示例#1
0
def main():
    print('=========================')
    print(sys.argv[0])
    print('=========================')

    num_new_articles = 0

    with requests.Session() as session:
        session.headers.update(headers)

        articles = get_articles_on_pages(num_pages_to_check, session)

        print('\tgathering article info ...')
        for x in tqdm(articles):
            title = get_title(x)
            date = get_date(x)
            hash_str = make_hash(title, date)

            if is_article_new(hash_str):
                link = get_link(x)
                content = get_content(link, session)
                new_tup = (str(datetime.date.today()), title, content,
                           formatDate(date), hash_str, link, SOURCE)
                dbExecutor.insertOne(new_tup)
                num_new_articles += 1

        print(num_new_articles, 'new articles found,', len(articles),
              'articles checked,', num_errors, 'errors found\n')
示例#2
0
def main():
    print('=========================')
    print(sys.argv[0])
    print('=========================')

    num_new_articles = 0

    with requests.Session() as session:
        session.headers.update(headers)

        articles = get_articles_on_pages(num_pages_to_check, session)

        print('\tgathering articles ...')
        for x in tqdm(articles):
            title = get_title(x)
            date = get_date(x)
            hash_str = make_hash(
                title, base_url
            )  #datuma ni na prvi strani, namesto tega hash naredim iz base_url

            if is_article_new(hash_str):
                link = get_link(x)
                r = get_connection(link, session)
                soup = bs(r.text, 'html.parser')
                content = get_content(soup)
                new_tup = (str(datetime.date.today()), title, content, date,
                           hash_str, link, SOURCE)
                dbExecutor.insertOne(new_tup)
                num_new_articles += 1

        print(num_new_articles, 'new articles found,', len(articles),
              'articles checked,', num_errors, 'errors found\n')
示例#3
0
def main():
    print('=========================')
    print(sys.argv[0])
    print('=========================')

    num_new_articles = 0

    with requests.Session() as session:

        articles = getArticlesOn_n_pages(num_pages_to_check)

        print('\tgathering article info')
        for x in tqdm(articles):
            title = getTitle(x)
            date = getDate(x)
            hash_str = makeHash(title, date)

            if is_article_new(hash_str):
                link = getLink(x)
                content = getContent(link, session)
                tup = (str(datetime.date.today()), title, content, date,
                       hash_str, link, SOURCE)
                dbExecutor.insertOne(tup)
                num_new_articles += 1

        print(num_new_articles, 'new articles found', len(articles),
              'articles checked,', num_errors, 'errors found')
示例#4
0
def main():
    print('=========================')
    print(sys.argv[0])
    print('=========================')

    num_new_articles = 0

    with requests.Session() as session:
        session.headers.update(headers)
        articles = getArticlesOn_n_pages(num_pages_to_check, session)
        articles_checked = len(articles)

        print('\tgathering article info ...')
        for x in tqdm(articles):
            title = getTitle(x)
            date = getDate(x)
            hash_str = makeHash(title)

            if is_article_new(hash_str):
                link = getLink(x)
                r = get_connection(link, session)
                soup = bs(r.text, 'html.parser')
                content = getContent(soup)
                tup = (str(datetime.date.today()), title, content,
                       formatDate(date), hash_str, link, SOURCE)
                dbExecutor.insertOne(tup)
                num_new_articles += 1

        print(num_new_articles, 'new articles found,', articles_checked,
              'articles checked', num_errors, 'errors found\n')