def fix_slaute(db):
    news_collection = db['news_collection']
    search_items = []
    for i in news_collection.find({'fact_url': {
            "$regex": "salute"
    }}, {
            "news_id": 1,
            "fact_url": 1,
            "ref_source.text": 1
    }):
        text = i.get("ref_source", {}).get("text", "")
        news_id = i.get("news_id")
        fact_url = i.get("fact_url")
        search_items.append((news_id, fact_url))
    driver = get_selenium_driver()
    index = 0
    for i in tqdm(search_items):
        driver.get(i[1])
        contents = driver.find_elements_by_xpath("//div[@class='col-md-8']//p")
        contents = [i.text for i in contents]
        content = " ".join(contents)
        print(content)
        news_collection.find_one_and_update(
            {"news_id": i[0]}, {"$set": {
                "ref_source.text": content
            }})
        index += 1
        if index % 10 == 0:
            driver.close()
            driver = get_selenium_driver()
def fix_piaui(db):
    news_collection = db['news_collection']

    # domain = "poligrafo"
    domain = "aosfatos"
    for domain in ["poligrafo", "aosfatos"]:
        search_items = []
        for i in news_collection.find({'fact_url': {
                "$regex": domain
        }}, {
                "news_id": 1,
                "fact_url": 1,
                "ref_source.text": 1
        }):
            text = i.get("ref_source", {}).get("text", "")
            news_id = i.get("news_id")
            fact_url = i.get("fact_url")
            # news_collection.find_one_and_update({"news_id": news_id}, {"$set": {"ref_source.text": text.replace("Lupa", "")}})
            if len(text.split()) < 5:
                search_items.append((news_id, fact_url))
        driver = get_selenium_driver()
        idx = 0
        for i in tqdm(search_items):
            # url_list, domain, content, lang, claim
            content = rule_based_crawl(i[1], driver, i[0], None, None)[2]
            print(i[1], content)
            news_collection.find_one_and_update(
                {"news_id": i[0]}, {"$set": {
                    "ref_source.text": content
                }})
            idx += 1
            if idx % 10 == 0:
                driver.close()
                driver = get_selenium_driver()
示例#3
0
def fetch_fact_check(idx, all_pages_chunkify, main_url,
                     db, crawl_func):
    idx_pages = all_pages_chunkify[idx]
    return_list = []
    news_collection = db[Constants.NEWS_COLLECTION]
    driver = get_selenium_driver()
    for no_page in idx_pages:
        if no_page == -1:
            url = main_url
        else:
            url = main_url.format(no_page)
        return_element = crawl_func(driver, url)

        print("Length for Page {} is {}".format(no_page, len(return_element['info_list'])))
        if len(list(return_element.values())[0]) == 0:
            continue
        else:
            if return_element is None:
                continue

            for i in return_element:
                i['label'] = "fake"
                news_collection.find_one_and_update({'id': i['id']}, {"$set": i}, upsert=True)
                crawl_from_fact_checking(i, news_collection, driver)
            driver.close()
            driver = get_selenium_driver()
def job_from_factcheck_facebook(idx, fact_urls, db):
    driver = get_selenium_driver()
    source_db = db['fake_news_source_article']
    format_db = db['fake_news_format']
    id_url_list = fact_urls[idx]
    for id_url in id_url_list:
        tokens = ['posts', 'videos', 'photo', 'fbid', 'permalink']
        id = id_url[0]
        url = id_url[1]
        if source_db.find_one({'id': id}) is None:
            try:
                driver.get(url)
                fb_urls = driver.find_elements_by_xpath(
                    "//a[contains(@href, 'facebook.com')]")
                fb_urls = [i.get_attribute("href") for i in fb_urls]
                fb_urls = [i for i in fb_urls if any([t in i for t in tokens])]
                if len(fb_urls) == 0:
                    continue
                fb_url = fb_urls[0]
                print("FB URL is {}".format(fb_url))
                format_db.find_and_modify({'id': id}, {
                    '$set': {
                        'url': fb_url,
                        "type": "facebook"
                    },
                })
                source_db.find_and_modify({'id': id}, {
                    'url': fb_url,
                    'id': id
                },
                                          upsert=True)
            except:
                print("ERROR in Factchecking URL {}".format(url))
def fix_pesacheck(db):
    news_collection = db['news_collection']
    search_items = []

    domain = "pesacheck"

    for i in news_collection.find({'fact_url': {
            "$regex": domain
    }}, {
            "news_id": 1,
            "fact_url": 1,
            "ref_source.text": 1
    }):
        text = i.get("ref_source", {}).get("text", "")
        news_id = i.get("news_id")
        fact_url = i.get("fact_url")
        # news_collection.find_one_and_update({"news_id": news_id}, {"$set": {"ref_source.text": text.replace("Lupa", "")}})
        if len(text.split()) < 5:
            search_items.append((news_id, fact_url))
    driver = get_selenium_driver()
    idx = 0
    for i in tqdm(search_items):
        # url_list, domain, content, lang, claim
        return_dic = {}
        url = rule_based_crawl(i[1], driver, i[0], None, None)[0][0]
        try:
            return_dic['ref_source'] = page_in_archive(url=url, driver=driver)
            return_dic['ref_source_url'] = return_dic['ref_source']['url']
            if "archive" in url:
                return_dic["ref_source"]['ref_archive_url'] = url

        except:
            print("ERROR At {}".format(url))
        return_dic['news_id'] = i[0]
        return_dic['ref_source_url'] = url
        news_collection.find_one_and_update({"news_id": i[0]},
                                            {"$set": return_dic})
        idx += 1
        if idx % 10 == 0:
            driver.close()
            driver = get_selenium_driver()
def job_from_facebook(idx, store_urls, db):
    driver = get_selenium_driver()
    news_collection = db[Constants.NEWS_COLLECTION]
    urls = store_urls[idx]
    t = open("./fb_result.txt", 'a')
    for id_url in urls:
        id = id_url[0]
        url = id_url[1]
        try:
            # url = url[0]
            if "archive" not in url:
                archive_url = archiveis.capture(url)
                time.sleep(30)
            else:
                archive_url = url

            print(archive_url)
            # if "wip/" in archive_url:

        except Exception as e:
            print(str(e))
            t.write(url + "\n")
            t.flush()
            print("Factcheck ERROR in {}".format(url))
            continue
        if "wip/" in archive_url:
            archive_url = archive_url.replace("wip/", "")
        if "wip/" not in archive_url:
            try:
                return_dic = {}
                return_dic['ref_source'] = get_news_source_article(
                    archive_url, driver)
                return_dic['news_id'] = id
                return_dic['ref_source_url'] = return_dic['ref_source']['url']
                print(id)
                news_collection.find_one_and_update(
                    {"news_id": return_dic['news_id']}, {"$set": return_dic},
                    upsert=True)
            except:
                print("Problem in {}".format(archive_url))
                continue

        else:
            news_collection.find_one_and_update({"news_id": id}, {
                "$set": {
                    'archive_wip': archive_url.replace("/wip", ""),
                    'news_id': id
                }
            },
                                                upsert=True)
    t.close()
示例#7
0
def fetch_save_collection(idx, all_pages_chunkify, main_url, db, crawl_func):
    idx_pages = all_pages_chunkify[idx]
    return_list = []
    news_collection = db[Constants.NEWS_COLLECTION]
    driver = get_selenium_driver()
    for no_page in idx_pages:
        if no_page == -1:
            url = main_url
        else:
            url = main_url.format(no_page)
        try:
            return_element = crawl_func(driver, url)
        except:
            break
        print("Length for Page {} is {}".format(
            no_page, len(return_element['info_list'])))
        if len(list(return_element.values())[0]) == 0:
            continue
        else:
            if return_element is None:
                continue

            return_element_list = [
                i if "url" in i.keys() else i.update({'url': j})
                for i, j in zip(return_element['info_list'],
                                return_element['fact_url_list'])
            ]
            for i in return_element_list:
                try:
                    i['ref_source'] = crawl_link_article(i['url'])
                    i['ref_source']['ref_source_url'] = i['url']
                    news_collection.find_one_and_update({'id': i['id']},
                                                        {"$set": i},
                                                        upsert=True)
                except:
                    continue

    driver.close()

    news_collection = db[Constants.NEWS_COLLECTION]

    for i in return_list:

        if db.find_one({"id": i['id']}, {"id": 1}) is None:
            news_collection.update({'id': i['id']}, {'$set': i}, upsert=True)
            article_detail = crawl_link_article(i['url'])
            news_collection.find_one_and_update(
                {'id': i['id']},
                {'$set': article_detail.update({"agency": i['agency']})},
                upsert=True)
            logging.info(f"Success finish {i['id']}.")
def twitter_in_fact_check(news_collection, news_tweet_correlation, driver):
    tweet_ids = []
    list_kw = ['twitter', 'media']
    refresh_t = 1
    for i in tqdm(news_collection.find({"ref_source_url": "NONE"})):
        try:
            originated = str(i['orginated'])
        except:
            continue
        if any([kw for kw in list_kw if kw in originated.lower()]):

            try:
                twitter_urls = extract_twitter_url(fact_site_url=i['fact_url'],
                                                   driver=driver)
            except:
                logging.info("Exception in URL: {}".format(i['fact_url']))
                print("Exception in URL: {}".format(i['fact_url']))
                continue

            tweet_list = []
            for turl in twitter_urls:
                try:
                    tweet_list.append(
                        re.findall('\d+',
                                   turl.split("/")[-1])[0])
                except:
                    continue
            if len(tweet_list) == 0:
                continue
            # only take the first element

            news_collection.find_and_modify({"news_id": i['news_id']}, {
                'ref_source_url':
                "https://twitter.com/i/web/status/{}".format(tweet_list[0])
            })
            news_tweet = {
                "news_id": i['news_id'],
                "tweet_list": [int(tweet) for tweet in tweet_list]
            }
            news_tweet_correlation.find_and_modify(
                {'news_id': news_tweet['news_id']}, {"$set": news_tweet},
                upsert=True)
            tweet_ids += tweet_list
        if refresh_t % 100 == 0:
            driver.close()
            driver = get_selenium_driver()
    return tweet_ids