def fix_slaute(db): news_collection = db['news_collection'] search_items = [] for i in news_collection.find({'fact_url': { "$regex": "salute" }}, { "news_id": 1, "fact_url": 1, "ref_source.text": 1 }): text = i.get("ref_source", {}).get("text", "") news_id = i.get("news_id") fact_url = i.get("fact_url") search_items.append((news_id, fact_url)) driver = get_selenium_driver() index = 0 for i in tqdm(search_items): driver.get(i[1]) contents = driver.find_elements_by_xpath("//div[@class='col-md-8']//p") contents = [i.text for i in contents] content = " ".join(contents) print(content) news_collection.find_one_and_update( {"news_id": i[0]}, {"$set": { "ref_source.text": content }}) index += 1 if index % 10 == 0: driver.close() driver = get_selenium_driver()
def fix_piaui(db): news_collection = db['news_collection'] # domain = "poligrafo" domain = "aosfatos" for domain in ["poligrafo", "aosfatos"]: search_items = [] for i in news_collection.find({'fact_url': { "$regex": domain }}, { "news_id": 1, "fact_url": 1, "ref_source.text": 1 }): text = i.get("ref_source", {}).get("text", "") news_id = i.get("news_id") fact_url = i.get("fact_url") # news_collection.find_one_and_update({"news_id": news_id}, {"$set": {"ref_source.text": text.replace("Lupa", "")}}) if len(text.split()) < 5: search_items.append((news_id, fact_url)) driver = get_selenium_driver() idx = 0 for i in tqdm(search_items): # url_list, domain, content, lang, claim content = rule_based_crawl(i[1], driver, i[0], None, None)[2] print(i[1], content) news_collection.find_one_and_update( {"news_id": i[0]}, {"$set": { "ref_source.text": content }}) idx += 1 if idx % 10 == 0: driver.close() driver = get_selenium_driver()
def fetch_fact_check(idx, all_pages_chunkify, main_url, db, crawl_func): idx_pages = all_pages_chunkify[idx] return_list = [] news_collection = db[Constants.NEWS_COLLECTION] driver = get_selenium_driver() for no_page in idx_pages: if no_page == -1: url = main_url else: url = main_url.format(no_page) return_element = crawl_func(driver, url) print("Length for Page {} is {}".format(no_page, len(return_element['info_list']))) if len(list(return_element.values())[0]) == 0: continue else: if return_element is None: continue for i in return_element: i['label'] = "fake" news_collection.find_one_and_update({'id': i['id']}, {"$set": i}, upsert=True) crawl_from_fact_checking(i, news_collection, driver) driver.close() driver = get_selenium_driver()
def job_from_factcheck_facebook(idx, fact_urls, db): driver = get_selenium_driver() source_db = db['fake_news_source_article'] format_db = db['fake_news_format'] id_url_list = fact_urls[idx] for id_url in id_url_list: tokens = ['posts', 'videos', 'photo', 'fbid', 'permalink'] id = id_url[0] url = id_url[1] if source_db.find_one({'id': id}) is None: try: driver.get(url) fb_urls = driver.find_elements_by_xpath( "//a[contains(@href, 'facebook.com')]") fb_urls = [i.get_attribute("href") for i in fb_urls] fb_urls = [i for i in fb_urls if any([t in i for t in tokens])] if len(fb_urls) == 0: continue fb_url = fb_urls[0] print("FB URL is {}".format(fb_url)) format_db.find_and_modify({'id': id}, { '$set': { 'url': fb_url, "type": "facebook" }, }) source_db.find_and_modify({'id': id}, { 'url': fb_url, 'id': id }, upsert=True) except: print("ERROR in Factchecking URL {}".format(url))
def fix_pesacheck(db): news_collection = db['news_collection'] search_items = [] domain = "pesacheck" for i in news_collection.find({'fact_url': { "$regex": domain }}, { "news_id": 1, "fact_url": 1, "ref_source.text": 1 }): text = i.get("ref_source", {}).get("text", "") news_id = i.get("news_id") fact_url = i.get("fact_url") # news_collection.find_one_and_update({"news_id": news_id}, {"$set": {"ref_source.text": text.replace("Lupa", "")}}) if len(text.split()) < 5: search_items.append((news_id, fact_url)) driver = get_selenium_driver() idx = 0 for i in tqdm(search_items): # url_list, domain, content, lang, claim return_dic = {} url = rule_based_crawl(i[1], driver, i[0], None, None)[0][0] try: return_dic['ref_source'] = page_in_archive(url=url, driver=driver) return_dic['ref_source_url'] = return_dic['ref_source']['url'] if "archive" in url: return_dic["ref_source"]['ref_archive_url'] = url except: print("ERROR At {}".format(url)) return_dic['news_id'] = i[0] return_dic['ref_source_url'] = url news_collection.find_one_and_update({"news_id": i[0]}, {"$set": return_dic}) idx += 1 if idx % 10 == 0: driver.close() driver = get_selenium_driver()
def job_from_facebook(idx, store_urls, db): driver = get_selenium_driver() news_collection = db[Constants.NEWS_COLLECTION] urls = store_urls[idx] t = open("./fb_result.txt", 'a') for id_url in urls: id = id_url[0] url = id_url[1] try: # url = url[0] if "archive" not in url: archive_url = archiveis.capture(url) time.sleep(30) else: archive_url = url print(archive_url) # if "wip/" in archive_url: except Exception as e: print(str(e)) t.write(url + "\n") t.flush() print("Factcheck ERROR in {}".format(url)) continue if "wip/" in archive_url: archive_url = archive_url.replace("wip/", "") if "wip/" not in archive_url: try: return_dic = {} return_dic['ref_source'] = get_news_source_article( archive_url, driver) return_dic['news_id'] = id return_dic['ref_source_url'] = return_dic['ref_source']['url'] print(id) news_collection.find_one_and_update( {"news_id": return_dic['news_id']}, {"$set": return_dic}, upsert=True) except: print("Problem in {}".format(archive_url)) continue else: news_collection.find_one_and_update({"news_id": id}, { "$set": { 'archive_wip': archive_url.replace("/wip", ""), 'news_id': id } }, upsert=True) t.close()
def fetch_save_collection(idx, all_pages_chunkify, main_url, db, crawl_func): idx_pages = all_pages_chunkify[idx] return_list = [] news_collection = db[Constants.NEWS_COLLECTION] driver = get_selenium_driver() for no_page in idx_pages: if no_page == -1: url = main_url else: url = main_url.format(no_page) try: return_element = crawl_func(driver, url) except: break print("Length for Page {} is {}".format( no_page, len(return_element['info_list']))) if len(list(return_element.values())[0]) == 0: continue else: if return_element is None: continue return_element_list = [ i if "url" in i.keys() else i.update({'url': j}) for i, j in zip(return_element['info_list'], return_element['fact_url_list']) ] for i in return_element_list: try: i['ref_source'] = crawl_link_article(i['url']) i['ref_source']['ref_source_url'] = i['url'] news_collection.find_one_and_update({'id': i['id']}, {"$set": i}, upsert=True) except: continue driver.close() news_collection = db[Constants.NEWS_COLLECTION] for i in return_list: if db.find_one({"id": i['id']}, {"id": 1}) is None: news_collection.update({'id': i['id']}, {'$set': i}, upsert=True) article_detail = crawl_link_article(i['url']) news_collection.find_one_and_update( {'id': i['id']}, {'$set': article_detail.update({"agency": i['agency']})}, upsert=True) logging.info(f"Success finish {i['id']}.")
def twitter_in_fact_check(news_collection, news_tweet_correlation, driver): tweet_ids = [] list_kw = ['twitter', 'media'] refresh_t = 1 for i in tqdm(news_collection.find({"ref_source_url": "NONE"})): try: originated = str(i['orginated']) except: continue if any([kw for kw in list_kw if kw in originated.lower()]): try: twitter_urls = extract_twitter_url(fact_site_url=i['fact_url'], driver=driver) except: logging.info("Exception in URL: {}".format(i['fact_url'])) print("Exception in URL: {}".format(i['fact_url'])) continue tweet_list = [] for turl in twitter_urls: try: tweet_list.append( re.findall('\d+', turl.split("/")[-1])[0]) except: continue if len(tweet_list) == 0: continue # only take the first element news_collection.find_and_modify({"news_id": i['news_id']}, { 'ref_source_url': "https://twitter.com/i/web/status/{}".format(tweet_list[0]) }) news_tweet = { "news_id": i['news_id'], "tweet_list": [int(tweet) for tweet in tweet_list] } news_tweet_correlation.find_and_modify( {'news_id': news_tweet['news_id']}, {"$set": news_tweet}, upsert=True) tweet_ids += tweet_list if refresh_t % 100 == 0: driver.close() driver = get_selenium_driver() return tweet_ids