def retrieve_urls(self, parsed_claim_review_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) -> \ List[str]: """ :parsed_listing_page: --> une page (parsed) qui liste des claims :listing_page_url: --> l'url associé à la page ci-dessus :number_of_page: --> number_of_page :return: --> la liste des url de toutes les claims """ urls = [] # First single page: page_contend = caching.get(listing_page_url, headers=self.headers, timeout=5) page = BeautifulSoup(page_contend, "lxml") if page is not None: for ex_url in self.extract_urls(page): urls.append(ex_url) # All pages >0: for page_number in tqdm(range(2, number_of_pages)): if 0 < self.configuration.maxClaims < len(urls): break url = listing_page_url + "page/" + str(page_number) + "/" page_contend = caching.get(url, headers=self.headers, timeout=5) page = BeautifulSoup(page_contend, "lxml") if page is not None: for ex_url in self.extract_urls(page): urls.append(ex_url) return urls
def get_all_claims(self): claims = [] # type : List[Claim] listing_pages = self.retrieve_listing_page_urls() for listing_page_url in listing_pages: print("Fetching listing pages from " + listing_page_url) page = caching.get(listing_page_url, headers=self.headers, timeout=5) if not page: continue parsed_listing_page = BeautifulSoup( page, self.configuration.parser_engine) number_of_pages = self.find_page_count(parsed_listing_page) if number_of_pages and number_of_pages < 0: number_of_pages = None urls = self.retrieve_urls(parsed_listing_page, listing_page_url, number_of_pages) print("Extracting claims listed in " + listing_page_url) for url in tqdm(urls): try: if "http" in url: review_page = caching.get(url, headers=self.headers, timeout=6) if review_page: parsed_claim_review_page = BeautifulSoup( review_page, self.configuration.parser_engine) claim = get_claim_from_cache(url) if not claim: local_claims = self.extract_claim_and_review( parsed_claim_review_page, url) for claim in local_claims: self._annotate_claim(claim) if len(local_claims) > 1: for claim in local_claims: claims.append( claim.generate_dictionary()) elif len( local_claims) == 1 and local_claims[0]: claims.append( local_claims[0].generate_dictionary()) cache_claim(local_claims[0]) else: self.failed_log.write(url + "\n") self.failed_log.flush() else: claims.append(claim.generate_dictionary()) except ConnectionError: pass self.failed_log.close() return pandas.DataFrame(claims)
def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int: count = 26 url = "https://checkyourfact.com/page/" + str(count + 1) result = caching.get(url, headers=self.headers, timeout=10) if result: while result: count += 1 url = "https://www.washingtonpost.com/news/fact-checker/page/" + str(count) result = caching.get(url, headers=self.headers, timeout=10) else: count -= 1 return count
def retrieve_listing_page_urls(self) -> List[str]: links = [] different_categories_value = ["disinformation-cases"] url_begins = [ "https://euvsdisinfo.eu/", "https://euvsdisinfo.eu/ru/", "https://euvsdisinfo.eu/it/", "https://euvsdisinfo.eu/es/", "https://euvsdisinfo.eu/fr/", "https://euvsdisinfo.eu/de/" ] for url in url_begins: for value in different_categories_value: #different_urls.append(url + value + "/") # data = caching.get(f""+ url + value + "/") data = caching.get( "https://euvsdisinfo.eu/disinformation-cases/", headers=self.headers, timeout=15) soup = BeautifulSoup(data, 'html.parser') nb = self.find_page_count(soup) for x in range(0, int(nb / 10)): links.append(url + value + '/?offset=' + str(x * 10)) # data = caching.get('https://euvsdisinfo.eu/disinformation-cases') # soup = BeautifulSoup(data, 'html.parser') # nb = self.find_page_count(soup) # links = [] # for x in range(0, int(nb/10)): # links.append('https://euvsdisinfo.eu/disinformation-cases/?offset='+str(x*10)) return links
def retrieve_listing_page_urls(self) -> List[str]: data = caching.get('https://euvsdisinfo.eu/disinformation-cases') soup = BeautifulSoup(data, 'html.parser') nb = self.find_page_count(soup) links = [] for x in range(0, int(nb/10)): links.append('https://euvsdisinfo.eu/disinformation-cases/?offset='+str(x*10)) return links
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: urls = [] offset = 1 links = caching.get( f"https://loadmore.aap.com.au/category?category=6&postOffset={offset}&perPage=100" ) offset = 100 while links != "[]": parsed_json = json.loads(links) for link in parsed_json: urls.append(link['link']) links = caching.get( f"https://loadmore.aap.com.au/category?category=6&postOffset={offset}&perPage=100" ) offset += 100 return urls
def retrieve_listing_page_urls(self) -> List[str]: listings_url = "https://www.politifact.com/truth-o-meter/rulings/" page = caching.get(listings_url, headers=self.headers, timeout=5) parsed = BeautifulSoup(page, "lxml") main_tag = parsed.find("main", {"class": "main"}) # type: BeautifulSoup links = main_tag.find_all("a", href=True) return ["http://www.politifact.com" + link['href'] for link in links]
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: query_url = "https://www.newtral.es/wp-json/wp/v2/posts?per_page=100&offset={offset}&categories=1" + \ "&exclude=80729%2C79970%2C78262%2C78455%2C77275%2C77315%2C77161%2C76907%2C76298" + \ "%2C75434%2C74706%2C74103%2C74062&_locale=user" urls = [] json_output = caching.get(query_url.format(offset=0), headers=self.headers, timeout=5) offset = 0 while json_output.strip() != "[]": pages = json.loads(json_output) for page in pages: urls.append(page['link']) offset += 100 json_output = caching.get(query_url.format(offset=offset), headers=self.headers, timeout=5) return urls
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: urls = self.extract_urls(parsed_listing_page) for page_number in tqdm(range(2, number_of_pages)): url = "http://factscan.ca/page/" + str(page_number) + "/" page = caching.get(url, headers=self.headers, timeout=5) current_parsed_listing_page = BeautifulSoup(page, "lxml") urls += self.extract_urls(current_parsed_listing_page) return urls
def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int: count = 5 url = "https://www.polygraph.info/z/20382?p=" + str(count + 1) result = caching.get(url, headers=self.headers, timeout=10) if result: while result: count += 1 url = "https://www.polygraph.info/z/20382?p=" + str(count) result = caching.get(url, headers=self.headers, timeout=10) if result: parsed = BeautifulSoup(result, self.configuration.parser_engine) articles = parsed.findAll("li", {"class": "fc__item"}) if not articles or len(articles) == 0: break else: count -= 1 return count - 1
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: urls = [] #self.extract_urls(parsed_listing_page) for page_number in tqdm(range(1, number_of_pages)): url = "https://www.truthorfiction.com/category/fact-checks/page/" + str(page_number) + "/" page = caching.get(url, headers=self.headers, timeout=20) current_parsed_listing_page = BeautifulSoup(page, "lxml") urls += self.extract_urls(current_parsed_listing_page) return urls
def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int: count = 26 url = "https://checkyourfact.com/page/" + str(count + 1) result = caching.get(url, headers=self.headers, timeout=10) if result: while result: count += 1 url = "https://checkyourfact.com/page/" + str(count) result = caching.get(url, headers=self.headers, timeout=10) if result: parsed = BeautifulSoup(result, self.configuration.parser_engine) articles = parsed.find("articles").findAll("article") if not articles or len(articles) == 0: break else: count -= 1 return count
def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int: page_nav = parsed_listing_page.find("div", {"class": "nav-previous"}) last_page_link = page_nav.findAll("a")[0]['href'] page_re = re.compile("https://www.truthorfiction.com/category/fact-checks/page/([0-9]+)/") max_page = int(page_re.match(last_page_link).group(1)) if (max_page >= 2) and ((max_page*10) <= self.configuration.maxClaims): page = caching.get(last_page_link, headers=self.headers, timeout=5) if page: parsed_listing_page = BeautifulSoup(page, self.configuration.parser_engine) max_page = self.find_page_count(parsed_listing_page) return max_page
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) -> List[str]: urls = self.extract_urls(parsed_listing_page) for page_number in trange(1, number_of_pages): url = listing_page_url + "?page=" + str(int(page_number)) page = caching.get(url, headers=self.headers, timeout=20) current_parsed_listing_page = BeautifulSoup(page, "lxml") urls += self.extract_urls(current_parsed_listing_page) return urls
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: urls = self.extract_urls(parsed_listing_page) for page_number in tqdm(range(2, number_of_pages)): if 0 < self.configuration.maxClaims < len(urls): break url = listing_page_url + "/page/" + str(page_number) page = caching.get(url, headers=self.headers, timeout=5) current_parsed_listing_page = BeautifulSoup(page, "lxml") urls = urls + self.extract_urls(current_parsed_listing_page) return urls
def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int: next_link = parsed_listing_page.find("a", {"class", "btn-next btn"})['href'] next_page_contents = caching.get(next_link, headers=self.headers, timeout=5) next_page = BeautifulSoup(next_page_contents, "lxml") title_text = next_page.find( "title").text # Format u'Fact Checks Archive | Page 2 of 1069 | Snopes.com' max_page_pattern = re.compile("Page [0-9]+ of ([0-9+]+)") result = max_page_pattern.match(title_text.split("|")[1].strip()) max_page = int(result.group(1)) return max_page
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: urls = self.extract_urls(parsed_listing_page) page_number = 2 while True: url = listing_page_url + "?page=" + str(page_number) page = caching.get(url, headers=self.headers, timeout=5) if not page: break current_parsed_listing_page = BeautifulSoup(page, "lxml") urls += self.extract_urls(current_parsed_listing_page)
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: urls = [] offset = 1 links = caching.get( f"https://loadmore.aap.com.au/category?category=6&postOffset={offset}&perPage=100" ) offset = 100 tmp_counter = 0 while links != "[]" and tmp_counter < self.configuration.maxClaims: parsed_json = json.loads(links) for link in parsed_json: tmp_counter += 1 urls.append(link['link']) if (self.configuration.maxClaims <= tmp_counter): break links = caching.get( f"https://loadmore.aap.com.au/category?category=6&postOffset={offset}&perPage=100" ) offset += 100 return urls
def get(self, url): """ @return the webpage """ headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36' } html = caching.get(url, headers=headers) soup = BeautifulSoup(html, 'lxml') # removing some useless tags for s in soup.select("script, iframe, head, header, footer, style"): s.extract() return soup
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: urls = self.extract_urls(parsed_listing_page) print(number_of_pages) for page_number in tqdm(range(0, number_of_pages)): url = "https://africacheck.org/fact-checks?field_article_type_value=reports&field_rated_value=All&field_country_value=All&sort_bef_combine=created_DESC&sort_by=created&sort_order=DESC&page=" + str( page_number) + "/" page = caching.get(url, headers=self.headers, timeout=5) #print(url) current_parsed_listing_page = BeautifulSoup(page, "lxml") #print(current_parsed_listing_page) urls += self.extract_urls(current_parsed_listing_page) return urls
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: urls = self.extract_urls(parsed_listing_page) for page_number in tqdm(range(2, number_of_pages)): url = "https://checkyourfact.com/page/" + str(page_number) + "/" page = caching.get(url, headers=self.headers, timeout=5) if page: current_parsed_listing_page = BeautifulSoup(page, "lxml") urls += self.extract_urls(current_parsed_listing_page) else: break print(urls) return urls
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: urls = self.extract_urls(parsed_listing_page) for page_number in tqdm(range(0, number_of_pages)): # each page 9 articles: if ((page_number * 9) + 18 >= self.configuration.maxClaims): break #url = "https://africacheck.org/latest-reports/page/" + str(page_number) + "/" url = "https://africacheck.org/search?rt_bef_combine=created_DESC&sort_by=created&sort_order=DESC&page=" + str( page_number) page = caching.get(url, headers=self.headers, timeout=5) current_parsed_listing_page = BeautifulSoup(page, "lxml") urls += self.extract_urls(current_parsed_listing_page) return urls
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: # lien de la premiere page -> liste de textes urls = self.extract_urls(parsed_listing_page) # parcours from 2 to end for page_number in tqdm(range(2, number_of_pages + 1)): url = "https://www.polygraph.info/z/20382?p=" + str(page_number) + "/" # load from cache (download if not exists, sinon load ) page = caching.get(url, headers=self.headers, timeout=5) if page: # parser avec BeautifulSoup la page current_parsed_listing_page = BeautifulSoup(page, "lxml") # extriare les liens dans cette page et rajoute dans urls urls += self.extract_urls(current_parsed_listing_page) else: break return urls
def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \ -> List[str]: urls = self.extract_urls(parsed_listing_page) page_number = 2 while True and ((page_number*30) <= self.configuration.maxClaims): url = listing_page_url + "?page=" + str(page_number) page = caching.get(url, headers=self.headers, timeout=5) if page is not None: current_parsed_listing_page = BeautifulSoup(page, "lxml") else: break nav_buttons = current_parsed_listing_page.find_all("section", attrs={'class': 't-row'}) nav_buttons = nav_buttons[-1].find_all("li", attrs={'class': 'm-list__item'}) if len(nav_buttons) == 1: break else: urls += self.extract_urls(current_parsed_listing_page) page_number += 1 #print("\rr: " + url) return urls
def find_last_page(self): #returns last page listing articles page = 80 #86 count = 32 lim = -1 #Dichotomy while count >= 1: url = "https://factuel.afp.com/?page=" + str(int(page)) result = caching.get(url, headers=self.headers, timeout=10) parsed = BeautifulSoup(result, self.configuration.parser_engine) article = parsed.findAll("article") if lim > 0: count = count / 2 if (len(article) != 0): if count < 1: return int(page) page = page + count else: if lim == -1: lim = page count = count / 2 elif count < 1: return int(page - 1) page = page - count