def get_claim_from_cache(url: str) -> Optional[Claim]: result = redis.hgetall("___cached___claim___" + url) if result: claim = Claim.from_dictionary(result) return claim else: return None
def _annotate_claim(self, claim: Claim): if self.language == "eng" or self.language == "fra": claim_text = claim.claim claim.claim_entities = self.annotator.annotate( claim_text, language=self.language) if claim.tags is not None: keywords = claim.tags claim.keyword_entities = self.annotator.annotate( keywords, language=self.language) if claim.body is not None: claim_body = claim.body claim.body_entities = self.annotator.annotate( claim_body, language=self.language) if claim.author is not None: author = claim.author claim.author_entities = self.annotator.annotate( author, language=self.language)
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("newtral") title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content'] title = title.strip().split("|")[0] claim.set_title(title) dospunto = re.search(r'(: «)', title) dospunt = re.search(r'(: “)', title) if dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur # print ("auteur:" , auteur) claim_text = claim_a[1].strip("« »") claim.claim = claim_text elif dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text else: pass tags = parsed_claim_review_page.find_all("meta", attrs={'property': 'article:tag'}) tag_list = [] for tag in tags: tag_text = tag['content'] tag_list.append(tag_text) claim.set_tags(",".join(tag_list)) published = parsed_claim_review_page.find("meta", attrs={'property': 'article:published_time'})[ 'content'] claim.date_published = published.strip() entry_content = parsed_claim_review_page.find("div", attrs={'class': 'entry-content'}) intro = parsed_claim_review_page.find("div", attrs={'class': 'c-article__intro'}) if intro is None: intro_rating_p = entry_content.find("em") if intro_rating_p is None: intro_rating_p = entry_content.find("p") if intro_rating_p is None: intro_rating_p = entry_content.find("div") else: intro_rating_p = intro.p rating_in_image = False if intro_rating_p is None: # Rating in image... rating_in_image = True rating_text = "" else: rating_text = intro_rating_p.get_text() rating_re_es_falso = regex.compile( r"(La afirmación es|La afirmación es una|La declaración es|Es|El dato es" + \ "|La comparación de Colau es)? ?([\p{Lu}| ]+)(\.| –|,| )") es_falso_match = rating_re_es_falso.match(rating_text) if es_falso_match is not None and es_falso_match.group(2) is not None: rating_text = es_falso_match.group(2) else: if not rating_in_image: is_there_b = intro_rating_p.find('b') if is_there_b is not None: rating_text = is_there_b.text else: is_there_strong = intro_rating_p.find("strong") if is_there_strong is not None: rating_text = is_there_strong.text else: pass claim.rating = rating_text author_span = parsed_claim_review_page.find("span", attrs={'class': 'c-article__author'}) author_a = author_span.find("a") author_url = author_a['href'] author_text = author_a.text author_text = re.sub('Por', '', author_text).strip() claim.author_url = author_url claim.review_author = author_text # Recuperation du texte de l'article entry_text = "" body_t = entry_content.find_all('p') body = [text.text.strip() for text in body_t] entry_text += " ".join(body) + "\n" claim.body = entry_text # Recuperation des liens dans le texte de l'article links = [link['href'] for link in entry_content.find_all('a', href=True)] claim.referred_links = links # else: # title = container.h3.text # titles.append(title) # # print("title", title) # claim_c = hd.h1.text.split(":") # claim_d = hd.h1.text.strip() # # if claim_c: # auteur = claim_c[0].strip() # auteurs.append(auteur) # print("auteur:", auteur) # claim = claim_c[1].strip("« »") # claims.append(claim) # # print ("claim :", claim) # # else : # # print (claim_d) # return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("truthorfiction") title = parsed_claim_review_page.find("meta", {"property": "og:title"})['content'] claim.set_title(title) article = parsed_claim_review_page.find("article") # date date_ = parsed_claim_review_page.find('meta', {"property": "article:published_time"})['content'] if date_: date_str = date_.split("T")[0] claim.set_date(date_str) # author author_ = parsed_claim_review_page.find('meta', {"name": "author"})['content'] if author_: author_str = author_.split("T")[0] claim.set_author(author_str) ## auth link author_url = parsed_claim_review_page.find('a', {"class": "url fn n"})['href'] if author_url: claim.author_url = author_url # body content = [tag for tag in article.contents if not isinstance(tag, NavigableString)] body = content[-1] # type: Tag if body.has_attr("class") and "content-source" in body['class']: body = content[-2] claim.set_body(body.text.strip()) # related links related_links = [] for link in body.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) description = article.find("div", {"class", "claim-description"}) rating = article.find("div", {"class", "rating-description"}) if description and rating: claim.set_claim(description.text) claim.rating = rating.text else: h1 = article.find("h1") text = h1.text.replace("–", "-") split_text = text.split("-") rating_text = split_text[-1] claim_text = "".join(split_text[0:-1]) if len(claim_text) == 0 or "-" not in text: return [] else: claim.set_rating(rating_text) claim.set_claim(claim_text) # tags tags = [] if parsed_claim_review_page.select('footer > span.tags-links > a'): for link in parsed_claim_review_page.select('footer > span.tags-links > a'): if hasattr(link, 'href'): #tag_link = link['href'] tags.append(link.text) claim.set_tags(", ".join(tags)) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("polygraph") # title title = parsed_claim_review_page.find("h1", {"class": "title pg-title"}) claim.set_title(title.text.replace(";", ",")) # date full_date = parsed_claim_review_page.find("time")['datetime'].split("T") claim.set_date(full_date[0]) # body # body = parsed_claim_review_page.find('div', {"id":"article-content"}).find_all('p') # for b in body: # claim.set_body(b.get_text()) body = parsed_claim_review_page.find("div", {"id": "article-content"}) claim.set_body(body.get_text()) # related related_links related_links = [] for link in body.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) claim.set_claim(claim.title) # author author = parsed_claim_review_page.find('h4', {"class": "author"}) claim.set_author(author.text) # rating rating = parsed_claim_review_page.find('div', {"class": "verdict"}).find_all('span')[1] claim.set_rating(rating.text) return [claim]
def get_all_claims(criteria): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'} # print criteria.maxClaims # performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} last_page = [] for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break url = "https://theferret.scot/category/fact-check/page/" + str(page_number) + "/" # try: page = requests.get(url, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify() links = soup.findAll("h1", {"class": "entry-title"}) if (len(links) != 0) or (links != last_page): for anchor in links: anchor = anchor.find('a', {"rel": "bookmark"}, href=True) ind_ = str(anchor['href']) if (ind_ not in list(urls_.keys())): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break urls_[ind_] = page print("adding " + str(ind_)) last_page = links else: print("break!") break # except: # print "error=>"+str(url) claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.items(): print(str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " + str(url)) index += 1 url_complete = str(url) # print url_complete try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify("utf-8") claim_ = Claim() claim_.set_url(url_complete) claim_.set_source("theferret") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) # title # if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1): title = soup.find("h1", {"class": "cover-title"}) claim_.set_title(title.text) # date date_ = soup.find('div', {"class": "widget__content"}).find("p") # print date_["content"] if date_: date_str = search_dates(date_.text)[0][1].strftime("%Y-%m-%d") # print date_str claim_.set_date(date_str) # print claim_.date # body body = soup.find("div", {"class": "article__text"}) claim_.set_body(body.get_text()) # related links divTag = soup.find("div", {"class": "article__text"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.set_refered_links(related_links) claim_.set_claim(soup.find("h1", {"class": "article__title"}).text) claim_.setConclusion(conclusion) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): # print "achou" tags.append(tag["content"]) claim_.set_tags(", ".join(tags)) claims.append(claim_.generate_dictionary()) except: print("Error ->" + str(url_complete)) # creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def get_all_claims(criteria): # performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} letters = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "m", "o", "p", "q", "x", "y", "z" ] letters = ["a"] for l in letters: for page in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break try: print( ("http://www.mimikama.at/page/" + str(page) + "/?s=" + l)) page = urllib.request.urlopen("http://www.mimikama.at/page/" + str(page) + "/?s=" + l).read() except: break soup = BeautifulSoup(page, "lxml") soup.prettify() links = soup.find('div', { "class": "td-ss-main-content" }).findAll('a', {"rel": "bookmark"}, href=True) if len(links) != 0: for anchor in links: if (anchor['href'] not in list(urls_.keys())): urls_[anchor['href']] = l print("adding " + str(anchor['href'])) if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break else: print("break!") break claims = [] index = 0 # visiting each article's dictionary and extract the content. for url in list(urls_.keys()): try: print( str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " + str(url)) index += 1 claim_ = Claim() claim_.set_source("mimikama") url_complete = url claim_.set_url(url_complete) page = urllib.request.urlopen(url_complete, timeout=5).read() soup = BeautifulSoup(page, "lxml") soup.prettify() # conclusin # conclusion=soup.find('div', {"class": "td-post-content"}).find('h2') # if conclusion : # claim_.setConclusion(conclusion.get_text()) # title title = soup.find("h1", {"class": "entry-title"}) claim_.set_title(title.text) # claim # claim = soup.find('div', {"class": "td-post-content"}).find('h2') # if claim and claim.find_previous('strong'): # claim_.setClaim(claim.find_previous('strong').get_text()) # else: claim_.set_claim(claim_.title) # date date = soup.find("time", {"class": "entry-date updated td-module-date"}) # print date # print (search_dates(date.get_text())[0][1].strftime("%Y-%m-%d")) claim_.set_date( search_dates(date.get_text())[0][1].strftime("%Y-%m-%d")) # related links divTag = soup.find("div", {"class": "td-post-content"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.set_refered_links(related_links) body = soup.find("div", {"class": "td-post-content"}) claim_.set_body(body.get_text()) claims.append(claim_.generate_dictionary()) except: print("Erro =>" + url) # creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) #print("\r" + url) claim.set_source("politifact") # Claim title = parsed_claim_review_page.find("div", {"class": "m-statement__quote"}) claim.set_claim(title.text.strip()) # title title = parsed_claim_review_page.find("h2", {"class": "c-title"}) claim.set_title(title.text.strip()) # date date = parsed_claim_review_page.find('span', {"class": "m-author__date"}) if date: date_str = search_dates(date.text)[0][1].strftime("%Y-%m-%d") claim.set_date(date_str) # rating # https://static.politifact.com/politifact/rulings/meter-mostly-false.jpg statement_body=parsed_claim_review_page.find("div", {"class", "m-statement__body"}) statement_detail = statement_body.find("div", {"class", "c-image"}) statement_detail_image=statement_detail.find("picture") statement_detail_image_alt=statement_detail_image.find("img",{"class", "c-image__original"}) if statement_detail_image_alt: #claim.alternate_name = statement_detail_image_alt['src'].split("rulings/")[1].split(".jpg")[0] if self.translate_rating_value(statement_detail_image_alt['alt']) != "": claim.rating = self.translate_rating_value(statement_detail_image_alt['alt']) else: claim.rating = statement_detail_image_alt['alt'] # body body = parsed_claim_review_page.find("article", {"class": "m-textblock"}) #body.find("div", {"class": "artembed"}).decompose() #claim.set_body(body.get_text()) text ="" if parsed_claim_review_page.select( 'main > section > div.t-row__center > article.m-textblock' ): for child in parsed_claim_review_page.select( 'main > section > div.t-row__center > article.m-textblock' ): for element in child.contents: if (element.name == "div"): valid = True # check for illegal JS element in artembed (tag): if (hasattr( element, 'class' )): try: if ('class' in element.attrs): if (element.attrs['class'][0] == "artembed"): if (element.text.startswith("\r\nwindow.gciAnalyticsUAID")): valid = False except KeyError: print("KeyError: Skip") else: valid = True if hasattr( element, 'text' ): #if (element.text == "We rate this claim False." and url == "https://www.politifact.com/staff/kelsey-tamakloe/"): if (url == "https://www.politifact.com/staff/kelsey-tamakloe/"): print("\r" + str(element.text)) if (valid == True): if (element): if (hasattr( element, 'text' )): text += " " + str(element.text) else: text += " " + str(element) body_description = text.strip() claim.body = str(body_description).strip() # author author_meta = parsed_claim_review_page.find("div", {"class": "m-author__content"}) if author_meta: author = author_meta.find("a").text claim.set_author(author) author_url = author_meta.find("a") if author_url.attrs["href"] != "": claim.author_url = "https://www.politifact.com" + author_url.attrs["href"] # date published statement_meta = parsed_claim_review_page.find("div", {"class": "m-statement__meta"}) if statement_meta: meta_text = statement_meta.text if "on" in meta_text: meta_text = meta_text.split(" on ")[1] if "in" in meta_text: meta_text = meta_text.split(" in ")[0] if meta_text: date = search_dates(meta_text) if date: date = date[0][1].strftime("%Y-%m-%d") claim.date = date # related links div_tag = parsed_claim_review_page.find("article", {"class": "m-textblock"}) related_links = [] for link in body.find_all('a', href=True): if (link['href'][0] == "/"): related_links.append("https://www.politifact.com" + link['href']) else: related_links.append(link['href']) claim.set_refered_links(related_links) claim.set_claim(parsed_claim_review_page.find("div", {"class": "m-statement__quote"}).text.strip()) tags = [] ul_tag = parsed_claim_review_page.find("ul", {"class", "m-list"}) if ul_tag: ul_tag_contents = ul_tag.findAll("li", {"class", "m-list__item"}) for a in ul_tag_contents: a_tag=a.find("a", title=True) a_tag_text=a_tag['title'] tags.append(a_tag_text) if statement_body: topics = statement_body.find("ul", {"class", "m-list"}).find_all("a") for link in topics: text = link['title'] tags.append(text) claim.set_tags(",".join(tags)) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("factcheck_aap") # The title elements = parsed_claim_review_page.findAll('h1') if len(elements) == 1: title = elements[0].text else: title = elements[1].text claim.set_title(title.strip()) body = parsed_claim_review_page.select(".c-article__content") verdict_div = body[0].select(".c-article__verdict") if len(verdict_div) > 0: verdict_strongs = verdict_div[0].find_all("strong") else: verdict_strongs = body[0].find_all("strong") verdict = "" for verdict_strong in verdict_strongs: if "AAP FactCheck" not in verdict_strong.text and "AAP FactCheck Investigation:" not in verdict_strong.text: verdict = verdict_strong.text break claim.set_rating(verdict) if len(verdict_div) > 0: verdict_div[0].decompose() # The body body_text = body[0].text claim.set_body(body_text) # Date where the article was published date_tag = parsed_claim_review_page.find("date", attrs={'class': 'd-none'}) date_text = date_tag.text find_date = dateparser.parse(date_text) claim.set_date_published(find_date.strftime("%Y-%m-%d")) elements = body[0].find_all('a') refs = [] for elem in elements: refs.append(elem['href']) claim.set_refered_links(refs) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("newtral") #title, claim and autor claim title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content'] title = title.strip().split("|")[0] claim.set_title(title) entry_content = parsed_claim_review_page.find( "div", attrs={'class': 'entry-content'}) #print (title) dospunto = re.search(r'(: «)', title) dospunt = re.search(r'(: “)', title) if dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur claim_text = claim_a[1].strip("« »") claim.claim = claim_text #print (claim_a) elif dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text else: pass #multiple title or claim claim_mult = entry_content.findAll('h2') if claim_mult: claim_al = [i.text.strip() for i in claim_mult] dospunt = re.search(r'(: “)', claim_al) dospunto = re.search(r'(: «)', claim_al) if dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text elif dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur claim_text = claim_a[1].strip("« »") claim.claim = claim_text #print (claim_a) else: claim.set_title(claim_al) #tags tags = parsed_claim_review_page.find_all( "meta", attrs={'property': 'article:tag'}) tag_list = [] for tag in tags: tag_text = tag['content'] tag_list.append(tag_text) claim.set_tags(",".join(tag_list)) #date pubished published = parsed_claim_review_page.find( "meta", attrs={'property': 'article:published_time'})['content'] claim.date_published = published.strip() #autor article author_span = parsed_claim_review_page.find( "span", attrs={'class': 'c-article__author'}) author_a = author_span.find("a") author_url = author_a['href'] author_text = author_a.text author_text = re.sub('Por', '', author_text).strip() claim.author_url = author_url claim.review_author = author_text # Recuperation du texte de l'article entry_text = "" body_t = entry_content.find_all('p') body = [text.text.strip() for text in body_t] entry_text += " ".join(body) + "\n" claim.body = entry_text # Recuperation des liens dans le texte de l'article links = [ link['href'] for link in entry_content.find_all('a', href=True) ] claim.referred_links = links #Veracite intro = parsed_claim_review_page.find( "div", attrs={'class': 'c-article__intro'}) veracities = [ "ENGAÑOSA", "ENGAÑOSO", "FALSO", "FALSA", "FALSOS", "VERDADERO", "VERDAD A MEDIAS" ] def common(a, b): c = [value for value in a if value in b] return c if intro: intro_p = " ".join(str(v) for v in intro) #print(type(body_t)) rating_text_list = intro_p.upper() rating_text = [ i.strip() for i in common(veracities, rating_text_list) ] claim.alternate_name = rating_text else: body_a = " ".join(str(v) for v in body) #print(type(body_t)) rating_text_list = body_a.upper() rating_text = [ i.strip() for i in common(veracities, rating_text_list) ] claim.alternate_name = rating_text return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("factscan") json_ = None if parsed_claim_review_page.find("script", {"type": "application/ld+json"}): json_ = parsed_claim_review_page.find("script", { "type": "application/ld+json" }).get_text() def parse_wrong_json(json_, left, right): if json_: if len(json_.split(left)) > 0: return json_.split(left)[1].split(right)[0] else: return None # Summary box summary_box = parsed_claim_review_page.find("div", {"class": "summary-box"}) # title title = parsed_claim_review_page.find( "meta", {"property": "og:title"})['content'] claim.set_title(title) # claim review date date = parsed_claim_review_page.find( 'meta', {"property": "article:published_time"}) if date: date_str = search_dates( date['content'].split("T")[0])[0][1].strftime("%Y-%m-%d") claim.set_date(date_str) # Creative work date summary_text = summary_box.find("p").text date_published = "" if " on " in summary_text: date_published = summary_text.split(" on ")[-1].strip() else: if " published " in summary_text: date_published = summary_text.split(" published ")[-1].strip() elif " dated " in summary_text: date_published = summary_text.split(" dated ")[-1].strip() elif " from " in summary_text: date_published = summary_text.split(" from ")[-1].strip() elif " sent " in summary_text: date_published = summary_text.split(" in ")[-1].strip() elif " in " in summary_text: date_published = summary_text.split(" in ")[-1].strip() if len(date_published) > 0: date_published = search_dates(date_published)[0][1].strftime( "%Y-%m-%d") claim.setDatePublished(date_published) # rating if json_: claim.set_rating_value( parse_wrong_json(json_, '"ratingValue":', ",")) claim.setWorstRating(parse_wrong_json(json_, '"worstRating":', ",")) claim.set_best_rating(parse_wrong_json(json_, '"bestRating":', ",")) claim.set_alternate_name( parse_wrong_json(json_, '"alternateName":', ",")) # when there is no json else: if parsed_claim_review_page.find("div", {"class": "fact-check-icon"}): if parsed_claim_review_page.find("div", { "class": "fact-check-icon" }).find('img'): claim_str = \ parsed_claim_review_page.find("div", {"class": "fact-check-icon"}).find('img')['alt'].split( ":")[1] claim.alternate_name = claim_str.strip() # body body = parsed_claim_review_page.find("div", {"class": "entry-content"}) claim.set_body(body.get_text()) # author author = parsed_claim_review_page.find( "div", {"class": "sharethefacts-speaker-name"}) if not author: author = summary_box.find("p").find("strong") if author: claim.set_author(author.text) # same_as claim.setSameAs(parse_wrong_json(json_, '"sameAs": [', "]")) # related links divTag = parsed_claim_review_page.find("div", {"class": "entry-content"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) if parsed_claim_review_page.find("div", {"class": "sharethefacts-statement"}): claim.set_claim( parsed_claim_review_page.find( "div", { "class": "sharethefacts-statement" }).get_text()) else: claim.set_claim(claim.title) tags = [] for tag in parsed_claim_review_page.findAll( 'meta', {"property": "article:tag"}): tags.append(tag["content"]) if len(tags) == 0: for tag in parsed_claim_review_page.findAll( "a", {"rel": "category tag"}): tags.append(tag.text) claim.set_tags(", ".join(tags)) return [claim]
def get_all_claims(criteria): print(criteria.maxClaims) # performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break try: page = urllib.request.urlopen( "https://g1.globo.com/e-ou-nao-e/index/feed/pagina-" + str(page_number) + ".ghtml").read() except: break soup = BeautifulSoup(page, "lxml") soup.prettify() links = soup.findAll('a', {"class": "feed-post-link"}, href=True) if len(links) != 0: for anchor in links: if (anchor['href'] not in list(urls_.keys())): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break urls_[anchor['href']] = page_number print("adding " + str(anchor['href'])) else: print("break!") break claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.items(): print( str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " + str(url)) index += 1 url_complete = str(url) # print url_complete page = urllib.request.urlopen(url_complete).read().decode( 'utf-8', 'ignore') soup = BeautifulSoup(page, "lxml") soup.prettify("utf-8") claim_ = Claim() claim_.set_url(url_complete) claim_.set_source("g1") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) try: # title # if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1): title = soup.find("h1", {"class": "content-head__title"}) claim_.set_title(title.text) # date date_ = soup.find('time', {"itemprop": "datePublished"}) if date_: date_str = date_.get_text().split(" ")[1] claim_.set_date( dateparser.parse(date_str, settings={ 'DATE_ORDER': 'DMY' }).strftime("%Y-%m-%d")) # print claim_.date # body body = soup.find("article") claim_.set_body(body.get_text().replace("\n", "").replace( "TwitterFacebookE-mailWhatsApp", "")) # related links divTag = soup.find("article", {"itemprop": "articleBody"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.set_refered_links(related_links) # claim claim_conclusion = soup.find("h1", { "class": "content-head__title" }).get_text() # claim_.setClaim(claim_conclusion) # if (len(claim_conclusion.split("?"))>1): claim_.set_claim(claim_conclusion.split("?")[0]) claim_.setConclusion(claim_conclusion.split("?")[1]) # if (claim_element.find_previous_sibling("figure") and claim_element.find_previous_sibling("figure").findAll("figcaption")): # claim_.setConclusion(claim_element.find_previous_sibling("figure").findAll("figcaption")[-1:][0].get_text()) # print claim_.claim.decode("utf-8") + " ====> " # print claim_.conclusion.decode("utf-8") # print "-->"+ str(claim_.conclusion) claims.append(claim_.generate_dictionary()) except: print("Error ->" + str(url_complete)) # creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("politifact") # Claim title = parsed_claim_review_page.find("div", {"class": "m-statement__quote"}) claim.set_claim(title.text) # title title = parsed_claim_review_page.find("h2", {"class": "c-title"}) claim.set_title(title.text) # date date = parsed_claim_review_page.find('span', {"class": "m-author__date"}) if date: date_str = search_dates(date.text)[0][1].strftime("%Y-%m-%d") claim.set_date(date_str) # rating statement_body=parsed_claim_review_page.find("div", {"class", "m-statement__body"}) statement_detail = statement_body.find("div", {"class", "c-image"}) statement_detail_image=statement_detail.find("picture") statement_detail_image_alt=statement_detail_image.find("img",{"class", "c-image__original"}) if statement_detail_image_alt: claim.alternate_name = statement_detail_image_alt['alt'] # body body = parsed_claim_review_page.find("article", {"class": "m-textblock"}) claim.set_body(body.get_text()) # author statement_meta = parsed_claim_review_page.find("div", {"class": "m-statement__meta"}) if statement_meta: author = statement_meta.find("a").text claim.set_author(author) # date published if statement_meta: meta_text = statement_meta.text if "on" in meta_text: meta_text = meta_text.split(" on ")[1] if "in" in meta_text: meta_text = meta_text.split(" in ")[0] if meta_text: date = search_dates(meta_text) if date: date = date[0][1].strftime("%Y-%m-%d") claim.setDatePublished(date) # related links div_tag = parsed_claim_review_page.find("article", {"class": "m-textblock"}) related_links = [] for link in body.find_all('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) claim.set_claim(parsed_claim_review_page.find("div", {"class": "m-statement__quote"}).text.strip()) tags = [] ul_tag = parsed_claim_review_page.find("ul", {"class", "m-list"}) if ul_tag: ul_tag_contents = ul_tag.findAll("li", {"class", "m-list__item"}) for a in ul_tag_contents: a_tag=a.find("a", title=True) a_tag_text=a_tag['title'] tags.append(a_tag_text) if statement_body: topics = statement_body.find("ul", {"class", "m-list"}).find_all("a") for link in topics: text = link['title'] tags.append(text) claim.set_tags(",".join(tags)) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: self.claim = self.extract_claim(parsed_claim_review_page) self.review = self.extract_review(parsed_claim_review_page) claim = Claim() claim.set_rating_value( self.extract_rating_value(parsed_claim_review_page)) claim.set_rating( FatabyyanoFactCheckingSiteExtractor.translate_rating_value( self.extract_rating_value(parsed_claim_review_page))) claim.set_source("fatabyyano") claim.set_author("fatabyyano") claim.set_date_published(self.extract_date(parsed_claim_review_page)) claim.set_claim(self.claim) claim.set_body(self.review) claim.set_refered_links(self.extract_links(parsed_claim_review_page)) claim.set_title(self.extract_claim(parsed_claim_review_page)) claim.set_date(self.extract_date(parsed_claim_review_page)) claim.set_url(url) claim.set_tags(self.extract_tags(parsed_claim_review_page)) return [claim]
def get_all_claims(criteria): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } # performing a search by each letter, and adding each article to a urls_ var. urls_ = {} for page_number in range(1, 500): if 0 < criteria.maxClaims <= len(urls_): break url = "https://www.washingtonpost.com/news/fact-checker/page/" + str( page_number) + "/" if page_number == 1: url = "https://www.washingtonpost.com/news/fact-checker/?utm_term=.c0f1538d1850" # try: print(url) page = requests.get(url, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify() print(page.text) links = soup.findAll("div", {"class": "story-headline"}) print(links) if len(links) == 0: break for anchor in links: anchor = anchor.find("a") ind_ = str(anchor['href']) if ind_ not in list(urls_.keys()): if 0 < criteria.maxClaims <= len(urls_): break urls_[ind_] = ind_ claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.items(): print( str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " + str(url)) index += 1 url_complete = str(url) # print url_complete try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify("utf-8") claim_ = Claim() claim_.set_url(url_complete) claim_.set_source("washingtonpost") if criteria.html: claim_.setHtml(soup.prettify("utf-8")) # title title = soup.find("h1", {"class": "article__title"}) claim_.set_title(title.text) # date date_ = soup.find('div', {"class": "widget__content"}).find("p") if date_: date_str = search_dates(date_.text)[0][1].strftime("%Y-%m-%d") claim_.set_date(date_str) # body body = soup.find("div", {"class": "article__text"}) claim_.set_body(body.get_text()) # related links divTag = soup.find("div", {"class": "article__text"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.set_refered_links(related_links) claim_.set_claim(soup.find("h1", {"class": "article__title"}).text) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): tags.append(tag["content"]) claim_.set_tags(", ".join(tags)) claims.append(claim_.generate_dictionary()) except: print("Error ->" + str(url_complete)) # creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: #print(parsed_claim_review_page) local_claims = [] claim = Claim() claim.set_url(url) claim.set_source("africacheck") # title title = parsed_claim_review_page.find("meta", {"property": "og:title"}) global_title_text = title['content'] print(global_title_text) claim.set_title(global_title_text) # date date = parsed_claim_review_page.find('span', {"class": "published"}) #print(date.text) global_date_str = "" if date: #global_date_str = search_dates(date.text.split(" ")[0])[0][1].strftime("%Y-%m-%d") datee = date.text global_date_str0 = re.search("[0-9]+ [a-zA-Z]+ [0-9]+", datee) global_date_str = global_date_str0.group(0) print(global_date_str) claim.set_date(global_date_str) #arrettttttttttttttttttttttttttttttttttttttttttttttttt # rating global_truth_rating = "" if parsed_claim_review_page.find( "div", {"class": "article-details__verdict"}): #changer global_truth = parsed_claim_review_page.find( "div", {"class": "article-details__verdict"}) div_rating = global_truth.find('div') #changer div_rating_class = div_rating["class"][1] #changer div_rating_class_verdict0 = re.search( "[-][a-zA-Z]+", div_rating_class).group(0) #changer global_truth_rating = str( re.search("[a-zA-Z]+", div_rating_class_verdict0).group(0)) print(re.search("[a-zA-Z]+", div_rating_class_verdict0).group(0)) #changer if parsed_claim_review_page.find("div", {"class": "verdict-stamp"}): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "verdict-stamp" }).get_text() if parsed_claim_review_page.find("div", {"class": "verdict"}): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "verdict" }).get_text() if parsed_claim_review_page.find( "div", {"class": "report-verdict indicator"}): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "report-verdict indicator" }).get_text() if parsed_claim_review_page.find( "div", { "class": "report-verdict indicator" }).find('span'): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "report-verdict indicator" }).find('span').get_text() claim.set_rating(global_truth_rating) #changer # author #author = parsed_claim_review_page.find("div", {"class": "sharethefacts-speaker-name"}) if parsed_claim_review_page.findAll("div", {"class": "author-details"}): author0 = parsed_claim_review_page.findAll( "div", {"class": "author-details"}) #changer #print(author0) if author0: for author in author0: claim.set_author(author.find('h4').get_text()) print(author.find('h4').get_text()) elif parsed_claim_review_page.find("div", {"class": "author-details"}): author = parsed_claim_review_page.find( "div", {"class": "author-details"}) #changer print(author) if author: claim.set_author(author.find('h4').get_text()) #if author: # claim.set_author(author.get_text()) # when there is no json date = parsed_claim_review_page.find('span', {"class": "published"}) #print(date.text) global_date_str = "" if date: #global_date_str = search_dates(date.text.split(" ")[0])[0][1].strftime("%Y-%m-%d") datee = date.text global_date_str0 = re.search("[0-9]+ [a-zA-Z]+ [0-9]+", datee) global_date_str = global_date_str0.group(0) print(global_date_str) claim.set_date(global_date_str) #date = parsed_claim_review_page.find("time", {"class": "datetime"}) #print(date) #if date: #claim.set_date(date.get_text()) tags = [] for tag in parsed_claim_review_page.findAll( 'meta', {"property": "article:tag"}): tags.append(tag["content"]) claim.set_tags(", ".join(tags)) print(tags) global_claim_text = "" #report_claim_div = parsed_claim_review_page.find("div", {"class": "report-claim"}) report_claim_div0 = parsed_claim_review_page.findAll( "div", {"class": "grid-x grid-padding-x"}) report_claim_div = report_claim_div0[1] #report_claim_div = parsed_claim_review_page.find("div", {"class": "clearfix text-formatted field field--name-body field--type-text-with-summary field--label-hidden field__item"}) #for pr in report_claim_div: # print(pr) #print(report_claim_div[1]) if report_claim_div: if report_claim_div.find("p") is not None: claim.set_claim(report_claim_div.find("p").get_text()) #print(report_claim_div.find("p").get_text()) else: claim.set_claim(claim.title) inline_ratings = parsed_claim_review_page.findAll( "div", {"class", "inline-rating"}) #print(inline_ratings) #entry_section = parsed_claim_review_page.find("section", {"class", "entry-content"}) # type: Tag entry_section = parsed_claim_review_page.find( "section", {"class", "cell"}) # type: Tag entry_section_full_text = entry_section.text # There are several claims checked within the page. Common date, author, tags ,etc. if inline_ratings and len(inline_ratings) > 0: entry_contents = entry_section.contents # type : List[Tag] current_index = 0 # First we extract the bit of text common to everything until we meed a sub-section body_text, links, current_index = get_text_and_links_until_next_header( entry_contents, current_index) claim.set_body(body_text) claim.set_refered_links(links) while current_index < len(entry_contents): current_index = forward_until_inline_rating( entry_contents, current_index) inline_rating_div = entry_contents[current_index] if isinstance(inline_rating_div, NavigableString): break claim_text = inline_rating_div.find("p", { "class": "claim-content" }).text inline_rating = inline_rating_div.find( "div", {"class", "indicator"}).find("span").text previous_current_index = current_index inline_body_text, inline_links, current_index = get_text_and_links_until_next_header( entry_contents, current_index) if previous_current_index == current_index: current_index += 1 inline_claim = Claim() inline_claim.set_source("africacheck") inline_claim.set_claim(claim_text) inline_claim.set_rating(inline_rating) inline_claim.set_refered_links(",".join(inline_links)) inline_claim.set_body(inline_body_text) inline_claim.set_tags(", ".join(tags)) inline_claim.set_date(global_date_str) inline_claim.set_url(url) if author: inline_claim.set_author(author.get_text()) inline_claim.set_title(global_title_text) local_claims.append(inline_claim) elif "PROMISE:" in entry_section_full_text and "VERDICT:" in entry_section_full_text: entry_contents = entry_section.contents # type : List[Tag] current_index = 0 # First we extract the bit of text common to everything until we meed a sub-section body_text, links, current_index = get_text_and_links_until_next_header( entry_contents, current_index) claim.set_body(body_text) claim.set_refered_links(links) while current_index < len(entry_contents): inline_rating_div = entry_contents[current_index] if isinstance(inline_rating_div, NavigableString): break claim_text = entry_contents[current_index + 2].span.text inline_rating = entry_contents[current_index + 4].span.text current_index += 5 previous_current_index = current_index inline_body_text, inline_links, current_index = get_text_and_links_until_next_header( entry_contents, current_index) if previous_current_index == current_index: current_index += 1 inline_claim = Claim() inline_claim.set_source("africacheck") inline_claim.set_claim(claim_text) inline_claim.set_rating(inline_rating) inline_claim.set_refered_links(",".join(inline_links)) inline_claim.set_body(inline_body_text) inline_claim.set_tags(", ".join(tags)) inline_claim.set_date(global_date_str) inline_claim.set_url(url) if author: inline_claim.set_author(author.get_text()) inline_claim.set_title(global_title_text) local_claims.append(inline_claim) else: # body #body = parsed_claim_review_page.find("div", {"id": "main"}) body = parsed_claim_review_page.find( "div", {"id": "block-mainpagecontent"}) claim.set_body(body.get_text()) # related links divTag = parsed_claim_review_page.find( "div", {"id": "block-mainpagecontent"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(",".join(related_links)) local_claims.append(claim) return local_claims
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() self.claim = self.extract_claim(parsed_claim_review_page) self.review = self.extract_review(parsed_claim_review_page) rating_value = self.extract_rating_value(parsed_claim_review_page) claim.set_rating_value(rating_value) claim.set_alternate_name(self.translate_rating_value(rating_value)) claim.set_source(self.extract_author( parsed_claim_review_page)) # auteur de la review claim.set_author(self.extract_claimed_by( parsed_claim_review_page)) # ? auteur de la claim? # claim.setDatePublished(self.extract_date(parsed_claim_review_page)) #? publication de la claim claim.set_claim(self.claim) claim.set_body(self.review) claim.set_refered_links(self.extract_links(parsed_claim_review_page)) claim.set_title(self.extract_title(parsed_claim_review_page)) # date de la publication de la review claim.set_date(self.extract_date(parsed_claim_review_page)) claim.set_url(url) claim.set_tags(self.extract_tags(parsed_claim_review_page)) # extract_entities returns two variables json_claim, json_body = self.extract_entities(self.claim, self.review) claim.set_claim_entities(json_claim) claim.set_body_entities(json_body) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("checkyourfact") # title title = parsed_claim_review_page.find('article').find("h1") claim.set_title(title.text.replace("FACT CHECK: ", "")) url_date = url.replace("https://checkyourfact.com/", "").replace("/", " ").split(" ") claim.set_date(url_date[0] + "-" + url_date[1] + "-" + url_date[2]) # body body = parsed_claim_review_page.find("article") claim.set_body(body.get_text()) # related links div_tag = parsed_claim_review_page.find("article") related_links = [] for link in div_tag.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) claim.set_claim(claim.title) # rating rating = find_by_text(parsed_claim_review_page, "Verdict", "span") if rating: rating_text = rating[0].text.split(":")[-1].strip() claim.set_rating(rating_text) else: pass tags = [] for tag in parsed_claim_review_page.findAll( 'meta', {"property": "article:tag"}): tags.append(tag["content"]) claim.set_tags(", ".join(tags)) if len(claim.rating) == 0: return [] else: return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: if url in url_blacklist: return [] claim = Claim() # url claim.url = str(url) # souce claim.source = "snopes" # title title = None if parsed_claim_review_page.select('article > header > h1'): for tmp in parsed_claim_review_page.select( 'article > header > h1'): title = tmp.text.strip() #sub_title = parsed_claim_review_page.select( 'article > header > h2' ) claim.title = str(title.strip()) # author author_list = [] author_links = [] if parsed_claim_review_page.select( 'article > header > ul.list-unstyled.authors.list-unstyled.d-flex.flex-wrap.comma-separated > li > a' ): for author_a in parsed_claim_review_page.select( 'article > header > ul.list-unstyled.authors.list-unstyled.d-flex.flex-wrap.comma-separated > li > a' ): if hasattr(author_a, 'href'): author_list.append(author_a.text.strip()) author_links.append(author_a.attrs['href']) else: print("no author?") claim.author = ", ".join(author_list) claim.author_url = (", ".join(author_links)) # review_author ? # - # date datePub = None dateUpd = None date_str = "" date_ = parsed_claim_review_page.find('ul', {"class": "dates"}) if date_: dates = date_.find('li', {"class": "font-weight-bold text-muted"}) dateSpans = dates.span for dateItems in dateSpans: if dateItems == 'Published': datePub = dateItems.next.strip() date_str = dateparser.parse(datePub).strftime("%Y-%m-%d") claim.date_published = date_str claim.date = date_str if dateItems == 'Updated': dateUpd = dateItems.next.strip() date_str = dateparser.parse(dateUpd).strftime("%Y-%m-%d") claim.date = date_str # claim image? # - # claim claim_text = None if parsed_claim_review_page.select( 'article > div > div.claim-text.card-body'): for p in parsed_claim_review_page.select( 'article > div > div.claim-text.card-body'): if hasattr(p, 'text'): claim_text = p.text.strip() claim.claim = str(claim_text).strip() # rating -> https://www.snopes.com/fact-check-ratings/ rating = None if parsed_claim_review_page.select( 'article > div > div > div > div.media-body > span'): for rating_span in parsed_claim_review_page.select( 'article > div > div > div > div.media-body > span'): rating = rating_span.text.strip() claim.rating = str(rating).replace('"', "").strip() # claim.set_rating_value( rating ) # rating best whats_true = None if parsed_claim_review_page.select( 'article > div > div > div.whats-true > div > p'): for rating_span_true in parsed_claim_review_page.select( 'article > div > div > div.whats-true > div > p'): whats_true = rating_span_true.text.strip() if whats_true: whats_true = str(whats_true).replace('"', "") # Text: (not Numerical value) # claim.best_rating = whats_true # rating worst whats_true = False if parsed_claim_review_page.select( 'article > div > div > div.whats-false > div > p'): for rating_span_false in parsed_claim_review_page.select( 'article > div > div > div.whats-false > div > p'): whats_false = rating_span_false.text.strip() if whats_false: whats_false = str(whats_true).replace('"', "") # Text: (not Numerical value) # claim.worst_rating = whats_false # rating Undetermined? whats_undetermined = False if parsed_claim_review_page.select( 'article > div > div > div.whats-undetermined > div > p'): for rating_span_undetermined in parsed_claim_review_page.select( 'article > div > div > div.whats-undetermined > div > p'): whats_undetermined = rating_span_undetermined.text.strip() if whats_undetermined: whats_undetermined = str(whats_undetermined).replace('"', "") # Text: (not Numerical value) # claim.whats_undetermined = whats_undetermined # rating value ? # - # Body descriptioon text = "" if parsed_claim_review_page.select( 'article > div.single-body.card.card-body.rich-text > p'): for child in parsed_claim_review_page.select( 'article > div.single-body.card.card-body.rich-text > p'): text += " " + child.text body_description = text.strip() claim.body = str(body_description).strip() # related links related_links = [] if parsed_claim_review_page.select( 'article > div.single-body.card.card-body > p > a'): for link in parsed_claim_review_page.select( 'article > div.single-body.card.card-body > p > a'): if hasattr(link, 'href'): related_links.append(link['href']) claim.referred_links = related_links # tags tags = [] if parsed_claim_review_page.select( 'article > footer > div > a > div > div'): for tag in parsed_claim_review_page.select( 'article > footer > div > a > div > div'): if hasattr(tag, 'text'): tags.append(tag.text.strip()) claim.tags = ", ".join(tags) # same as ? # - # No Rating? No Claim? if not claim_text or not rating: print(url) if not rating: print("-> Rating cannot be found!") if not claim_text: print("-> Claim cannot be found!") return [] return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("eufactcheck") #title #Since the title always starts with claim followed by the title of the article we split the string based on ":" full_title = parsed_claim_review_page.find("div", {"class":"page-title-head hgroup"}).find("h1").get_text().split(":") claim.set_title(full_title[1]) #date full_date = parsed_claim_review_page.find("time", {"class":"entry-date updated"})['datetime'].split("T") claim.set_date(full_date[0]) #body body = parsed_claim_review_page.find('div', {"class":"entry-content"}) claim.set_body(body.get_text()) #related related_links related_links = [] for link in body.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) claim.set_claim(claim.title) #rating rating = full_title[0].strip() claim.set_alternate_name(rating) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim_txt = self.extract_claim(parsed_claim_review_page) review = self.extract_review(parsed_claim_review_page) rating_value = self.extract_rating_value(parsed_claim_review_page, url) claim.set_rating(rating_value) claim.set_source("Vishvanews") # auteur de la review claim.review_author = self.extract_author(parsed_claim_review_page) claim.set_author(self.extract_claimed_by( parsed_claim_review_page)) # ? auteur de la claim? # claim.setDatePublished(self.extract_date(parsed_claim_review_page)) #? publication de la claim claim.set_claim(claim_txt) claim.set_body(review) claim.set_refered_links(self.extract_links(parsed_claim_review_page)) claim.set_title(self.extract_title(parsed_claim_review_page)) # date de la publication de la review claim.set_date(self.extract_date(parsed_claim_review_page)) claim.set_url(url) claim.set_tags(self.extract_tags(parsed_claim_review_page)) # extract_entities returns two variables json_claim, json_body = self.extract_entities(claim_txt, review) claim.claim_entities = json_claim claim.body_entities = json_body if claim.rating != "": return [claim] else: return []
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: if url in url_blacklist: return [] claim = Claim() claim.set_url(url) claim.set_source("snopes") # title article = parsed_claim_review_page.find("article", {'class', 'main-post'}) header = article.find("header") title = header.find("h1") claim.set_title(title.text) card = article.find("div", {"class": "content-wrapper card"}) card_body = card.find("div", {'class': 'content'}) # date date_str = "" rating = None claim_text = None date_ = parsed_claim_review_page.find('span', {"class": "date date-published"}) # print date_["content"] if not date_: date_ = parsed_claim_review_page.find( 'span', {"class": "date date-last-update"}) if date_: date_str = dateparser.parse(date_.text).strftime("%Y-%m-%d") # body ads = card_body.findAll("div") for ad in ads: ad.decompose() ads = card_body.findAll("div", {"class": "snopes-bt"}) for ad in ads: ad.decompose() text = "" contents = card_body.findChildren() for child in contents: text += child.text body_description = text # author author = parsed_claim_review_page.find("a", {"class": "author"}) rating_div = None if not rating: rating = parsed_claim_review_page.find("span", {"class": "rating-name"}) if not rating: rating_div = parsed_claim_review_page.find( "div", {"class": "media rating"}) if not rating and not rating_div: rating_div = parsed_claim_review_page.find("div", {"class": "claim-old"}) if not rating and not rating_div: rating_div = parsed_claim_review_page.find( "div", {"class": "rating-wrapper card"}) if rating_div: rating = rating_div.find("h5") if not rating: rating = rating_div.find("span") if not rating: # Oldest page format rating = parsed_claim_review_page.find("font", {"class", "status_color"}) if rating: rating = rating.find("b") # related links related_links = [] for link in card_body.findAll('a', href=True): related_links.append(link['href']) if not claim_text: claim_p = parsed_claim_review_page.find('p', {"class": "claim"}) if not claim_p: claim_div = parsed_claim_review_page.find( 'div', {"class": "claim"}) if not claim_div: claim_div = parsed_claim_review_page.find( 'div', {"class": "claim-old"}) if not claim_div: claim_text = "" else: claim_text = claim_div.find("p").text else: claim_text = claim_p.text else: claim_text = claim_text.strip() tags = [] for tag in parsed_claim_review_page.findAll( 'meta', {"property": "article:tag"}): tags.append(tag["content"]) if not date_str or not claim_text or not body_description or not rating: claim_text, body_description, date_str, rating = handle_legacy_page_structures( card_body, claim_text, body_description, date_str, rating) claim.set_date(date_str) claim.set_body(body_description) claim.set_tags(", ".join(tags)) claim.set_refered_links(related_links) if author: claim.review_author = author.text.strip() if len(claim_text) > 3 and len(claim_text.split("\n")) < 5: claim.set_claim(claim_text) else: if header: h1 = header.find("h1") claim_text = h1.text if claim_text: claim.set_claim(claim_text) else: print("Claim text cannot be found!") return [] else: return [] if rating: claim.set_alternate_name(rating.text) else: return [] return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: self.claim = self.extract_claim(parsed_claim_review_page) self.review = self.extract_review(parsed_claim_review_page) claim = Claim() claim.set_rating_value( self.extract_rating_value(parsed_claim_review_page)) claim.set_alternate_name( FatabyyanoFactCheckingSiteExtractor.translate_rating_value( self.extract_rating_value(parsed_claim_review_page))) claim.set_source("fatabyyano") claim.set_author("fatabyyano") claim.setDatePublished(self.extract_date(parsed_claim_review_page)) claim.set_claim(self.claim) claim.set_body(self.review) claim.set_refered_links(self.extract_links(parsed_claim_review_page)) claim.set_title(self.extract_claim(parsed_claim_review_page)) claim.set_date(self.extract_date(parsed_claim_review_page)) claim.set_url(url) claim.set_tags(self.extract_tags(parsed_claim_review_page)) # extract_entities returns two variables json_claim, json_body = self.extract_entities(self.claim, self.review) claim.set_claim_entities(json_claim) claim.set_body_entities(json_body) return [claim]
def get_all_claims(criteria): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } # print criteria.maxClaims # performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} types = [ "true", "mostly-true", "half-true", "barely-true", "false", "pants-fire", "no-flip", "half-flip", "full-flop" ] last_page = [] for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break url = "https://www.channel4.com/news/factcheck/page/" + str( page_number) # url="http://www.politifact.com/truth-o-meter/rulings/"+str(type_)+"/?page="+str(page_number) try: page = requests.get(url, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify() links = soup.findAll("li", {"class": "feature factcheck"}) if (len(links) != 0) or (links != last_page): for anchor in links: anchor = anchor.find('a', {"class": "permalink"}, href=True) ind_ = str(anchor['href']) if (ind_ not in list(urls_.keys())): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break if (ind_ not in criteria.avoid_url): urls_[ind_] = ind_ print("adding " + str(ind_)) last_page = links else: print("break!") break except: print("error=>" + str(url)) claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.items(): print( str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " + str(url)) index += 1 url_complete = str(url) # print url_complete try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify("utf-8") claim_ = Claim() claim_.set_url(url_complete) claim_.set_source("channel4") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) # title # if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1): title = soup.find("div", { "class": "factcheck-article-header" }).find("h1").get_text() claim_.set_title(title) # date date_ = soup.find('li', {"class": "pubDateTime"}) # print date_["content"] if date_: date_str = search_dates( date_['data-time'])[0][1].strftime("%Y-%m-%d") # print date_str claim_.set_date(date_str) # print claim_.date # body body = soup.find("div", {"class": "article-body article-main"}) claim_.set_body(body.get_text()) # related links divTag = soup.find("div", {"class": "article-body article-main"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.set_refered_links(related_links) claim_.set_claim(title) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): # print "achou" tags.append(tag["content"]) claim_.set_tags(", ".join(tags)) # if (claim_.conclusion.replace(" ","")=="" or claim_.claim.replace(" ","")==""): # print claim_.conclusion # print claim_.claim # raise ValueError('No conclusion or claim') claims.append(claim_.generate_dictionary()) except: print("Error ->" + str(url_complete)) # creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def new_claim(f_link, date, title, tags): claim_ = Claim() claim_.set_url(f_link) claim_.set_title(title) claim_.set_tags(tags) date_ = date.strip().split() date_ = "-".join([date_[4], date_[2], date_[0]]) claim_.set_date(dateparser.parse(date_).strftime("%Y-%m-%d")) claim_.set_source("publica") claim_.set_body("") return claim_
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claims = [] claim = Claim() # url claim.url = str(url) # souce claim.source = "fullfact" # title title = None if parsed_claim_review_page.select( 'body > main > div > div > section > article > h1'): for tmp in parsed_claim_review_page.select( 'body > main > div > div > section > article > h1'): title = tmp.text.strip() claim.title = str(title.strip()) # author author_list = [] author_links = [] # single author? if parsed_claim_review_page.select( 'article > section.social-media > div > div > ul > li > span > cite' ): for author_a in parsed_claim_review_page.select( 'article > section.social-media > div > div > ul > li > span > cite' ): if hasattr(author_a, 'text'): author_list.append(author_a.text.strip()) # if hasattr( author_a, 'href' ): # author_list.append( author_a.text.strip() ) # author_links.append( author_a.attrs['href'] ) else: print("no author? https://fullfact.org/about/our-team/") claim.author = ", ".join(author_list) #claim.author_url = ( ", ".join( author_links ) ) # date datePub = None dateUpd = None date_str = "" # updated? if parsed_claim_review_page.select('article > div.published-at'): for date_ in parsed_claim_review_page.select( 'article > div.published-at'): if hasattr(date_, 'text'): datePub = date_.text.strip() if "|" in datePub: split_datePub = datePub.split("|") if len(split_datePub) > 0: datePub = split_datePub[0].strip() date_str = dateparser.parse(datePub).strftime("%Y-%m-%d") claim.date_published = date_str claim.date = date_str else: print("no date?") # Body descriptioon text = "" if parsed_claim_review_page.select('article > p'): for child in parsed_claim_review_page.select('article > p'): text += " " + child.text body_description = text.strip() claim.body = str(body_description).strip() # related links (in page body text <p>) related_links = [] if parsed_claim_review_page.select('article > p > a'): for link in parsed_claim_review_page.select('article > p > a'): try: if hasattr(link, 'href'): if 'http' in link['href']: related_links.append(link['href']) else: related_links.append("https://fullfact.org" + link['href']) except KeyError as e: print("->KeyError: " + str(e)) continue except IndexError as e: print("->IndexError : " + str(e)) continue # related links (in Related fact checks) if parsed_claim_review_page.select( 'section.related-factchecks > div > ul > li > a'): for link in parsed_claim_review_page.select( 'section.related-factchecks > div > ul > li > a'): try: if hasattr(link, 'href'): if 'http' in link['href']: related_links.append(link['href']) else: related_links.append("https://fullfact.org" + link['href']) except KeyError as e: print("->KeyError: " + str(e)) continue except IndexError as e: print("->IndexError: " + str(e)) continue if related_links: claim.referred_links = related_links # cannot be found on fullfact: # self.tags = "" # self.author_url = "" # self.date_published = "" # self.same_as = "" # self.rating_value = "" # self.worst_rating = "" # self.best_rating = "" # self.review_author = "" # claim # multiple (local) claims: 'article > div > div > div.row.no-gutters.card-body-text > div > div > p' ? claim_text_list = [] claim_text = None # rating -> VERDICT: extract_conclusion -> true, false, ... claim_verdict_list = [] claim_verdict = None column = "claim" # or verdict: if parsed_claim_review_page.select( 'body > main > div > div > section > article > div > div > div.row.no-gutters.card-body-text > div > div > p' ): for p in parsed_claim_review_page.select( 'body > main > div > div > section > article > div > div > div.row.no-gutters.card-body-text > div > div > p' ): if hasattr(p, 'text'): if column == "claim": claim_text_list.append(p.text.strip()) if claim_text == None: claim_text = p.text.strip() column = "verdict" else: rating_word_list = p.text conclusion_text = self._conclusion_processor.extract_conclusion( rating_word_list) #print ("conclusion_text: " + conclusion_text) rating = str(conclusion_text).replace('"', "").strip() if "." in rating: split_name = rating.split(".") if len(split_name) > 0: rating = split_name[0] claim_verdict_list.append(rating) if claim_verdict == None: claim_verdict = rating column = "claim" # First local claim and rating: claim.claim = claim_text claim.rating = claim_verdict # All claims and ratings "comma" separated: get all claims? # claim.claim = ", ".join( claim_text_list ) # claim.rating = ", ".join( verdict_text_list ) # Create multiple claims from the main one and add change then the claim text and verdict (rating): c = 0 while c <= len(claim_text_list) - 1: claims.append(claim) claims[c].claim = claim_text_list[c] claims[c].rating = claim_verdict_list[c] c += 1 # for local_claim in claim_text_list: # claims[claim[len(claim)]] = claims[claim[len(claim)-1]] # No Rating? No Claim? if not claim.claim or not claim.rating: print(url) if not claim.rating: print("-> Rating cannot be found!") if not claim.claim: print("-> Claim cannot be found!") return [] # return [claim] return claims
def get_all_claims(criteria): print(criteria.maxClaims) # performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break try: url = "https://correctiv.org/echtjetzt/artikel/seite/" + str( page_number) + "/" page = urllib.request.urlopen(url).read() except: break soup = BeautifulSoup(page, "lxml") soup.prettify() links = soup.findAll('a', {"class": "entry-list-item__link"}, href=True) if len(links) != 0: for anchor in links: url_to_add = "https://correctiv.org" + str(anchor['href']) if (url_to_add not in list(urls_.keys())): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break urls_[url_to_add] = page_number print("adding " + str(url_to_add)) else: print("break!") break claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.items(): print( str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " + str(url)) index += 1 url_complete = str(url) # print url_complete try: page = urllib.request.urlopen(url_complete).read().decode( 'utf-8', 'ignore') soup = BeautifulSoup(page, "lxml") soup.prettify("utf-8") claim_ = Claim() claim_.set_url(url_complete) claim_.set_source("correctiv") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) # title # if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1): title = soup.find("h1", {"class": "article-header__headline"}) claim_.set_title( title.text.replace("Faktencheck:", "").replace("\n", "")) date_ = soup.find('time', {"class": "article-body__publishing-date"}) # print date_["content"] if date_: date_str = search_dates( date_['title'].split("T")[0])[0][1].strftime("%Y-%m-%d") # print date_str claim_.set_date(date_str) # print claim_.date # body body = soup.find("div", {"class": "article-body__main"}) claim_.set_body(body.get_text()) # related links divTag = soup.find("div", {"class": "article-body__main"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.set_refered_links(related_links) claim_.set_claim(claim_.title) conclsion = soup.find( 'div', {"class": "article-body__claimreview claimreview"}) if conclsion: claim_.setConclusion( conclsion.text.replace("Unsere Bewertung: ", "").replace("\n", "")) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): # print "achou" tags.append(tag["content"]) claim_.set_tags(", ".join(tags)) claims.append(claim_.generate_dictionary()) except: print("Error ->" + str(url_complete)) # creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() data = parsed_claim_review_page.find(string=re.compile("schema.org")) data = json.loads(str(data)) node_zero = data['@graph'][0] if node_zero and 'claimReviewed' in node_zero.keys(): claim_str = node_zero['claimReviewed'] if claim_str and len(claim_str) > 0: claim.set_claim(claim_str) else: return [] rating = data['@graph'][0]['reviewRating'] if rating and 'alternateName' in rating.keys(): claim.set_rating(rating['alternateName']) try: claim.set_best_rating(rating['bestRating']) claim.set_worst_rating(rating['worstRating']) claim.set_rating_value(rating['ratingValue']) except Exception: pass else: return [] if 'author' in data['@graph'][0]['itemReviewed'].keys(): author = data['@graph'][0]['itemReviewed']['author'] if author and 'name' in author.keys(): if len(str(author['name'])) > 0: claim.set_author(author['name']) claim.set_url(url) claim.set_source("factual_afp") try: title = data['@graph'][0]['name'] claim.set_title(title) except Exception: pass try: claim.set_date(data['@graph'][0]['itemReviewed']['datePublished']) except Exception: pass try: date = data['@graph'][0]['datePublished'] claim.set_date_published(date.split(' ')[0]) except Exception: pass body = parsed_claim_review_page.find( 'div', {'class': 'article-entry clearfix'}) claim.set_body(body.text) links = [] children = parsed_claim_review_page.find( 'div', { 'class': 'article-entry clearfix' }).children for child in children: try: if child.name == 'aside': continue elems = child.findAll('a') for elem in elems: links.append(elem['href']) except Exception as e: continue claim.set_refered_links(links) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claims = [] ol = parsed_claim_review_page.find('ol', {'class': 'breadcrumb col-xs-12'}) elems = ol.findAll('a') keywords = [] for elem in elems: keywords.append(elem.text) #extraction of brief claims d = parsed_claim_review_page.findAll('div', {"id":"briefClaimConclusion"}) if len(d) != 0 : d = d[0].find('div', {"class":"box-panel"}) divs = d.children for div in divs: if(type(div) == type(d) and div.name == 'div'): try: claim = Claim() claim.set_url(url) claim.set_source("fullfact") claim_str = div.find('div', {"class": "col-xs-12 col-sm-6 col-left"}).find('p').text conclusion = div.find('div', {"class": "col-xs-12 col-sm-6 col-right"}).find('p').text claim.set_claim(claim_str) claim.set_alternate_name(conclusion) claim.set_tags(','.join(keywords)) claims.append(claim) except Exception as e: continue #Extraction of quotes quotes = parsed_claim_review_page.findAll('blockquote') if(len(claims) == 0 or len(quotes) == 0): return claims for quote in quotes: claim = Claim() claim.set_url(url) claim.set_source("fullfact") try: p = quote.findAll('p') if(len(p) == 1): # if there one paragraph then there is no author nor date claim.set_claim(p[0].text) claim.set_tags(','.join(keywords)) claims.append(claim) continue claim_str = '' for x in p[:-1]: # Sometimes the quotes is made of 2 paragraphes or more claim_str = x.text if(len(claim_str) < 4): # if it's too small it is not a claim continue p = p[-1] #last paragraph always mentions the author and the date author = p.text.split(',')[:-1] #and there is always a semicolon seperating the two date = p.text.split(',')[-1] while not claim_str[0].isalnum(): claim_str = claim_str[1:] while not claim_str[-1].isalnum(): claim_str = claim_str[:-1] claim.set_claim(claim_str) claim.set_author(''.join(author)) except Exception as e: continue try: a = p.find('a') #usually the date is mentionned with the link where the claim was said d = datetime.strptime(a.text, '%d %B %Y').strftime("%Y-%m-%d") claim.set_refered_links(a['href']) claim.setDate(d) except Exception as e: try: d = datetime.strptime(date[1:-1], ' %d %B %Y').strftime("%Y-%m-%d") claim.setDate(d) except Exception as e: pass claim.set_tags(keywords) claims.append(claim) return claims
def get_all_claims(criteria): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } # performing a search by each letter, and adding each article to a urls_ var. urls_ = {} last_page = [] #print("fafafafafafa") for page_number in range(1, 500): if 0 < criteria.maxClaims <= len(urls_): break url = "https://africacheck.org/latest-reports/page/" + str( page_number) + "/" try: page = requests.get(url, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify() links = soup.findAll("div", {"class": "article-content"}) if (len(links) != 0) or (links != last_page): for anchor in links: anchor = anchor.find('a', href=True) ind_ = str(anchor['href']) if ind_ not in list(urls_.keys()): if 0 < criteria.maxClaims <= len(urls_): break if ind_ not in criteria.avoid_url: urls_[ind_] = ind_ print("adding " + str(ind_)) last_page = links else: print("break!") break except: print("error=>" + str(url)) claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.items(): print( str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " + str(url)) index += 1 url_complete = str(url) # print url_complete # try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify("utf-8") claim_ = Claim() claim_.set_url(url_complete) claim_.set_source("africacheck") # title title = soup.find("meta", {"property": "og:title"}) title_content = title['content'] if "|" in title_content: title_content = title_content.split("|")[-1] claim_.set_title(title_content) # date date_ = soup.find('time') # print date_["content"] if date_: date_str = search_dates( date_['datetime'].split(" ")[0])[0][1].strftime("%Y-%m-%d") # print date_str claim_.set_date(date_str) # print claim_.date # rating truth_rating = "" if soup.find("div", {"class": "verdict-stamp"}): truth_rating = soup.find("div", { "class": "verdict-stamp" }).get_text() if soup.find("div", {"class": "verdict"}): truth_rating = soup.find("div", {"class": "verdict"}).get_text() if soup.find("div", {"class": "indicator"}): truth_rating = soup.find("div", {"class": "indicator"}).get_text() if soup.find("div", {"class": "indicator"}).find('span'): truth_rating = soup.find("div", { "class": "indicator" }).find('span').get_text() claim_.set_rating( str(re.sub('[^A-Za-z0-9 -]+', '', truth_rating)).lower().strip()) # when there is no json date_ = soup.find("time", {"class": "datetime"}) if date_: claim_.set_date(date_.get_text()) # body body = soup.find("div", {"id": "main"}) claim_.set_body(body.get_text()) # author author = soup.find("div", {"class": "sharethefacts-speaker-name"}) if author: claim_.set_author(author.get_text()) # related links divTag = soup.find("div", {"id": "main"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.set_refered_links(related_links) if soup.find("div", {"class": "report-claim"}): claim_.set_claim( soup.find("div", { "class": "report-claim" }).find("strong").get_text()) else: claim_.set_claim(claim_.title) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): tags.append(tag["content"]) claim_.set_tags(", ".join(tags)) claims.append(claim_.generate_dictionary()) # creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf