def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("checkyourfact") # title title = parsed_claim_review_page.find('article').find("h1") claim.set_title(title.text.replace("FACT CHECK: ", "")) url_date = url.replace("https://checkyourfact.com/", "").replace("/", " ").split(" ") claim.set_date(url_date[0] + "-" + url_date[1] + "-" + url_date[2]) # author & author_url if parsed_claim_review_page.select('detail > article > author'): for author in parsed_claim_review_page.select( 'detail > article > author'): if (hasattr(author, "data-slug")): author_str = author.text.split("|")[0].strip().split( "\n")[0] claim.author = author_str claim.author_url = "https://checkyourfact.com/author/" + author[ 'data-slug'] break # body body = parsed_claim_review_page.find("article") claim.set_body(body.get_text()) # related links div_tag = parsed_claim_review_page.find("article") related_links = [] for link in div_tag.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) claim.set_claim(claim.title) # rating rating = find_by_text(parsed_claim_review_page, "Verdict", "span") if rating: rating_text = rating[0].text.split(":")[-1].strip() claim.set_rating(rating_text) else: pass tags = [] for tag in parsed_claim_review_page.findAll( 'meta', {"property": "article:tag"}): tags.append(tag["content"]) claim.set_tags(", ".join(tags)) if len(claim.rating) == 0: return [] else: return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("truthorfiction") title = parsed_claim_review_page.find("meta", {"property": "og:title"})['content'] claim.set_title(title) article = parsed_claim_review_page.find("article") # date date_ = parsed_claim_review_page.find('meta', {"property": "article:published_time"})['content'] if date_: date_str = date_.split("T")[0] claim.set_date(date_str) # author author_ = parsed_claim_review_page.find('meta', {"name": "author"})['content'] if author_: author_str = author_.split("T")[0] claim.set_author(author_str) ## auth link author_url = parsed_claim_review_page.find('a', {"class": "url fn n"})['href'] if author_url: claim.author_url = author_url # body content = [tag for tag in article.contents if not isinstance(tag, NavigableString)] body = content[-1] # type: Tag if body.has_attr("class") and "content-source" in body['class']: body = content[-2] claim.set_body(body.text.strip()) # related links related_links = [] for link in body.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) description = article.find("div", {"class", "claim-description"}) rating = article.find("div", {"class", "rating-description"}) if description and rating: claim.set_claim(description.text) claim.rating = rating.text else: h1 = article.find("h1") text = h1.text.replace("–", "-") split_text = text.split("-") rating_text = split_text[-1] claim_text = "".join(split_text[0:-1]) if len(claim_text) == 0 or "-" not in text: return [] else: claim.set_rating(rating_text) claim.set_claim(claim_text) # tags tags = [] if parsed_claim_review_page.select('footer > span.tags-links > a'): for link in parsed_claim_review_page.select('footer > span.tags-links > a'): if hasattr(link, 'href'): #tag_link = link['href'] tags.append(link.text) claim.set_tags(", ".join(tags)) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: if url in url_blacklist: return [] claim = Claim() # url claim.url = str(url) # souce claim.source = "snopes" # title title = None if parsed_claim_review_page.select('article > header > h1'): for tmp in parsed_claim_review_page.select( 'article > header > h1'): title = tmp.text.strip() #sub_title = parsed_claim_review_page.select( 'article > header > h2' ) claim.title = str(title.strip()) # author author_list = [] author_links = [] if parsed_claim_review_page.select( 'article > header > ul.list-unstyled.authors.list-unstyled.d-flex.flex-wrap.comma-separated > li > a' ): for author_a in parsed_claim_review_page.select( 'article > header > ul.list-unstyled.authors.list-unstyled.d-flex.flex-wrap.comma-separated > li > a' ): if hasattr(author_a, 'href'): author_list.append(author_a.text.strip()) author_links.append(author_a.attrs['href']) else: print("no author?") claim.author = ", ".join(author_list) claim.author_url = (", ".join(author_links)) # review_author ? # - # date datePub = None dateUpd = None date_str = "" date_ = parsed_claim_review_page.find('ul', {"class": "dates"}) if date_: dates = date_.find('li', {"class": "font-weight-bold text-muted"}) dateSpans = dates.span for dateItems in dateSpans: if dateItems == 'Published': datePub = dateItems.next.strip() date_str = dateparser.parse(datePub).strftime("%Y-%m-%d") claim.date_published = date_str claim.date = date_str if dateItems == 'Updated': dateUpd = dateItems.next.strip() date_str = dateparser.parse(dateUpd).strftime("%Y-%m-%d") claim.date = date_str # claim image? # - # claim claim_text = None if parsed_claim_review_page.select( 'article > div > div.claim-text.card-body'): for p in parsed_claim_review_page.select( 'article > div > div.claim-text.card-body'): if hasattr(p, 'text'): claim_text = p.text.strip() claim.claim = str(claim_text).strip() # rating -> https://www.snopes.com/fact-check-ratings/ rating = None if parsed_claim_review_page.select( 'article > div > div > div > div.media-body > span'): for rating_span in parsed_claim_review_page.select( 'article > div > div > div > div.media-body > span'): rating = rating_span.text.strip() claim.rating = str(rating).replace('"', "").strip() # claim.set_rating_value( rating ) # rating best whats_true = None if parsed_claim_review_page.select( 'article > div > div > div.whats-true > div > p'): for rating_span_true in parsed_claim_review_page.select( 'article > div > div > div.whats-true > div > p'): whats_true = rating_span_true.text.strip() if whats_true: whats_true = str(whats_true).replace('"', "") # Text: (not Numerical value) # claim.best_rating = whats_true # rating worst whats_true = False if parsed_claim_review_page.select( 'article > div > div > div.whats-false > div > p'): for rating_span_false in parsed_claim_review_page.select( 'article > div > div > div.whats-false > div > p'): whats_false = rating_span_false.text.strip() if whats_false: whats_false = str(whats_true).replace('"', "") # Text: (not Numerical value) # claim.worst_rating = whats_false # rating Undetermined? whats_undetermined = False if parsed_claim_review_page.select( 'article > div > div > div.whats-undetermined > div > p'): for rating_span_undetermined in parsed_claim_review_page.select( 'article > div > div > div.whats-undetermined > div > p'): whats_undetermined = rating_span_undetermined.text.strip() if whats_undetermined: whats_undetermined = str(whats_undetermined).replace('"', "") # Text: (not Numerical value) # claim.whats_undetermined = whats_undetermined # rating value ? # - # Body descriptioon text = "" if parsed_claim_review_page.select( 'article > div.single-body.card.card-body.rich-text > p'): for child in parsed_claim_review_page.select( 'article > div.single-body.card.card-body.rich-text > p'): text += " " + child.text body_description = text.strip() claim.body = str(body_description).strip() # related links related_links = [] if parsed_claim_review_page.select( 'article > div.single-body.card.card-body > p > a'): for link in parsed_claim_review_page.select( 'article > div.single-body.card.card-body > p > a'): if hasattr(link, 'href'): related_links.append(link['href']) claim.referred_links = related_links # tags tags = [] if parsed_claim_review_page.select( 'article > footer > div > a > div > div'): for tag in parsed_claim_review_page.select( 'article > footer > div > a > div > div'): if hasattr(tag, 'text'): tags.append(tag.text.strip()) claim.tags = ", ".join(tags) # same as ? # - # No Rating? No Claim? if not claim_text or not rating: print(url) if not rating: print("-> Rating cannot be found!") if not claim_text: print("-> Claim cannot be found!") return [] return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) #print("\r" + url) claim.set_source("politifact") # Claim title = parsed_claim_review_page.find("div", {"class": "m-statement__quote"}) claim.set_claim(title.text.strip()) # title title = parsed_claim_review_page.find("h2", {"class": "c-title"}) claim.set_title(title.text.strip()) # date date = parsed_claim_review_page.find('span', {"class": "m-author__date"}) if date: date_str = search_dates(date.text)[0][1].strftime("%Y-%m-%d") claim.set_date(date_str) # rating # https://static.politifact.com/politifact/rulings/meter-mostly-false.jpg statement_body=parsed_claim_review_page.find("div", {"class", "m-statement__body"}) statement_detail = statement_body.find("div", {"class", "c-image"}) statement_detail_image=statement_detail.find("picture") statement_detail_image_alt=statement_detail_image.find("img",{"class", "c-image__original"}) if statement_detail_image_alt: #claim.alternate_name = statement_detail_image_alt['src'].split("rulings/")[1].split(".jpg")[0] if self.translate_rating_value(statement_detail_image_alt['alt']) != "": claim.rating = self.translate_rating_value(statement_detail_image_alt['alt']) else: claim.rating = statement_detail_image_alt['alt'] # body body = parsed_claim_review_page.find("article", {"class": "m-textblock"}) #body.find("div", {"class": "artembed"}).decompose() #claim.set_body(body.get_text()) text ="" if parsed_claim_review_page.select( 'main > section > div.t-row__center > article.m-textblock' ): for child in parsed_claim_review_page.select( 'main > section > div.t-row__center > article.m-textblock' ): for element in child.contents: if (element.name == "div"): valid = True # check for illegal JS element in artembed (tag): if (hasattr( element, 'class' )): try: if ('class' in element.attrs): if (element.attrs['class'][0] == "artembed"): if (element.text.startswith("\r\nwindow.gciAnalyticsUAID")): valid = False except KeyError: print("KeyError: Skip") else: valid = True if hasattr( element, 'text' ): #if (element.text == "We rate this claim False." and url == "https://www.politifact.com/staff/kelsey-tamakloe/"): if (url == "https://www.politifact.com/staff/kelsey-tamakloe/"): print("\r" + str(element.text)) if (valid == True): if (element): if (hasattr( element, 'text' )): text += " " + str(element.text) else: text += " " + str(element) body_description = text.strip() claim.body = str(body_description).strip() # author author_meta = parsed_claim_review_page.find("div", {"class": "m-author__content"}) if author_meta: author = author_meta.find("a").text claim.set_author(author) author_url = author_meta.find("a") if author_url.attrs["href"] != "": claim.author_url = "https://www.politifact.com" + author_url.attrs["href"] # date published statement_meta = parsed_claim_review_page.find("div", {"class": "m-statement__meta"}) if statement_meta: meta_text = statement_meta.text if "on" in meta_text: meta_text = meta_text.split(" on ")[1] if "in" in meta_text: meta_text = meta_text.split(" in ")[0] if meta_text: date = search_dates(meta_text) if date: date = date[0][1].strftime("%Y-%m-%d") claim.date = date # related links div_tag = parsed_claim_review_page.find("article", {"class": "m-textblock"}) related_links = [] for link in body.find_all('a', href=True): if (link['href'][0] == "/"): related_links.append("https://www.politifact.com" + link['href']) else: related_links.append(link['href']) claim.set_refered_links(related_links) claim.set_claim(parsed_claim_review_page.find("div", {"class": "m-statement__quote"}).text.strip()) tags = [] ul_tag = parsed_claim_review_page.find("ul", {"class", "m-list"}) if ul_tag: ul_tag_contents = ul_tag.findAll("li", {"class", "m-list__item"}) for a in ul_tag_contents: a_tag=a.find("a", title=True) a_tag_text=a_tag['title'] tags.append(a_tag_text) if statement_body: topics = statement_body.find("ul", {"class", "m-list"}).find_all("a") for link in topics: text = link['title'] tags.append(text) claim.set_tags(",".join(tags)) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("newtral") #title, claim and autor claim title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content'] title = title.strip().split("|")[0] claim.set_title(title) entry_content = parsed_claim_review_page.find( "div", attrs={'class': 'entry-content'}) #print (title) dospunto = re.search(r'(: «)', title) dospunt = re.search(r'(: “)', title) if dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur claim_text = claim_a[1].strip("« »") claim.claim = claim_text #print (claim_a) elif dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text else: pass #multiple title or claim claim_mult = entry_content.findAll('h2') if claim_mult: claim_al = [i.text.strip() for i in claim_mult] dospunt = re.search(r'(: “)', claim_al) dospunto = re.search(r'(: «)', claim_al) if dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text elif dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur claim_text = claim_a[1].strip("« »") claim.claim = claim_text #print (claim_a) else: claim.set_title(claim_al) #tags tags = parsed_claim_review_page.find_all( "meta", attrs={'property': 'article:tag'}) tag_list = [] for tag in tags: tag_text = tag['content'] tag_list.append(tag_text) claim.set_tags(",".join(tag_list)) #date pubished published = parsed_claim_review_page.find( "meta", attrs={'property': 'article:published_time'})['content'] claim.date_published = published.strip() #autor article author_span = parsed_claim_review_page.find( "span", attrs={'class': 'c-article__author'}) author_a = author_span.find("a") author_url = author_a['href'] author_text = author_a.text author_text = re.sub('Por', '', author_text).strip() claim.author_url = author_url claim.review_author = author_text # Recuperation du texte de l'article entry_text = "" body_t = entry_content.find_all('p') body = [text.text.strip() for text in body_t] entry_text += " ".join(body) + "\n" claim.body = entry_text # Recuperation des liens dans le texte de l'article links = [ link['href'] for link in entry_content.find_all('a', href=True) ] claim.referred_links = links #Veracite intro = parsed_claim_review_page.find( "div", attrs={'class': 'c-article__intro'}) veracities = [ "ENGAÑOSA", "ENGAÑOSO", "FALSO", "FALSA", "FALSOS", "VERDADERO", "VERDAD A MEDIAS" ] def common(a, b): c = [value for value in a if value in b] return c if intro: intro_p = " ".join(str(v) for v in intro) #print(type(body_t)) rating_text_list = intro_p.upper() rating_text = [ i.strip() for i in common(veracities, rating_text_list) ] claim.alternate_name = rating_text else: body_a = " ".join(str(v) for v in body) #print(type(body_t)) rating_text_list = body_a.upper() rating_text = [ i.strip() for i in common(veracities, rating_text_list) ] claim.alternate_name = rating_text return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("newtral") title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content'] title = title.strip().split("|")[0] claim.set_title(title) dospunto = re.search(r'(: «)', title) dospunt = re.search(r'(: “)', title) if dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur # print ("auteur:" , auteur) claim_text = claim_a[1].strip("« »") claim.claim = claim_text elif dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text else: pass tags = parsed_claim_review_page.find_all("meta", attrs={'property': 'article:tag'}) tag_list = [] for tag in tags: tag_text = tag['content'] tag_list.append(tag_text) claim.set_tags(",".join(tag_list)) published = parsed_claim_review_page.find("meta", attrs={'property': 'article:published_time'})[ 'content'] claim.date_published = published.strip() entry_content = parsed_claim_review_page.find("div", attrs={'class': 'entry-content'}) intro = parsed_claim_review_page.find("div", attrs={'class': 'c-article__intro'}) if intro is None: intro_rating_p = entry_content.find("em") if intro_rating_p is None: intro_rating_p = entry_content.find("p") if intro_rating_p is None: intro_rating_p = entry_content.find("div") else: intro_rating_p = intro.p rating_in_image = False if intro_rating_p is None: # Rating in image... rating_in_image = True rating_text = "" else: rating_text = intro_rating_p.get_text() rating_re_es_falso = regex.compile( r"(La afirmación es|La afirmación es una|La declaración es|Es|El dato es" + \ "|La comparación de Colau es)? ?([\p{Lu}| ]+)(\.| –|,| )") es_falso_match = rating_re_es_falso.match(rating_text) if es_falso_match is not None and es_falso_match.group(2) is not None: rating_text = es_falso_match.group(2) else: if not rating_in_image: is_there_b = intro_rating_p.find('b') if is_there_b is not None: rating_text = is_there_b.text else: is_there_strong = intro_rating_p.find("strong") if is_there_strong is not None: rating_text = is_there_strong.text else: pass claim.rating = rating_text author_span = parsed_claim_review_page.find("span", attrs={'class': 'c-article__author'}) author_a = author_span.find("a") author_url = author_a['href'] author_text = author_a.text author_text = re.sub('Por', '', author_text).strip() claim.author_url = author_url claim.review_author = author_text # Recuperation du texte de l'article entry_text = "" body_t = entry_content.find_all('p') body = [text.text.strip() for text in body_t] entry_text += " ".join(body) + "\n" claim.body = entry_text # Recuperation des liens dans le texte de l'article links = [link['href'] for link in entry_content.find_all('a', href=True)] claim.referred_links = links # else: # title = container.h3.text # titles.append(title) # # print("title", title) # claim_c = hd.h1.text.split(":") # claim_d = hd.h1.text.strip() # # if claim_c: # auteur = claim_c[0].strip() # auteurs.append(auteur) # print("auteur:", auteur) # claim = claim_c[1].strip("« »") # claims.append(claim) # # print ("claim :", claim) # # else : # # print (claim_d) # return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: local_claims = [] claim = Claim() claim.set_url(url) claim.set_source("africacheck") # title title = parsed_claim_review_page.find("meta", {"property": "og:title"}) global_title_text = title['content'] claim.set_title(global_title_text) # date date = parsed_claim_review_page.find("span", { "class": "published" }).next global_date_str = "" if date: # global_date_str = search_dates(date['datetime'].split(" ")[0])[0][1].strftime("%Y-%m-%d") global_date_str = search_dates(date)[0][1].strftime("%Y-%m-%d") claim.set_date(global_date_str) # author author = parsed_claim_review_page.find("div", {"class": "author-details"}) if author: claim.set_author(author.get_text()) if parsed_claim_review_page.select('div.author-details > a > h4'): for child in parsed_claim_review_page.select( 'div.author-details > a > h4'): try: claim.author = child.get_text() continue except KeyError: print("KeyError: Skip") if parsed_claim_review_page.select('div.author-details > a'): for child in parsed_claim_review_page.select( 'div.author-details > a'): try: claim.author_url = child['href'] continue except KeyError: print("KeyError: Skip") # tags tags = [] for tag in parsed_claim_review_page.findAll( 'meta', {"property": "article:tag"}): tags.append(tag["content"]) claim.set_tags(", ".join(tags)) # claim entry_section = parsed_claim_review_page.find("section", {"class", "cell"}) verdict_box = parsed_claim_review_page.find( "div", {"class", "article-details__verdict"}) if verdict_box and len( verdict_box) > 0 and "Verdict" in verdict_box.text: report_claim_div = parsed_claim_review_page.find( "div", {"class": "field--name-field-claims"}) if report_claim_div: claim.set_claim(report_claim_div.get_text()) else: claim.set_claim(claim.title) # rating inline_ratings = parsed_claim_review_page.findAll( "div", {"class", "rating"}) if inline_ratings: if (hasattr(inline_ratings[0], 'class')): try: if ('class' in inline_ratings[0].attrs): if (inline_ratings[0].attrs['class'][1]): rating_tmp = inline_ratings[0].attrs['class'][ 1] claim.rating = rating_tmp.replace( 'rating--', '').replace("-", "").capitalize() except KeyError: print("KeyError: Skip") else: # alternative rating (If there is no article--aside box with verdict) global_truth_rating = "" if parsed_claim_review_page.find("div", {"class": "verdict-stamp"}): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "verdict-stamp" }).get_text() if parsed_claim_review_page.find("div", {"class": "verdict"}): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "verdict" }).get_text() if parsed_claim_review_page.find("div", {"class": "indicator"}): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "indicator" }).get_text() if parsed_claim_review_page.find("div", { "class": "indicator" }).find('span'): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "indicator" }).find('span').get_text() # If still no rathing value, try to extract from picture name if (global_truth_rating == ""): filename = "" if parsed_claim_review_page.select( 'div.hero__image > picture'): for child in parsed_claim_review_page.select( 'div.hero__image > picture'): # child.contents[1].attrs['srcset'] if (hasattr(child, 'contents')): try: filename = child.contents[1].attrs['srcset'] continue except KeyError: print("KeyError: Skip") if (filename != ""): filename_split = filename.split("/") filename_split = filename_split[len(filename_split) - 1].split(".png") filename_split = filename_split[0].split("_") if len(filename_split) == 1: global_truth_rating = filename_split[0] else: global_truth_rating = filename_split[ len(filename_split) - 1] claim.set_rating( str(re.sub('[^A-Za-z0-9 -]+', '', global_truth_rating)).lower().strip().replace( "pfalse", "false").replace("-", "").capitalize()) if (not self.rating_value_is_valid(claim.rating)): print("\nURL: " + url) print("\n Rating:" + claim.rating) claim.rating = "" # body body = parsed_claim_review_page.find("div", {"class": "article--main"}) claim.set_body(body.get_text()) # related links related_links = [] for link in body.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) if claim.rating: return [claim] else: return []