def recovery_information_noob(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) title = soup_article.title.get_text() # Retrieval of publication date find_date = soup_article.find('time', attrs={"class": "date"}) for a in find_date.find_all('a'): find_valeur = re.compile('[0-9]{4}\/[0-9]{2}\/[0-9]{2}') for valeur in find_valeur.finditer(str(a.get("href"))): date_p = valeur.group(0) date_p = datetime.strptime(date_p, "%Y/%m/%d")\ .strftime("%Y-%m-%d") # Retrieval of the author of the article author = [] for div in soup_article.find_all('div'): if re.search('author', str(div.get("class"))): author.append(div.p.span.get_text()) # Retrieval of the artical theme theme = "" for nav in soup_article.find_all('nav'): if nav.get("class") == ['breadcrumb']: for ol in nav.find_all('ol'): for a in ol.find_all('a'): theme = a.get_text() # Retrieving the content of the article contents = "" for div in soup_article.find_all('div'): if re.search('body', str(div.get("id"))): for aside in div.find_all('aside'): for p in aside.find_all('p'): p.string = "" for p in div.find_all('p'): for a in p.find_all('a'): if a.get("class") == ['lire']: a.string = "" for img in p.find_all('img'): p.string = "" contents += p.get_text() + " " article = utils.recovery_article(title, 'NouvelObservateur', author, date_p, contents, theme) return (article)
def recovery_information_hum(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) for meta in soup_article.find_all('meta'): if meta.get("property") == 'og:title': title = meta.get("content") for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:section': theme = meta.get("content") author = [] for h2 in soup_article.find_all('h2'): for a in h2.find_all('a'): for valeur in re.finditer('auteur', str(a.get("href"))): author.append(a.get_text()) for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:published_time': raw_date = meta.get("content") date_p = raw_date[0:10] date_p = datetime.strptime(date_p, "%Y-%m-%d").strftime("%d/%m/%Y") contents = "" for div in soup_article.find_all('div'): if div.get("class") == [ 'field', 'field-name-field-news-chapo', 'field-type-text-long', 'field-label-hidden']: for p in div.find_all('p'): contents += p.get_text() if div.get("class") == [ 'field', 'field-name-field-news-text', 'field-type-text-long', 'field-label-hidden']: for p in div.find_all('p'): contents += p.get_text() article = utils.recovery_article(title, 'Humanite', author, date_p, contents, theme) return(article)
def recovery_information_fem(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) title = soup_article.title.get_text() title = title.split(" - ") title = title[0] for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:published_time': raw_date = meta.get("content") date_p = raw_date[0:10] author = [] for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:author': author.append(meta.get("content")) theme = "" for link in soup_article.find_all('link'): if link.get("rel") == ['Index']: link_theme = link.get("href") part_link = link_theme.split("/") theme = part_link[3] contents = "" for div in soup_article.find_all('div'): if div.get("class") == ['chapo']: for p in div.find_all('p'): contents += p.get_text() + " " if div.get("class") == ['contenu']: for p in div.find_all('p'): contents += p.get_text() + " " if div.get("class") == ['diaporama']: for p in div.find_all('p'): contents += p.get_text() + " " contents = re.sub(r"\s\s+", " ", contents) article = utils.recovery_article(title, 'Femina', author, date_p, contents, theme) return(article)
def info_articles(article_link): req = requests.get(article_link) data = req.text soup = BeautifulSoup(data, "lxml") title = unidecode.unidecode(soup.find('title').string) newspaper = "Le Monde" # Article theme if(soup.find("li", class_="ariane z2")): theme = soup.find("li", class_="ariane z2").find("a").get_text() else: theme = 'Forum' # Author of the article if(soup.find("span", class_="auteur")): if(soup.find("span", class_="auteur").a): author = soup.find("span", class_="auteur").find("a").get_text() else: author = soup.find("span", class_="auteur").get_text() author = re.sub(r"\s\s+", " ", author) author = re.sub(r"^ ", "", author) else: author = "" # publication date da = re.search(r"\d{4}-\d{2}\-\d{2}", soup.find("time").get("datetime"))[0] if(da): date_p = date.datetime.strptime(da, "%Y-%m-%d").strftime("%d/%m/%Y") else: date_p = str(date.datetime.now().strftime("%d/%m/%Y")) # Article content content = "" for div in soup.find_all('div'): for p in div.find_all('p'): content += p.get_text() + " " content = unidecode.unidecode(re.sub(r"\s\s+", " ", content)) new_article = utilsg4.recovery_article( title, newspaper, [author], date_p, content, theme) return new_article
def recovery_information_sv(url_article): """ Arguments: - url of one article Returns: - informations of the article """ soup_article = utils.recovery_flux_url_rss(url_article) # title title = "" for h1 in soup_article.find_all("h1"): if h1.get("class") == ["like-h1"]: title = h1.get_text() # date t_date = soup_article.find("time")["datetime"].split("-") t_date.reverse() date = "/".join(t_date) # author author = [] for span in soup_article.find_all('span'): if span.get("class") == ["author"]: author.append(span.span.get_text()) # content content = "" for div in soup_article.find_all('div'): if div.get("class") == ['content', 'left']: for p in div.find_all('p'): content += p.get_text() + " " # theme theme = "" for meta in soup_article.find_all('meta'): if meta.get("property") == 'article:tag': theme = meta.get("content") article = utils.recovery_article(title, 'Scienceetvie', author, date, content, theme) return(article)
def recovery_information_fusc(url): """ Arguments: url : string Return : article : dictionary It retrieve for each article the title, newspaper, author, date, theme """ soup = utils.recovery_flux_url_rss(url) # retrieve title title = '' title = soup.title.string indice = title.find('|') if indice != -1: title = title[:indice-1] # retrieve the author author = [] tag_author = soup.find('h3', attrs={'itemprop': 'author'}) author.append(tag_author.get_text()) # retrieve date publi_date = '' regex_date = re. search('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', soup.time.string) publi_date = regex_date.group(0) # retrieve content content = '' for p in soup.find_all('p'): for p2 in re.finditer('py0p5', p.get('class')[-1]): content += p.get_text() # retrieve theme delimiter = url.split('/') theme = delimiter[3] article = utils.recovery_article(title, 'FuturaSciences', author, publi_date, content, theme) return(article)
def recovery_information_lg(url): """ Arguments: url : string Return : article : dictionary It retrieve for each article the title, newspaper, author, date, theme """ soup = utils.recovery_flux_url_rss(url) # Retrieving the title balise_title = soup.title.string sep = balise_title.split("—") title = unidecode.unidecode("—".join(sep[:-1])) # Retrieving of author and publication date author = [] for span in soup.find_all('span'): if span.get("class") == ['context']: author.append(span.a.get_text()) for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(span)): date_p = valeur.group(0) # Retrieving the theme for ul in soup.find_all('ul'): if ul.get("class") == ['post-categories']: for li in ul.find_all('li'): theme = li.get_text() # Retrieving the content of the article contents = "" for div in soup.find_all('div'): if div.get("class") == ['content']: for p in div.find_all('p'): contents += p.get_text() + " " new_article = utils.recovery_article(title, "Le Gorafi", author, date_p, contents, theme) return(new_article)
def get_article(url): """Prend en argument une adresse url (url) et retourne une article au format { "title" : str, "newspaper" : str, "author" : [str], "date_publi" : str, "content" : str, "theme" : str } """ soup = utils.recovery_flux_url_rss(url) article = soup.find("article") # Titre de l'article title = article.find("h1").get_text() # tableau vide quand il y'a pas d'autheur sinon tableau de(s) auteur(s) authors = [] if article.find("header").find( "p", class_="authorsign-label") is None else unidecode( article.find("header").find( "p", class_="authorsign-label").get_text()).split(" et ") # Date de publication de l'article date_tab = article.find("time").get("datetime")[:10].split("-") date_tab.reverse() date_pub = "/".join(date_tab) # Theme de l'article theme = article.find("ol", class_="breadcrumb-list")\ .find_all("li")[1].find("span").get_text() # Contenu de l'article content = "" for p in article.find("div", class_="content").find_all("p"): content = content + p.get_text() # Nom du journal newspaper = soup.find("footer").find(has_copyright).find("a").get_text() regex = re.compile(r'[\n\r\t]') # Elever les \n \r \t du contenu content = regex.sub("", content) return utils.recovery_article(unidecode(title), unidecode(newspaper), authors, str(date_pub), unidecode(content), unidecode(theme))
def recovery_information_ld(url): soup = utilsg4.recovery_flux_url_rss(url) # Retrieve the title for meta in soup.find_all('meta'): if meta.get("property") == 'og:title': title = meta.get("content") # Retrieve the publication date for time in soup.find_all('time'): if time.get("itemprop") == 'datePublished': date = time.get("itemprop") for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(time)): date = valeur.group(0) # Retrieve the author author = [] for div in soup.find_all('div'): if div.get("class") == ['article_author']: author.append(div.span.get_text()) # Retrieve the content content = "" for div in soup.find_all('div'): if div.get("itemprop") == 'articleBody': for p in div.find_all('p'): content += p.get_text() + " " # Retrieve the theme theme = "" for h2 in soup.find_all('h2'): if h2.get("itemprop") == 'about': theme = h2.get_text() article = utilsg4.recovery_article(title, 'La Depeche', author, date, content, theme) return (article)
def get_article(url): """ Prend en argument une adresse url (url) et retourne un dictionnaire """ from unidecode import unidecode soup = utils.recovery_flux_url_rss(url) article = soup.find("article") meta = soup.find("meta", property="og:title").get("content") tab = meta.split("-") n = len(tab) newspaper = tab[n - 1] theme = tab[n - 2] title = "-".join(tab[:n - 2]) authors = [] regex = re.compile(r'[\n\r\t]') for span in article.find_all("span", class_="author--name"): author = regex.sub("", unidecode(span.get_text())) authors.append(author.strip()) date_pub = article.find("span", itemprop="datePublished").get( "datetime")[:10].replace("-", "/") content = "" for div in article.find_all( "div", class_=[ "article--intro", "article--wysiwyg", "article--footnotes"]): for p in div.find_all("p"): content = content + p.get_text() content = regex.sub("", content) return utils.recovery_article( unidecode(title), unidecode(newspaper), authors, date_pub, unidecode(content), unidecode(theme))