def recup_articles(): """ Objectif de la fonction est de récupérer la liste url des arcticles selon le mot clé recherché. Retourne : list_url_articles : liste des url (chaine de caractère) """ list_url_articles = [] list_page = [] for i in range(0, 230, 10): list_page.append(i) for i in range(1, 230): if i in list_page: time.sleep(15) urli = [ 'http://www.lemonde.fr/recherche/?keywords=cyclones+' + 'cyclone&page_num=' + str(i) + '&operator=or&exclude_' + 'keywords=&qt=recherche_texte_titre&author=&period=' + 'custom_date&start_day=01&start_month=01&start_year=' + '2000&end_day=22&end_month=02&end_year=2018&sort=asc' ] soup = utils.recovery_flux_url_rss(urli) for h3 in soup.find_all('div', attrs={ 'class': 'grid_11 conteneu' + 'r_fleuve alpha omega' }): for a in h3.find_all('a'): if 'http://' in a.get('href'): list_url_articles.append(a.get('href')) else: list_url_articles.append('http://www.lemonde.fr' + a.get('href')) else: urli = 'http://www.lemonde.fr/recherche/?keywords=ouragan&page' +\ '_num=' + str(i) + '&operator=and&exclude_keywords=&qt=' +\ 'recherche_texte_titre&author=&period=custom_date&start_day' +\ '=01&start_month=01&start_year=2000&end_day=22&end_month=' +\ '02&end_year=2018&sort=asc' soup = utils.recovery_flux_url_rss(urli) for h3 in soup.find_all('div', attrs={ 'class': 'grid_11 conten' + 'eur_fleuve alpha omega' }): for a in h3.find_all('a'): if 'http://' in a.get('href'): list_url_articles.append(a.get('href')) else: list_url_articles.append('http://www.lemonde.fr' + a.get('href')) return (list_url_articles)
def collect_url_themes(): """ Objectif de la fonction est de récupérer la liste url des arcticles selon le mot clé recherché. Retourne : list_url_articles : liste des url (chaine de caractère) """ list_url_articles = [] liste_url = [ 'http://recherche.lefigaro.fr/recherche/cyclone/?datemin' + '\=01-01-2008&datemax=02-02-2018&page=' ] for url in liste_url: for i in range(1, 5): urli = url + str(i) soup = utils.recovery_flux_url_rss(urli) for h3 in soup.find_all('h2', attrs={'class': 'fig-profil-headline'}): for a in h3.find_all('a'): if 'http://' in a.get('href'): list_url_articles.append(a.get('href')) else: list_url_articles.append('http://www.lefigaro.fr' + a.get('href')) return list_url_articles
def collect_url_articles(): """ Objectif de la fonction est de récupérer la liste url des arcticles selon le mot clé recherché mais le contenu n'est pas dans cette page HTML il faudra appelé une autre fonction Retourne : list_url_articles : liste des url (chaine de caractère) """ liste_url = [ 'http://www.liberation.fr/recherche/?sort=-publication_date' + '_time&q=cyclone&period=custom&period_start_day=1&period_' + 'start_month=1&period_start_year=2000&period_end_day=29&' + 'period_end_month=1&period_end_year=2018&editorial_source' + '=&paper_channel=460&page=' ] list_url_articles = [] for url in liste_url: for i in range(1, 18): time.sleep(15) urli = url + str(i) soup = utils.recovery_flux_url_rss(urli) for h3 in soup.find_all('h3', attrs={'class': 'live-title'}): for a in h3.find_all('a'): if 'http://' in a.get('href'): list_url_articles.append(a.get('href')) else: list_url_articles.append('http://www.liberation.fr' + a.get('href')) return (list_url_articles)
def collect_url_articles(): """ Retour : - liste d'url des articles en fonction de la categorie choisie list_category = ["inondation", "typhon", "ouragan", "cyclone", "seisme", "tremblement+de+terre"] """ list_url_articles = [] cat = "seisme" url = "http://www.lepoint.fr/recherche/index.php?query=" + cat +\ "&date_from=01%2F01%2F2000&date_to=31%2F01%2F2018&type=ARTICLE" soup_url = utils.recovery_flux_url_rss(url) for ol in soup_url.find_all('ol'): if ol.get("class") == ['pagination', 'bottom']: for li in ol.find_all('li'): for a in li.find_all('a'): if not a.get("class"): derniere_page = int(li.get_text()) for article in soup_url.find_all('article'): for div in article.find_all('div'): if div.get("class") == ['col', 'pls']: for a in div.find_all('a'): debut_url = "http://www.lepoint.fr" if re.search('journalistes', str(a.get("href"))) is False: list_url_articles.append(debut_url + a.get("href")) print(len(list_url_articles)) for i in range(2, derniere_page): time.sleep(61) url = "http://www.lepoint.fr/recherche/index.php?query=" + cat +\ "&date_from=01%2F01%2F2000&date_to=06%2F02%2F2018&type=" +\ "ARTICLE&page=" + str(i) soup_url = utils.recovery_flux_url_rss(url) for article in soup_url.find_all('article'): for div in article.find_all('div'): if div.get("class") == ['col', 'pls']: for a in div.find_all('a'): debut_url = "http://www.lepoint.fr" new_url = debut_url + a.get("href") list_url_articles.append(new_url) if re.search("journalistes", str(a.get("href"))): list_url_articles.remove(new_url) if re.search("frhttp", str(a.get("href"))): list_url_articles.remove(new_url) return list_url_articles
def recovery_link_new_articles_noob_crawler(): """ Retour : - liste des url de tous les articles de toutes les catégories """ list_category = [ "inondation", "inondations", "typhon", "typhons", "ouragan", "ouragans", "cyclone", "cyclones", "seisme", "seismes", "tremblement+de+terre", "tremblements+de+terre" ] article_noob = [] for cat in list_category: url = "https://recherche.nouvelobs.com/?referer=nouvelobs&q=" + cat soup = utils.recovery_flux_url_rss(url) for h2 in soup.find_all('h2'): if h2.get("class") == ['title']: for a in h2.find_all('a'): if not re.search("\/galeries\-photos\/", str(a.get("href"))) and not re.search( "\/cinema\/", str(a.get("href")))\ and not re.search("\/video\/", str( a.get("href"))) and not re.search( "\/magazine\/", str(a.get("href")))\ and not re.search("\/qui\-a\-dit\/", str(a.get("href"))): if re.search('www', str(a.get("href"))): article_noob.append(a.get("href")) for i in range(2, 21): url_noob = "http://recherche.nouvelobs.com/?p=" +\ str(i) + "&q=" + cat soup_url = utils.recovery_flux_url_rss(url_noob) for h2 in soup_url.find_all('h2'): if h2.get("class") == ['title']: for a in h2.find_all('a'): if not re.search("\/galeries\-photos\/", str(a.get( "href"))) and not re.search( "\/cinema\/", str(a.get("href")))\ and not re.search("\/video\/", str( a.get("href"))) and not re.search( "\/magazine\/", str(a.get("href")))\ and not re.search("\/qui\-a\-dit\/", str(a.get("href"))): if re.search('www', str(a.get("href"))): article_noob.append(a.get("href")) return (article_noob)
def collect_articles(): """ Retour : - Liste contenant les informations de tous les articles récupérés """ list_url_articles = collect_url_articles() list_new_articles = [] i = 0 for url_article in list_url_articles: i += 1 if i % 10 == 0: time.sleep(61) soup = utils.recovery_flux_url_rss(url_article) # Récupération du titre de l'article for div in soup.find_all('div'): if div.get("class") == ["page-title"]: title = div.get_text() # Récupération de l'auteur de l'article list_authors = [] for span in soup.find_all('span'): if span.get("rel") == ["author"]: list_authors.append(span.get_text()) # Récupération de la date de publication de l'article date_publi = "" for div in soup.find_all('div'): if div.get("class") == [ 'reset-text', 'art-date-infos', 'mts', 'list-view' ]: for balise_time in div.find_all('time'): date = balise_time.get("datetime") date_publi = date[0:10] # Récupération du contenu de l'article content = "" for h2 in soup.find_all('h2'): if h2.get('class') == ['art-chapeau']: content += h2.get_text() + " " for div in soup.find_all('div'): if div.get('class') == ['art-text']: for p in div.find_all('p'): content += p.get_text() + " " new_article = utils.recovery_article(title, 'LePoint', list_authors, date_publi, content, 'seisme') if not utils.is_empty(new_article): list_new_articles.append(new_article) return list_new_articles
def collect_url_bis(list_url_articles): """ Objectif de la fonction est de récupérer la liste url des arcticles selon le mot clé recherché. Retourne : list_url_articles : liste des url (chaine de caractère) """ url_final = [] for url in list_url_articles: soup = utils.recovery_flux_url_rss(url) for link in soup.find_all('link'): if (url[25:] and "amphtml") in link.get('href'): url_final.append(link.get('href')) return (url_final)
def recovery_information_noob(url_article): """ Arguments: - url de l'article dont on va récupérer les informations utiles Retour: - informations de l'article en format json """ soup_article = utils.recovery_flux_url_rss(url_article) title = soup_article.title.get_text() # Récupération de la date de publication de l'article find_date = soup_article.find('time', attrs={"class": "date"}) for a in find_date.find_all('a'): find_valeur = re.compile('[0-9]{4}\/[0-9]{2}\/[0-9]{2}') for valeur in find_valeur.finditer(str(a.get("href"))): date_p = valeur.group(0) date_p = datetime.strptime(date_p, "%Y/%m/%d")\ .strftime("%Y-%m-%d") # Récupération de l'auteur de l'article author = [] for div in soup_article.find_all('div'): if re.search('author', str(div.get("class"))): author.append(div.p.span.get_text()) # Récupération du thème de l'article theme = "" for nav in soup_article.find_all('nav'): if nav.get("class") == ['breadcrumb']: for ol in nav.find_all('ol'): for a in ol.find_all('a'): theme = a.get_text() # Récupération du contenu de l'article content = "" for div in soup_article.find_all('div'): if re.search('body', str(div.get("id"))): for aside in div.find_all('aside'): for p in aside.find_all('p'): p.string = "" for p in div.find_all('p'): for a in p.find_all('a'): if a.get("class") == ['lire']: a.string = "" for img in p.find_all('img'): p.string = "" content += p.get_text() + " " article = utils.recovery_article(title, 'NouvelObservateur', author, date_p, content, theme) return (article)
def info_articles(article_link): """ Objectif est de récupérer les différents élèments contenu dans les artciles : titre , date, auteur et contenu Arguments : liste d'articles Retourne : un article avec les différents élèments """ soup = utils.recovery_flux_url_rss(article_link) title = soup.title.string newspaper = "Le Monde" # Article theme if (soup.find("li", class_="ariane z2")): theme = soup.find("li", class_="ariane z2").find("a").get_text() else: theme = 'Forum' # Author of the article if (soup.find("span", class_="auteur")): if (soup.find("span", class_="auteur").a): author = soup.find("span", class_="auteur").find("a").get_text() else: author = soup.find("span", class_="auteur").get_text() author = re.sub(r"\s\s+", " ", author) author = re.sub(r"^ ", "", author) else: author = "" # publication date date_p = "" da = re.search(r"\d{4}-\d{2}\-\d{2}", soup.find("time").get("datetime"))[0] if (da): date_p = date.datetime.strptime(da, "%Y-%m-%d").strftime("%d/%m/%Y") else: date_p = str(date.datetime.now().strftime("%d/%m/%Y")) # Article content content = "" for div in soup.find_all( 'div', attrs={'class': 'contenu_article js_' + 'article_body'}): for p in div.find_all('p'): content += p.get_text() + " " # content = unidecode.unidecode(re.sub(r"\s\s+", " ", content)) new_article = utils.recovery_article(title, newspaper, [author], date_p, content, theme) return new_article