def recuperation_info_lmde(file_target="/Users/sofian/Documents/Projet_att/" + str(date.datetime.now().date()) + "/"): list_url_articles = [] j = 0 # récupération des articles avec la recherche: impact attentat for i in range(1, 16): j = j+1 url = 'http://www.lemonde.fr/recherche/?keywords=attentat+impact&page_num=' + str(i) +'&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=18&end_month=02&end_year=2018&sort=desc' soup = utils.recovery_flux_url_rss(url) for h3 in soup.find_all('h3'): for a in h3.find_all('a'): url = 'http://www.lemonde.fr' + a.get("href") list_url_articles.append(url) if (j == 3): time.sleep(61) j = 0 # récupération des articles avec la recherche: attentat for i in range(1, 600): j = j+1 url = 'http://www.lemonde.fr/recherche/?keywords=attentat&page_num=' + str(i) + "&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=31&end_month=01&end_year=2018&sort=desc" soup = utils.recovery_flux_url_rss(url) for h3 in soup.find_all('h3'): for a in h3.find_all('a'): url = 'http://www.lemonde.fr' + a.get("href") list_url_articles.append(url) if (j == 3): time.sleep(61) j = 0 # récupération des articles avec la recherche: terrorisme for i in range(1, 800): j = j+1 url = 'http://www.lemonde.fr/recherche/?keywords=terrorisme&page_num=' + str(i) + '&operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=since_1944&start_day=01&start_month=01&start_year=1944&end_day=18&end_month=02&end_year=2018&sort=desc' soup = utils.recovery_flux_url_rss(url) for h3 in soup.find_all('h3'): for a in h3.find_all('a'): url = 'http://www.lemonde.fr' + a.get("href") list_url_articles.append(url) if (j == 3): time.sleep(61) j = 0 list_dictionaries = [] info_articles(list_dictionaries, list_url_articles) utils.create_json(file_target, list_dictionaries, "LeMonde/", "lmde")
def recovery_new_articles_lpt(file_target="/Users/sofian/Documents/Projet_att/" + str(date.datetime.now().date()) + "/"): """Procedure that calls all the others functions and procedures in order to collect articles from a newspaper in a file Arguments: file_target {string} -- path where the articles will be recorded """ list_url_articles = [] # boucle qui va aller de la page 1 à la page 95 de la recherche attentat for i in range(1, 95): url = 'http://www.lepoint.fr/recherche/index.php?query=attentats&page=' + str(i) soup = utils.recovery_flux_url_rss(url) # récupère le code html de l'url # on récupère tout les urls des articles qui sont dans les balises <div class='image-search-wrap'> <figure> </figure> </div> for div in soup.find_all('div'): if re.search('image-search-wrap', str(div.get("class"))): for fig in div.find_all('figure'): url = "http://www.lepoint.fr" + fig.a.get("href") list_url_articles.append(url) time.sleep(61) # On se met en veille pendant 61 secondes pour pas que le site internet reconnaisse qu'on soit un robot list_dictionaries = [] collect_articles(list_dictionaries, list_url_articles) ## Appel à la fonction collect_articles, pour avoir les informations de l'article utils.create_json(file_target, list_dictionaries, "LePoint/", "lpt") # On crée le fichier json pour chaque article
def collect_articles(list_dictionaries, list_url_articles): """Add the articles (dictionaries) from a list of URL in a list of dictionaries Arguments: list_dictionaries {list} -- list of dictionaries list_url_articles {list} -- list of URL theme {string} -- theme related to the list of dictionaries """ ## Boucle qui va pour chaque url d'un article récupéré les informations (titre, date de publication, auteur, le theme et le contenu de l'article for url_article in list_url_articles: soup = utils.recovery_flux_url_rss(url_article) # récupère le code html de l'url balise_title = soup.title.string # récupère le titre de l'article sep = balise_title.split(" - Le Point") title = sep[0] list_authors = [] # On va récupéré les auteurs qui sont dans une balise <span> qui est dans une balise <div> for div in soup.find_all('div'): if div.get('class') == ['mbs']: for span in div.find_all('span'): name = span.get_text() name = re.sub('Par', '', name) name = re.sub('\n', '', name) list_authors.append(name) dates = [] # On va récupéré la date qui est dans une balise <time> for balise_time in soup.find_all('time'): for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(balise_time)): dates.append(date.datetime.strptime(valeur.group(0), '%d/%m/%Y')) date_publication = date.datetime.strftime(min(dates), '%d/%m/%Y') date_publication = str(date.datetime.strptime(date_publication, "%d/%m/%Y").date()) # On récupère le theme à l'aide de l'url de l'article par exemple : "http:/www.lepoint.fr/ " pour cette article on récupère le thème sport theme = re.search("www.lepoint.fr/(.*)/", url_article)[1] # On va récupéré le contenu de l'article qui est dans une balise <h2 class='art-chapeau'> et aussi dans une balise <div class='art-text"> content = '' for h2 in soup.find_all('h2'): if h2.get('class') == ['art-chapeau']: content += h2.get_text()+" " for div in soup.find_all('div'): if div.get('class') == ['art-text']: for p in div.find_all('p'): content += p.get_text()+" " new_article = utils.recovery_article(title, 'LePoint', list_authors, date_publication, content, theme) if not utils.is_empty(new_article): list_dictionaries.append(new_article)
def recovery_new_articles_lpt( file_target="/Users/sofian/Documents/Projet_att/" + str(date.datetime.now().date()) + "/"): """Procedure that calls all the others functions and procedures in order to collect articles from a newspaper in a file Arguments: file_target {string} -- path where the articles will be recorded """ list_url_articles = [] j = 0 for i in range(1, 90): j = j + 1 url = 'http://www.lepoint.fr/recherche/index.php?query=attentats&sort=pertinence&page=' + str( i) soup = utils.recovery_flux_url_rss(url) for div in soup.find_all('div'): if re.search('image-search-wrap', str(div.get("class"))): for fig in div.find_all('figure'): url = "http://www.lepoint.fr" + fig.a.get("href") list_url_articles.append(url) if (j == 3): time.sleep(61) j = 0 for i in range(1, 90): j = j + 1 url = 'http://www.lepoint.fr/recherche/index.php?query=attentat&sort=pertinence&page=' + str( i) soup = utils.recovery_flux_url_rss(url) for div in soup.find_all('div'): if re.search('image-search-wrap', str(div.get("class"))): for fig in div.find_all('figure'): url = "http://www.lepoint.fr" + fig.a.get("href") list_url_articles.append(url) if (j == 3): time.sleep(5) j = 0 for i in range(1, 90): j = j + 1 url = 'http://www.lepoint.fr/recherche/index.php?query=terrorisme&sort=pertinence&page=' + str( i) soup = utils.recovery_flux_url_rss(url) for div in soup.find_all('div'): if re.search('image-search-wrap', str(div.get("class"))): for fig in div.find_all('figure'): url = "http://www.lepoint.fr" + fig.a.get("href") list_url_articles.append(url) if (j == 3): time.sleep(61) j = 0 for i in range(1, 90): j = j + 1 url = 'http://www.lepoint.fr/recherche/index.php?query=terroriste&sort=pertinence&page=' + str( i) soup = utils.recovery_flux_url_rss(url) for div in soup.find_all('div'): if re.search('image-search-wrap', str(div.get("class"))): for fig in div.find_all('figure'): url = "http://www.lepoint.fr" + fig.a.get("href") list_url_articles.append(url) if (j == 3): time.sleep(61) j = 0 for i in range(1, 90): j = j + 1 url = 'http://www.lepoint.fr/recherche/index.php?query=terroristes&sort=pertinence&page=' + str( i) soup = utils.recovery_flux_url_rss(url) for div in soup.find_all('div'): if re.search('image-search-wrap', str(div.get("class"))): for fig in div.find_all('figure'): url = "http://www.lepoint.fr" + fig.a.get("href") list_url_articles.append(url) if (j == 3): time.sleep(61) j = 0 list_dictionaries = [] collect_articles(list_dictionaries, list_url_articles) utils.create_json(file_target, list_dictionaries, "LePoint/", "lpt")
def collect_articles(list_dictionaries, list_url_articles): """Add the articles (dictionaries) from a list of URL in a list of dictionaries Arguments: list_dictionaries {list} -- list of dictionaries list_url_articles {list} -- list of URL theme {string} -- theme related to the list of dictionaries """ try: j = 0 titre = [] for url_article in list_url_articles: soup = utils.recovery_flux_url_rss( url_article) # récupère le code html de l'url balise_title = soup.title.string # récupère le titre de l'article sep = balise_title.split(" - Le Point") title = sep[0] list_authors = [] # On va récupéré les auteurs qui sont dans une balise <span> qui est dans une balise <div> for div in soup.find_all('div'): if div.get('class') == ['mbs']: for span in div.find_all('span'): name = span.get_text() name = re.sub('Par', '', name) name = re.sub('\n', '', name) list_authors.append(name) dates = [] # On va récupéré la date qui est dans une balise <time> for balise_time in soup.find_all('time'): for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(balise_time)): dates.append( date.datetime.strptime(valeur.group(0), '%d/%m/%Y')) date_publication = date.datetime.strftime(min(dates), '%d/%m/%Y') date_publication = str( date.datetime.strptime(date_publication, "%d/%m/%Y").date()) theme = re.search("www.lepoint.fr/(.*)/", url_article)[1] content = '' for h2 in soup.find_all('h2'): if h2.get('class') == ['art-chapeau']: content += h2.get_text() + " " for div in soup.find_all('div'): if div.get('class') == ['art-text']: for p in div.find_all('p'): content += p.get_text() + " " new_article = utils.recovery_article(title, 'LePoint', list_authors, date_publication, content, theme) if (j == 5): time.sleep(61) j = 0 erreur = "non" for tit in titre: if title == tit: erreur = "oui" if len(content) > 10 and erreur == "non": titre.append(title) list_dictionaries.append(new_article) except: print("Probleme")
import utils_v0 as utils import re from datetime import datetime import datetime as date file_target = "/Users/sofian/Documents/Projet_att/" + str(date.datetime.now().date()) + "/" article_noob = [] for i in range(1, 30): url_rss_noob = "http://recherche.nouvelobs.com/?p=" + str(i) + "&q=attentat&c=bnJlc3VsdHMlM0QxMCUyNnN0YXJ0JTNEMjgwJTI2bG9naWMlM0RzbHJlZm9udGUtZ2xvYmFsZSUyNnElM0RhdHRlbnRhdCUyQiUyNTI4Tk9UJTJCY29ycG9yYXRlJTI1MkZ0cmVlJTI1M0FUb3AlMjUyRnR5cGUlMjUyRmRlcGVjaGVzJTJCQU5EJTJCTk9UJTJCY29ycG9yYXRlJTI1MkZ0cmVlJTI1M0FUb3AlMjUyRnR5cGUlMjUyRnJlZGlyZWN0aW9uJTI1MjklMkJBTkQlMkIlMjUyOGNvcnBvcmF0ZSUyNTJGdHJlZSUyNTNBVG9wJTI1MkZteXNvdXJjZSUyNTJGbm91dmVsb2JzLmNvbSUyQk9SJTJCY29ycG9yYXRlJTI1MkZ0cmVlJTI1M0FUb3AlMjUyRm15c291cmNlJTI1MkZsZXBsdXMlMkJPUiUyQmNvcnBvcmF0ZSUyNTJGdHJlZSUyNTNBVG9wJTI1MkZteXNvdXJjZSUyNTJGb2JzZXNzaW9uJTJCT1IlMkJjb3Jwb3JhdGUlMjUyRnRyZWUlMjUzQVRvcCUyNTJGbXlzb3VyY2UlMjUyRnRlbGVvYnMuY29tJTJCT1IlMkJjb3Jwb3JhdGUlMjUyRnRyZWUlMjUzQVRvcCUyNTJGbXlzb3VyY2UlMjUyRmJpYmxpb2JzJTI1Mjk%3D" soup_url = utils.recovery_flux_url_rss(url_rss_noob) for h2 in soup_url.find_all('h2'): if h2.get("class") == ['title']: if re.search('www.nouvelobs.com', str(h2.a.get("href"))): article_noob.append(h2.a.get("href")) for i in range(1, 30): url_rss_noob = "http://recherche.nouvelobs.com/?p=" + str(i) + "&q=terrorisme&c=bnJlc3VsdHMlM0QxMCUyNnN0YXJ0JTNEMjkwJTI2bG9naWMlM0RzbHJlZm9udGUtZ2xvYmFsZSUyNnElM0R0ZXJyb3Jpc21lJTJCJTI1MjhOT1QlMkJjb3Jwb3JhdGUlMjUyRnRyZWUlMjUzQVRvcCUyNTJGdHlwZSUyNTJGZGVwZWNoZXMlMkJBTkQlMkJOT1QlMkJjb3Jwb3JhdGUlMjUyRnRyZWUlMjUzQVRvcCUyNTJGdHlwZSUyNTJGcmVkaXJlY3Rpb24lMjUyOSUyQkFORCUyQiUyNTI4Y29ycG9yYXRlJTI1MkZ0cmVlJTI1M0FUb3AlMjUyRm15c291cmNlJTI1MkZub3V2ZWxvYnMuY29tJTJCT1IlMkJjb3Jwb3JhdGUlMjUyRnRyZWUlMjUzQVRvcCUyNTJGbXlzb3VyY2UlMjUyRmxlcGx1cyUyQk9SJTJCY29ycG9yYXRlJTI1MkZ0cmVlJTI1M0FUb3AlMjUyRm15c291cmNlJTI1MkZvYnNlc3Npb24lMkJPUiUyQmNvcnBvcmF0ZSUyNTJGdHJlZSUyNTNBVG9wJTI1MkZteXNvdXJjZSUyNTJGdGVsZW9icy5jb20lMkJPUiUyQmNvcnBvcmF0ZSUyNTJGdHJlZSUyNTNBVG9wJTI1MkZteXNvdXJjZSUyNTJGYmlibGlvYnMlMjUyOQ%3D%3D" soup_url = utils.recovery_flux_url_rss(url_rss_noob) for h2 in soup_url.find_all('h2'): if h2.get("class") == ['title']: if re.search('www.nouvelobs.com', str(h2.a.get("href"))): article_noob.append(h2.a.get("href")) # analyse de chaque article titre = [] for url_article in article_noob: try: soup_article = utils.recovery_flux_url_rss(url_article)
def info_articles(list_dictionaries, list_url_articles): try: j = 0 titre = [] for url_article in list_url_articles: soup = utils.recovery_flux_url_rss(url_article) title = soup.find('title').string title = title.lower() newspaper = "Le Monde" # Article theme theme = "" for li in soup.find_all('li'): for val in re.finditer('ariane', str(li.get("class"))): theme = li.a.get_text() # Author of the article if(soup.find("span", class_="auteur")): if(soup.find("span", class_="auteur").a): author = soup.find("span",class_="auteur").find("a").get_text() else: author = soup.find("span", class_="auteur").get_text() author = re.sub(r"\s\s+", " ", author) author = re.sub(r"^ ", "", author) else: author = "" # publication date date_p = "" for tim in soup.find_all('time'): if tim.get("itemprop") == 'datePublished': date_t = tim.get('datetime') date_p = date_t[0:10] date_p = datetime.strptime(date_p, "%Y-%m-%d").strftime("%d/%m/%Y") # Article content content = "" for div in soup.find_all('div'): if div.get("id") == 'articleBody': for p in div.find_all('p'): if p.get("class") == ['lire']: p.string = "" content += div.get_text() + " " new_article = utils.recovery_article(title, newspaper, author, date_p, content, theme) if (j == 3): time.sleep(61) j = 0 erreur = "non" for tit in titre: if title == tit: erreur = "oui" if len(content) > 10 and erreur == "non": titre.append(title) list_dictionaries.append(new_article) except: print("Probleme")