def recovery_new_articles_noob_crawler(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] i = 0 article_noob = recovery_link_new_articles_noob_crawler() # Each article is analized one by one for article in article_noob: new_article = recovery_information_noob(article) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "NouvelObs_crawler/", "noob") i = 0 file_json = [] utils.create_json(file_target, file_json, "NouvelObs_crawler/", "noob")
def recovery_new_articles_equipe(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] i = 0 list_url = recovery_link_new_articles_equipe("https://www.lequipe.fr/rss/") for url in list_url: soup_url = utils.recovery_flux_url_rss(url) items = soup_url.find_all("item") article_equipe = [] # We're picking up every new article in a list for item in items: article_equipe.append(re.search(r"<link/>(.*)", str(item))[1]) # Each article is analized one by one for article in article_equipe: new_article = recovery_information_equipe(article) if utils.is_empty(new_article) is False: file_json.append(recovery_information_equipe(article)) i += 1 if i == 20: utils.create_json(file_target, file_json, "Equipe_rss/", "equi") i = 0 file_json = [] utils.create_json(file_target, file_json, "Equipe_rss/", "equi")
def recuperation_info_libe(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Main fonction that get all articles url, extract the informations and create a JSON File """ source = "liberation/" link_rss = get_rss_infos() list_articles = [] i = 0 for lr in link_rss: i += 1 if "www.liberation.fr" in lr: informations = get_information(lr) else: informations = None if informations: new_article = get_information(lr) if utils.is_empty(new_article) is False: list_articles.append(new_article) if i > 49: i = 0 utils.create_json(file_target, list_articles, source, "libe") list_articles = [] utils.create_json(file_target, list_articles, source, "libe")
def recovery_old_articles_LD( file_target='/var/www/html/projet2018/data/clean/robot/' + str(datetime.datetime.now().date())): """ it create a json for each article """ list_category = [ 'grand-sud', 'actu', 'faits-divers', 'economie', 'sports', 'sante', 'tv-people', 'sorties' ] links_article = [] list_articles = [] for cat in list_category: for i in range(1, 100): try: url = 'https://www.ladepeche.fr/recherche/?p=' + str(i)\ + '&c=' + cat + '&plus-infos=1' soup = utils.recovery_flux_url_rss(url) except: break for h2 in soup.find_all('h2'): for item in h2.find_all('a'): link = 'https://www.ladepeche.fr' + str(item.get('href')) links_article.append(link) for link in links_article: new_article = recovery_article_ld(link) if not utils.is_empty(new_article): list_articles.append(new_article) utils.create_json(file_target, list_articles, "Ladepeche", "LD")
def collect_articles(list_dictionaries, list_url_articles, theme): """Add the articles (dictionaries) from a list of URL in a list of dictionaries Arguments: list_dictionaries {list} -- list of dictionaries list_url_articles {list} -- list of URL theme {string} -- theme related to the list of dictionaries """ for url_article in list_url_articles: try: req = requests.get(url_article) data = req.text soup = BeautifulSoup(data, "lxml") balise_title = soup.title.string sep = balise_title.split(" - Le Point") title = sep[0] list_authors = [] for div in soup.find_all('div'): if div.get('class') == ['mbs']: for span in div.find_all('span'): name = span.get_text() name = re.sub('Par', '', name) name = re.sub("\s\s+", "", name) list_authors.append(name) dates = [] for balise_time in soup.find_all('time'): for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(balise_time)): dates.append( date.datetime.strptime(valeur.group(0), '%d/%m/%Y')) date_publication = date.datetime.strftime(min(dates), '%d/%m/%Y') date_publication = str( date.datetime.strptime(date_publication, "%d/%m/%Y").date()) content = '' for h2 in soup.find_all('h2'): if h2.get('class') == ['art-chapeau']: content += h2.get_text() + " " for div in soup.find_all('div'): if div.get('class') == ['art-text']: for p in div.find_all('p'): content += p.get_text() + " " new_article = utils.recovery_article(title, 'LePoint', list_authors, date_publication, content, theme) if not utils.is_empty(new_article): list_dictionaries.append(new_article) except: print("Erreur lors de l'enregistrement de l'article")
def collect_articles(list_dictionaries, list_url_articles, theme): for url_article in list_url_articles: try: req = requests.get(url_article) data = req.text soup = BeautifulSoup(data, 'lxml') title = soup.title.string list_authors = [] for a in soup.find_all('a'): if a.get("class") == ['fig-content-metas__author']: name = re.sub("\s\s+", "", a.get_text()) name = re.sub("\n", "", name) list_authors.append(name) if len(list_authors) == 0: for span in soup.find_all('span'): if span.get("class") == ['fig-content-metas__author']: name = re.sub("\s\s+", "", span.get_text()) name = re.sub("\n", "", name) list_authors.append(name) date_publication = "" for marker_time in soup.find_all('time'): for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(marker_time)): date_publication = valeur.group(0) date_publication = str( date.datetime.strptime(date_publication, "%d/%m/%Y").date()) content = "" for p in soup.find_all('p'): if p.get("class") == ['fig-content__chapo']: content = p.get_text() + " " for div in soup.find_all('div'): if div.get("class") == ['fig-content__body']: for p in div.find_all('p'): content += p.get_text() + " " new_article = utils.recovery_article(title, 'LeFigaro', list_authors, date_publication, content, theme) if not utils.is_empty(new_article): list_dictionaries.append(new_article) except: print("Erreur lors de l'enregistrement de l'article")
def collect_articles(list_dictionaries, list_url_articles, list_titre): j = 0 for url_article in list_url_articles: j = j + 1 soup = utils.recovery_flux_url_rss(url_article) for titl in soup.find_all('title'): # find the title tit = titl.get_text() if len(tit.split('-')) == 2: title = tit.split('-')[0] authors = [] for a in soup.find_all('a'): # find the authors if a.get('href') is not None: if "dpi-authors" in a.get('href').split('/'): tit = a.get('href').split('/')[-1] authors.append(tit.split('-')[0] + ' ' + tit.split('-')[1]) if len(authors) == 0: authors.append('') dates = [] date_publication = [] for balise_time in soup.find_all('time'): # find publication's date if 'pubdate' in balise_time.get('class'): dates.append(balise_time.get('datetime').split('T')[0]) date_publication.append( balise_time.get('datetime').split('T')[0]) theme = re.search("www.lesoir.be/(.*)/", url_article)[1] content = '' for p in soup.find_all('p'): if len(p.get_text().split(" ")) >= 2: content += p.get_text() new_article = utils.recovery_article(title, 'lesoir', authors, date_publication, content, theme) if (j == 3): time.sleep(71) j = 0 if not utils.is_empty(new_article): erreur = "non" for tit in list_titre: if title == tit: erreur = "oui" if len(content) > 10 and erreur == "non": list_titre.append(title) list_dictionaries.append(new_article)
def recovery_new_articles_fusc( file_target='/var/www/html/projet2018/data/clean/robot/' + str(date.datetime.now().date()) + "/"): """ it create a json for each new article """ links = recovery_link_new_articles('https://www.futura-sciences.com/' + 'flux-rss/') list_articles = [] for article in links: new_article = recovery_information_fusc(article) if not utils.is_empty(new_article): list_articles.append(new_article) utils.create_json(file_target, list_articles, 'FuturaSciences', 'fusc')
def collect_articles(list_dictionaries, list_url_articles, theme): """Add the articles (dictionaries) from a list of URL in a list of dictionaries Arguments: list_dictionaries {list} -- list of dictionaries list_url_articles {list} -- list of URL theme {string} -- theme related to the list of dictionaries """ for url_article in list_url_articles: req = requests.get(url_article) data = req.text soup = BeautifulSoup(data, "lxml") title = soup.title.string list_authors = [] for a in soup.find_all('a'): if a.get("class") == ['fig-content-metas__author']: name = re.sub("\s\s+", "", a.get_text()) name = re.sub("\n", "", name) list_authors.append(name) if len(list_authors) == 0: for span in soup.find_all('span'): if span.get("class") == ['fig-content-metas__author']: name = re.sub("\s\s+", "", span.get_text()) name = re.sub("\n", "", name) list_authors.append(name) date_publication = '' for marker_time in soup.find_all('time'): for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(marker_time)): date_publication = valeur.group(0) date_publication = str( date.datetime.strptime(date_publication, "%d/%m/%Y").date()) content = '' for p in soup.find_all('p'): if p.get("class") == ['fig-content__chapo']: content = p.get_text() + " " for div in soup.find_all('div'): if div.get("class") == ['fig-content__body']: for p in div.find_all('p'): content += p.get_text() + " " new_article = utils.recovery_article(title, 'LeFigaro', list_authors, date_publication, content, theme) if not utils.is_empty(new_article): list_dictionaries.append(new_article)
def recovery_new_article_lg(): """ Retrieving new articles thanks to the rss feed and create for each article a json """ file_target = "/var/www/html/projet2018/data/clean/robot/" url_rss = "http://www.legorafi.fr/feed/" links_article = recovery_link_new_articles_lg(url_rss) list_article = [] for link_article in links_article: new_article = recovery_information_lg(link_article) if not utils.is_empty(new_article): print(new_article) list_article.append(new_article) utils.create_json(file_target, list_article, 'LeGorafi', 'lg')
def fileJson(article_latribune): file_json = [] for article in article_latribune: soup = utils.recovery_flux_url_rss(article) # Retrieve the title title = soup.title.string # Retrieve the theme for li in soup.find_all('li'): if li.get("itemprop") == 'itemListElement': theme = li.a.span.get_text() # Retrieve the author author = [] for span in soup.find_all('span'): if span.get("class") == ['author-name']: author.append(span.a.span.get_text()) # Retrieve the publication date for time in soup.find_all('time'): if time.get("itemprop") == 'datePublished': date = time.get("itemprop") for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', str(time)): date = valeur.group(0) date = datetime.strptime(date, "%d/%m/%Y")\ .strftime("%Y-%m-%d") print(date) # Retrieve the content content = "" for div in soup.find_all('div'): if div.get("itemprop") == 'articleBody': for p in div.find_all('p'): content += p.get_text() + " " new_article = { "title": title, "newspaper": "La tribune", "author": author, "date_publi": date, "content": content, "theme": theme } # add each new article in the "file_json" table if utils.is_empty(new_article) is False: file_json.append(new_article) return (file_json)
def recovery_old_article_equi(file_target="data/clean/robot/"): file_json = [] url_rss = "https://www.lequipe.fr/" links_article = recovery_link_old_articles_equi(url_rss) i = 0 for link in links_article: new_article = recovery_information_equi(link) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "Equip_old/", "equi") i = 0 file_json = [] utils.create_json(file_target, file_json, "Equip_old/", "equi")
def add_articles(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ it create a json for each new article """ soup = utils.recovery_flux_url_rss( "http://www.20minutes.fr/feeds/rss-actu-france.xml") items = soup.find_all("item") articles = [] for item in items: # Récuperer le lien des articles url = re.search(r"<link/>(.*)<pubdate>", str(item)).group(1) if is_article(url): new_article = get_article(url) if utils.is_empty(new_article): articles.append(new_article) utils.create_json(file_target, articles, "Minutes/", "min")
def recovery_old_article_equi( file_target="/var/www/html/projet2018/data/clean/robot/" + str(date.datetime.now().date()) + "/"): file_json = [] url_rss = "https://www.lequipe.fr/" links_article = recovery_link_old_articles_equi(url_rss) i = 0 for link in links_article: new_article = recovery_information_equi(link) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "Equipe/", "equi") i = 0 file_json = [] utils.create_json(file_target, file_json, "Equipe/", "equi")
def recovery_old_articles_sv( file_target="C:/Users/Laetitia/Desktop/Groupe4_Robot" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ list_category = [ "corps-et-sante", "nature-et-enviro", "ciel-et-espace", "technos-et-futur", "cerveau-et-intelligence", "science-et-culture" ] file_json = [] i = 0 for cat in list_category: # We retrieve the URL feeds for each page of article # Each HTML-coded article is analyzed with beautiful soup url_rss_sv = "https://www.science-et-vie.com/" + cat soup_url = utils.recovery_flux_url_rss(url_rss_sv) article_sv = [] # We retrieve all the articles for a given page for div in soup_url.find_all("div"): if div.get("class") == ["title"]: for item in div.find_all("a"): links = "https://www.science-et-vie.com/" + \ str(item.get("href")) article_sv.append(links) # Each article is analized one by one for article in article_sv: new_article = recovery_information_sv(article) if utils.is_empty(new_article) is False: file_json.append(recovery_information_sv(article)) i += 1 if i == 20: utils.create_json(file_target, file_json, "ScienceEtVie_crawler/", "sv") i = 0 file_json = [] utils.create_json(file_target, file_json, "ScienceEtVie_crawler/", "sv")
def recovery_old_article_lg(file_target="/var/www/html/projet2018/data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ it create a json for each new article """ list_article = [] ii = 0 url_rss = 'http://www.legorafi.fr/category/' links_article = recovery_link_old_articles_lg(url_rss) for link in links_article: new_article = recovery_information_lg(link) if not utils.is_empty(new_article): list_article.append(new_article) ii += 1 if ii == 20: utils.create_json(file_target, list_article, 'LeGorafi', 'lg') ii = 0 list_article = [] utils.create_json(file_target, list_article, 'LeGorafi', 'lg')
def recovery_old_articles_fusc( file_target='/var/www/html/projet2018/data/clean/robot/'): """ it create a json for each article """ url = "https://www.futura-sciences.com/sitemap-html/actualites/" url_fusc = "https://www.futura-sciences.com" for ii in range(1, 202): links_article = [] soup = utils.recovery_flux_url_rss(url + str(ii) + "/") for tag_div_link in soup.find_all( 'div', attrs={"class": "has-divider-bottom latest-item"}): links_article.append(url_fusc + tag_div_link.a.get('href')) list_articles = [] for link_article in links_article: new_article = recovery_information_fusc(link_article) if not utils.is_empty(new_article): list_articles.append(new_article) utils.create_json(file_target, list_articles, 'FuturaSciences', 'fusc')
def recovery_new_articles_fem(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] i = 0 article_fem = recovery_link_new_articles_fem() for article in article_fem: new_article = recovery_information_fem(article) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "Femina_crawler/", "fem") i = 0 file_json = [] utils.create_json(file_target, file_json, "Femina_crawler/", "fem")
def recovery_new_articles_hum_crawler(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] article_humanite = recovery_link_new_articles_hum_crawler() # Each url is analized one by one i = 0 for article in article_humanite: new_article = recovery_information_hum(article) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "Humanite/", "hum") i = 0 file_json = [] utils.create_json(file_target, file_json, "Humanite/", "hum")
def recovery_new_articles_ld(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): links = recovery_link_new_articles_ld( "https://www.ladepeche.fr/services/flux-rss/") list_articles = [] i = 0 for article in links: new_article = recovery_information_ld(article) if utils.is_empty(new_article) is False: list_articles.append(new_article) i += 1 if i == 50: utils.create_json(file_target, list_articles, "ladepeche/", "LD") i = 0 list_articles = [] utils.create_json(file_target, list_articles, "ladepeche/", "LD")
def add_articles( file_target="/home/etudiant/Documents/ProjetSID/Groupe4_Robot/" + "Telerama/Art/" + str(date.datetime.now().date()) + "/"): """ it create a json for each new article """ categories = { "cinema": 5, "scenes": 5, "enfants": 5, "idees": 5, } articles = [] for category, nbre in categories.items(): for i in range(0, nbre): url = "http://www.telerama.fr/" + \ category + "/articles?page=" + str(i) new_article = get_article_of_category(url) if utils.is_empty(new_article) is False: articles.append(new_article) utils.create_json(file_target, articles, "Telerama/", "tera")
def recovery_old_articles_sv( file_target='/var/www/html/projet2018/data/clean/robot/' + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ list_category = [ 'corps-et-sante', 'nature-et-enviro', 'ciel-et-espace', 'technos-et-futur', 'cerveau-et-intelligence', 'science-et-culture' ] list_articles = [] i = 0 for cat in list_category: url_rss_sv = 'https://www.science-et-vie.com/' + cat soup_url = utils.recovery_flux_url_rss(url_rss_sv) article_sv = [] # We retrieve all the articles for a given page for div in soup_url.find_all('div', attrs={'class': 'title'}): for item in div.find_all("a"): links = 'https://www.science-et-vie.com/' + \ str(item.get('href')) article_sv.append(links) # Each article is analized one by one for article in article_sv: new_article = recovery_information_sv(article) if not utils.is_empty(new_article): list_articles.append(recovery_information_sv(article)) i += 1 if i == 20: utils.create_json(file_target, list_articles, 'ScienceEtVie/', 'sv') i = 0 list_articles = [] utils.create_json(file_target, list_articles, 'ScienceEtVie/', 'sv')
def recovery_new_articles_noob_rss(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] i = 0 # Each url is analized one by one list_url = recovery_link_new_articles_noob_rss("http://www.nouvelobs." + "com/rss/") for url in list_url: soup_url = utils.recovery_flux_url_rss(url) items = soup_url.find_all("item") article_noob = [] # We're picking up every new article in a list for item in items: link_article = re.search(r"<link/>(.*)", str(item))[1] link_article = link_article.split("<description>") link_article = link_article[0] article_noob.append(link_article) if re.search("\/galeries\-photos\/", link_article): article_noob.remove(link_article) # Each article is analized one by one for article in article_noob: new_article = recovery_information_noob(article) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "NouvelObs_rss/", "noob") i = 0 file_json = [] utils.create_json(file_target, file_json, "NouvelObs/", "noob")
def recovery_new_articles_lt(file_target="C:/Users/lea/Desktop/PROJET/" + str(date.datetime.now().date()) + "/"): list_category = [ "actualites/economie/economie", "Entreprises-secteurs", "media-telecom-entreprise", "finance-patrimoine-investir", "opinions", "regions/economie-en-region" ] file_json = [] articles_latribune = [] # We retrieve the URL feeds for each page of article for cat in list_category: url_latribune = "https://www.latribune.fr/" + cat + ".html" soup_url = utils.recovery_flux_url_rss(url_latribune) for ul in soup_url.find_all("ul"): if ul.get("class") == ['pagination-archive', 'pages']: for li in ul.find_all("li"): for a in li.find_all("a"): link = a.get("href") link2 = "https://www.latribune.fr" + link soup_url = utils.recovery_flux_url_rss(link2) for div in soup_url.find_all("div"): for valeur in re.finditer('title-river', str(div.get("class"))): for a in div.find_all('a'): articles_latribune.append(a.get("href")) # Each article is analized one by one for article in articles_latribune: new_article = recovery_information_lt(article) if utils.is_empty(new_article) is False: file_json.append(new_article) utils.create_json(file_target, file_json, "latribune_crawler/", "lt")
url_rss_sv = 'https://www.science-et-vie.com/' + cat soup_url = utils.recovery_flux_url_rss(url_rss_sv) article_sv = [] # We retrieve all the articles for a given page for div in soup_url.find_all('div', attrs={'class': 'title'}): for item in div.find_all("a"): links = 'https://www.science-et-vie.com/' + \ str(item.get('href')) article_sv.append(links) # Each article is analized one by one for article in article_sv: new_article = recovery_information_sv(article) if not utils.is_empty(new_article): list_articles.append(recovery_information_sv(article)) i += 1 if i == 20: utils.create_json(file_target, list_articles, 'ScienceEtVie/', 'sv') i = 0 list_articles = [] utils.create_json(file_target, list_articles, 'ScienceEtVie/', 'sv') if __name__ == '__main__': recovery_old_articles_sv() # /var/www/html/projet2018/data/clean/robot/