def recuperation_info_lmde( file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): source = "lemonde/" url_rss_lib = "http://www.lemonde.fr/rss/" abbreviation = "lmde" url = "http://www.lemonde.fr" list_articles = [] i = 0 listRSS = linkRSS(url_rss_lib) for article_link in listRSS: if "/article/" in article_link: i += 1 list_articles.append(info_articles(article_link)) if i == 20: utilsg4.create_json( file_target, list_articles, source, abbreviation) i = 0 list_articles = [] # links = recent(url) # list_articles.extend(articlesList(links)) utilsg4.create_json(file_target, list_articles, source, abbreviation)
def recovery_new_articles_fusc(file_target = '/var/www/html/projet2018/data/clean/robot/'): """ it create a json for each new article """ links = recovery_link_new_articles('https://www.futura-sciences.com/' + 'flux-rss/') list_articles = [] for article in links: new_article = recovery_information_fusc(article) if not utils.is_empty(new_article): list_articles.append(new_article) utils.create_json(file_target, list_articles, 'FuturaSciences', 'fusc')
def recovery_new_article_lg(file_target = "/var/www/html/projet2018/data/clean/robot/"): """ Retrieving new articles thanks to the rss feed and create for each article a json """ url_rss = "http://www.legorafi.fr/feed/" links_article = recovery_link_new_articles_lg(url_rss) list_article = [] for link_article in links_article: new_article = recovery_information_lg(link_article) if new_article["theme"] != "Magazine": list_article.append(new_article) utils.create_json(file_target, list_article, "LeGorafi", "lg")
def recovery_new_articles_hum(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] # Each url is analized one by one article_humanite = recovery_link_new_articles_hum("https://www.humanite" + ".fr/rss/actu.rss") for article in article_humanite: file_json.append(recovery_information_hum(article)) utils.create_json(file_target, file_json, "Humanite/", "hum")
def add_articles(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ it create a json for each new article """ soup = utils.recovery_flux_url_rss( "http://www.20minutes.fr/feeds/rss-actu-france.xml") items = soup.find_all("item") articles = [] for item in items: # Récuperer le lien des articles url = re.search(r"<link/>(.*)<pubdate>", str(item)).group(1) if is_article(url): articles.append(get_article(url)) utils.create_json(file_target, articles, "Minutes/", "min")
def recovery_new_articles_ld(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): links = recovery_link_new_articles_ld( "https://www.ladepeche.fr/services/flux-rss/") list_articles = [] i = 0 for article in links: new_article = recovery_information_ld(article) list_articles.append(new_article) i += 1 if i == 50: utilsg4.create_json(file_target, list_articles, "ladepeche/", "LD") i = 0 list_articles = [] utilsg4.create_json(file_target, list_articles, "ladepeche/", "LD")
def add_articles( file_target="/home/etudiant/Documents/ProjetSID/Groupe4_Robot/Telerama/Art/" + str( date.datetime.now().date()) + "/"): """ it create a json for each new article """ categories = { "cinema": 40, "scenes": 30, "enfants": 3, "idees": 30, } articles = [] for category, nbre in categories.items(): for i in range(0, nbre): url = "http://www.telerama.fr/" + \ category + "/articles?page=" + str(i) articles.extend(get_article_of_category(url)) utils.create_json(file_target, articles, "Telerama/", "tera")
def recovery_old_articles_sv( file_target="C:/Users/Laetitia/Desktop/Groupe4_Robot" + str( date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ list_category = [ "corps-et-sante", "nature-et-enviro", "ciel-et-espace", "technos-et-futur", "cerveau-et-intelligence", "science-et-culture"] file_json = [] for cat in list_category: # We retrieve the URL feeds for each page of article # Each HTML-coded article is analyzed with beautiful soup url_rss_sv = url_rss = "https://www.science-et-vie.com/" + cat soup_url = utils.recovery_flux_url_rss(url_rss_sv) article_sv = [] # We retrieve all the articles for a given page for div in soup_url.find_all("div"): if div.get("class") == ["title"]: for item in div.find_all("a"): links = "https://www.science-et-vie.com/" + \ str(item.get("href")) article_sv.append(links) # Each article is analized one by one for article in article_sv: file_json.append(recovery_information_sv(article)) utils.create_json(file_target, file_json, "Scienceetvie_crawler/", "sv")
def recovery_new_articles_fem(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] i = 0 article_fem = recovery_link_new_articles_fem() for article in article_fem: new_article = recovery_information_fem(article) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "Femina_crawler/", "fem") i = 0 file_json = [] utils.create_json(file_target, file_json, "Femina_crawler/", "fem")
def recovery_new_articles_noob_rss(file_target="data/clean/robot/" + str(date.datetime.now().date()) + "/"): """ Returns: - creation of a json for each new article """ file_json = [] i = 0 # Each url is analized one by one list_url = recovery_link_new_articles_noob_rss("http://www.nouvelobs." + "com/rss/") for url in list_url: soup_url = utils.recovery_flux_url_rss(url) items = soup_url.find_all("item") article_noob = [] # We're picking up every new article in a list for item in items: link_article = re.search(r"<link/>(.*)", str(item))[1] link_article = link_article.split("<description>") link_article = link_article[0] article_noob.append(link_article) if re.search("\/galeries\-photos\/", link_article): article_noob.remove(link_article) # Each article is analized one by one for article in article_noob: new_article = recovery_information_noob(article) if utils.is_empty(new_article) is False: file_json.append(new_article) i += 1 if i == 20: utils.create_json(file_target, file_json, "NouvelObs_rss/", "noob") i = 0 file_json = [] utils.create_json(file_target, file_json, "NouvelObs/", "noob")