def recovery_new_articles_lfi(file_target="data/clean/robot/" +
                              str(date.datetime.now().date()) + "/"):
    """Procedure that calls all the others functions and procedures in order to
    collect articles from a newspaper in a file
    Arguments:
        file_target {string} -- path where the articles will be recorded
    """
    list_url_themes = collect_url_themes("http://www.lefigaro.fr/")

    for url_theme in list_url_themes:

        list_dictionaries = []

        theme = re.search("http://www.lefigaro.fr/(.*)", url_theme)[1]
        theme = re.sub("/", "", theme)
        print(theme)

        list_url_sub_themes = collect_url_sub_themes(url_theme)

        list_url_articles = []

        for url_sub_theme in list_url_sub_themes:
            collect_url_articles(list_url_articles, url_sub_theme)

        collect_articles(list_dictionaries, list_url_articles, theme)

        time.sleep(3)

        utils.create_json(file_target, list_dictionaries, 'leFigaro/', 'lfi')
示例#2
0
def recuperation_info_libe(file_target="data/clean/robot/" +
                           str(date.datetime.now().date()) + "/"):
    """
    Main fonction that get all articles url,
    extract the informations and create a JSON File
    """

    source = "liberation/"

    link_rss = get_rss_infos()

    list_articles = []
    i = 0

    for lr in link_rss:
        i += 1
        if "www.liberation.fr" in lr:
            informations = get_information(lr)
        else:
            informations = None
        if informations:
            new_article = get_information(lr)
            if utils.is_empty(new_article) is False:
                list_articles.append(new_article)
        if i > 49:
            i = 0
            utils.create_json(file_target, list_articles, source, "libe")
            list_articles = []

    utils.create_json(file_target, list_articles, source, "libe")
示例#3
0
def recovery_new_articles_noob_crawler(file_target="data/clean/robot/" +
                                       str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """

    file_json = []
    i = 0
    article_noob = recovery_link_new_articles_noob_crawler()

    # Each article is analized one by one
    for article in article_noob:
        new_article = recovery_information_noob(article)
        if utils.is_empty(new_article) is False:
            file_json.append(new_article)
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "NouvelObs_crawler/",
                              "noob")
            i = 0
            file_json = []

    utils.create_json(file_target, file_json, "NouvelObs_crawler/",
                      "noob")
def recovery_new_articles_equipe(file_target="data/clean/robot/" +
                                 str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """
    file_json = []
    i = 0
    list_url = recovery_link_new_articles_equipe("https://www.lequipe.fr/rss/")
    for url in list_url:
        soup_url = utils.recovery_flux_url_rss(url)
        items = soup_url.find_all("item")
        article_equipe = []

        # We're picking up every new article in a list
        for item in items:
            article_equipe.append(re.search(r"<link/>(.*)", str(item))[1])
        # Each article is analized one by one
        for article in article_equipe:
            new_article = recovery_information_equipe(article)
            if utils.is_empty(new_article) is False:
                file_json.append(recovery_information_equipe(article))
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "Equipe_rss/", "equi")
            i = 0
            file_json = []

    utils.create_json(file_target, file_json, "Equipe_rss/", "equi")
def recovery_old_articles_LD(
        file_target='/var/www/html/projet2018/data/clean/robot/' +
    str(datetime.datetime.now().date())):
    """
        it create a json for each article
    """
    list_category = [
        'grand-sud', 'actu', 'faits-divers', 'economie', 'sports', 'sante',
        'tv-people', 'sorties'
    ]
    links_article = []
    list_articles = []
    for cat in list_category:
        for i in range(1, 100):
            try:
                url = 'https://www.ladepeche.fr/recherche/?p=' + str(i)\
                        + '&c=' + cat + '&plus-infos=1'
                soup = utils.recovery_flux_url_rss(url)
            except:
                break

        for h2 in soup.find_all('h2'):
            for item in h2.find_all('a'):
                link = 'https://www.ladepeche.fr' + str(item.get('href'))
                links_article.append(link)

        for link in links_article:
            new_article = recovery_article_ld(link)
            if not utils.is_empty(new_article):
                list_articles.append(new_article)
        utils.create_json(file_target, list_articles, "Ladepeche", "LD")
def recovery_new_articles_lpt(file_target="data/clean/robot/" +
                              str(date.datetime.now().date()) + "/"):
    """Procedure that calls all the others functions and procedures in order to
    collect articles from a newspaper in a file
    Arguments:
        file_target {string} -- path where the articles will be recorded
    """
    list_url_themes = collect_url_themes('http://www.lepoint.fr/rss/')

    for url_theme in list_url_themes:

        list_url_articles = []

        list_dictionaries = []

        theme = re.search("http://www.lepoint.fr/(.*)/rss.xml", url_theme)[1]
        print("---------------------------" + theme +
              "------------------------")

        collect_url_articles(list_url_articles, url_theme)
        for index_page in range(2, 10):
            collect_url_articles(
                list_url_articles,
                url_theme + "index_" + str(index_page) + ".php")

        collect_articles(list_dictionaries, list_url_articles, theme)
        time.sleep(3)

        utils.create_json(file_target, list_dictionaries, "LePoint/", "lpt")
def recuperation_info_lt(file_target="data/clean/robot/" +
                         str(date.datetime.now().date()) + "/"):
    url_rss_latribune = "http://www.latribune.fr/rss/rubriques/actualite.html"
    articles = article_lt(url_rss_latribune)
    file_json = fileJson(articles)
    sources = "Latribune/"
    if not os.path.exists(file_target + sources):
        os.makedirs(file_target + sources)
    # Call the create_json function
    utils.create_json(file_target, file_json, sources, "lt")
示例#8
0
def recovery_new_articles_fusc(
    file_target='/var/www/html/projet2018/data/clean/robot/' +
    str(date.datetime.now().date()) + "/"):
    """
        it create a json for each new article
    """
    links = recovery_link_new_articles('https://www.futura-sciences.com/' +
                                       'flux-rss/')
    list_articles = []
    for article in links:
        new_article = recovery_information_fusc(article)
        if not utils.is_empty(new_article):
            list_articles.append(new_article)
    utils.create_json(file_target, list_articles, 'FuturaSciences', 'fusc')
示例#9
0
def recovery_new_article_lg():
    """
         Retrieving new articles thanks to the rss feed
         and create for each article a json
    """
    file_target = "/var/www/html/projet2018/data/clean/robot/"
    url_rss = "http://www.legorafi.fr/feed/"
    links_article = recovery_link_new_articles_lg(url_rss)
    list_article = []
    for link_article in links_article:
        new_article = recovery_information_lg(link_article)
        if not utils.is_empty(new_article):
            print(new_article)
            list_article.append(new_article)
    utils.create_json(file_target, list_article, 'LeGorafi', 'lg')
def recovery_old_article_equi(file_target="data/clean/robot/"):
    file_json = []
    url_rss = "https://www.lequipe.fr/"
    links_article = recovery_link_old_articles_equi(url_rss)

    i = 0
    for link in links_article:
        new_article = recovery_information_equi(link)
        if utils.is_empty(new_article) is False:
            file_json.append(new_article)
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "Equip_old/", "equi")
            i = 0
            file_json = []
    utils.create_json(file_target, file_json, "Equip_old/", "equi")
示例#11
0
def add_articles(file_target="data/clean/robot/" +
                 str(date.datetime.now().date()) + "/"):
    """
        it create a json for each new article
    """
    soup = utils.recovery_flux_url_rss(
        "http://www.20minutes.fr/feeds/rss-actu-france.xml")
    items = soup.find_all("item")
    articles = []
    for item in items:
        # Récuperer le lien des articles
        url = re.search(r"<link/>(.*)<pubdate>", str(item)).group(1)
        if is_article(url):
            new_article = get_article(url)
            if utils.is_empty(new_article):
                articles.append(new_article)
    utils.create_json(file_target, articles, "Minutes/", "min")
示例#12
0
def add_articles(file_target="/var/www/html/projet2018/data/clean/robot/" +
                 str(date.datetime.now().date()) + "/"):
    """
        it creates a json for each new article
    """
    categories = {
        "cinema": 40,
        "scenes": 30,
        "enfants": 3,
        "idees": 30,
    }
    articles = []
    for category, nbre in categories.items():
        for i in range(0, nbre):
            url = "http://www.telerama.fr/" + category + \
                "/articles?page=" + str(i)
            articles.extend(get_article_of_category(url))
            utils.create_json(file_target, articles, "Telerama/", "tera")
def recovery_old_article_equi(
    file_target="/var/www/html/projet2018/data/clean/robot/" +
    str(date.datetime.now().date()) + "/"):
    file_json = []
    url_rss = "https://www.lequipe.fr/"
    links_article = recovery_link_old_articles_equi(url_rss)

    i = 0
    for link in links_article:
        new_article = recovery_information_equi(link)
        if utils.is_empty(new_article) is False:
            file_json.append(new_article)
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "Equipe/", "equi")
            i = 0
            file_json = []
    utils.create_json(file_target, file_json, "Equipe/", "equi")
def recovery_old_articles_sv(
    file_target="C:/Users/Laetitia/Desktop/Groupe4_Robot" +
    str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """

    list_category = [
        "corps-et-sante", "nature-et-enviro", "ciel-et-espace",
        "technos-et-futur", "cerveau-et-intelligence", "science-et-culture"
    ]

    file_json = []
    i = 0
    for cat in list_category:
        # We retrieve the URL feeds for each page of article
        # Each HTML-coded article is analyzed with beautiful soup
        url_rss_sv = "https://www.science-et-vie.com/" + cat

        soup_url = utils.recovery_flux_url_rss(url_rss_sv)

        article_sv = []
        # We retrieve all the articles for a given page
        for div in soup_url.find_all("div"):
            if div.get("class") == ["title"]:
                for item in div.find_all("a"):
                    links = "https://www.science-et-vie.com/" + \
                        str(item.get("href"))
                    article_sv.append(links)

        # Each article is analized one by one
        for article in article_sv:
            new_article = recovery_information_sv(article)
            if utils.is_empty(new_article) is False:
                file_json.append(recovery_information_sv(article))
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "ScienceEtVie_crawler/",
                              "sv")
            i = 0
            file_json = []

    utils.create_json(file_target, file_json, "ScienceEtVie_crawler/", "sv")
def recovery_old_articles_fusc(
        file_target='/var/www/html/projet2018/data/clean/robot/'):
    """
        it create a json for each article
    """
    url = "https://www.futura-sciences.com/sitemap-html/actualites/"
    url_fusc = "https://www.futura-sciences.com"
    for ii in range(1, 202):
        links_article = []
        soup = utils.recovery_flux_url_rss(url + str(ii) + "/")
        for tag_div_link in soup.find_all(
                'div', attrs={"class": "has-divider-bottom latest-item"}):
            links_article.append(url_fusc + tag_div_link.a.get('href'))
        list_articles = []
        for link_article in links_article:
            new_article = recovery_information_fusc(link_article)
            if not utils.is_empty(new_article):
                list_articles.append(new_article)
        utils.create_json(file_target, list_articles, 'FuturaSciences', 'fusc')
示例#16
0
def recovery_old_article_lg(file_target="/var/www/html/projet2018/data/clean/robot/" +
                            str(date.datetime.now().date()) + "/"):
    """
        it create a json for each new article
    """
    list_article = []
    ii = 0
    url_rss = 'http://www.legorafi.fr/category/'
    links_article = recovery_link_old_articles_lg(url_rss)
    for link in links_article:
        new_article = recovery_information_lg(link)
        if not utils.is_empty(new_article):
            list_article.append(new_article)
            ii += 1
        if ii == 20:
            utils.create_json(file_target, list_article, 'LeGorafi', 'lg')
            ii = 0
            list_article = []
    utils.create_json(file_target, list_article, 'LeGorafi', 'lg')
def recovery_new_articles_fem(file_target="data/clean/robot/" +
                              str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """
    file_json = []
    i = 0
    article_fem = recovery_link_new_articles_fem()
    for article in article_fem:
        new_article = recovery_information_fem(article)
        if utils.is_empty(new_article) is False:
            file_json.append(new_article)
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "Femina_crawler/", "fem")
            i = 0
            file_json = []

    utils.create_json(file_target, file_json, "Femina_crawler/", "fem")
def add_articles(
    file_target="/home/etudiant/Documents/ProjetSID/Groupe4_Robot/" +
    "Telerama/Art/" + str(date.datetime.now().date()) + "/"):
    """
        it create a json for each new article
    """
    categories = {
        "cinema": 5,
        "scenes": 5,
        "enfants": 5,
        "idees": 5,
    }
    articles = []
    for category, nbre in categories.items():
        for i in range(0, nbre):
            url = "http://www.telerama.fr/" + \
                category + "/articles?page=" + str(i)
            new_article = get_article_of_category(url)
            if utils.is_empty(new_article) is False:
                articles.append(new_article)
    utils.create_json(file_target, articles, "Telerama/", "tera")
def recovery_new_articles_ld(file_target="data/clean/robot/" +
                             str(date.datetime.now().date()) + "/"):

    links = recovery_link_new_articles_ld(
        "https://www.ladepeche.fr/services/flux-rss/")

    list_articles = []
    i = 0
    for article in links:
        new_article = recovery_information_ld(article)
        if utils.is_empty(new_article) is False:
            list_articles.append(new_article)
            i += 1
            if i == 50:
                utils.create_json(file_target, list_articles, "ladepeche/",
                                  "LD")

                i = 0
                list_articles = []

    utils.create_json(file_target, list_articles, "ladepeche/", "LD")
def recovery_new_articles_hum_crawler(file_target="data/clean/robot/" +
                                      str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """
    file_json = []
    article_humanite = recovery_link_new_articles_hum_crawler()
    # Each url is analized one by one
    i = 0
    for article in article_humanite:
        new_article = recovery_information_hum(article)
        if utils.is_empty(new_article) is False:
            file_json.append(new_article)
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "Humanite/", "hum")
            i = 0
            file_json = []

    utils.create_json(file_target, file_json, "Humanite/", "hum")
示例#21
0
def recovery_old_articles_sv(
    file_target='/var/www/html/projet2018/data/clean/robot/' +
    str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """
    list_category = [
        'corps-et-sante', 'nature-et-enviro', 'ciel-et-espace',
        'technos-et-futur', 'cerveau-et-intelligence', 'science-et-culture'
    ]

    list_articles = []
    i = 0
    for cat in list_category:
        url_rss_sv = 'https://www.science-et-vie.com/' + cat

        soup_url = utils.recovery_flux_url_rss(url_rss_sv)

        article_sv = []
        # We retrieve all the articles for a given page
        for div in soup_url.find_all('div', attrs={'class': 'title'}):
            for item in div.find_all("a"):
                links = 'https://www.science-et-vie.com/' + \
                    str(item.get('href'))
                article_sv.append(links)

        # Each article is analized one by one
        for article in article_sv:
            new_article = recovery_information_sv(article)
            if not utils.is_empty(new_article):
                list_articles.append(recovery_information_sv(article))
            i += 1
            if i == 20:
                utils.create_json(file_target, list_articles, 'ScienceEtVie/',
                                  'sv')
                i = 0
                list_articles = []

    utils.create_json(file_target, list_articles, 'ScienceEtVie/', 'sv')
def recovery_old_article_minutes(
    file_target="/var/www/html/projet2018/data/clean/robot/" +
    str(date.datetime.now().date()) + "/"):
    # Chemin repertoire des articles

    source = "Minutes/"

    soup = utils.recovery_flux_url_rss("http://www.20minutes.fr")

    categories = soup.find("nav", class_="header-nav").find_all("li")
    articles = []

    for category in categories:
        url = category.find("a").get("href")
        theme = unidecode(category['data-theme'])
        if theme in [
                "default", "entertainment", "sport", "economy", "hightech",
                "planet"
        ]:
            articles.extend(get_article_of_category(url))

        utils.create_json(file_target, articles, source, "min")
示例#23
0
def recovery_new_articles_lpt(
    file_target="C:/Users/cmisid/Documents/TableauDeBord/LESOIR/" +
    str(date.datetime.now().date()) + "/"):

    list_url_articles = []
    j = 0
    for i in range(0, 1650, 10):
        j = j + 1
        url1 = 'http://www.lesoir.be/archives/recherche?datefilter=lastyear&sort=date+desc&start='
        +str(i) + '&word=terrorisme'
        soup1 = utils.recovery_flux_url_rss(url1)

        for a in soup1.find_all('a'):
            tit = a.get('href')
            if '/archive/' in tit.split('d'):
                url = 'http://www.lesoir.be' + tit
                list_url_articles.append(url)


######################
        url2 = 'http://www.lesoir.be/archives/recherche?datefilter=lastyear&sort=date+desc&start='
        +str(i) + '&word=attentat'
        soup2 = utils.recovery_flux_url_rss(url2)

        for a in soup2.find_all('a'):
            tit = a.get('href')
            if '/archive/' in tit.split('d'):
                url = 'http://www.lesoir.be' + tit
                list_url_articles.append(url)

        if (j == 3):
            time.sleep(71)
            j = 0

    list_dictionaries = []
    list_titre = []
    collect_articles(list_dictionaries, list_url_articles, list_titre)
    utils.create_json(file_target, list_dictionaries, "lesoir/", "lsr")
示例#24
0
def recovery_new_articles_libe(
    file_target="/var/www/html/projet2018/data/clean/robot/" +
    str(date.datetime.now().date()) + "/"):
    """Procedure that calls all the others functions and procedures in order to
    collect articles from a newspaper in a file
    Arguments:
        file_target {string} -- path where the articles will be recorded
    """
    list_dictionaries = []

    list_url_articles = collect_url_articles('http://www.liberation.fr/')

    number_articles = 0
    for url_article in list_url_articles:
        article = collect_article(url_article)
        if article != None:
            list_dictionaries.append(article)
            # Buffer
            number_articles += 1
            if number_articles % 50 == 0:
                utils.create_json(file_target, list_dictionaries,
                                  'Liberation/', 'libe')
                list_dictionaries.clear()
    utils.create_json(file_target, list_dictionaries, 'Liberation/', 'libe')
示例#25
0
def recovery_new_articles_noob_rss(file_target="data/clean/robot/" +
                                   str(date.datetime.now().date()) + "/"):
    """
        Returns:
            - creation of a json for each new article
    """
    file_json = []
    i = 0
    # Each url is analized one by one
    list_url = recovery_link_new_articles_noob_rss("http://www.nouvelobs." +
                                                   "com/rss/")
    for url in list_url:
        soup_url = utils.recovery_flux_url_rss(url)
        items = soup_url.find_all("item")
        article_noob = []

        # We're picking up every new article in a list
        for item in items:
            link_article = re.search(r"<link/>(.*)", str(item))[1]
            link_article = link_article.split("<description>")
            link_article = link_article[0]
            article_noob.append(link_article)
            if re.search("\/galeries\-photos\/", link_article):
                article_noob.remove(link_article)
        # Each article is analized one by one
        for article in article_noob:
            new_article = recovery_information_noob(article)
            if utils.is_empty(new_article) is False:
                file_json.append(new_article)
            i += 1
        if i == 20:
            utils.create_json(file_target, file_json, "NouvelObs_rss/", "noob")
            i = 0
            file_json = []

    utils.create_json(file_target, file_json, "NouvelObs/", "noob")
def recovery_new_articles_lt(file_target="C:/Users/lea/Desktop/PROJET/" +
                             str(date.datetime.now().date()) + "/"):

    list_category = [
        "actualites/economie/economie", "Entreprises-secteurs",
        "media-telecom-entreprise", "finance-patrimoine-investir", "opinions",
        "regions/economie-en-region"
    ]
    file_json = []
    articles_latribune = []
    # We retrieve the URL feeds for each page of article
    for cat in list_category:
        url_latribune = "https://www.latribune.fr/" + cat + ".html"
        soup_url = utils.recovery_flux_url_rss(url_latribune)

        for ul in soup_url.find_all("ul"):
            if ul.get("class") == ['pagination-archive', 'pages']:
                for li in ul.find_all("li"):
                    for a in li.find_all("a"):
                        link = a.get("href")
                        link2 = "https://www.latribune.fr" + link
                        soup_url = utils.recovery_flux_url_rss(link2)

                        for div in soup_url.find_all("div"):
                            for valeur in re.finditer('title-river',
                                                      str(div.get("class"))):
                                for a in div.find_all('a'):
                                    articles_latribune.append(a.get("href"))

    # Each article is analized one by one
    for article in articles_latribune:
        new_article = recovery_information_lt(article)
        if utils.is_empty(new_article) is False:
            file_json.append(new_article)

    utils.create_json(file_target, file_json, "latribune_crawler/", "lt")
        soup_url = utils.recovery_flux_url_rss(url_rss_sv)

        article_sv = []
        # We retrieve all the articles for a given page
        for div in soup_url.find_all('div', attrs={'class': 'title'}):
                for item in div.find_all("a"):
                    links = 'https://www.science-et-vie.com/' + \
                        str(item.get('href'))
                    article_sv.append(links)

        # Each article is analized one by one
        for article in article_sv:
            new_article = recovery_information_sv(article)
            if not utils.is_empty(new_article):
                list_articles.append(recovery_information_sv(article))
            i += 1
            if i == 20:
                utils.create_json(file_target, list_articles, 'ScienceEtVie/',
                                  'sv')
                i = 0
                list_articles = []

    utils.create_json(file_target, list_articles, 'ScienceEtVie/',
                      'sv')


if __name__ == '__main__':
    recovery_old_articles_sv()
    # /var/www/html/projet2018/data/clean/robot/
示例#28
0
    # content
    content = ""

    for h2 in soup_article.find_all('h2'):
        if h2.get("class") == ['article-full__header']:
            content = h2.get_text() + " "
    for div in soup_article.find_all('div'):
        if div.get("class") == ['article-full__body-content']:
            for b in div.find_all('b'):
                b.string = ""
            for a in div.find_all('a'):
                a.string = ""
            content += div.get_text() + " "

    data = [{
        "title": title,
        "newspaper": "leparisien",
        "author": author,
        "date_publi": date_p,
        "theme": categorie,
        "content": content
    }]
    # Mis sous json les articles
    erreur = "non"
    for tit in titre:
        if title == tit:
            erreur = "oui"
    if len(content) > 10 and erreur == "non":
        titre.append(title)
        utilsg4.create_json(fileTarget, data, "leparisien", "lp")