Python recovery_flux_url_rss示例，utils_v40.recovery_flux_url_rss Python示例

示例#1

0

显示文件

文件： crawler_le_monde.py 项目： NicolasBroute/Projet_Tableau_de_Bord

def recup_articles():
    """ Objectif de la fonction est de récupérer la liste url des arcticles
    selon le mot clé recherché.
    Retourne : list_url_articles : liste des url (chaine de caractère)
    """
    list_url_articles = []
    list_page = []
    for i in range(0, 230, 10):
        list_page.append(i)
    for i in range(1, 230):
        if i in list_page:
            time.sleep(15)
            urli = [
                'http://www.lemonde.fr/recherche/?keywords=cyclones+' +
                'cyclone&page_num=' + str(i) + '&operator=or&exclude_' +
                'keywords=&qt=recherche_texte_titre&author=&period=' +
                'custom_date&start_day=01&start_month=01&start_year=' +
                '2000&end_day=22&end_month=02&end_year=2018&sort=asc'
            ]
            soup = utils.recovery_flux_url_rss(urli)

            for h3 in soup.find_all('div',
                                    attrs={
                                        'class':
                                        'grid_11 conteneu' +
                                        'r_fleuve alpha omega'
                                    }):
                for a in h3.find_all('a'):
                    if 'http://' in a.get('href'):
                        list_url_articles.append(a.get('href'))
                    else:
                        list_url_articles.append('http://www.lemonde.fr' +
                                                 a.get('href'))
        else:
            urli = 'http://www.lemonde.fr/recherche/?keywords=ouragan&page' +\
                '_num=' + str(i) + '&operator=and&exclude_keywords=&qt=' +\
                'recherche_texte_titre&author=&period=custom_date&start_day' +\
                '=01&start_month=01&start_year=2000&end_day=22&end_month=' +\
                '02&end_year=2018&sort=asc'
            soup = utils.recovery_flux_url_rss(urli)

            for h3 in soup.find_all('div',
                                    attrs={
                                        'class':
                                        'grid_11 conten' +
                                        'eur_fleuve alpha omega'
                                    }):
                for a in h3.find_all('a'):
                    if 'http://' in a.get('href'):
                        list_url_articles.append(a.get('href'))
                    else:
                        list_url_articles.append('http://www.lemonde.fr' +
                                                 a.get('href'))
    return (list_url_articles)

示例#2

0

显示文件

def collect_url_themes():
    """ Objectif de la fonction est de récupérer la liste url des arcticles
    selon le mot clé recherché.
    Retourne : list_url_articles : liste des url (chaine de caractère)
    """
    list_url_articles = []
    liste_url = [
        'http://recherche.lefigaro.fr/recherche/cyclone/?datemin' +
        '\=01-01-2008&datemax=02-02-2018&page='
    ]
    for url in liste_url:
        for i in range(1, 5):
            urli = url + str(i)
            soup = utils.recovery_flux_url_rss(urli)

            for h3 in soup.find_all('h2',
                                    attrs={'class': 'fig-profil-headline'}):
                for a in h3.find_all('a'):
                    if 'http://' in a.get('href'):
                        list_url_articles.append(a.get('href'))
                    else:
                        list_url_articles.append('http://www.lefigaro.fr' +
                                                 a.get('href'))

    return list_url_articles

示例#3

0

显示文件

文件： crawler_liberation.py 项目： NicolasBroute/Projet_Tableau_de_Bord

def collect_url_articles():
    """ Objectif de la fonction est de récupérer la liste url des arcticles
    selon le mot clé recherché mais le contenu n'est pas dans cette page
    HTML il faudra appelé une autre fonction
    Retourne : list_url_articles : liste des url (chaine de caractère)
    """
    liste_url = [
        'http://www.liberation.fr/recherche/?sort=-publication_date' +
        '_time&q=cyclone&period=custom&period_start_day=1&period_' +
        'start_month=1&period_start_year=2000&period_end_day=29&' +
        'period_end_month=1&period_end_year=2018&editorial_source' +
        '=&paper_channel=460&page='
    ]
    list_url_articles = []
    for url in liste_url:
        for i in range(1, 18):
            time.sleep(15)
            urli = url + str(i)
            soup = utils.recovery_flux_url_rss(urli)
            for h3 in soup.find_all('h3', attrs={'class': 'live-title'}):
                for a in h3.find_all('a'):
                    if 'http://' in a.get('href'):
                        list_url_articles.append(a.get('href'))
                    else:
                        list_url_articles.append('http://www.liberation.fr' +
                                                 a.get('href'))
    return (list_url_articles)

示例#4

0

显示文件

def collect_url_articles():
    """
        Retour :
            - liste d'url des articles en fonction de la categorie choisie
        list_category = ["inondation", "typhon", "ouragan", "cyclone",
                         "seisme", "tremblement+de+terre"]
    """
    list_url_articles = []

    cat = "seisme"
    url = "http://www.lepoint.fr/recherche/index.php?query=" + cat +\
        "&date_from=01%2F01%2F2000&date_to=31%2F01%2F2018&type=ARTICLE"
    soup_url = utils.recovery_flux_url_rss(url)
    for ol in soup_url.find_all('ol'):
        if ol.get("class") == ['pagination', 'bottom']:
            for li in ol.find_all('li'):
                for a in li.find_all('a'):
                    if not a.get("class"):
                        derniere_page = int(li.get_text())
    for article in soup_url.find_all('article'):
        for div in article.find_all('div'):
            if div.get("class") == ['col', 'pls']:
                for a in div.find_all('a'):
                    debut_url = "http://www.lepoint.fr"
                    if re.search('journalistes', str(a.get("href"))) is False:
                        list_url_articles.append(debut_url + a.get("href"))

    print(len(list_url_articles))
    for i in range(2, derniere_page):
        time.sleep(61)
        url = "http://www.lepoint.fr/recherche/index.php?query=" + cat +\
            "&date_from=01%2F01%2F2000&date_to=06%2F02%2F2018&type=" +\
            "ARTICLE&page=" + str(i)
        soup_url = utils.recovery_flux_url_rss(url)
        for article in soup_url.find_all('article'):
            for div in article.find_all('div'):
                if div.get("class") == ['col', 'pls']:
                    for a in div.find_all('a'):
                        debut_url = "http://www.lepoint.fr"
                        new_url = debut_url + a.get("href")
                        list_url_articles.append(new_url)
                        if re.search("journalistes", str(a.get("href"))):
                            list_url_articles.remove(new_url)
                        if re.search("frhttp", str(a.get("href"))):
                            list_url_articles.remove(new_url)

    return list_url_articles

示例#5

0

显示文件

文件： crawler_nouvelobs.py 项目： NicolasBroute/Projet_Tableau_de_Bord

def recovery_link_new_articles_noob_crawler():
    """
        Retour :
            - liste des url de tous les articles de toutes les catégories
    """
    list_category = [
        "inondation", "inondations", "typhon", "typhons", "ouragan",
        "ouragans", "cyclone", "cyclones", "seisme", "seismes",
        "tremblement+de+terre", "tremblements+de+terre"
    ]

    article_noob = []
    for cat in list_category:
        url = "https://recherche.nouvelobs.com/?referer=nouvelobs&q=" + cat
        soup = utils.recovery_flux_url_rss(url)
        for h2 in soup.find_all('h2'):
            if h2.get("class") == ['title']:
                for a in h2.find_all('a'):
                    if not re.search("\/galeries\-photos\/",
                                     str(a.get("href"))) and not re.search(
                                        "\/cinema\/", str(a.get("href")))\
                                     and not re.search("\/video\/", str(
                                             a.get("href"))) and not re.search(
                                     "\/magazine\/", str(a.get("href")))\
                                     and not re.search("\/qui\-a\-dit\/",
                                                       str(a.get("href"))):
                        if re.search('www', str(a.get("href"))):
                            article_noob.append(a.get("href"))
        for i in range(2, 21):
            url_noob = "http://recherche.nouvelobs.com/?p=" +\
                str(i) + "&q=" + cat
            soup_url = utils.recovery_flux_url_rss(url_noob)
            for h2 in soup_url.find_all('h2'):
                if h2.get("class") == ['title']:
                    for a in h2.find_all('a'):
                        if not re.search("\/galeries\-photos\/", str(a.get(
                                "href"))) and not re.search(
                                "\/cinema\/", str(a.get("href")))\
                                and not re.search("\/video\/", str(
                                        a.get("href"))) and not re.search(
                                         "\/magazine\/", str(a.get("href")))\
                                and not re.search("\/qui\-a\-dit\/",
                                                  str(a.get("href"))):
                            if re.search('www', str(a.get("href"))):
                                article_noob.append(a.get("href"))

    return (article_noob)

示例#6

0

显示文件

def collect_articles():
    """
        Retour :
            - Liste contenant les informations de tous les articles récupérés
    """

    list_url_articles = collect_url_articles()

    list_new_articles = []

    i = 0
    for url_article in list_url_articles:
        i += 1
        if i % 10 == 0:
            time.sleep(61)
        soup = utils.recovery_flux_url_rss(url_article)

        # Récupération du titre de l'article
        for div in soup.find_all('div'):
            if div.get("class") == ["page-title"]:
                title = div.get_text()

        # Récupération de l'auteur de l'article
        list_authors = []
        for span in soup.find_all('span'):
            if span.get("rel") == ["author"]:
                list_authors.append(span.get_text())

        # Récupération de la date de publication de l'article
        date_publi = ""
        for div in soup.find_all('div'):
            if div.get("class") == [
                    'reset-text', 'art-date-infos', 'mts', 'list-view'
            ]:
                for balise_time in div.find_all('time'):
                    date = balise_time.get("datetime")
                    date_publi = date[0:10]

        # Récupération du contenu de l'article
        content = ""
        for h2 in soup.find_all('h2'):
            if h2.get('class') == ['art-chapeau']:
                content += h2.get_text() + " "
        for div in soup.find_all('div'):
            if div.get('class') == ['art-text']:
                for p in div.find_all('p'):
                    content += p.get_text() + " "

        new_article = utils.recovery_article(title, 'LePoint', list_authors,
                                             date_publi, content, 'seisme')
        if not utils.is_empty(new_article):
            list_new_articles.append(new_article)

        return list_new_articles

示例#7

0

显示文件

文件： crawler_liberation.py 项目： NicolasBroute/Projet_Tableau_de_Bord

def collect_url_bis(list_url_articles):
    """ Objectif de la fonction est de récupérer la liste url des arcticles
    selon le mot clé recherché.
    Retourne : list_url_articles : liste des url (chaine de caractère)
    """
    url_final = []
    for url in list_url_articles:
        soup = utils.recovery_flux_url_rss(url)
        for link in soup.find_all('link'):
            if (url[25:] and "amphtml") in link.get('href'):
                url_final.append(link.get('href'))
    return (url_final)

示例#8

0

显示文件

文件： crawler_nouvelobs.py 项目： NicolasBroute/Projet_Tableau_de_Bord

def recovery_information_noob(url_article):
    """
        Arguments:
            - url de l'article dont on va récupérer les informations utiles
        Retour:
            - informations de l'article en format json
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    title = soup_article.title.get_text()

    # Récupération de la date de publication de l'article
    find_date = soup_article.find('time', attrs={"class": "date"})
    for a in find_date.find_all('a'):
        find_valeur = re.compile('[0-9]{4}\/[0-9]{2}\/[0-9]{2}')
        for valeur in find_valeur.finditer(str(a.get("href"))):
            date_p = valeur.group(0)
            date_p = datetime.strptime(date_p, "%Y/%m/%d")\
                .strftime("%Y-%m-%d")

    # Récupération de l'auteur de l'article
    author = []
    for div in soup_article.find_all('div'):
        if re.search('author', str(div.get("class"))):
            author.append(div.p.span.get_text())

    # Récupération du thème de l'article
    theme = ""
    for nav in soup_article.find_all('nav'):
        if nav.get("class") == ['breadcrumb']:
            for ol in nav.find_all('ol'):
                for a in ol.find_all('a'):
                    theme = a.get_text()

    # Récupération du contenu de l'article
    content = ""
    for div in soup_article.find_all('div'):
        if re.search('body', str(div.get("id"))):
            for aside in div.find_all('aside'):
                for p in aside.find_all('p'):
                    p.string = ""
            for p in div.find_all('p'):
                for a in p.find_all('a'):
                    if a.get("class") == ['lire']:
                        a.string = ""
                for img in p.find_all('img'):
                    p.string = ""
                content += p.get_text() + " "

    article = utils.recovery_article(title, 'NouvelObservateur', author,
                                     date_p, content, theme)
    return (article)

示例#9

0

显示文件

文件： crawler_le_monde.py 项目： NicolasBroute/Projet_Tableau_de_Bord

def info_articles(article_link):
    """ Objectif est de récupérer les différents élèments contenu dans les
    artciles : titre , date, auteur et contenu
    Arguments : liste d'articles
    Retourne : un article avec les différents élèments
    """

    soup = utils.recovery_flux_url_rss(article_link)
    title = soup.title.string

    newspaper = "Le Monde"

    # Article theme
    if (soup.find("li", class_="ariane z2")):
        theme = soup.find("li", class_="ariane z2").find("a").get_text()
    else:
        theme = 'Forum'

    # Author of the article
    if (soup.find("span", class_="auteur")):
        if (soup.find("span", class_="auteur").a):
            author = soup.find("span", class_="auteur").find("a").get_text()
        else:
            author = soup.find("span", class_="auteur").get_text()
        author = re.sub(r"\s\s+", " ", author)
        author = re.sub(r"^ ", "", author)
    else:
        author = ""

    # publication date
    date_p = ""
    da = re.search(r"\d{4}-\d{2}\-\d{2}", soup.find("time").get("datetime"))[0]
    if (da):
        date_p = date.datetime.strptime(da, "%Y-%m-%d").strftime("%d/%m/%Y")
    else:
        date_p = str(date.datetime.now().strftime("%d/%m/%Y"))

    # Article content
    content = ""
    for div in soup.find_all(
            'div', attrs={'class': 'contenu_article js_' + 'article_body'}):
        for p in div.find_all('p'):
            content += p.get_text() + " "
    # content = unidecode.unidecode(re.sub(r"\s\s+", " ", content))

    new_article = utils.recovery_article(title, newspaper, [author], date_p,
                                         content, theme)

    return new_article