示例#1
0
def recovery_information_noob(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    title = soup_article.title.get_text()

    # Retrieval of publication date
    find_date = soup_article.find('time', attrs={"class": "date"})
    for a in find_date.find_all('a'):
        find_valeur = re.compile('[0-9]{4}\/[0-9]{2}\/[0-9]{2}')
        for valeur in find_valeur.finditer(str(a.get("href"))):
            date_p = valeur.group(0)
            date_p = datetime.strptime(date_p, "%Y/%m/%d")\
                .strftime("%Y-%m-%d")

    # Retrieval of the author of the article
    author = []
    for div in soup_article.find_all('div'):
        if re.search('author', str(div.get("class"))):
            author.append(div.p.span.get_text())

    # Retrieval of the artical theme
    theme = ""
    for nav in soup_article.find_all('nav'):
        if nav.get("class") == ['breadcrumb']:
            for ol in nav.find_all('ol'):
                for a in ol.find_all('a'):
                    theme = a.get_text()

    # Retrieving the content of the article
    contents = ""
    for div in soup_article.find_all('div'):
        if re.search('body', str(div.get("id"))):
            for aside in div.find_all('aside'):
                for p in aside.find_all('p'):
                    p.string = ""
            for p in div.find_all('p'):
                for a in p.find_all('a'):
                    if a.get("class") == ['lire']:
                        a.string = ""
                for img in p.find_all('img'):
                    p.string = ""
                contents += p.get_text() + " "

    article = utils.recovery_article(title, 'NouvelObservateur', author,
                                     date_p, contents, theme)
    return (article)
示例#2
0
def recovery_information_hum(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'og:title':
            title = meta.get("content")

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:section':
            theme = meta.get("content")

    author = []
    for h2 in soup_article.find_all('h2'):
        for a in h2.find_all('a'):
            for valeur in re.finditer('auteur', str(a.get("href"))):
                author.append(a.get_text())

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:published_time':
            raw_date = meta.get("content")
            date_p = raw_date[0:10]
            date_p = datetime.strptime(date_p, "%Y-%m-%d").strftime("%d/%m/%Y")

    contents = ""
    for div in soup_article.find_all('div'):
        if div.get("class") == [
            'field',
            'field-name-field-news-chapo',
            'field-type-text-long',
                'field-label-hidden']:
            for p in div.find_all('p'):
                contents += p.get_text()
        if div.get("class") == [
            'field',
            'field-name-field-news-text',
            'field-type-text-long',
                'field-label-hidden']:
            for p in div.find_all('p'):
                contents += p.get_text()

    article = utils.recovery_article(title, 'Humanite',
                                     author, date_p, contents, theme)
    return(article)
def recovery_information_fem(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    title = soup_article.title.get_text()
    title = title.split(" - ")
    title = title[0]

    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:published_time':
            raw_date = meta.get("content")
            date_p = raw_date[0:10]

    author = []
    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:author':
            author.append(meta.get("content"))

    theme = ""
    for link in soup_article.find_all('link'):
        if link.get("rel") == ['Index']:
            link_theme = link.get("href")
            part_link = link_theme.split("/")
            theme = part_link[3]

    contents = ""
    for div in soup_article.find_all('div'):
        if div.get("class") == ['chapo']:
            for p in div.find_all('p'):
                contents += p.get_text() + " "
        if div.get("class") == ['contenu']:
            for p in div.find_all('p'):
                contents += p.get_text() + " "
        if div.get("class") == ['diaporama']:
            for p in div.find_all('p'):
                contents += p.get_text() + " "
    contents = re.sub(r"\s\s+", " ", contents)

    article = utils.recovery_article(title, 'Femina',
                                     author, date_p, contents, theme)
    return(article)
示例#4
0
def info_articles(article_link):
    req = requests.get(article_link)
    data = req.text
    soup = BeautifulSoup(data, "lxml")

    title = unidecode.unidecode(soup.find('title').string)

    newspaper = "Le Monde"

    # Article theme
    if(soup.find("li", class_="ariane z2")):
        theme = soup.find("li", class_="ariane z2").find("a").get_text()
    else:
        theme = 'Forum'

    # Author of the article
    if(soup.find("span", class_="auteur")):
        if(soup.find("span", class_="auteur").a):
            author = soup.find("span", class_="auteur").find("a").get_text()
        else:
            author = soup.find("span", class_="auteur").get_text()
        author = re.sub(r"\s\s+", " ", author)
        author = re.sub(r"^ ", "", author)
    else:
        author = ""

    # publication date
    da = re.search(r"\d{4}-\d{2}\-\d{2}", soup.find("time").get("datetime"))[0]
    if(da):
        date_p = date.datetime.strptime(da, "%Y-%m-%d").strftime("%d/%m/%Y")
    else:
        date_p = str(date.datetime.now().strftime("%d/%m/%Y"))

    # Article content
    content = ""
    for div in soup.find_all('div'):
        for p in div.find_all('p'):
            content += p.get_text() + " "
    content = unidecode.unidecode(re.sub(r"\s\s+", " ", content))

    new_article = utilsg4.recovery_article(
        title, newspaper, [author], date_p, content, theme)

    return new_article
def recovery_information_sv(url_article):
    """
        Arguments:
            - url of one article
        Returns:
            - informations of the article
    """
    soup_article = utils.recovery_flux_url_rss(url_article)

    # title
    title = ""
    for h1 in soup_article.find_all("h1"):
        if h1.get("class") == ["like-h1"]:
            title = h1.get_text()

    # date
    t_date = soup_article.find("time")["datetime"].split("-")
    t_date.reverse()
    date = "/".join(t_date)

    # author
    author = []
    for span in soup_article.find_all('span'):
        if span.get("class") == ["author"]:
            author.append(span.span.get_text())

    # content
    content = ""
    for div in soup_article.find_all('div'):
        if div.get("class") == ['content', 'left']:
            for p in div.find_all('p'):
                content += p.get_text() + " "

    # theme
    theme = ""
    for meta in soup_article.find_all('meta'):
        if meta.get("property") == 'article:tag':
            theme = meta.get("content")

    article = utils.recovery_article(title, 'Scienceetvie',
                                     author, date, content, theme)
    return(article)
示例#6
0
def recovery_information_fusc(url):
    """
        Arguments:
            url : string
        Return :
            article : dictionary
        It retrieve for each article the title, newspaper, author, date, theme
    """
    soup = utils.recovery_flux_url_rss(url)

    # retrieve title
    title = ''
    title = soup.title.string
    indice = title.find('|')
    if indice != -1:
        title = title[:indice-1]

    # retrieve the author
    author = []
    tag_author = soup.find('h3', attrs={'itemprop': 'author'})
    author.append(tag_author.get_text())

    # retrieve date
    publi_date = ''
    regex_date = re. search('[0-9]{2}\/[0-9]{2}\/[0-9]{4}', soup.time.string)
    publi_date = regex_date.group(0)

    # retrieve content
    content = ''
    for p in soup.find_all('p'):
        for p2 in re.finditer('py0p5', p.get('class')[-1]):
            content += p.get_text()

    # retrieve theme
    delimiter = url.split('/')
    theme = delimiter[3]

    article = utils.recovery_article(title, 'FuturaSciences', author,
                                     publi_date, content, theme)
    return(article)
示例#7
0
def recovery_information_lg(url):
    """
        Arguments:
            url : string
        Return :
            article : dictionary
        It retrieve for each article the title, newspaper, author, date, theme
    """
    soup = utils.recovery_flux_url_rss(url)

    # Retrieving the title
    balise_title = soup.title.string
    sep = balise_title.split("—")
    title = unidecode.unidecode("—".join(sep[:-1]))

    # Retrieving of author and publication date
    author = []
    for span in soup.find_all('span'):
        if span.get("class") == ['context']:
            author.append(span.a.get_text())
            for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                                      str(span)):
                date_p = valeur.group(0)

    # Retrieving the theme
    for ul in soup.find_all('ul'):
        if ul.get("class") == ['post-categories']:
            for li in ul.find_all('li'):
                theme = li.get_text()

    # Retrieving the content of the article
    contents = ""
    for div in soup.find_all('div'):
        if div.get("class") == ['content']:
            for p in div.find_all('p'):
                contents += p.get_text() + " "
    new_article = utils.recovery_article(title, "Le Gorafi", author, date_p,
                                         contents, theme)
    return(new_article)
示例#8
0
def get_article(url):
    """Prend en argument une adresse url (url) et retourne une article au format
        {
            "title" : str,
            "newspaper" : str,
            "author" : [str],
            "date_publi" : str,
            "content" : str,
            "theme" : str
            }
    """
    soup = utils.recovery_flux_url_rss(url)
    article = soup.find("article")
    # Titre de l'article
    title = article.find("h1").get_text()
    # tableau vide quand il y'a pas d'autheur sinon tableau de(s) auteur(s)
    authors = [] if article.find("header").find(
        "p", class_="authorsign-label") is None else unidecode(
            article.find("header").find(
                "p", class_="authorsign-label").get_text()).split(" et ")
    # Date de publication de l'article
    date_tab = article.find("time").get("datetime")[:10].split("-")
    date_tab.reverse()
    date_pub = "/".join(date_tab)
    # Theme de l'article
    theme = article.find("ol", class_="breadcrumb-list")\
        .find_all("li")[1].find("span").get_text()
    # Contenu de l'article
    content = ""
    for p in article.find("div", class_="content").find_all("p"):
        content = content + p.get_text()
    # Nom du journal
    newspaper = soup.find("footer").find(has_copyright).find("a").get_text()
    regex = re.compile(r'[\n\r\t]')
    # Elever les \n \r \t du contenu
    content = regex.sub("", content)
    return utils.recovery_article(unidecode(title), unidecode(newspaper),
                                  authors, str(date_pub), unidecode(content),
                                  unidecode(theme))
示例#9
0
def recovery_information_ld(url):

    soup = utilsg4.recovery_flux_url_rss(url)
    # Retrieve the title
    for meta in soup.find_all('meta'):
        if meta.get("property") == 'og:title':
            title = meta.get("content")

    # Retrieve the publication date
    for time in soup.find_all('time'):
        if time.get("itemprop") == 'datePublished':
            date = time.get("itemprop")
            for valeur in re.finditer('[0-9]{2}\/[0-9]{2}\/[0-9]{4}',
                                      str(time)):
                date = valeur.group(0)

    # Retrieve the author
    author = []
    for div in soup.find_all('div'):
        if div.get("class") == ['article_author']:
            author.append(div.span.get_text())

    # Retrieve the content
    content = ""
    for div in soup.find_all('div'):
        if div.get("itemprop") == 'articleBody':
            for p in div.find_all('p'):
                content += p.get_text() + " "

    # Retrieve the theme
    theme = ""
    for h2 in soup.find_all('h2'):
        if h2.get("itemprop") == 'about':
            theme = h2.get_text()

    article = utilsg4.recovery_article(title, 'La Depeche', author, date,
                                       content, theme)
    return (article)
def get_article(url):
    """
        Prend en argument une adresse url (url) et retourne un dictionnaire
    """
    from unidecode import unidecode
    soup = utils.recovery_flux_url_rss(url)
    article = soup.find("article")
    meta = soup.find("meta", property="og:title").get("content")
    tab = meta.split("-")
    n = len(tab)
    newspaper = tab[n - 1]
    theme = tab[n - 2]
    title = "-".join(tab[:n - 2])
    authors = []
    regex = re.compile(r'[\n\r\t]')
    for span in article.find_all("span", class_="author--name"):
        author = regex.sub("", unidecode(span.get_text()))
        authors.append(author.strip())
    date_pub = article.find("span", itemprop="datePublished").get(
        "datetime")[:10].replace("-", "/")
    content = ""
    for div in article.find_all(
        "div",
        class_=[
            "article--intro",
            "article--wysiwyg",
            "article--footnotes"]):
        for p in div.find_all("p"):
            content = content + p.get_text()
    content = regex.sub("", content)
    return utils.recovery_article(
        unidecode(title),
        unidecode(newspaper),
        authors,
        date_pub,
        unidecode(content),
        unidecode(theme))