Пример #1
0
 def html(self, fileids=None, categories=None):
     """
     Returns the HTML content of each document, cleaning it using
     the readability-lxml library.
     """
     for doc in self.docs(fileids, categories):
         try:
             yield Paper(doc).summary()
         except Unparseable as e:
             print("Could not parse HTML: {}".format(e))
             continue
Пример #2
0
 def html(self, fileids=None, categories=None):
     """
     Возвращает содержимое HTML каждого документа, очищая его
     с помощью библиотеки readability-lxml.
     """
     for doc in self.docs(fileids, categories):
         try:
             yield Paper(doc).summary()
         except Unparseable as e:
             print("Could not parse HTML: {}".format(e))
             continue
Пример #3
0
    def html(self, fileids=None, categories=None, readability=True):
        """
        Returns the HTML content from each JSON document for every file in
        the corpus, ensuring that it exists. Note, this simply returns the
        HTML strings, it doesn't perform any parsing of the HTML.

        If readability is True, clean HTML is returned.
        """
        ## Returns a generator of documents.
        html = self.fields('content', fileids, categories)
        if readability:
            for doc in html:
                try:
                    yield Paper(doc).summary()
                except Unparseable as e:
                    print("Could not parse HTML: {}".format(e))
        else:
            for doc in html:
                yield doc
Пример #4
0
    def docs(self, fileids=None, categories=None):
        """
        Returns the events after pos tagging them
        """
        # Initialize pos tagger if necessary - using spacy
        #if self.tagger is None :
        #    self.tagger = spacy.load('fr_core_news_sm')

        # Resolve the fileids and the categories
        fileids = self.resolve(fileids, categories)
        #print(fileids)

        # Retrieve events
        for fileid in fileids:
            paras = []
            event = self.get_event(fileid)
            assert (event is not None)
            # Build paragraphs
            # Limitation : one paragraph for the description
            paras.append(event['name'])
            html = Paper(event['description']).summary()
            ##            print(event['description'])
            ##            print(html)
            soup = bs4.BeautifulSoup(html, 'lxml')
            for element in soup.find_all(self.htmltags):
                #print('§'+element.text)
                paras.append(element.text)
            soup.decompose()
            ##            print(paras)
            ##            key=input('>>')
            ##            for item in ['name', 'description', ] :
            ##                paras.append(event[item])

            # Return tags
            yield [[self.tagger.pos_tag(sent) for sent in sent_tokenize(para)]
                   for para in paras]
Пример #5
0
def textgetter(url):
    """Scrapes web news and returns the content

    Parameters
    ----------

    url : str
        web address to news report

    Returns 
    -------
    
    answer : dict
        Python dictionary with key/value pairs for:
            text (str) - Full text of article
            url (str) - url to article
            title (str) - extracted title of article
            author (str) - name of extracted author(s)
            base (str) - base url of where article was located
            provider (str) - string of the news provider from url
            published_date (str,isoformat) - extracted date of article
            top_image (str) - extracted url of the top image for article

    """
    global done
    TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$")
    if s.search(url):
        site = u.search(s.search(url).group()).group(3)
    else:
        site = None
    answer = {}
    # check that its an url
    if s.search(url):
        if url in done.keys():
            return done[url]
            pass
        try:
            r = requests.get(url, verify=False, timeout=1)
        except:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider'] = site
            answer['published_date'] = None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            yield answer

        if r.status_code != 200:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider'] = site
            answer['published_date'] = None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url

        if len(r.content) > 500:
            article = Article(url)
            article.download(input_html=r.content)
            article.parse()
            if len(article.text) >= 200:
                answer['author'] = ", ".join(article.authors)
                answer['base'] = s.search(url).group()
                answer['provider'] = site
                answer['published_date'] = article.publish_date
                if isinstance(article.publish_date, datetime.datetime):
                    answer['published_date'] = article.publish_date.astimezone(
                        pytz.utc).isoformat()

                answer['text'] = article.text
                answer['title'] = article.title
                answer['top_image'] = article.top_image
                answer['url'] = url

            else:
                doc = Paper(r.content)
                data = doc.summary()
                title = doc.title()
                soup = BeautifulSoup(data, 'lxml')
                newstext = " ".join([l.text for l in soup.find_all(TAGS)])

                if len(newstext) > 200:
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider'] = site
                    answer['published_date'] = None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
                else:
                    newstext = " ".join([
                        l.text
                        for l in soup.find_all('div', class_='field-item even')
                    ])
                    done[url] = newstext
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider'] = site
                    answer['published_date'] = None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
        else:
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider'] = site
            answer['published_date'] = None
            answer['text'] = 'No text returned'
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            yield answer
        yield answer
        del r, data
    else:
        answer['author'] = None
        answer['base'] = s.search(url).group()
        answer['provider'] = site
        answer['published_date'] = None
        answer['text'] = 'This is not a proper url'
        answer['title'] = None
        answer['top_image'] = None
        answer['url'] = url
        yield answer
Пример #6
0
def textgetter(url):
    """Scrapes web news and returns the content
    Parameters
    ----------
    url : str
        web address to news report
    Returns 
    -------
    
    answer : dict
        Python dictionary with key/value pairs for:
            text (str) - Full text of article
            url (str) - url to article
            title (str) - extracted title of article
            author (str) - name of extracted author(s)
            base (str) - base url of where article was located
            provider (str) - string of the news provider from url
            published_date (str,isoformat) - extracted date of article
            top_image (str) - extracted url of the top image for article
    """
    global done
    TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'h7', 'p', 'li']

    # regex for url check
    s = re.compile('(http://|https://)([A-Za-z0-9_\.-]+)')
    u = re.compile("(http://|https://)(www.)?(.*)(\.[A-Za-z0-9]{1,4})$")
    if s.search(url):
        site = u.search(s.search(url).group()).group(3)
    else:
        site = None
    answer = {}
    # check that its an url
    if s.search(url):
        if url in done.keys():
            yield done[url]
            pass
        try:
            # make a request to the url
            r = requests.get(url, verify=False, timeout=1)
        except:
            # if the url does not return data, set to empty values
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords']=None
            answer['summary']=None
            yield answer
        # if url does not return successfully, set ot empty values
        if r.status_code != 200:
            done[url] = "Unable to reach website."
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = "Unable to reach website."
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords']=None
            answer['summary']=None

        # test if length of url content is greater than 500, if so, fill data
        if len(r.content)>500:
            # set article url
            article = Article(url)
            # test for python version because of html different parameters
            if int(platform.python_version_tuple()[0])==3:
                article.download(input_html=r.content)
            elif int(platform.python_version_tuple()[0])==2:
                article.download(html=r.content)
            # parse the url
            article.parse()
            article.nlp()
            # if parse doesn't pull text fill the rest of the data
            if len(article.text) >= 200:
                answer['author'] = ", ".join(article.authors)
                answer['base'] = s.search(url).group()
                answer['provider']=site
                answer['published_date'] = article.publish_date
                answer['keywords']=article.keywords
                answer['summary']=article.summary
                # convert the data to isoformat; exception for naive date
                if isinstance(article.publish_date,datetime.datetime):
                    try:
                        answer['published_date']=article.publish_date.astimezone(pytz.utc).isoformat()
                    except:
                        answer['published_date']=article.publish_date.isoformat()
                

                answer['text'] = article.text
                answer['title'] = article.title
                answer['top_image'] = article.top_image
                answer['url'] = url
                
                

            # if previous didn't work, try another library
            else:
                doc = Paper(r.content)
                data = doc.summary()
                title = doc.title()
                soup = BeautifulSoup(data, 'lxml')
                newstext = " ".join([l.text for l in soup.find_all(TAGS)])

                # as we did above, pull text if it's greater than 200 length
                if len(newstext) > 200:
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider']=site
                    answer['published_date']=None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
                    answer['keywords']=None
                    answer['summary']=None
                # if nothing works above, use beautiful soup
                else:
                    newstext = " ".join([
                        l.text
                        for l in soup.find_all(
                            'div', class_='field-item even')
                    ])
                    done[url] = newstext
                    answer['author'] = None
                    answer['base'] = s.search(url).group()
                    answer['provider']=site
                    answer['published_date']=None
                    answer['text'] = newstext
                    answer['title'] = title
                    answer['top_image'] = None
                    answer['url'] = url
                    answer['keywords']=None
                    answer['summary']=None
        # if nothing works, fill with empty values
        else:
            answer['author'] = None
            answer['base'] = s.search(url).group()
            answer['provider']=site
            answer['published_date']=None
            answer['text'] = 'No text returned'
            answer['title'] = None
            answer['top_image'] = None
            answer['url'] = url
            answer['keywords']=None
            answer['summary']=None
            yield answer
        yield answer

    # the else clause to catch if invalid url passed in
    else:
        answer['author'] = None
        answer['base'] = s.search(url).group()
        answer['provider']=site
        answer['published_date']=None
        answer['text'] = 'This is not a proper url'
        answer['title'] = None
        answer['top_image'] = None
        answer['url'] = url
        answer['keywords']=None
        answer['summary']=None
        yield answer