Пример #1
0
class Document(object):
    """
    Document object represents webpage, article, or other text, to be processed.
    """
    def __init__(self, title, text, domain):
        """
        Init from title, body text, and domain name.
        """
        self.title = title
        self.text = text
        self.domain = domain

        self.full_text = u"{title}\n{text}".format(title=self.title, text=self.text)

    def entities(self,verbose=False):
        """
        Fetches context info and returns final entities.
        """
        self._entities = EntityCollection(self.full_text)
        self._entities.fetch_info()
        self._entities.sort()
        self._entities.find_indices_in_text(self.text)
        if verbose:
            return self._entities.verbose()
        else:
            return self._entities.output()

    def news(self):
        """
        Fetches news articles related to the given document.
        """
        if not self._entities:
            self.entities()
        self._entities.fetch_news()
        return self._entities.news

    # html for entities now inserted on frontend
    #
    # def html(self):
    #     if not self._entities:
    #         self.entities()
    #     output = self.text
    #     all = []
    #     for i, entity in enumerate(self._entities.output()):
    #         for indices in entity['indices']:
    #             elt = {}
    #             elt['indices'] = indices
    #             elt['i'] = i+1
    #             elt['text'] = entity['text']
    #             all.append(elt)
    #     all.sort(key=lambda e: e['indices'][0])
    #     offset = 0
    #     for elt in all:
    #         start = offset + elt['indices'][0]
    #         end = offset + elt['indices'][1]
    #         original = output[start:end]
    #         sub = u'<span class="entity">{0}<span data-index="{1}" class="index">{1}</span></span>'.format(original, elt['i'])
    #         output = "".join([output[:start],sub,output[end:]])
    #         offset += len(sub) - len(original)
    #     return output
Пример #2
0
 def entities(self, verbose=False):
     """
     Fetches context info and returns final entities.
     """
     self._entities = EntityCollection(self.full_text)
     self._entities.fetch_info()
     self._entities.sort()
     self._entities.find_indices_in_text(self.text)
     if verbose:
         return self._entities.verbose()
     else:
         return self._entities.output()
Пример #3
0
 def entities(self,verbose=False):
     """
     Fetches context info and returns final entities.
     """
     self._entities = EntityCollection(self.full_text)
     self._entities.fetch_info()
     self._entities.sort()
     self._entities.find_indices_in_text(self.text)
     if verbose:
         return self._entities.verbose()
     else:
         return self._entities.output()
Пример #4
0
class Document(object):
    """
    Document object represents webpage, article, or other text, to be processed.
    """
    def __init__(self, title, text, domain):
        """
        Init from title, body text, and domain name.
        """
        self.title = title
        self.text = text
        self.domain = domain

        self.full_text = u"{title}\n{text}".format(title=self.title,
                                                   text=self.text)

    def entities(self, verbose=False):
        """
        Fetches context info and returns final entities.
        """
        self._entities = EntityCollection(self.full_text)
        self._entities.fetch_info()
        self._entities.sort()
        self._entities.find_indices_in_text(self.text)
        if verbose:
            return self._entities.verbose()
        else:
            return self._entities.output()

    def news(self):
        """
        Fetches news articles related to the given document.
        """
        if not self._entities:
            self.entities()
        self._entities.fetch_news()
        return self._entities.news