Exemplo n.º 1
0
 def pipe(self, key, value):
   doc = value
   for word in set(w.text for w in util.IterWords(doc)):
     new_doc = document_pb2.Document()
     new_doc.CopyFrom(doc)
     
     yield str(word), new_doc
Exemplo n.º 2
0
  def pipe(self, key, value):
    doc = value
    for word in set(w.text for w in util.IterWords(doc)):
      keyword = document_pb2.Keyword()
      keyword.word = word
      keyword.doc_count = 1
      keyword.total_doc_count = self.n

      yield str(word), keyword
Exemplo n.º 3
0
    def pipe(self, key, value):
        doc = value
        label = Label(doc, self.label, self.classes)
        for word in set(w.text for w in util.IterWords(doc)):
            keyword = document_pb2.Keyword()
            keyword.word = word
            keyword.prior[label] = 1

            yield str(word), keyword
Exemplo n.º 4
0
  def pipe(self, key, value):
    doc, keyword = value
    term_count = 0
    doc_term_count = 0
    for word in (w.text for w in util.IterWords(doc)):
      doc_term_count += 1
      if word == keyword.word:
        term_count += 1
        
    keyword.term_count = term_count
    keyword.tf_idf = self.score(
        term_count, doc_term_count, keyword.doc_count, keyword.total_doc_count)
    doc.keywords.extend([keyword])

    if float(keyword.doc_count) / keyword.total_doc_count > self.min_df:
      yield str(doc.url), doc
Exemplo n.º 5
0
 def weight(self, doc, word):
     return sum(1 for w in util.IterWords(doc) if w.text == word)