def pipe(self, key, value): doc = value for word in set(w.text for w in util.IterWords(doc)): new_doc = document_pb2.Document() new_doc.CopyFrom(doc) yield str(word), new_doc
def pipe(self, key, value): doc = value for word in set(w.text for w in util.IterWords(doc)): keyword = document_pb2.Keyword() keyword.word = word keyword.doc_count = 1 keyword.total_doc_count = self.n yield str(word), keyword
def pipe(self, key, value): doc = value label = Label(doc, self.label, self.classes) for word in set(w.text for w in util.IterWords(doc)): keyword = document_pb2.Keyword() keyword.word = word keyword.prior[label] = 1 yield str(word), keyword
def pipe(self, key, value): doc, keyword = value term_count = 0 doc_term_count = 0 for word in (w.text for w in util.IterWords(doc)): doc_term_count += 1 if word == keyword.word: term_count += 1 keyword.term_count = term_count keyword.tf_idf = self.score( term_count, doc_term_count, keyword.doc_count, keyword.total_doc_count) doc.keywords.extend([keyword]) if float(keyword.doc_count) / keyword.total_doc_count > self.min_df: yield str(doc.url), doc
def weight(self, doc, word): return sum(1 for w in util.IterWords(doc) if w.text == word)