def tfIdfBlock(self, data, field): '''Creates TF/IDF canopy of a given set of data''' class CustomStopWordRemover(object): stop_words = self.stop_words[field].copy() def process(self, lst): return [w for w in lst if not w in self.stop_words] index = TextIndex(Lexicon(Splitter(), CustomStopWordRemover())) index.index = CosineIndex(index.lexicon) index_to_id = {} base_tokens = {} for i, (record_id, doc) in enumerate(data, 1) : index_to_id[i] = record_id base_tokens[i] = doc index.index_doc(i, doc) canopies = (tfidf._createCanopies(index, base_tokens, threshold, field) for threshold in self.tfidf_fields[field]) for canopy in canopies : key, index_canopy = canopy id_canopy = dict((index_to_id[k], index_to_id[v]) for k,v in index_canopy.iteritems()) self.canopies[key] = defaultdict(str, id_canopy)
def __init__(self, field, stop_words=[]): self.field = field splitter = Splitter() stop_word_remover = CustomStopWordRemover(stop_words) operator_escaper = OperatorEscaper() lexicon = Lexicon(splitter, stop_word_remover, operator_escaper) self._index = TextIndex(lexicon) self._index.index = CosineIndex(self._index.lexicon) self._i_to_id = {} self._parseTerms = self._index.lexicon.parseTerms
def stopWords(data) : index = TextIndex(Lexicon(Splitter())) for i, (_, doc) in enumerate(data, 1) : index.index_doc(i, doc) doc_freq = [(len(index.index._wordinfo[wid]), word) for word, wid in index.lexicon.items()] doc_freq.sort(reverse=True) N = float(index.index.documentCount()) threshold = int(max(1000, N * 0.05)) stop_words = set([]) for frequency, word in doc_freq : if frequency > threshold : stop_words.add(word) else : break return stop_words