def fill(self): zf = zipfile.ZipFile(self.path) if not zf.namelist() == ['test.txt', 'train.txt', 'vocab.txt']: raise Exception( 'Collection arch must be contain only this files: test.txt, train.txt, vocab.txt' ' but find %s' % zf.namelist()) for text_id, doc in grouper(zf.open('train.txt'), 2): self.documents_train.append( map(int, doc.decode('utf-8').strip().split())) for text_id, doc in grouper(zf.open('test.txt'), 2): self.documents_test.append( map(int, doc.decode('utf-8').strip().split())) self.id_to_words = dict( filter( len, imap(split, zf.open('vocab.txt').read().decode('utf8').split('\n')))) self.id_to_words = dict( zip(map(int, self.id_to_words.keys()), self.id_to_words.values())) self.words_to_id = dict( imap(lambda x: (x[1], x[0]), self.id_to_words.iteritems())) self.num_wrd = len(self.id_to_words) print 'Read %s num doc train = %s test = %s num wrd %s' % \ (self.path, len(self.documents_train), len(self.documents_test), self.num_wrd) return self
def fill(self): zf = zipfile.ZipFile(self.path) if not zf.namelist() == ['test.txt', 'train.txt', 'vocab.txt']: raise Exception('Collection arch must be contain only this files: test.txt, train.txt, vocab.txt') for text_id, doc in grouper(zf.open('train.txt'), 2): self.documents_train.append(map(int, doc.decode('utf-8').strip().split())) for text_id, doc in grouper(zf.open('test.txt'), 2): self.documents_test.append(map(int, doc.decode('utf-8').strip().split())) self.id_to_words = dict(filter(len, imap(split, zf.open('vocab.txt').read().decode('utf8').split('\n')))) self.id_to_words = dict(zip(map(int, self.id_to_words.keys()), self.id_to_words.values())) self.words_to_id = dict(imap(lambda x: (x[1], x[0]), self.id_to_words.iteritems())) self.num_wrd = len(self.id_to_words) print 'Read %s num doc train = %s test = %s num wrd %s' % \ (self.path, len(self.documents_train), len(self.documents_test), self.num_wrd) return self
def train(self, collection): bigrams = dict() collocation_measure = lambda coloc: counts_neighbors[ coloc] * 1.0 / counts_windows[coloc] for i, document in enumerate(collection.documents_train + collection.documents_test): groups_words = grouper([wrd for wrd in document], 5) counts_neighbors = defaultdict(lambda: 0) counts_windows = defaultdict(lambda: 0) for group in groups_words: count_neighbor = Counter(ngrams(group, 2)) count_window = Counter(all_pairs(group)) dicts_sum(counts_neighbors, count_neighbor) dicts_sum(counts_windows, count_window) buf = [] for item in counts_neighbors: if collocation_measure(item) > self.sigma and counts_neighbors[ item] > self.min_occur: if item[0] != item[1]: buf.append((item, (collocation_measure(item), counts_neighbors[item]))) bigrams[i] = dict(buf) collection.bigrams = [] for i in xrange( len(collection.documents_train + collection.documents_test)): collection.bigrams += bigrams[i].keys() collection.bigrams = set(collection.bigrams) documents = [ wrd for document in collection.documents_train + collection.documents_test for wrd in document ] bigrams = filter(lambda bigr: bigr in collection.bigrams, ngrams(documents, 2)) collection.bigrams = dict( sorted(Counter(bigrams).items(), key=itemgetter(1), reverse=True)[:self.top]) max_v = max(collection.words_to_id.values()) + 1 for bigram in collection.bigrams.keys(): collection.words_to_id[bigram] = max_v collection.id_to_words[max_v] = collection.id_to_words[ bigram[0]] + '_' + collection.id_to_words[bigram[1]] max_v += 1