def fill(self):
        zf = zipfile.ZipFile(self.path)

        if not zf.namelist() == ['test.txt', 'train.txt', 'vocab.txt']:
            raise Exception(
                'Collection arch must be contain only this files: test.txt, train.txt, vocab.txt'
                ' but find %s' % zf.namelist())

        for text_id, doc in grouper(zf.open('train.txt'), 2):
            self.documents_train.append(
                map(int,
                    doc.decode('utf-8').strip().split()))

        for text_id, doc in grouper(zf.open('test.txt'), 2):
            self.documents_test.append(
                map(int,
                    doc.decode('utf-8').strip().split()))

        self.id_to_words = dict(
            filter(
                len,
                imap(split,
                     zf.open('vocab.txt').read().decode('utf8').split('\n'))))
        self.id_to_words = dict(
            zip(map(int, self.id_to_words.keys()), self.id_to_words.values()))
        self.words_to_id = dict(
            imap(lambda x: (x[1], x[0]), self.id_to_words.iteritems()))

        self.num_wrd = len(self.id_to_words)

        print 'Read %s num doc train = %s test = %s num wrd %s' % \
              (self.path, len(self.documents_train), len(self.documents_test), self.num_wrd)

        return self
Пример #2
0
    def fill(self):
        zf = zipfile.ZipFile(self.path)

        if not zf.namelist() == ['test.txt', 'train.txt', 'vocab.txt']:
            raise Exception('Collection arch must be contain only this files: test.txt, train.txt, vocab.txt')

        for text_id, doc in grouper(zf.open('train.txt'), 2):
            self.documents_train.append(map(int, doc.decode('utf-8').strip().split()))

        for text_id, doc in grouper(zf.open('test.txt'), 2):
            self.documents_test.append(map(int, doc.decode('utf-8').strip().split()))

        self.id_to_words = dict(filter(len, imap(split, zf.open('vocab.txt').read().decode('utf8').split('\n'))))
        self.id_to_words = dict(zip(map(int, self.id_to_words.keys()), self.id_to_words.values()))
        self.words_to_id = dict(imap(lambda x: (x[1], x[0]), self.id_to_words.iteritems()))

        self.num_wrd = len(self.id_to_words)

        print 'Read %s num doc train = %s test = %s num wrd %s' % \
              (self.path, len(self.documents_train), len(self.documents_test), self.num_wrd)

        return self
    def train(self, collection):
        bigrams = dict()
        collocation_measure = lambda coloc: counts_neighbors[
            coloc] * 1.0 / counts_windows[coloc]

        for i, document in enumerate(collection.documents_train +
                                     collection.documents_test):
            groups_words = grouper([wrd for wrd in document], 5)
            counts_neighbors = defaultdict(lambda: 0)
            counts_windows = defaultdict(lambda: 0)

            for group in groups_words:
                count_neighbor = Counter(ngrams(group, 2))
                count_window = Counter(all_pairs(group))

                dicts_sum(counts_neighbors, count_neighbor)
                dicts_sum(counts_windows, count_window)

            buf = []
            for item in counts_neighbors:
                if collocation_measure(item) > self.sigma and counts_neighbors[
                        item] > self.min_occur:
                    if item[0] != item[1]:
                        buf.append((item, (collocation_measure(item),
                                           counts_neighbors[item])))

            bigrams[i] = dict(buf)

        collection.bigrams = []
        for i in xrange(
                len(collection.documents_train + collection.documents_test)):
            collection.bigrams += bigrams[i].keys()
        collection.bigrams = set(collection.bigrams)

        documents = [
            wrd for document in collection.documents_train +
            collection.documents_test for wrd in document
        ]
        bigrams = filter(lambda bigr: bigr in collection.bigrams,
                         ngrams(documents, 2))
        collection.bigrams = dict(
            sorted(Counter(bigrams).items(), key=itemgetter(1),
                   reverse=True)[:self.top])

        max_v = max(collection.words_to_id.values()) + 1

        for bigram in collection.bigrams.keys():
            collection.words_to_id[bigram] = max_v
            collection.id_to_words[max_v] = collection.id_to_words[
                bigram[0]] + '_' + collection.id_to_words[bigram[1]]
            max_v += 1