예제 #1
0
 def sparse_tensor_batches(batch_size=1000, symmetric=symmetric):
     batches = batch_generator2(self.model,
                                self.sentences_generator(),
                                batch_size=batch_size)
     for batch in batches:
         sparse_ppmi_tensor_pair = gatherer.create_pmi_tensor(
             batch=batch,
             positive=True,
             debug=False,
             symmetric=symmetric,
             log_info=False,
             neg_sample_percent=0.0 if nonneg else 0.05,
             pmi=True,
             shift=shift,
         )
         yield sparse_ppmi_tensor_pair
        def sparse_tensor_batches(batch_size=1000, symmetric=symmetric):
            if is_glove:

                def grouper(n, iterable):
                    it = iter(iterable)
                    while True:
                        chunk = tuple(itertools.islice(it, n))
                        if not chunk:
                            return
                        yield chunk

                (indices, values) = gatherer.create_pmi_tensor(
                    batch=None,
                    positive=True,
                    debug=False,
                    symmetric=True,
                    log_info=True,
                    pmi=False,
                )
                (indices, values) = (indices, np.log(values))
                for i in range(50):
                    indices_shuffled, values_shuffled = shuffle(
                        indices, values)  # sklearn's shuffle implementation
                    print('GloVe iteration number {}...'.format(i))
                    for sampled_indices, sampled_values in zip(
                            grouper(batch_size, indices_shuffled),
                            grouper(batch_size, values_shuffled)):
                        yield (sampled_indices, sampled_values)
            else:  # not is_glove
                batches = batch_generator2(self.model,
                                           self.sentences_generator(),
                                           batch_size=batch_size)
                for batch in batches:
                    sparse_ppmi_tensor_pair = gatherer.create_pmi_tensor(
                        batch=batch,
                        positive=True,
                        debug=False,
                        symmetric=symmetric,
                        log_info=False,
                        neg_sample_percent=neg_sample_percent,
                        pmi=True,
                        shift=shift,
                    )
                    yield sparse_ppmi_tensor_pair
예제 #3
0
 def sparse_tensor_batches(batch_size=1000):
     batches = batch_generator2(
         self.model,
         self.sentences_generator(num_sents=self.num_sents),
         batch_size=batch_size)
     for batch in batches:
         pairlist = [
             gatherer.create_pmi_tensor(
                 batch=batch,
                 positive=True,
                 debug=False,
                 symmetric=True,
                 log_info=False,
                 neg_sample_percent=0.0,
                 pmi=True,
                 shift=shift,
             ) for (shift, gatherer) in zip(shifts, gatherers)
         ]
         yield ([x[0] for x in pairlist], [x[1] for x in pairlist])
    def get_pmi_gatherer(self, n):
        gatherer = None
        if os.path.exists('gatherer_{}_{}_{}.pkl'.format(
                self.num_articles, self.min_count, n)):
            with open(
                    'gatherer_{}_{}_{}.pkl'.format(self.num_articles,
                                                   self.min_count, n),
                    'rb') as f:
                t = time.time()
                import gc
                gc.disable()
                gatherer = dill.load(f)
                gc.enable()
                print('Loading gatherer took {} secs'.format(time.time() - t))
        else:
            # batch_size doesn't matter. But higher is probably better (in terms of threading & speed)
            batches = batch_generator2(
                self.model,
                self.sentences_generator(num_articles=self.num_articles),
                batch_size=1000)
            gatherer = PMIGatherer(self.model, n=n)
            if self.num_articles <= 1e4:
                gatherer.populate_counts(batches, huge_vocab=False)
            else:
                gatherer.populate_counts(batches, huge_vocab=True, min_count=5)

            with open(
                    'gatherer_{}_{}_{}.pkl'.format(self.num_articles,
                                                   self.min_count, n),
                    'wb') as f:
                t = time.time()
                import gc
                gc.disable()
                dill.dump(gatherer, f)
                gc.enable()
                print('Dumping gatherer took {} secs'.format(time.time() - t))
        return gatherer