Пример #1
0
 def gen_batches(self, docs, y=None):
     docs = list(zip(docs, y)) if y is not None else list(docs)
     if y is not None:
         get_rng().shuffle(docs)
     for i in range(0, len(docs), self.batch_size):
         cur_docs = docs[i:i + self.batch_size]
         if len(cur_docs) < self.batch_size:
             cur_docs.extend(docs[i] for i in get_rng().choice(len(docs), self.batch_size - len(cur_docs), False))
         yield self.gen_batch(*zip(*cur_docs)) if y is not None else self.gen_batch(cur_docs)
Пример #2
0
 def gen_batches(self, docs, y=None):
     docs = list(zip(docs, y)) if y is not None else list(docs)
     if y is not None:
         get_rng().shuffle(docs)
     for i in range(0, len(docs), self.batch_size):
         cur_docs = docs[i:i + self.batch_size]
         if len(cur_docs) < self.batch_size:
             cur_docs.extend(
                 docs[i]
                 for i in get_rng().choice(len(docs), self.batch_size -
                                           len(cur_docs), False))
         yield self.gen_batch(*zip(
             *cur_docs)) if y is not None else self.gen_batch(cur_docs)
Пример #3
0
 def fit(self, docs, y):
     self.joint_model.build_vocab(docs)
     freqs = Counter(y)
     classes = sorted(y.keys())
     self.class_scores = np.log([freqs[c] for c in classes])
     self.models = [deepcopy(self.joint_model) for _ in classes]
     for class_, model in zip(classes, self.models):
         cur_docs = [doc for doc, c in zip(docs, y) if c == class_]
         for epoch in range(20):
             logger.info('epoch {}'.format(epoch + 1))
             get_rng().shuffle(cur_docs)
             model.train(cur_docs)
             model.alpha *= 0.9
             model.min_alpha = model.alpha
Пример #4
0
 def fit(self, docs, y):
     self.joint_model.build_vocab(docs)
     freqs = Counter(y)
     classes = sorted(freqs.keys())
     self.class_scores = np.log([freqs[c] for c in classes])
     self.models = [deepcopy(self.joint_model) for _ in classes]
     for class_, model in zip(classes, self.models):
         cur_docs = [doc for doc, c in zip(docs, y) if c == class_]
         for epoch in range(20):
             logging.info('epoch {}'.format(epoch + 1))
             get_rng().shuffle(cur_docs)
             model.train(cur_docs)
             model.alpha *= 0.9
             model.min_alpha = model.alpha
Пример #5
0
 def _fit_embedding_word(self, embedding_type, construct_docs, tokenize_, d=None):
     if embedding_type == 'google':
         embeddings_ = joblib.load('data/google/GoogleNews-vectors-negative300.pickle')
         embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()})
     elif embedding_type == 'twitter':
         estimator = Pipeline([
             ('tokenize', MapCorporas(tokenize_)),
             ('word2vec', MergeSliceCorporas(CachedFitTransform(Word2Vec(
                 sg=1, size=d, window=10, hs=0, negative=5, sample=1e-3, min_count=1, iter=20, workers=16
             ), self.memory))),
         ]).fit([self.train_docs, self.unsup_docs[:10**6], self.val_docs, self.test_docs])
         embeddings_ = estimator.named_steps['word2vec'].estimator
         embeddings_ = SimpleNamespace(X=embeddings_.syn0, vocab={w: v.index for w, v in embeddings_.vocab.items()})
     else:
         embeddings_ = SimpleNamespace(X=np.empty((0, d)), vocab={})
     estimator = Pipeline([
         ('tokenize', MapCorporas(tokenize_)),
         # 0.25 is chosen so the unknown vectors have approximately the same variance as google pre-trained ones
         ('embeddings', MapCorporas(Embeddings(
             embeddings_, rand=lambda shape: get_rng().uniform(-0.25, 0.25, shape).astype('float32'),
             include_zero=True
         ))),
     ])
     estimator.fit(construct_docs)
     return estimator.named_steps['embeddings'].estimator
Пример #6
0
 def transform(self, docs):
     for doc in docs:
         res = '' if isinstance(doc, str) else []
         is_ = sorted(get_rng().choice(len(doc), min(len(doc), get_rng().geometric(self.p) - 1), replace=False))
         prev_i = -1
         for i in is_:
             # delete, insert, substitute
             op = get_rng().choice(3) if self.alphabet else 0
             if op == 0:
                 res += doc[prev_i + 1:i]
             elif op == 1:
                 res += doc[prev_i + 1:i + 1] + self.alphabet[get_rng().choice(len(self.alphabet))]
             else:
                 res += doc[prev_i + 1:i] + self.alphabet[get_rng().choice(len(self.alphabet))]
             prev_i = i
         res += doc[prev_i + 1:]
         yield res
Пример #7
0
 def transform(self, docs):
     for doc in docs:
         res = [word for word, tag, confidence in doc]
         is_ = get_rng().choice(len(doc), min(len(doc), get_rng().geometric(0.5)), replace=False)
         for i in is_:
             word, tag, confidence = doc[i]
             words = []
             if tag not in 'nvar':
                 continue
             for synset in wn.synsets(word, pos=tag):
                 for lemma in synset.lemma_names():
                     replace_word = lemma.replace('_', ' ')
                     if replace_word.lower() != word.lower():
                         words.append(replace_word)
             word_i = get_rng().geometric(0.5)
             if word_i < len(words):
                 res[i] = words[word_i]
         yield res
Пример #8
0
 def transform(self, docs):
     for doc in docs:
         res = '' if isinstance(doc, str) else []
         is_ = sorted(get_rng().choice(len(doc),
                                       min(len(doc),
                                           get_rng().geometric(self.p) - 1),
                                       replace=False))
         prev_i = -1
         for i in is_:
             # delete, insert, substitute
             op = get_rng().choice(3) if self.alphabet else 0
             if op == 0:
                 res += doc[prev_i + 1:i]
             elif op == 1:
                 res += doc[prev_i + 1:i +
                            1] + self.alphabet[get_rng().choice(
                                len(self.alphabet))]
             else:
                 res += doc[prev_i + 1:i] + self.alphabet[get_rng().choice(
                     len(self.alphabet))]
             prev_i = i
         res += doc[prev_i + 1:]
         yield res
Пример #9
0
 def _fit_embedding_char(embedding_type, alphabet, d=None):
     if embedding_type == 'onehot':
         X = np.identity(len(alphabet), dtype='float32')
     else:
         X = get_rng().uniform(-0.25, 0.25, (len(alphabet), d)).astype('float32')
     return Embeddings(SimpleNamespace(vocab=dict(zip(alphabet, range(len(alphabet)))), X=X), include_zero=True)