def testMerge(self): d = Dictionary(self.texts) f = Dictionary(self.texts[:3]) g = Dictionary(self.texts[3:]) f.merge_with(g) self.assertEqual(sorted(d.token2id.keys()), sorted(f.token2id.keys()))
def create_dictionary(path): dictionary = Dictionary() for year in os.listdir(path): for month in os.listdir(os.path.join(path, year)): dict_temp = corpora.Dictionary(ReadFilesDir(os.path.join(path, year, month))) dictionary.merge_with(dict_temp) print(month) return dictionary
def load_input(dataset_path): #training_path, test_path xy_train = [] xy_test = [] (x_train, y_train), (x_test, y_test) = ([], []), ([], []) for tag in filter(lambda x: x[0] != '.', listdir(dataset_path)): path = dataset_path + "/" + tag num_files = len([ f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f)) ]) k = 0 for file in filter(lambda x: x[0] != '.', listdir(path)): k += 1 f = open(path + "/" + file, "r") if k < num_files * 0.8: xy_train.append( np.array(( clean_str(f.read()) + " " + tag).split())) # last element of collection is the tag else: xy_test.append( np.array(( clean_str(f.read()) + " " + tag).split())) # last element of collection is the tag vocab_train = Dictionary(xy_train) vocab_test = Dictionary(xy_test) vocab_train.merge_with(vocab_test) for xy in xy_train: y = xy[-1] y_train.append(vocab_train.token2id[y]) x = np.delete(xy, -1) words = [] for word in x: words.append(vocab_train.token2id[word]) x_train.append(words) for xy in xy_test: y = xy[-1] y_test.append(vocab_train.token2id[y]) x = np.delete(xy, -1) words = [] for word in x: words.append(vocab_train.token2id[word]) x_test.append(words) return (np.array(x_train), np.array(y_train)), (np.array(x_test), np.array(y_test)), vocab_train
def training(): train_gs = ["train/STS2012-en-train/STS.gs.MSRpar.txt", "train/STS2012-en-train/STS.gs.MSRvid.txt", "train/STS2012-en-train/STS.gs.SMTeuroparl.txt"] train_input = ["train/STS2012-en-train/STS.input.MSRpar.txt","train/STS2012-en-train/STS.input.MSRvid.txt", "train/STS2012-en-train/STS.input.SMTeuroparl.txt"] train_align = ["trainalign/2012/STS.alignment.MSRpar.txt", "trainalign/2012/STS.alignment.MSRvid.txt", "trainalign/2012/STS.alignment.SMTeuroparl.txt"] dictionary = Dictionary([]) features = [] labels = [] aligns = [] for i in range(len(train_input)): sentencesA, sentencesB, idfDict, aligns, NEs_A, NEs_B, spw_A, spw_B = read_sentences(train_input[i],train_align[i]) features += sentence_vector_similarity(sentencesA, sentencesB, embeddings, idfDict, aligns, NEs_A, NEs_B, spw_A, spw_B) dictionary.merge_with(Dictionary(sentencesA+sentencesB)) # read gold standard with open(train_gs[i], "rb") as f: labels += map(float, f.read().strip().split()) corpus_A = [] corpus_B = [] for i in range(len(train_input)): sentencesA, sentencesB, _, _, _, _, _, _ = read_sentences(train_input[i],train_align[i]) for doc in sentencesA: corpus_A.append(dictionary.doc2bow(doc)) for doc in sentencesB: corpus_B.append(dictionary.doc2bow(doc)) NUM_TPC = 14 topicModel = LdaModel(corpus_A+corpus_B, num_topics = NUM_TPC) assert len(corpus_A)==len(corpus_B)==len(features) == len(labels) for i in xrange(len(corpus_A)): vectorA = numpy.zeros(NUM_TPC) vectorB = numpy.zeros(NUM_TPC) for j,prob in topicModel[corpus_A[i]]: vectorA[j] = prob for j,prob in topicModel[corpus_B[i]]: vectorB[j] = prob if numpy.linalg.norm(vectorA) == 0 or numpy.linalg.norm(vectorB) == 0: features[i].append(0.) else: features[i].append(cosine_similarity(vectorA, vectorB)) # train model # model = MLPRegressor(hidden_layer_sizes = (100,100), max_iter = 10000, # activation = 'logistic') model = Ridge() model.fit(features, labels) return model,topicModel, dictionary
def get_dict(): global PAD_token global SOS_token global EOS_token dct = Dictionary([['<PAD>']]) default_dct = Dictionary([DEFAULT_TOKENS]) dct.merge_with(default_dct) PAD_token = dct.token2id['<PAD>'] SOS_token = dct.token2id['<SOS>'] EOS_token = dct.token2id['<EOS>'] return dct
class Sentences: def __init__(self, corpus_file, n_docs=-1): self.corpus_file = corpus_file self.n_docs = n_docs self.tp = TextProcessing(dir='') self.dictionary = Dictionary('') self.tokenizer = RegexpTokenizer(r'[a-zA-Z]+') self.en_stop = get_stop_words('en') self.p_stemmer = PorterStemmer() def __iter__(self, dict_dir): logging.info("Loading corpus in file %s" % self.corpus_file) i = 0 for line in open(self.corpus_file, 'r'): # cleaning the line stemmed_tokens = self.tp.clean_line(line) # add tokens to list #ret.append(stemmed_tokens) # add line to dictionary d2 = Dictionary(stemmed_tokens) self.dictionary = self.dictionary.merge_with(d2) # count number of documents and break if > num_docs i += 1 if self.n_docs != -1 and i >= self.n_docs: break if i % 1000 == 0: logging.debug('Document %s loaded' % i)
def load_inputTrainingTest(training_path, test_path): xy_train = [] xy_test = [] (x_train, y_train), (x_test, y_test) = ([], []), ([], []) for tag in filter(lambda x: x[0] != '.', listdir(training_path)): path = training_path + "/" + tag for file in filter(lambda x: x[0] != '.', listdir(path)): f = open(path + "/" + file, "r") xy_train.append(np.array( (clean_str(f.read()) + " " + tag).split())) #last element of collection is the tag for tag in filter(lambda x: x[0] != '.', listdir(test_path)): path = test_path + "/" + tag for file in filter(lambda x: x[0] != '.', listdir(path)): f = open(path + "/" + file, "r") xy_test.append(np.array( (clean_str(f.read()) + " " + tag).split())) #last element of collection is the tag vocab_train = Dictionary(xy_train) vocab_test = Dictionary(xy_test) vocab_train.merge_with(vocab_test) for xy in xy_train: y = xy[-1] y_train.append(vocab_train.token2id[y]) x = np.delete(xy, -1) words = [] for word in x: words.append(vocab_train.token2id[word]) x_train.append(words) for xy in xy_test: y = xy[-1] y_test.append(vocab_train.token2id[y]) x = np.delete(xy, -1) words = [] for word in x: words.append(vocab_train.token2id[word]) x_test.append(words) return (np.array(x_train), np.array(y_train)), (np.array(x_test), np.array(y_test)), vocab_train
def compile_gensim_vocab(self, tf_vectorizer, vocabulary_outpath): ''' Extract the vocabulary from fit sklearn count vectorizer, save in Gensim's Dictionary format ''' print('\nCreate and Save to file Vocabulary from sklearn CountVectorizer using' + 'Gensim Dictionary') start = datetime.now() sklearn_vocab = tf_vectorizer.vocabulary_ vocabulary_gensim = {} for key, val in sklearn_vocab.items(): vocabulary_gensim[val] = key vocabulary = Dictionary() vocabulary.merge_with(vocabulary_gensim) vocabulary.save(vocabulary_outpath) end = datetime.now() print(" Time taken: {}".format(end - start)) return vocabulary
validation_extra_features.append( [feats + token_feat for token_feat in token_feats]) logging.info('Extra features created') dictionary = Dictionary([["<OOV>", "<PAD>"]]) x_train = [[remove_duplicates_char(token.lower_) for token in doc] for doc in train_docs] train_dictionary = Dictionary(x_train) train_selected_dictionary = Dictionary( [[remove_duplicates_char(token.lower_) for token in doc] for doc in train_selected_docs]) train_dictionary.filter_extremes(no_above=0.6, no_below=10) dictionary.merge_with(train_selected_dictionary) dictionary.merge_with(train_dictionary) dictionary.save(join(stg.MODELS_DIR, 'rnn_spacy_tokens_dict')) x_train_indexed = [[ dictionary.token2id.get(remove_duplicates_char(token.lower_), 0) for token in doc ] for doc in train_docs] x_validation_indexed = [[ dictionary.token2id.get(remove_duplicates_char(token.lower_), 0) for token in doc ] for doc in validation_docs] if ARGS.load_embedding_matrix == 'y': embedding_matrix = joblib.load( filename=join(stg.MODELS_DIR, 'embedding_matrix'))
class Vocab: def __init__(self): self.dictionary = Dictionary() self.dictionary.token2id['<UNK>'] = -1 self.dictionary.id2token[-1] = '<UNK>' self.dictionary.dfs[-1] = 0 def set(self, corpus, prune_at=2000000): self.dictionary.add_documents(corpus, prune_at) def prune(self, **kwargs): # it is best if pruning is applied after all the updates # otherwise dropped tokens during pruning, seen in update # docs will produce wrong counts if self.dictionary.dfs == {}: raise ValueError('no vocab to filter; build vocab first') no_below = kwargs.get('no_below', 5) no_above = kwargs.get('no_above', 0.7) keep_n = kwargs.get('keep_n', 100000) keep_tokens = kwargs.get('keep_tokens', None) if keep_tokens: keep_tokens.append('UNK') else: keep_tokens = ['UNK'] preprune_count = sum([df for _, df in self.dictionary.dfs.items()]) self.dictionary.filter_extremes(no_below, no_above, keep_n, keep_tokens) postprune_count = sum([df for _, df in self.dictionary.dfs.items()]) self.dictionary.dfs[-1] = preprune_count - postprune_count # add UNK back (gets pruned due to 0 initial val) self.dictionary.token2id['<UNK>'] = -1 self.dictionary.id2token[-1] = '<UNK>' def update(self, docs, prune_at=2000000): self.add_documents(docs, prune_at) def transform(self, docs, transform_to='ids', with_unk=True): if transform_to == 'ids': for doc in docs: yield self.dictionary.doc2idx(doc) elif transform_to == 'bow': for doc in docs: if with_unk: yield self.doc2bow(doc) else: yield self.dictionary.doc2bow(doc) else: raise ValueError('unknwon transformation format') def fit_transform(self, docs, transform_to='ids', prune_at=2000000, filter_vocab=False, **kwargs): self.set(docs, prune_at) if filter_vocab: self.prune(**kwargs) yield from self.transform(docs, transform_to) def merge(self, other): self.dictionary.merge_with(other) def save(self, fname, as_text=False, sort_by_word=False): if as_text: self.dictionary.save_as_text(fname, sort_by_word) else: self.dictionary.save(fname) def load(self, fname, from_text=False): if from_text: self.dictionary = Dictionary.load_from_text(fname) else: self.dictionary = Dictionary.load(fname) def __len__(self): return len(self.dictionary) def __iter__(self): return iter(self.dictionary) def keys(self): return list(self.dictionary.token2id.values()) def __str__(self): return str(self.dictionary) def __getitem__(self, tokenid): return self.dictionary[tokenid] def doc2bow(self, document): # note: slight variation to BoW format conversion from gensim # to allow '<UNK>' tokens if isinstance(document, string_types): raise TypeError( "doc2bow expects an array of unicode tokens on input, not a single string" ) # Construct (word, frequency) mapping. counter = defaultdict(int) for w in document: if w in self.dictionary.token2id: counter[self.dictionary.token2id[w]] += 1 else: counter[-1] += 1 # return tokenids, in ascending id order counter = sorted(iteritems(counter)) return counter