def test_lookups(): dictionary = Dictionary(TOKEN_SETS) # it can perform token inclusion and position lookups: assert dictionary.doc2idx( TOKEN_SETS[0]) == [0, 3, 1, 2] # ["all", "the", "kings", "men"] assert dictionary.doc2idx(NEW_TOKENS) == [ 0, 1, -1, -1 ] # ["all", "kings", "queens", "jacks"]
def generate_walks(G, n_walks, walk_len): walks = build_random_walk_corpus(G, n_walks, walk_len) # Now we have a Gensim Dictionary to work with dictionary = Dictionary(walks) # Covert docs to indexes in dictionary return dictionary, [ dictionary.doc2idx(w) for w in tqdm(walks, desc='Converting to indicies') ]
def load_data(data_file): data_file = Path(data_file) pairs = [] with data_file.open() as fin: for line in fin: a, b = line.strip().split('\t') pairs.append((a, b)) d = Dictionary(pairs) pairs = np.asarray([d.doc2idx(pair) for pair in pairs]) return d, pairs
def build_token_corpus(file_path: str, dictionary: Dictionary) -> list: """build data corpus convert each word to id and append EOS Arguments: file_path {str} -- data file path dictionary {Dictionary} -- build up word2id dictionary Returns: list -- idx data list """ with open(file_path, mode='r', encoding='utf-8') as fp: data = json.load(fp) data_size = len(data) logging.info("data have {} articles".format(data_size)) for idx, article in enumerate(data): article_content = article[ARTICLE_CONTENT] article_content = jieba_tokenize(article_content) content_token = dictionary.doc2idx(article_content) article_title = article[ARTICLE_TITLE] article_title = jieba_tokenize(article_title) title_token = dictionary.doc2idx(article_title) article[ARTICLE_CONTENT] = content_token article[ARTICLE_TITLE] = title_token for questions_obj in article[QUESTIONS]: question = questions_obj[QUESTION] question = jieba_tokenize(question) question_token = dictionary.doc2idx(question) questions_obj[QUESTION] = question_token answer = questions_obj[ANSWER] answer = jieba_tokenize(answer) answer_token = dictionary.doc2idx(answer) questions_obj[ANSWER] = answer_token if idx % 100 == 0: percent = idx / data_size logging.info("finish {}% of data".format(percent * 100)) return data
def load_data(filename): # Load articles from file articles, _ = LoadArticles(filename, verbose=False, split=True) # Create vocab dictionary for articles dct = Dictionary(articles) # dct.filter_extremes(no_below=5, no_above=500, keep_n=100000) dct.filter_extremes(no_below=5, no_above=500, keep_n=50000) # convert words to indices # we make UNK the highest index vocab_size = len(dct) articles = [dct.doc2idx(a, unknown_word_index=vocab_size) for a in articles] return articles, dct
def extract_important_words_tfidf(documents, threshold): dictionary = Dictionary(documents) corpus = [dictionary.doc2bow(text) for text in documents] tfidf = TfidfModel(corpus) corpus_tfidf = tfidf[corpus] new_documents = [] for i, doc in enumerate(documents): important_words_index = get_important_words(corpus_tfidf[i], threshold) new_doc = [] for term in doc: if dictionary.doc2idx([term])[0] in important_words_index: new_doc.append(term) new_documents.append(new_doc) return new_documents # list of list of terms
def tokenize_files(df_path): data = pd.read_csv(df_path, na_filter=False)['language'].values cleaned_docs = [ tokenize_doc(f) for f in tqdm(data, desc='Tokenizing Docs') ] # Now we have a Gensim Dictionary to work with dictionary = Dictionary(cleaned_docs) # Remove any tokens with a frequency less than 10 dictionary.filter_extremes(no_below=10, no_above=0.75) # Covert docs to indexes indexed_docs = [ dictionary.doc2idx(d) for d in tqdm(cleaned_docs, desc='Converting to indicies') ] # Remove out of vocab tokens return dictionary, [[ t for t in d if t != -1 ] for d in tqdm(indexed_docs, desc="Removing out-of-vocab tokens")]
def make_tf_time_series(tweets_time_series, keep_only_common_words=True): tweets_time_series = break_up_sentences(tweets_time_series) tweets_dict = Dictionary(tweets_time_series) bow_time_series = [ tweets_dict.doc2bow(tweets) for tweets in tweets_time_series ] tf_time_series = [ make_term_frequency(time_step) for time_step in bow_time_series ] tf_time_series = [[(tweets_dict.get(tup[0]), tup[1]) for tup in time_step] for time_step in tf_time_series] if keep_only_common_words: tweets_dict.filter_extremes(no_below=len(tweets_time_series), no_above=1) tf_time_series = [[ tup for tup in time_step if tweets_dict.doc2idx([tup[0]])[0] != -1 ] for time_step in tf_time_series] return tf_time_series
def postagging(self, word_seq): N = self.freq_obj.get_hidden_stage() T = len(word_seq) word_seq = [_.lower() for _ in word_seq] dct = Dictionary([word_seq]) word2idx = dct.token2id id2word = {v: k for k, v in word2idx.items()} observed = np.array(dct.doc2idx(word_seq)) emission_prob_matrix = np.zeros((N, T)) for word in word2idx.keys(): for tag in self.tag_set: if tag != 'START': emission_prob_matrix[ HMMTagger.get_tagid(tag), word2idx[word]] = self.freq_obj.get_emission_prob( word.lower(), tag) path = self.hmm_obj.viterbi(observed, self.freq_obj.get_transition_matrix(), emission_prob_matrix, self.freq_obj.get_initial_distribution()) return [(id2word[observed[i]], HMMTagger.idx2tag[path[0][i]]) for i in range(len(observed))]
#First parameter is the replacement, second parameter is your input string test = regex.sub('', d) #Out: 'abdE' if len(test) > 100: datagensim += [[i for i in test.split(" ") if len(i) > 3]] dct = Dictionary(datagensim) dct.filter_extremes(no_below=2, no_above=0.9) dct.compactify() X = np.zeros((len(dct.keys()), len(datagensim)), int) i = 0 bow = [] datagensimClean = [] for d in datagensim: idx = dct.doc2idx(d) dC = [d[i] for i in range(len(d)) if idx[i] > -1] tmp = dct.doc2bow(dC) datagensimClean += [dC] bow += [tmp] for key, value in tmp: X[key, i] = value i += 1 datagensim = datagensimClean #%% Test data #with open(r'C:\Users\Matteo\Documents\Git\aLDA\data\wikitext-2-raw\wiki.test.raw', encoding="utf8") as file: # dataRaw = file.read() # #dataRaw = stem(dataRaw) #data = dataRaw.replace('= = = = ','+ + + +')
n_hidden = 128 df = pd.read_csv(data_file, names=('question', 'answer'), dtype='object') q_maxlen = df['question'].map(len).max() a_maxlen = df['answer'].map(len).max() rpad_blank = lambda size: (lambda s: s.ljust(size, ' ')) que = df['question'].map(rpad_blank(q_maxlen)) ans = df['answer'].map(rpad_blank(a_maxlen)) dic = Dictionary([list(' '.join(df.values.flatten()))]) dic.save_as_text(f'{data_file}.dic') one_hot = lambda s: np.eye(len(dic))[dic.doc2idx(list(s))] x = np.array([one_hot(q) for q in que]) y = np.array([one_hot(a) for a in ans]) model = Sequential() # encoder model.add(LSTM(n_hidden, input_shape=(q_maxlen, len(dic)))) # decoder model.add(RepeatVector(a_maxlen)) model.add(LSTM(n_hidden, return_sequences=True)) model.add(TimeDistributed(Dense(len(dic)))) model.add(Activation('softmax'))
class SkipGramDataset(Dataset): def __init__(self, args): self.args = args self.dictionary = None self.examples = [] self.name = '' def __getitem__(self, index): return self._example_to_tensor(*self.examples[index]) def __len__(self): return len(self.examples) def save(self, examples_path, dict_path): print('Saving Dataset Examples...') torch.save({ 'examples': self.examples, }, examples_path) print('Saving Dataset Dictionary...') self.dictionary.save(dict_path) print('Saved Dataset!') def load(self, examples_path, dict_path): print('Loading Dataset Examples...') self.examples = torch.load(examples_path)['examples'] print('Loading Dataset Dictionary...') self.dictionary = Dictionary().load(dict_path) print('Loaded Saved Dataset!') def generate_examples_serial(self): """ Generates examples with no multiprocessing - straight through! :return: None - updates class properties """ # Now we have a Gensim Dictionary to work with self._build_dictionary() # Remove any tokens with a frequency less than 10 self.dictionary.filter_extremes(no_below=10, no_above=0.75) self.examples = [] for file in tqdm(self.load_files(), desc="Generating Examples (serial)"): file = self.dictionary.doc2idx(file) self.examples.extend(self._generate_examples_from_file(file)) def load_files(self): """ Sets self.files as a list of tokenized documents! :returns: List of files """ # Needs to be implemented by child class raise NotImplementedError def _build_dictionary(self): """ Creates a Gensim Dictionary :return: None - modifies self.dictionary """ print("Building Dictionary...") self.dictionary = Dictionary(self.load_files()) def _generate_examples_from_file(self, file): """ Generate all examples from a file within window size :param file: File from self.files :returns: List of examples """ examples = [] for i, token in enumerate(file): if token == -1: # Out of dictionary token continue # Generate context tokens for the current token context_words = self._generate_contexts(i, file) # Form Examples: # center, context - follows form: (input, target) new_examples = [(token, ctxt) for ctxt in context_words if ctxt != -1] # Add to class examples.extend(new_examples) return examples def _generate_contexts(self, token_idx, tokenized_doc): """ Generate Token's Context Words Generates all the context words within the window size defined during initialization around token. :param token_idx: Index at which center token is found in tokenized_doc :param tokenized_doc: List - Document broken into tokens :returns: List of context words """ contexts = [] # Iterate over each position in window for w in range(-self.args.window_size, self.args.window_size + 1): context_pos = token_idx + w # Make sure current center and context are valid is_outside_doc = context_pos < 0 or context_pos >= len(tokenized_doc) center_is_context = token_idx == context_pos if is_outside_doc or center_is_context: # Not valid - skip to next window position continue contexts.append(tokenized_doc[context_pos]) return contexts def _example_to_tensor(self, center, target): """ Takes raw example and turns it into tensor values :params example: Tuple of form: (center word, document id) :params target: String of the target word :returns: A tuple of tensors """ center, target = torch.tensor([int(center)]), torch.tensor([int(target)]) return center, target
n_hidden = int(sys.argv[5]) BOS = '\t' EOS = '\n' df = pd.read_csv(data_file, names=('question', 'answer'), dtype='object') a_maxlen = df['answer'].map(len).max() + 2 ans = df['answer'].map(lambda a: f'{BOS}{a}{EOS}') dic = Dictionary([list(BOS + EOS + ' '.join(df.values.flatten()))]) dic.save(f'{data_file}.dic') padding_one_hot = lambda d, size: np.vstack( (np.eye(len(dic))[dic.doc2idx(list(d))], np.zeros( (size - len(d), len(dic))))) one_hot = lambda s: np.eye(len(dic))[dic.doc2idx(list(s))] sum_one_hot = lambda s: np.add.reduce(one_hot(s)) x1 = np.array([sum_one_hot(q) for q in df['question']]) x2 = np.array([padding_one_hot(a, a_maxlen) for a in ans]) y = np.array([np.vstack((d[1:], np.zeros((1, len(dic))))) for d in x2]) enc_inputs = Input(shape=(len(dic), )) enc_outputs = Dense(n_hidden, activation='relu')(enc_inputs) enc_states = [enc_outputs] decoder = GRU(n_hidden, return_sequences=True, return_state=True)
limit = int('9' * digit) + 1 input_format = lambda f: f.ljust(input_digit, ' ') output_format = lambda f: f.ljust(output_digit, ' ') input_formula = lambda d: input_format(f"{d[0]}+{d[1]}") ds = np.random.randint(0, limit, size = (n, 2)) data = [input_formula(d) for d in ds] labels = [output_format(f"{sum(d)}") for d in ds] dic = Dictionary([list('0123456789+ ')]) one_hot = lambda ds: np.array([np.eye(len(dic))[dic.doc2idx(list(d))] for d in ds]) x = one_hot(data) y = one_hot(labels) model = Sequential() # encoder model.add(LSTM(128, input_shape=(input_digit, len(dic)))) # decoder model.add(RepeatVector(output_digit)) model.add(LSTM(128, return_sequences = True)) model.add(TimeDistributed(Dense(len(dic))))
remove_text='https?' vocab=vocab.str.replace(remove_text,'') vocab=vocab.str.findall('[a-zA-Z]+') stopword=stopwords.words('english') vocab=vocab.apply(lambda x: [w.lower() for w in x if w.lower() not in stopword]) vocab=vocab.apply(lambda x: [stemmer.stem(w) for w in x]) vocab[:10] dictionary=Dictionary(vocab) train.text=vocab.iloc[:7613] test.text=vocab[7613:].reset_index().text train.text=train.text.apply(lambda x: dictionary.doc2idx(x)) train_text=pad_sequences(train.text) train_target=train.target.values from gensim.models.word2vec import Word2Vec import inspect word2vec=Word2Vec(vocab,size=32) #inspect.signature(Word2Vec) vocab[:10] word_vectors=word2vec.wv.vectors word_vectors.shape
class BilingualPreprocessor: def __init__(self, is_training=False): self.ja_dictionary = Dictionary([['<PAD>'], ['<BeginOfEncode>'], ['<BOS>'], ['<EOS>'], ['<UNK>']]) self.en_dictionary = Dictionary([['<PAD>'], ['<BeginOfEncode>'], ['<BOS>'], ['<EOS>'], ['<UNK>']]) self.is_training = is_training def register_ja_texts(self, texts: List[List[str]]): if self.is_training: self.ja_dictionary.add_documents(texts) def register_en_texts(self, texts: List[List[str]]): if self.is_training: self.en_dictionary.add_documents(texts) @property def ja_eos_index(self): return self.ja_dictionary.token2id['<EOS>'] @property def en_eos_index(self): return self.en_dictionary.token2id['<EOS>'] @property def ja_unknown_word_index(self): return self.ja_dictionary.token2id['<UNK>'] @property def en_unknown_word_index(self): return self.en_dictionary.token2id['<UNK>'] @property def ja_begin_of_encode_index(self): return self.ja_dictionary.token2id['<BeginOfEncode>'] @property def en_begin_of_encode_index(self): return self.en_dictionary.token2id['<BeginOfEncode>'] @property def ja_vocab_count(self): return len(self.ja_dictionary) @property def en_vocab_count(self): return len(self.en_dictionary) def doc2idx_ja(self, texts): return self.ja_dictionary.doc2idx( texts, unknown_word_index=self.ja_unknown_word_index) def doc2idx_en(self, texts): return self.en_dictionary.doc2idx( texts, unknown_word_index=self.en_unknown_word_index) def save(self, file_path): with open(file_path, 'wb') as f: dill.dump(self, f) @classmethod def load(cls, file_path): with open(file_path, 'rb') as f: preprocessor = dill.load(f) assert isinstance(preprocessor, cls), 'Load a class different from {}'.format(cls) return preprocessor
with open('reviews_using_dataset.csv') as csvfile: readCSV = csv.reader(csvfile, delimiter=',') for i, row in enumerate(readCSV): if i == 0: continue if i < 100: reviewTexts_train.append(row[0]) sentiment_train.append(int(row[1])) else: break reviewTexts_test.append(row[0]) sentiment_test.append(int(row[1])) from gensim.corpora import Dictionary dct = Dictionary([sent.strip().split() for sent in reviewTexts_train]) doc_idx = [ dct.doc2idx(reviewTexts_train[i].strip().split()) for i in range(len(reviewTexts_train)) ] print(doc_idx[0]) #train_bow = dct.doc2bow([sent.strip().split() for sent in reviewTexts_train]) #print (len(train_bow)) from keras.preprocessing import sequence max_words = 500 X_train = sequence.pad_sequences(doc_idx, maxlen=max_words) #X_test = sequence.pad_sequences(X_test, maxlen=max_words)
class Dataset(object): ''' Create dataset for training supervised model ''' def __init__(self, config): self.config = config self.train_data = None self.test_data = None self.val_data = None self.vocab = None self.word_embeddings = None def get_pandas_df(self, filename): ''' Load the data into Pandas.DataFrame object This will be used to convert data to torchtext object ''' with open(filename, 'r', encoding='utf-8') as datafile: data = [line.strip().split(' ', maxsplit=1) for line in datafile] data_text = list(map(lambda x: x[1], data)) data_label = list(map(lambda x: x[0], data)) full_df = pd.DataFrame({"text": data_text, "label": data_label}) return full_df def load_data(self, train_file, test_file, dataname, embed_file=None, val_file=None): ''' Loads the data from files Sets up iterators for training, validation and test data Also create vocabulary and word embeddings based on the data Inputs: embed_file (String): absolute path to file containing word embeddings (GloVe/Word2Vec) train_file (String): absolute path to training file test_file (String): absolute path to test file val_file (String): absolute path to validation file ''' # load embeddings voc_file = dataname + '_vocab.txt' new_embed = dataname + '_embed.pkl' train_X, train_Y = read_labeled(train_file) test_X, test_Y = read_labeled(test_file) val_X = None val_Y = None if val_file: val_X, val_Y = read_labeled(val_file) else: sp = int(len(train_X) * 0.8) train_X, val_X = (train_X[:sp], train_X[sp:]) train_Y, val_Y = (train_Y[:sp], train_Y[sp:]) train_X = [doc_padding(x, self.config.max_sen_len) for x in train_X] test_X = [doc_padding(x, self.config.max_sen_len) for x in test_X] val_X = [doc_padding(x, self.config.max_sen_len) for x in val_X] if os.path.isfile(voc_file): self.vocab = Dictionary.load_from_text(voc_file) else: self.vocab = Dictionary(train_X) special_tokens = {'<pad>': 0, '<unk>': 1} self.vocab.patch_with_special_tokens(special_tokens) self.vocab.save_as_text(voc_file) # build vocab train_X = [self.vocab.doc2idx(x, 1) for x in train_X] test_X = [self.vocab.doc2idx(x, 1) for x in test_X] val_X = [self.vocab.doc2idx(x, 1) for x in val_X] # transform words to index if os.path.isfile(new_embed): self.word_embeddings = torch.load(new_embed) else: embeds = Vectors(embed_file, unk_init=lambda x: torch.Tensor( np.random.normal(scale=0.6, size=(x.size())))) self.word_embeddings = weight_matrix(self.vocab, embeds) torch.save(self.word_embeddings, new_embed) self.train_data = (train_X, train_Y) self.test_data = (test_X, test_Y) self.val_data = (val_X, val_Y) print("Loaded {} training examples".format(len(train_X))) print("Loaded {} test examples".format(len(test_X))) print("Loaded {} validation examples".format(len(val_X))) def train_iterator(self): return batch_iter(*self.train_data, self.config.batch_size) def test_iterator(self): return batch_iter(*self.test_data, self.config.batch_size, False) def val_iterator(self): return batch_iter(*self.val_data, self.config.batch_size, False)
def extract_and_save_biterm(fname, embed_size=300, min_count=5, max_percent=0.5, iteration=200): ''' simple preprocessing of biterm A biterm is an unordered words pair Biterm is drawn from documents not from the whole corpus ''' docs = read_corpus(fname, labeled=False, tokens_only=True) docs = [[token for token in doc if not token.isnumeric()] for doc in docs] # Remove words that are only one character, and remove stop words docs = [[ token for token in doc if len(token) > 1 and token not in STOP_WORDS ] for doc in docs] lemmatizer = WordNetLemmatizer() docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs] bigram = Phrases(docs, min_count=20) for idx in range(len(docs)): for token in bigram[docs[idx]]: if '_' in token: # Token is a bigram, add to document. docs[idx].append(token) dictionary = Dictionary(docs) dictionary.filter_extremes(no_below=min_count, no_above=max_percent) dictionary.compactify() '''encode''' docs = [[token for token in doc if token in dictionary.token2id] for doc in docs] # # Remove docs that contains less than 3 words docs = [doc for doc in docs if len(set(doc)) > 1] # remove docs that contain less than 2 unique words model = gensim.models.Word2Vec(docs, workers=4, size=embed_size, iter=100, min_count=2) docs = [dictionary.doc2idx(doc) for doc in docs] biterms = {} i = 0 doc_bitems = [] for doc in docs: d_bi = {} doc = sorted(doc) for x in range(len(doc) - 1): for y in range(x + 1, len(doc)): if doc[x] == doc[y]: continue biterm = (doc[x], doc[y]) idx = 0 if biterm not in biterms: biterms[biterm] = i idx = i i += 1 else: idx = biterms[biterm] if idx in d_bi: d_bi[idx] += 1 else: d_bi[idx] = 1 doc_bitems.append(d_bi) fname = os.path.basename(fname) fname = fname.split('.')[0] dirc = os.path.join(os.getcwd(), 'Data', 'unsupervised') if not os.path.exists(dirc): os.makedirs(dirc) embeddings = {} for key, token in dictionary.iteritems(): embeddings[key] = model.wv[token] dictionary.save(os.path.join(dirc, fname + '_dic.pkl')) biterms = dict([key, val] for val, key in biterms.items()) with open(os.path.join(dirc, fname + '_bit.pkl'), 'wb') as f: pickle.dump(biterms, f) with open(os.path.join(dirc, fname + '_doc_bit.pkl'), 'wb') as f: pickle.dump(doc_bitems, f) with open(os.path.join(dirc, fname + '_emb.pkl'), 'wb') as f: pickle.dump(embeddings, f) with open(os.path.join(dirc, fname + '_doc.pkl'), 'wb') as f: pickle.dump(docs, f)
from gensim.models import word2vec from sklearn import decomposition data_file = sys.argv[1] pca_num = int(sys.argv[2]) limit_value = 0.1 sentences = [s for s in word2vec.LineSentence(data_file) if len(s) >= 2] dic = Dictionary(sentences) x = np.zeros((len(sentences), len(dic))) for i, d in enumerate(sentences): x[np.ix_([i], dic.doc2idx(d))] = 1 pca = decomposition.PCA(n_components = pca_num, random_state = 1) nx = pca.fit_transform(x) print(sum(pca.explained_variance_ratio_)) print(nx) for i, pc in enumerate(pca.components_): ids = np.where((pc >= limit_value) | (pc <= -limit_value)) items = [dic[id] for id in ids[0]] for r in sorted(zip(items, pc[ids]), key = lambda x: -x[1]):
class Index(object): """define an index instance along with its associated methods""" def __init__(self, stops, minsize=3): """initialize index variables""" self.ix = None self.tokenizer = StandardAnalyzer(stoplist=stops, minsize=minsize) self.umls = umls.UMLSLookup() self.term_dict = {} self.token2cuis = {} self.concept_dict = {"__NULL__": 0} self.synsets = {} def get_doc_ids(self, corpus_path, corpus_name): """get doc ids from corpus""" if "OHSUMED" in corpus_name: docs = safir_utils.gen_trec_doc(corpus_path) elif "TREC_CDS" in corpus_name: docs = safir_utils.gen_cds_doc(corpus_path) return [docno for docno, doc in docs] def only_digits(self, token): """check whether input token contains only digits and/or punctuation""" return all(char.isdigit() or char in string.punctuation for char in token) def preprocess_text(self, text, tags=False, remove_digits=True): """preprocess text: tokenize docs, lowerize text, remove words with length < min_size, remove tags, remove only-digits tokens and remove stopwords""" if tags: # remove tags text = strip_tags(text) if remove_digits: # tokenize and remove digits-only tokens text = [ token.text for token in self.tokenizer(text) if not self.only_digits(token.text) ] else: # tokenize and keep digits-only tokens text = [token.text for token in self.tokenizer(text)] # return preprocessed doc return text def preprocess_corpus(self, corpus_path, corpus_name, out_corpus, out_ids): """preprocess corpus: apply preprocess_text to each doc within corpus""" if "OHSUMED" in corpus_name: docs = safir_utils.gen_trec_doc(corpus_path) elif "TREC_CDS" in corpus_name: docs = safir_utils.gen_cds_doc(corpus_path) # tokenize docs print("pre processing docs...") #pproc_corpus = [self.preprocess_text(doc) for docno, doc in docs] pproc_corpus = [] doc_ids = [] # iterate over docs and store pre processed docs and docnos for docno, doc in docs: pproc_corpus.append(self.preprocess_text(doc)) doc_ids.append(docno) print("pre processing finished!") # store pproc_corpus print("store pre processed corpus in {}".format(out_corpus)) with open(out_corpus, 'w') as outf: json.dump(pproc_corpus, outf) # store docnos print("store doc_ids in {}".format(out_ids)) with open(out_ids, 'w') as outf: json.dump(doc_ids, outf) # return pproc_corpus and doc_ids return pproc_corpus, doc_ids def load_pproc_corpus(self, fname): """load stored pre processed corpus""" with open(fname, 'r') as inf: pproc_corpus = json.load(inf) return pproc_corpus def load_doc_ids(self, fname): """load stored doc ids""" with open(fname, 'r') as inf: doc_ids = json.load(inf) return doc_ids def index_corpus(self, pproc_corpus, fname): """index pre processed corpus using gensim dictionary - fast doc2bow, doc2idx conversion""" self.ix = Dictionary(pproc_corpus) self.ix.save_as_text(fname) return True def load_index(self, fname): """load stored index""" self.ix = Dictionary.load_from_text(fname) return True def build_term_dict(self, pproc_corpus, fname, dict_size=131072, remove_digits=True, min_df=2, max_df=0.5): """create term dictionary""" ttf = {} # filter terms with df lower than 2 and greater than 0.5 (in %) and store their ttf for doc in tqdm(pproc_corpus): # get doc in bow format bow = self.ix.doc2bow(doc) for idx, tf in bow: if self.ix.dfs[idx] >= 2 and self.ix.dfs[ idx] / self.ix.num_docs <= 0.5: if idx in ttf: ttf[idx] += tf else: ttf[idx] = tf # convert ttf dict into counter and keep dict_size most frequent terms count = Counter(ttf).most_common(dict_size) # create term dict - two-levels encoding (i.e. self.term_dict[self.ix.token2id[token]]) for idx, ttf in count: self.term_dict[idx] = len(self.term_dict) # store term dictionary with open(fname, 'w') as outf: json.dump(self.term_dict, outf) return True def load_term_dict(self, fname): """load term dictionary""" with open(fname, 'r') as inf: self.term_dict = json.load(inf) # convert keys from str back to int - json stores dict keys as str self.term_dict = { int(ix_term): dict_term for ix_term, dict_term in self.term_dict.items() } return True def get_pos2token(self, text): """split text into tokens and return {pos: [token, ["__NULL__"]]}""" pos2token = {} tokens = text.split( ) # split on whitespaces as text has been already pre processed # set text index index = text.index running_offset = 0 # loop over tokens for token in tokens: token_offset = index(token, running_offset) token_len = len(token) # update running offset running_offset = token_offset + token_len pos2token[token_offset] = [self.ix.token2id[token], ["__NULL__"] ] # note: ["__NULL__"] is for later use return pos2token def associate_token2cuis(self, pos2token, terms_candidate_cuis): """return list of (token, [cui1, cui2, ...]) pairs given token position and candidate concepts""" for term_cuis in terms_candidate_cuis: # get positional information start = term_cuis[0]['start'] # check whether 'start' matches with any pos2token key if start in pos2token: # update ["__NULL__"] with candidate cuis pos2token[start][1] = [concept['cui'] for concept in term_cuis] # return pos2token values only - i.e. (term, [cui1, cui2, ...]) pairs return list(pos2token.values()) def map_token2cuis(self, fname, threshold=1.0, stypes_fname=None): """map candidate cuis to each token in the index""" terms_str = ' '.join(list(self.ix.token2id.keys())) # split term_str into substrings of length <= 999999 - max length allowed by scipy parser substrs = wrap(terms_str, width=999999, break_long_words=False, break_on_hyphens=False) if stypes_fname is not None: # load user-specified UMLS semantic types print("user-specified UMLS semantic types for QuickUMLS enabled") semtypes = ','.join(safir_utils.load_semtypes(stypes_fname)) else: # keep default QuickUMLS semantic types semtypes = None # initialize QuickUMLS server server = QuickUMLS(window=1, threshold=threshold, semtypes=semtypes) server.launch_quickumls() # initialize concept matcher matcher = get_quickumls_client() token2cuis = [] # extract concepts from substrs for substr in substrs: terms_candidate_cuis = matcher.match(substr) # get position dict: {pos: [token, ["__NULL__"]]} given substr pos2token = self.get_pos2token(substr) # associate each token with its candidate concepts token2cuis += self.associate_token2cuis(pos2token, terms_candidate_cuis) # close connection with QuickUMLS server server.close_quickumls() # store token2cuis as dict self.token2cuis = dict(token2cuis) # store token2cuis with open(fname, 'w') as outf: json.dump(self.token2cuis, outf) return True def load_token2cuis(self, fname): """load token2cuis""" with open(fname, 'r') as inf: self.token2cuis = json.load(inf) # convert keys from str back to int - json stores dict keys as str self.token2cuis = { int(token): cuis for token, cuis in self.token2cuis.items() } return True def update_concept_dict(self, cui): """update concept dictionary""" if cui in self.concept_dict: return True else: self.concept_dict[cui] = len(self.concept_dict) return True def load_concept_dict(self, fname): """load concept dictionary""" with open(fname, 'r') as inf: self.concept_dict = json.load(inf) return True def update_synsets(self, cui, idx): """update synonyms set""" if self.concept_dict[ cui] in self.synsets: # add term to set of synonyms for the given cui self.synsets[self.concept_dict[cui]].add(self.term_dict[idx]) return True elif self.concept_dict[cui] != self.concept_dict[ "__NULL__"]: # initialize set of synsets for given cui self.synsets[self.concept_dict[cui]] = {self.term_dict[idx]} return True else: # do not update synsets return False def load_synsets(self, fname): """load synsets""" with open(fname, 'r') as inf: self.synsets = json.load(inf) # convert keys from str back to int - json stores dict keys as str self.synsets = {int(cui): syns for cui, syns in self.synsets.items()} return True def get_sense_pairs(self): """return senses as (term, cui) 2-dim np array""" syns = [ list(itertools.product(self.synsets[cui], [cui])) for cui in self.synsets ] synp = [list(itertools.combinations(syn, 2)) for syn in syns] return np.array(list(itertools.chain.from_iterable(synp))) def s_wsd(self, doc, table_name, query=False): """shallow word-sense disambiguation: disambiguate polysemous terms based on shallow word-concept connectivity within UMLS""" doc_cuis = {} # convert doc into doc2idx format doc2idx = self.ix.doc2idx(doc) # get cuis from doc tokens for idx in doc2idx: if idx in self.token2cuis and self.token2cuis[idx] != ["__NULL__"]: for cui in self.token2cuis[idx]: if cui in doc_cuis: # increase cui count doc_cuis[cui] += 1 else: # initialize cui count doc_cuis[cui] = 1 # perform shallow word-sense disambiguation enc_doc = [] for idx in doc2idx: if idx in self.term_dict: # disambiguate only for terms contained within self.term_dict max_edges = 0 # relative maximum connections (edges) if len(self.token2cuis[idx]) == 1: # monosemous term ref_cui = self.token2cuis[idx][0] if not query: # update concept dict and synsets self.update_concept_dict(ref_cui) self.update_synsets(ref_cui, idx) # encode (term, cui) pair enc_doc.append( [self.term_dict[idx], self.concept_dict[ref_cui]]) else: # polysemous term candidates = [] # loop over cadidate concepts for subj_cui in self.token2cuis[idx]: num_edges = 0 # number of edges if doc_cuis[ subj_cui] == 1: # subj_cui is only associated with current term (idx) obj_cuis = list( set(doc_cuis.keys()).difference({subj_cui})) else: # subj_cui is associated with other terms in the doc too obj_cuis = list(doc_cuis.keys()) num_edges += self.umls.compute_num_edges( subj_cui, obj_cuis, table_name) # verify connectivity if num_edges > max_edges: # set candidates to subj_cui candidates = [subj_cui] # update max_edges max_edges = num_edges else: # append subj_cui to candidates candidates.append(subj_cui) # keep head candidate - when disambiguation is not complete, it allows to get the most likely concept based on QuickUMLS ordering ref_cui = candidates[0] if not query: # update concept dict and synsets self.update_concept_dict(ref_cui) self.update_synsets(ref_cui, idx) # encode (term, cui) pair enc_doc.append( [self.term_dict[idx], self.concept_dict[ref_cui]]) else: # term oov continue return enc_doc def encode_corpus(self, pproc_corpus, corpus_name, ecorpus_fname, t2c_fname, cdict_fname, syn_fname, threshold=0.7, stypes_fname=None): """perform semantic indexing and encode corpus""" print("map UMLS concepts to (indexed) tokens") self.map_token2cuis(t2c_fname, threshold=threshold, stypes_fname=stypes_fname) # get UMLS concepts mapped to (indexed) tokens ix_concepts = { cui for cuis in self.token2cuis.values() for cui in cuis if cui != "__NULL__" } # create sql table to store relations between concepts associated to indexed tokens - allows for fast accessing compared to MRREL table print( "create table to store UMLS relations between concepts associated to (indexed) tokens - fast access is enabled by indexes" ) self.umls.restrict_to_ix_concepts(ix_concepts, corpus_name) # create indexes to speed up requests self.umls.create_index("CUI1_" + corpus_name, ["CUI1"], corpus_name) # create index for subject column self.umls.create_index("CUI2_" + corpus_name, ["CUI2"], corpus_name) # create index for object column self.umls.create_index( "CUI1_CUI2_" + corpus_name, ["CUI1", "CUI2"], corpus_name) # create multicolumn index (subj, obj) # encode corpus print("disambiguate polysemous tokens and encode corpus") enc_corpus = [ self.s_wsd(doc, corpus_name, query=False) for doc in tqdm(pproc_corpus) ] # store synsets as dict of lists - enables json encoding self.synsets = {cui: list(syns) for cui, syns in self.synsets.items()} # store semantic data and encoded corpus with open(ecorpus_fname, 'w') as outf: json.dump(enc_corpus, outf) with open(cdict_fname, 'w') as outf: json.dump(self.concept_dict, outf) with open(syn_fname, 'w') as outf: json.dump(self.synsets, outf) # return encoded corpus return enc_corpus def load_enc_corpus(self, fname): """load encoded corpus""" with open(fname, 'r') as inf: enc_corpus = json.load(inf) return enc_corpus def preprocess_query(self, query): """pre process query""" pproc_query = self.preprocess_text(query) return pproc_query def encode_query(self, pproc_query, corpus_name): """disambiguate polysemous terms and encode query""" enc_query = self.s_wsd(pproc_query, corpus_name, query=True) if not enc_query: print("query does not contain known terms") return None else: return np.array(enc_query) def project_query(self, query, corpus_name, word_embs, proj_weights, concept_embs=None): """project encoded query into dense vector of size [1, doc_embs]""" enc_query = self.encode_query(self.preprocess_query(query), corpus_name) if enc_query is None: return None else: if concept_embs is None: # only terms are considered return np.matmul(proj_weights, np.mean(word_embs[enc_query[:, 0]], axis=0)) else: # terms + concepts are considered (i.e. senses) return np.matmul( proj_weights, np.mean(np.add(word_embs[enc_query[:, 0]], concept_embs[enc_query[:, 1]]), axis=0)) def semantic_search(self, doc_ids, docs, query_ids, queries, ranking_folder, ranking_name): """perform search over queries using neural semantic models and return ranking""" doc_ids = np.array(doc_ids) print("compute similarities between docs and queries") similarities = cosine_similarity(docs, queries) out = open(ranking_folder + '/' + ranking_name + '.txt', 'w') for i in tqdm(range(similarities.shape[1])): rank = np.argsort(-similarities[:, i])[:1000] docs_rank = doc_ids[rank] qid = query_ids[i] if qid.isdigit( ): # cast to integer - this operation avoids storing topic ids as '0##' instead of '##' qid = str(int(qid)) # convert to int and then back to str for j in range(len(docs_rank)): out.write('%s %s %s %d %f %s\n' % (qid, 'Q0', docs_rank[j], j, similarities[rank[j]][i], ranking_name)) out.close() return True
mecab = MeCab.Tagger("-Owakati") # 辞書に含めない単語たち words_blacklist = [ ">>", # チャットのアノテーション "some_agent", "\u3000", # 全角スペースを意味している "。", "、", ] dct = Dictionary() # csvファイルの読み込み df = pd.read_csv(filepath, delimiter=",", names=["talker", "words", "type"]) # 文を分かち書き -> 半角スペースで区切り -> 最後の1文字(改行コード)を消したリストを得る wakati_df = df["words"].map(lambda x: mecab.parse(x).split(" ")[:-1]) # 辞書に追加 dct.add_documents(wakati_df) # ブラックリストの辞書内でのidを得る words_blacklist_id = dct.doc2idx(words_blacklist) # 辞書から削除 dct.filter_tokens(bad_ids=words_blacklist_id) #dct.filter_n_most_frequent(600) # 辞書の保存 dct.save(os.path.join(filedir, ".".join([filename, "dict"]))) # 辞書の中身と単語数の表示 print(dct.token2id) print(len(dct.token2id))
tokens = list() for text in texts: tokens.append(simple_preprocess(text)) # Vectorize the text samples into a 2D integer tensor. MAX_NUM_WORDS = 10000 # 2 words reserved: 0=pad, 1=oov MAX_SEQUENCE_LENGTH = 1000 dictionary = Dictionary(tokens) dictionary.filter_extremes(no_below=0, no_above=1.0, keep_n=MAX_NUM_WORDS - 2) word_index = dictionary.token2id print('Found %s unique tokens.' % len(word_index)) data = [dictionary.doc2idx(t) for t in tokens] # Truncate and pad sequences. data = [i[:MAX_SEQUENCE_LENGTH] for i in data] data = np.array([ np.pad(i, (0, MAX_SEQUENCE_LENGTH - len(i)), mode='constant', constant_values=-2) for i in data ], dtype=int) data = data + 2 print('Shape of data tensor:', data.shape) print('Length of label vector:', len(labels))
#读取停止词 file = codecs.open('stopwords.dic','r','utf-8') stoplist = [line.strip() for line in file] #读取数据集 file = codecs.open('data.dat','r','utf-8') doc_set = [document.strip() for document in file] texts = [] for i in doc_set: raw = i.lower().strip() tokens = jieba.cut(raw) stemmed_tokens = [word.strip() for word in tokens] stopped_tokens = [word for word in stemmed_tokens if word not in stoplist and len(word) > 1 and not re.search('[0-9]', word)] texts.append(stopped_tokens) dictionary = Dictionary(texts) corpus =[dictionary.doc2idx(text) for text in texts] corpus1=sequence.pad_sequences(corpus,maxlen=77) trainset, testset= cross_validation.train_test_split(corpus1, test_size=0.2, random_state=0) n_topics=10 random_state=0 n_iter=10 class AMC: def __init__(self, n_topics, n_iter, alpha=0.1, eta=0.01, random_state=None, refresh=10): self.n_topics = n_topics self.n_iter = n_iter self.alpha = alpha self.eta = eta # if random_state is None, check_random_state(None) does nothing # other than return the current numpy RandomState
class Vocab: def __init__(self): self.dictionary = Dictionary() self.dictionary.token2id['<UNK>'] = -1 self.dictionary.id2token[-1] = '<UNK>' self.dictionary.dfs[-1] = 0 def set(self, corpus, prune_at=2000000): self.dictionary.add_documents(corpus, prune_at) def prune(self, **kwargs): # it is best if pruning is applied after all the updates # otherwise dropped tokens during pruning, seen in update # docs will produce wrong counts if self.dictionary.dfs == {}: raise ValueError('no vocab to filter; build vocab first') no_below = kwargs.get('no_below', 5) no_above = kwargs.get('no_above', 0.7) keep_n = kwargs.get('keep_n', 100000) keep_tokens = kwargs.get('keep_tokens', None) if keep_tokens: keep_tokens.append('UNK') else: keep_tokens = ['UNK'] preprune_count = sum([df for _, df in self.dictionary.dfs.items()]) self.dictionary.filter_extremes(no_below, no_above, keep_n, keep_tokens) postprune_count = sum([df for _, df in self.dictionary.dfs.items()]) self.dictionary.dfs[-1] = preprune_count - postprune_count # add UNK back (gets pruned due to 0 initial val) self.dictionary.token2id['<UNK>'] = -1 self.dictionary.id2token[-1] = '<UNK>' def update(self, docs, prune_at=2000000): self.add_documents(docs, prune_at) def transform(self, docs, transform_to='ids', with_unk=True): if transform_to == 'ids': for doc in docs: yield self.dictionary.doc2idx(doc) elif transform_to == 'bow': for doc in docs: if with_unk: yield self.doc2bow(doc) else: yield self.dictionary.doc2bow(doc) else: raise ValueError('unknwon transformation format') def fit_transform(self, docs, transform_to='ids', prune_at=2000000, filter_vocab=False, **kwargs): self.set(docs, prune_at) if filter_vocab: self.prune(**kwargs) yield from self.transform(docs, transform_to) def merge(self, other): self.dictionary.merge_with(other) def save(self, fname, as_text=False, sort_by_word=False): if as_text: self.dictionary.save_as_text(fname, sort_by_word) else: self.dictionary.save(fname) def load(self, fname, from_text=False): if from_text: self.dictionary = Dictionary.load_from_text(fname) else: self.dictionary = Dictionary.load(fname) def __len__(self): return len(self.dictionary) def __iter__(self): return iter(self.dictionary) def keys(self): return list(self.dictionary.token2id.values()) def __str__(self): return str(self.dictionary) def __getitem__(self, tokenid): return self.dictionary[tokenid] def doc2bow(self, document): # note: slight variation to BoW format conversion from gensim # to allow '<UNK>' tokens if isinstance(document, string_types): raise TypeError( "doc2bow expects an array of unicode tokens on input, not a single string" ) # Construct (word, frequency) mapping. counter = defaultdict(int) for w in document: if w in self.dictionary.token2id: counter[self.dictionary.token2id[w]] += 1 else: counter[-1] += 1 # return tokenids, in ascending id order counter = sorted(iteritems(counter)) return counter
BOS = '\t' EOS = '\n' df = pd.read_csv(data_file, names=('question', 'answer'), dtype='object') q_maxlen = df['question'].map(len).max() a_maxlen = df['answer'].map(len).max() + 2 ans = df['answer'].map(lambda a: f'{BOS}{a}{EOS}') dic = Dictionary([list(BOS + EOS + ' '.join(df.values.flatten()))]) dic.save(f'{data_file}.dic') padding_one_hot = lambda d, size: np.vstack( (np.eye(len(dic))[dic.doc2idx(list(d))], np.zeros( (size - len(d), len(dic))))) x1 = np.array([padding_one_hot(q, q_maxlen) for q in df['question']]) x2 = np.array([padding_one_hot(a, a_maxlen) for a in ans]) y = np.array([np.vstack((d[1:], np.zeros((1, len(dic))))) for d in x2]) enc_inputs = Input(batch_shape=(None, q_maxlen, len(dic))) enc_outputs = Dense(n_hidden)(Flatten()(Dense(n_hidden)(enc_inputs))) enc_states = [enc_outputs] decoder = GRU(n_hidden, return_sequences=True, return_state=True) dec_inputs = Input(shape=(None, len(dic))) dec_outputs, _ = decoder(dec_inputs, initial_state=enc_states)
''' token2id : dict of (str,int) - token -> tokenID id2token : dict of (int,str) dfs : dict of (int,int) ''' dct.token2id #查看词汇编码 0 1 2 3 4 5 6 dct.dfs #查看每个词汇出现次数 dct.num_pos #查看处理过程的词汇数量 dct.num_nnz #与num_pos 类似 dct.add_documents([['cat','bird','cute'],['动物','植物','panda']]) #增加词条 dct.token2id #查看词汇编码 0 1 2 3 4 5 6 dct.doc2idx(['this','cat','is','cute']) #查询词汇在字典中的编码 没有的就饿返回-1 #准换为BOW稀疏向量 dct.doc2bow(['this','is','a','cute','cat'], #8(cute) 和 9(cat) 出现了 1次 return_missing=True, #词库没有的词 allow_update=True #把词库没有的词加入词库 是否直接更新所用字典 ) dct.token2id
names=('keyword', 'sentence'), dtype='object') keywords = [k.split(' ') for k in df['keyword'].values] sentences = [[BOS] + s.split(' ') + [EOS] for s in df['sentence'].values] q_maxlen = np.max([len(q) for q in keywords]) a_maxlen = np.max([len(a) for a in sentences]) print(f'question max size: {q_maxlen}, answer max size: {a_maxlen}') dic = Dictionary(keywords + sentences) dic.save(f'{dest_file_prefix}.dic') padding_one_hot = lambda d, size: np.vstack( (np.eye(len(dic))[dic.doc2idx(d)], np.zeros((size - len(d), len(dic))))) x1 = np.array([padding_one_hot(q, q_maxlen) for q in keywords]) x2 = np.array([padding_one_hot(a, a_maxlen) for a in sentences]) y = np.array([np.vstack((d[1:], np.zeros((1, len(dic))))) for d in x2]) encoder = GRU(n_hidden, return_state=True) enc_inputs = Input(shape=(None, len(dic))) enc_outputs, enc_h = encoder(enc_inputs) enc_states = [enc_h] decoder = GRU(n_hidden, return_sequences=True, return_state=True) dec_inputs = Input(shape=(None, len(dic))) dec_outputs, _ = decoder(dec_inputs, initial_state=enc_states)