def main(): parser = argparse.ArgumentParser() parser.add_argument('train_path', type=str, help='path to the train corpus file') parser.add_argument('test_path', type=str, help='path to the test corpus file') parser.add_argument('train_label', type=str, help='path to the train label file') parser.add_argument('test_label', type=str, help='path to the test label file') parser.add_argument('out_dir', type=str, help='path to the output dir') parser.add_argument('-nv', '--n_val', type=int, default=1000, help='validation set size') args = parser.parse_args() docs = load_corpus(args.train_path)['docs'].items() test_docs = load_corpus(args.test_path)['docs'] np.random.seed(0) np.random.shuffle(docs) n_val = args.n_val train_docs = dict(docs[:-n_val]) val_docs = dict(docs[-n_val:]) # doc_labels = load_json(args.train_label) # test_labels = load_json(args.test_label) doc_labels = None test_labels = None train = corpus2libsvm(train_docs, doc_labels, os.path.join(args.out_dir, 'train.libsvm')) val = corpus2libsvm(val_docs, doc_labels, os.path.join(args.out_dir, 'val.libsvm')) test = corpus2libsvm(test_docs, test_labels, os.path.join(args.out_dir, 'test.libsvm')) import pdb;pdb.set_trace()
def train(args): corpus = load_corpus(args.input) n_vocab, docs = len(corpus['vocab']), corpus['docs'] corpus.clear() # save memory X_docs = [] for k in docs.keys(): X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] np.random.seed(0) np.random.shuffle(X_docs) # X_docs_noisy = corrupted_matrix(np.r_[X_docs], 0.1) n_val = args.n_val # X_train = np.r_[X_docs[:-n_val]] # X_val = np.r_[X_docs[-n_val:]] X_train = np.r_[X_docs[:-n_val]] del X_docs[:-n_val] X_val = np.r_[X_docs] del X_docs start = timeit.default_timer() vae = VarAutoEncoder(n_vocab, args.n_dim, comp_topk=args.comp_topk, ctype=args.ctype, save_model=args.save_model) vae.fit([X_train, X_train], [X_val, X_val], nb_epoch=args.n_epoch, batch_size=args.batch_size) print 'runtime: %ss' % (timeit.default_timer() - start)
def extract_dict(args): corpus = load_corpus(args.input) vocab = corpus['vocab'] with io.open(os.path.join(args.output_dir, 'dict.corpus'), 'w', encoding='utf-8') as f: f.write(json.dumps(vocab, ensure_ascii=False)) print 'Generate the dictionary!'
def test(args): corpus = load_corpus(args.corpus[0]) docs, vocab_dict = corpus['docs'], corpus['vocab'] doc_codes = doc_word2vec(docs, revdict(vocab_dict), args.load_model, args.output, avg=True)
def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] ae = load_ae_model(args.load_model) doc_codes = ae.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print 'Saved doc codes file to %s' % args.output if args.save_topics: topics_strength = get_topics_strength(ae, revdict(vocab), topn=50) print_topics(topics_strength) # save_topics_strength(topics_strength, args.save_topics) save_chinese_topics_strength(topics_strength, args.save_topics) # topics = get_topics(ae, revdict(vocab), topn=10) # write_file(topics, args.save_topics) print 'Saved topics file to %s' % args.save_topics if args.word_clouds: queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading', 'tax', 'march', 'april', 'june', 'july'] weights = ae.get_weights()[0] weights = unitmatrix(weights) # normalize word_cloud(weights, vocab, queries, save_file=args.word_clouds) print 'Saved word clouds file to %s' % args.word_clouds if args.sample_words: revocab = revdict(vocab) queries = ['weapon', 'christian', 'compani', 'israel', 'law', 'hockey', 'comput', 'space'] words = [] for each in queries: if each in vocab: words.append(get_similar_words(ae, vocab[each], revocab, topn=11)) write_file(words, args.sample_words) print 'Saved sample words file to %s' % args.sample_words if args.translate_words: revocab = revdict(vocab) queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']] for each in queries: print each print translate_words(ae, each, vocab, revocab, topn=10) if args.calc_distinct: # mean, std = calc_pairwise_cosine(ae) # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi) sd = calc_pairwise_dev(ae) print 'Average squared deviation from 0 (90 degree): %s' % sd
def get_words(args): corpus = load_corpus(args.input_corpus) filename_corpus_dict = corpus['docs'] vocab_dict = corpus['vocab'] # we have to revort the dict dictionary = dict((v,k) for k, v in vocab_dict.iteritems()) filename_label_dict = load_json(args.input_label) print 'Finish loading data' label_vocab_dict = {} # start counting words for filename in filename_corpus_dict: vocab_num_dict = filename_corpus_dict[filename] label = filename_label_dict[filename] try: label_vocab_dict[label] except: label_vocab_dict[label] = {} for vocab in vocab_num_dict: num = vocab_num_dict[vocab] # print 'If num is a int? : ', isinstance(num, int) try: label_vocab_dict[label][vocab] += num except: label_vocab_dict[label][vocab] = num print 'Finish counting word frequence' label_topword_dict = {} label_num = len(label_topword_dict) print 'Label num is ', label_num topn = args.topn for label in label_vocab_dict: vocab_num_dict = label_vocab_dict[label] label_topword_dict[label] = sorted(vocab_num_dict, key = vocab_num_dict.__getitem__, reverse = True)[:topn] print 'Finish sorting the top n word' dump_json(label_topword_dict, args.output_json) print 'Finish write the json file' for label in label_topword_dict: filename_o = args.output_dir + 'label-' + str(label) + '.txt' print 'filename =' , filename_o file_o = open(filename_o, 'w') for word_index in label_topword_dict[label]: # print 'Is word_index a int:', isinstance(word_index, int) text = dictionary[int(word_index)] text += '\n' file_o.write(text.encode('utf-8')) file_o.close() print 'Finish writing files!'
def main(): parser = argparse.ArgumentParser() parser.add_argument('corpus', type=str, help='path to the corpus file') parser.add_argument('labels', type=str, help='path to the labels file') parser.add_argument('-bs', '--batch_size', type=int, default=100, help='batch size (default 100)') parser.add_argument('out_dir', type=str, help='path to the output dir') args = parser.parse_args() corpus = load_corpus(args.corpus) doc_labels = load_json(args.labels) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_names = docs.keys() X_docs = [doc2vec(x, n_vocab) for x in docs.values()] out_dir = args.out_dir # attributes attrs = zip(*sorted(vocab.items(), key=lambda d: [1]))[0] dump_pickle(attrs, os.path.join(out_dir, 'attributes.p')) # batches bs = args.batch_size batches = [bs * (x + 1) for x in range(int(len(docs) / bs) - 1)] batches.append(len(docs)) dump_pickle(batches, os.path.join(out_dir, 'batches.p')) # bow_batch_x for i in range(len(batches)): dump_pickle(X_docs[batches[i - 1] if i > 0 else 0:batches[i]], os.path.join(out_dir, 'bow_batch_%s.p' % batches[i])) # # docs_names_batch_x # for i in range(len(batches)): # dump_pickle(doc_names[batches[i - 1] if i > 0 else 0: batches[i]], os.path.join(out_dir, 'docs_names_batch_%s.p' % batches[i])) # class_indices_batch_x for i in range(len(batches)): data = [ doc_labels[doc_names[idx]] for idx in range(batches[i - 1] if i > 0 else 0, batches[i]) ] dump_pickle( data, os.path.join(out_dir, 'class_indices_batch_%s.p' % batches[i])) import pdb pdb.set_trace()
def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] model = AutoEncoder # model = DeepAutoEncoder ae = load_model(model, args.load_arch, args.load_weights) doc_codes = ae.encoder.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print('Saved doc codes file to %s' % args.output) if args.save_topics: topics_strength = get_topics_strength(ae, revdict(vocab), topn=10) save_topics_strength(topics_strength, args.save_topics) # topics = get_topics(ae, revdict(vocab), topn=10) # write_file(topics, args.save_topics) print('Saved topics file to %s' % args.save_topics) if args.sample_words: revocab = revdict(vocab) queries = [ 'weapon', 'christian', 'compani', 'israel', 'law', 'hockey', 'comput', 'space' ] words = [] for each in queries: words.append(get_similar_words(ae, vocab[each], revocab, topn=11)) write_file(words, args.sample_words) print('Saved sample words file to %s' % args.sample_words) if args.translate_words: revocab = revdict(vocab) queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']] for each in queries: print(each) print(translate_words(ae, each, vocab, revocab, topn=10)) if args.calc_distinct: # mean, std = calc_pairwise_cosine(ae) # print('Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi)) sd = calc_pairwise_dev(ae) print('Average squared deviation from 0 (90 degree): %s' % sd)
def gen_docs(args): #corpus = load_corpus("./data/20news/output/test.corpus") corpus = load_corpus(args.input_corpus) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) #new docs new_docs = {} for doc_key in docs.keys(): if doc_key.startswith(args.startswith): new_docs[doc_key] = docs[doc_key] print("{},共有文档:{}".format(args.startswith, len(new_docs))) dump_json({"vocab": vocab, "docs": new_docs}, args.output)
def test(args): corpus = load_corpus(args.corpus) vocab, docs = corpus['vocab'], corpus['docs'] doc_bow = {} for k in docs.keys(): bows = [] for idx, count in docs[k].iteritems(): bows.append((int(idx), count)) doc_bow[k] = bows del docs[k] lda = load_model(args.load_model) generate_doc_codes(lda, doc_bow, args.output) print 'Saved doc codes file to %s' % args.output if args.word_clouds: queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company',\ 'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase',\ 'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading',\ 'tax', 'march', 'june'] # queries = ['interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', \ # 'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', \ # 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', \ # 'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', \ # 'business', 'transaction', 'govern', 'trading', 'tax', 'three', 'four', 'five', \ # 'eleven', 'thirteen', 'fifteen', 'eighteen', 'twenty'] weights = lda.state.get_lambda() weights = np.apply_along_axis(lambda x: x / x.sum(), 1, weights) # get dist. # weights = unitmatrix(weights, axis=1) # normalize word_cloud(weights.T, vocab, queries, save_file=args.word_clouds) print 'Saved word clouds file to %s' % args.word_clouds if args.save_topics: topics_prob = show_topics_prob(lda) save_topics_prob(topics_prob, args.save_topics) # topics = show_topics(lda) # write_file(topics, args.save_topics) print 'Saved topics file to %s' % args.save_topics if args.calc_distinct: # mean, std = calc_pairwise_cosine(lda) # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi) sd = calc_pairwise_dev(lda) print 'Average squared deviation from 0 (90 degree): %s' % sd
def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] vae = load_vae_model(args.load_model) doc_codes = vae.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print 'Saved doc codes file to %s' % args.output
def main(): parser = argparse.ArgumentParser() parser.add_argument('--corpus', required=True, type=str, help='path to the corpus file') parser.add_argument('-mf', '--mod_file', required=True, type=str, help='path to the word2vec mod file') parser.add_argument('-sw', '--sample_words', type=str, help='path to the output sample words file') parser.add_argument('-o', '--output', type=str, help='path to the output doc codes file') args = parser.parse_args() corpus = load_corpus(args.corpus) docs, vocab_dict = corpus['docs'], corpus['vocab'] w2v = load_w2v(args.mod_file) # doc_codes = doc_word2vec(w2v, docs, revdict(vocab_dict), args.output, avg=True) if args.sample_words: queries = [ 'weapon', 'christian', 'compani', 'israel', 'law', 'hockey', 'comput', 'space' ] words = [] for each in queries: words.append(get_similar_words(w2v, each, topn=5)) write_file(words, args.sample_words) print('Saved sample words file to %s' % args.sample_words) import pdb pdb.set_trace()
def train(args): corpus = load_corpus(args.corpus) docs, vocab_dict = corpus['docs'], corpus['vocab'] doc_bow = [] doc_keys = docs.keys() for k in doc_keys: bows = [] for idx, count in docs[k].iteritems(): bows.append((int(idx), count)) doc_bow.append(bows) del docs[k] vocab_dict = dict([(int(y), x) for x, y in vocab_dict.iteritems()]) n_samples = len(doc_bow) doc_bow = np.array(doc_bow) np.random.seed(0) val_idx = np.random.choice(range(n_samples), args.n_val, replace=False) train_idx = list(set(range(n_samples)) - set(val_idx)) dbow_train = doc_bow[train_idx].tolist() dbow_val = doc_bow[val_idx].tolist() del doc_bow start = timeit.default_timer() lda = train_lda(dbow_train, vocab_dict, args.n_topics, args.n_iter, args.save_model) print 'runtime: %ss' % (timeit.default_timer() - start) if args.output: doc_keys = np.array(doc_keys) generate_doc_codes(lda, dict(zip(doc_keys[train_idx].tolist(), dbow_train)), args.output + '.train') generate_doc_codes(lda, dict(zip(doc_keys[val_idx].tolist(), dbow_val)), args.output + '.val') print 'Saved doc codes file to %s and %s' % (args.output + '.train', args.output + '.val')
def train(args): corpus = load_corpus(args.input) n_vocab, docs = len(corpus['vocab']), corpus['docs'] corpus.clear() # save memory doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] if args.noise == 'gs': X_docs_noisy = add_gaussian_noise(X_docs, 0.1) elif args.noise == 'sp': X_docs_noisy = add_salt_pepper_noise(X_docs, 0.1) pass elif args.noise == 'mn': X_docs_noisy = add_masking_noise(X_docs, 0.01) else: pass n_samples = X_docs.shape[0] np.random.seed(0) val_idx = np.random.choice(range(n_samples), args.n_val, replace=False) train_idx = list(set(range(n_samples)) - set(val_idx)) X_train = X_docs[train_idx] X_val = X_docs[val_idx] del X_docs if args.noise: # X_train_noisy = X_docs_noisy[:-n_val] # X_val_noisy = X_docs_noisy[-n_val:] X_train_noisy = X_docs_noisy[train_idx] X_val_noisy = X_docs_noisy[val_idx] print 'added %s noise' % args.noise else: X_train_noisy = X_train X_val_noisy = X_val start = timeit.default_timer() ae = AutoEncoder(n_vocab, args.n_dim, comp_topk=args.comp_topk, ctype=args.ctype, save_model=args.save_model) ae.fit([X_train_noisy, X_train], [X_val_noisy, X_val], nb_epoch=args.n_epoch, \ batch_size=args.batch_size, contractive=args.contractive) print 'runtime: %ss' % (timeit.default_timer() - start) if args.output: train_doc_codes = ae.encoder.predict(X_train) val_doc_codes = ae.encoder.predict(X_val) doc_keys = np.array(doc_keys) dump_json( dict(zip(doc_keys[train_idx].tolist(), train_doc_codes.tolist())), args.output + '.train') dump_json( dict(zip(doc_keys[val_idx].tolist(), val_doc_codes.tolist())), args.output + '.val') print 'Saved doc codes file to %s and %s' % (args.output + '.train', args.output + '.val')
def main(): parser = argparse.ArgumentParser() parser.add_argument('train_doc_codes', type=str, help='path to the train doc codes file') parser.add_argument('train_doc_labels', type=str, help='path to the train doc labels file') parser.add_argument('test_doc_codes', type=str, help='path to the test doc codes file') parser.add_argument('test_doc_labels', type=str, help='path to the test doc labels file') parser.add_argument('-nv', '--n_val', type=int, default=1000, help='size of validation set (default 1000)') parser.add_argument( '-qi', '--query_info', type=str, help='path to the query corpus (for geting doc length info)') parser.add_argument('-ml', '--multilabel', action='store_true', help='multilabel flag') args = parser.parse_args() # autoencoder train_doc_codes = load_json(args.train_doc_codes) train_doc_labels = load_json(args.train_doc_labels) test_doc_codes = load_json(args.test_doc_codes) test_doc_labels = load_json(args.test_doc_labels) X_train = np.r_[train_doc_codes.values()] Y_train = np.array([train_doc_labels[i] for i in train_doc_codes]) X_test = np.r_[test_doc_codes.values()] Y_test = np.array([test_doc_labels[i] for i in test_doc_codes]) # # DocNADE # train_doc_codes = load_json(args.train_doc_codes) # train_doc_labels = load_json(args.train_doc_labels) # test_doc_codes = load_json(args.test_doc_codes) # test_doc_labels = load_json(args.test_doc_labels) # X_train = [] # for each in train_doc_codes.values(): # X_train.append([float(x) for x in each]) # X_test = [] # for each in test_doc_codes.values(): # X_test.append([float(x) for x in each]) # X_train = np.r_[X_train] # Y_train = np.array([train_doc_labels[i] for i in train_doc_codes]) # X_test = np.r_[X_test] # Y_test = np.array([test_doc_labels[i] for i in test_doc_codes]) # # DBN # X_train = np.array(load_marshal(args.train_doc_codes)) # Y_train = np.array(load_marshal(args.train_doc_labels)) # X_test = np.array(load_marshal(args.test_doc_codes)) # Y_test = np.array(load_marshal(args.test_doc_labels)) seed = 7 np.random.seed(seed) val_idx = np.random.choice(range(X_train.shape[0]), args.n_val, replace=False) train_idx = list(set(range(X_train.shape[0])) - set(val_idx)) X_new_train = X_train[train_idx] Y_new_train = Y_train[train_idx] X_new_val = X_train[val_idx] Y_new_val = Y_train[val_idx] print 'train: %s, val: %s, test: %s' % ( X_new_train.shape[0], X_new_val.shape[0], X_test.shape[0]) results = retrieval(X_new_train, Y_new_train, X_new_val, Y_new_val,\ fractions=[0.001], multilabel=args.multilabel) print 'precision on val set: %s' % results if not args.query_info: results = retrieval(X_train, Y_train, X_test, Y_test,\ fractions=[0.001, 0.002, 0.005, 0.01, 0.02, 0.05, 0.1, 0.2, 0.5, 1.0], multilabel=args.multilabel) else: query_docs = load_corpus(args.query_info)['docs'] len_test = [sum(query_docs[i].values()) for i in test_doc_codes] results = retrieval_by_doclength(X_train, Y_train, X_test, Y_test, len_test, fraction=0.001, multilabel=args.multilabel) print 'precision on test set: %s' % results import pdb pdb.set_trace()
def train(args): corpus = load_corpus(args.input) n_vocab, docs = len(corpus['vocab']), corpus['docs'] corpus.clear() # vocab = corpus['vocab'] corpus.clear() # save memory doc_keys = docs.keys() X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] if args.noise == 'gs': X_docs_noisy = add_gaussian_noise(X_docs, 0.1) elif args.noise == 'sp': X_docs_noisy = add_salt_pepper_noise(X_docs, 0.1) pass elif args.noise == 'mn': X_docs_noisy = add_masking_noise(X_docs, 0.01) else: pass n_samples = X_docs.shape[0] np.random.seed(0) val_idx = np.random.choice(range(n_samples), args.n_val, replace=False) train_idx = list(set(range(n_samples)) - set(val_idx)) X_train = X_docs[train_idx] X_val = X_docs[val_idx] del X_docs if args.noise: # X_train_noisy = X_docs_noisy[:-n_val] # X_val_noisy = X_docs_noisy[-n_val:] X_train_noisy = X_docs_noisy[train_idx] X_val_noisy = X_docs_noisy[val_idx] print 'added %s noise' % args.noise else: X_train_noisy = X_train X_val_noisy = X_val start = timeit.default_timer() ae = AutoEncoder(n_vocab, args.n_dim, comp_topk=args.comp_topk, ctype=args.ctype, save_model=args.save_model) ae.fit([X_train_noisy, X_train], [X_val_noisy, X_val], nb_epoch=args.n_epoch, \ batch_size=args.batch_size, contractive=args.contractive) print 'runtime: %ss' % (timeit.default_timer() - start) if args.output: train_doc_codes = ae.encoder.predict(X_train) val_doc_codes = ae.encoder.predict(X_val) doc_keys = np.array(doc_keys) dump_json( dict(zip(doc_keys[train_idx].tolist(), train_doc_codes.tolist())), args.output + '.train') dump_json( dict(zip(doc_keys[val_idx].tolist(), val_doc_codes.tolist())), args.output + '.val') print 'Saved doc codes file to %s and %s' % (args.output + '.train', args.output + '.val') def unitmatrix(matrix, norm='l2', axis=1): if norm == 'l1': maxtrixlen = np.sum(np.abs(matrix), axis=axis) if norm == 'l2': maxtrixlen = np.linalg.norm(matrix, axis=axis) if np.any(maxtrixlen <= 0): return matrix else: maxtrixlen = maxtrixlen.reshape( 1, len(maxtrixlen)) if axis == 0 else maxtrixlen.reshape( len(maxtrixlen), 1) return matrix / maxtrixlen def calc_pairwise_dev(weights): # the average squared deviation from 0 (90 degree) weights = unitmatrix(weights, axis=0) # normalize n = weights.shape[1] score = 0. for i in range(n): for j in range(i + 1, n): score += (weights[:, i].dot(weights[:, j]))**2 return np.sqrt(2. * score / n / (n - 1)) from keras.models import load_model
def kmeans2(args): sentense_vec_dic = load_corpus(args.input) vec_name_u = load_corpus(args.question_name) print("if sentense_vec is a dict:") print(isinstance(sentense_vec_dic,dict)) print("if vec_name is a ls:") print(isinstance(vec_name_u,list)) vec = [] vec_name = [] for key in vec_name_u: filename = key.encode('utf-8') if filename in sentense_vec_dic.keys(): vec.append(sentense_vec_dic[filename]) vec_name.append(filename) print "file number is ", len(vec_name) sentense_vec_X = np.array(vec) print "doing k-means...." kmeans = KMeans(n_clusters=args.cluster_num, random_state=0).fit(sentense_vec_X) print "generate label" label_ls = kmeans.labels_ filename_label_dic = {} filesize = len(vec_name) for i in range(filesize): filename_label_dic[vec_name[i]] = label_ls[i] text_filename = args.text_file filename_text_dict = {} try: fp = open(text_filename, 'r') count_doc = 0; while 1: lines = fp.readlines() if not lines: break for sentense in lines: # print(sentense) text = sentense.decode('utf-8').strip('\r\n') count_doc += 1 doc_name = 'line-' + str(count_doc) filename_text_dict[doc_name] = text except Exception as e: raise e label_text_ls = [] for i in range(args.cluster_num): ls = [] label_text_ls.append(ls) for key in filename_label_dic: label = filename_label_dic[key] content = filename_text_dict[key] # print 'content of ', content, 'and the label is [', label, ']' label_text_ls[label].append(content) file_dict = {} for i in range(args.cluster_num): filename_o = args.output_dir + 'label-' + str(i) + '.txt' print 'filename =' , filename_o file_o = open(filename_o, 'w') for text in label_text_ls[i]: text += '\n' file_o.write(text.encode('utf-8')) file_o.close()
def get_word_relationship(args): corpus = load_corpus(args.input_corpus) doc_vec_dict = corpus['docs'] vocab_dict = corpus['vocab'] print 'Load corpus' # we have to revort the dict dictionary = dict((v,k) for k, v in vocab_dict.iteritems()) # Here the input top words path is the json file of the label-topwords_ls # should be a dict, each key is a label and its value is the list of top words top_words_path = args.input_topwords label_topwordls = load_json(top_words_path) print 'Load top words of each label' label_topwords_vocabnum_dict = {} label_topwordindexls_dict = {} for label in label_topwordls: label_topwords_vocabnum_dict[label] = {} topwords_index_ls = [] for word in label_topwordls[label]: topwords_index_ls.append(word) label_topwords_vocabnum_dict[label][word] = {} label_topwordindexls_dict[label] = topwords_index_ls print 'Finish change words into index' # in order to save memory and speed it up, I only calculate the word-words frequency of those # in the top word list for label in label_topwordindexls_dict: print 'Doing label', str(label) topwords_idx_set = set(label_topwordindexls_dict[label]) for filename in doc_vec_dict: word_vec_dict = doc_vec_dict[filename] result_word_ls = get_word_list(word_vec_dict, topwords_idx_set) for word in result_word_ls: for doc_word in word_vec_dict: try: label_topwords_vocabnum_dict[label][word][doc_word] += word_vec_dict[doc_word] except: label_topwords_vocabnum_dict[label][word][doc_word] = word_vec_dict[doc_word] print 'Finish building the dict of label-topwords-words-num!' # now we should get the top of words topn = args.topn # it is a dict-dict-ls ({label:{words:[top_relative words]}}) label_topwords_relativewords = {} for label in label_topwords_vocabnum_dict: label_topwords_relativewords[label] = {} for word in label_topwords_vocabnum_dict[label]: vocab_num_dict = label_topwords_vocabnum_dict[label][word] label_topwords_relativewords[label][word] = sorted(vocab_num_dict, key=vocab_num_dict.__getitem__, reverse = True)[:topn] print 'Finish sorting the top n word' dump_json(label_topwords_relativewords, args.output_json) print 'Finish write the json file' for label in label_topwords_relativewords: filename_o = args.output_dir + 'label-' + str(label) + '.txt' print 'filename =' , filename_o file_o = open(filename_o, 'w') for word_index in label_topwords_relativewords[label]: # print 'Is word_index a int:', isinstance(word_index, int) text = dictionary[int(word_index)] text += ': ' for top_relative_wordidx in label_topwords_relativewords[label][word_index]: text += dictionary[int(top_relative_wordidx)] text += ', ' text += '\n' file_o.write(text.encode('utf-8')) file_o.close() print 'Finish writing files!'
def learn_embedding(self, graph=None, edge_f=None, is_weighted=False, no_python=False, path_output="", dataset=""): n_dim = self._d method = "sdne" input = path_output + '/train.corpus' path_graph_embedding = path_source + "embedding/" + dataset + "/embedding_gem_sdne_" + dataset + "_" + str( n_dim) + ".txt" path_graph_embedding_id = path_source + "embedding/" + dataset + "/id_gem_" + method + "_" + dataset + "_" + str( n_dim) + ".txt" save_model = 'model' optimizer = "adadelta" val_split = 0.0214 batch_size = self._batch_size comp_topk = self._comp_topk optimizer = self._optimizer lr = self._lr alpha = self._alpha kfactor = self._kfactor gamma = self._gamma select_diff = self._select_diff select_loss = self._select_loss select_graph_np_diff = self._select_graph_np_diff contractive = None ctype = "kcomp" n_dim = 128 nb_epoch = 1000 save_model = 'model' if not graph and not edge_f: raise Exception('graph/edge_f needed') if not graph: graph = graph_util.loadGraphFromEdgeListTxt(edge_f) num_nodes = graph.number_of_nodes() graph3 = nx.DiGraph() graph3.add_nodes_from(range(0, num_nodes)) f1 = csv.reader(open(edge_f, "r"), delimiter=' ') for x, y in f1: # print(x,y) graph3.add_edge(int(x), int(y)) S = nx.to_scipy_sparse_matrix(graph, nodelist=sorted(graph.nodes())) t1 = time() S = (S + S.T) / 2 node_num = graph.number_of_nodes() edges_num = graph.number_of_edges() dict_nodes = {k: v for v, k in enumerate(sorted(graph.nodes()))} ## Load Graph Embeddings if (path_graph_embedding.endswith(".txt")): print("Loading SDNE embeddings") graph_embeddings = np.loadtxt(path_graph_embedding, delimiter=',') with open(path_graph_embedding_id) as temp_file: graph_embedding_id = [line.rstrip('\n') for line in temp_file] dict_graph = {k: v for v, k in enumerate(graph_embedding_id)} else: raise Exception('sdne embeddings do not exist') graph_embeddings = pickle.load(open(path_graph_embedding, "rb")) ## Load text data print("Loading textual corpus") corpus = load_corpus(input) n_vocab = len(corpus['vocab']) docs = corpus['docs'] corpus.clear() # save memory doc_keys = np.array(list(docs)) dict_doc = {int(k): v for v, k in enumerate((doc_keys))} X_docs = [] for k in list(docs): X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] # dump_json(dict(zip(doc_keys.tolist(), X_docs.tolist())), path_source+'embedding\\'+dataset+'\\bow.txt') text_vector = self.get_node_representation(graph, X_docs, dict_doc) graph_vector = self.get_node_representation(graph, graph_embeddings, dict_nodes) # return S,node_num,edges_num,graph_embeddings, X_docs,n_vocab, doc_keys, text_vector, graph_vector train_data = [text_vector, text_vector, graph_vector] result, _Y, model = fit_quadruple_hyperas(n_vocab, n_dim, comp_topk=comp_topk, ctype=ctype, save_model=save_model, kfactor=kfactor, alpha=alpha, gamma=gamma, num_nodes=node_num, num_edges=edges_num, train_data=train_data, test_data=X_docs, val_split=val_split, nb_epoch=nb_epoch, \ batch_size=batch_size, contractive=contractive, optimizer=optimizer, lr=lr, select_diff=select_diff, select_loss=select_loss, select_graph_np_diff=select_graph_np_diff) dump_json(dict(zip(doc_keys.tolist(), _Y.tolist())), path_source + 'embedding\\' + dataset + '\\predicted_cage_embedding.txt') print('Saved doc codes file') self._Y = _Y self._node_num = node_num self._X = X_docs _Y_id = doc_keys.tolist() return _Y, _Y_id, len(result.history["loss"]), t1
def kmeans(args): sentense_vec_dic = load_corpus(args.input) print("if sentense_vec is a dict:") print(isinstance(sentense_vec_dic, dict)) vec = [] vec_name = [] for key in sentense_vec_dic: vec.append(sentense_vec_dic[key]) vec_name.append(key) print "dict size is ", len(sentense_vec_dic) sentense_vec_X = np.array(vec) print "doing k-means...." if args.is_large_set: print "Do it in large data set" kmeans = MiniBatchKMeans(n_clusters=args.cluster_num, random_state=0).fit(sentense_vec_X) else: print "Do it in small data set" kmeans = KMeans(n_clusters=args.cluster_num, random_state=0).fit(sentense_vec_X) print "generate label" label_ls = kmeans.labels_ filename_label_dic = {} filesize = len(sentense_vec_dic) for i in range(filesize): filename_label_dic[vec_name[i]] = int(label_ls[i]) if args.output_json: print 'Write the label to the json file' dump_json(filename_label_dic, args.output_json) # with io.open(args.output_json, 'w', encoding='utf-8') as f: # f.write(json.dumps(filename_label_dic, ensure_ascii=False)) print 'Finish writing filename_label dict to file' text_filename = args.text_file filename_text_dict = {} try: fp = open(text_filename, 'r') count_doc = 0 while 1: lines = fp.readlines() if not lines: break for sentense in lines: # print(sentense) text = sentense.decode('utf-8').strip('\r\n') count_doc += 1 doc_name = 'line-' + str(count_doc) filename_text_dict[doc_name] = text except Exception as e: raise e label_text_ls = [] for i in range(args.cluster_num): ls = [] label_text_ls.append(ls) for key in filename_label_dic: label = filename_label_dic[key] content = filename_text_dict[key] # print 'content of ', content, 'and the label is [', label, ']' label_text_ls[label].append(content) file_dict = {} for i in range(args.cluster_num): filename_o = args.output_dir + 'label-' + str(i) + '.txt' print 'filename =', filename_o file_o = open(filename_o, 'w') for text in label_text_ls[i]: text += '\n' file_o.write(text.encode('utf-8')) file_o.close()
def test(args): corpus = load_corpus(args.input) vocab, docs = corpus['vocab'], corpus['docs'] n_vocab = len(vocab) doc_keys = list(docs.keys()) X_docs = [] for k in doc_keys: X_docs.append(vecnorm(doc2vec(docs[k], n_vocab), 'logmax1', 0)) del docs[k] X_docs = np.r_[X_docs] ae = load_ae_model(args.load_model) doc_codes = ae.predict(X_docs) dump_json(dict(zip(doc_keys, doc_codes.tolist())), args.output) print('Saved doc codes file to %s' % args.output) if args.save_topics: topics_strength = get_topics_strength(ae, revdict(vocab), topn=10) save_topics_strength(topics_strength, args.save_topics) # topics = get_topics(ae, revdict(vocab), topn=10) # write_file(topics, args.save_topics) print('Saved topics file to %s' % args.save_topics) if args.word_clouds: queries = [ 'interest', 'trust', 'cash', 'payment', 'rate', 'price', 'stock', 'share', 'award', 'risk', 'security', 'bank', 'company', 'service', 'grant', 'agreement', 'proxy', 'loan', 'capital', 'asset', 'bonus', 'shareholder', 'income', 'financial', 'net', 'purchase', 'position', 'management', 'loss', 'salary', 'stockholder', 'due', 'business', 'transaction', 'govern', 'trading', 'tax', 'march', 'april', 'june', 'july' ] weights = ae.get_weights()[0] weights = unitmatrix(weights) # normalize word_cloud(weights, vocab, queries, save_file=args.word_clouds) print('Saved word clouds file to %s' % args.word_clouds) if args.sample_words: revocab = revdict(vocab) while True: print("----------------------------\n? ", end='') sys.stdout.flush() query = sys.stdin.readline() query = re.sub(r'[^\w\s-]', ' ', query) # remove punctuations except hyphen query_words = [] for word in query.lower().split(): # convert to lowercase if word not in stopwords.words('english'): # remove stop words query_words.append(word) # ===== make the query length to be (32) = times_steps size """long_enough = False while not long_enough: for word in query_words: query_vectors.append(word2vec_map[word]) if len(query_vectors) == 32: long_enough = True break""" words = [] for each in query_words: words.append( get_similar_words(ae, vocab[each], revocab, topn=11)) write_file(words, args.sample_words) print('Saved sample words file to %s' % args.sample_words) if args.translate_words: revocab = revdict(vocab) queries = [['father', 'man', 'woman'], ['mother', 'woman', 'man']] for each in queries: print(each) print(translate_words(ae, each, vocab, revocab, topn=10)) if args.calc_distinct: # mean, std = calc_pairwise_cosine(ae) # print 'Average pairwise angle (pi): %s (%s)' % (mean / math.pi, std / math.pi) sd = calc_pairwise_dev(ae) print('Average squared deviation from 0 (90 degree): %s' % sd)