def build_graph(self, phrase_max_size=1, composition_function='RNN', dim=100, batch_size=1, neg=1, learning_rate=0.2, id_word=[], freq_table=[], init_word_data=[], init_context_data=[], epoch_num=1, embedding_train=False): if init_word_data: init_word_matrix = utils.read_embedding(init_word_data, id_word) else: init_word_matrix = [] if init_context_data: init_context_matrix = utils.read_embedding(init_context_data, id_word) else: init_context_matrix = [] holder, composed, context_id, true_logit, negative_logit = self.forward(phrase_max_size, composition_function, dim, batch_size, neg, freq_table, init_word_matrix, init_context_matrix, embedding_train) self.holder = holder self.composed = composed self.context_id = context_id self.true_logit = true_logit self.negative_logit = negative_logit loss = self.nce_loss(true_logit, negative_logit, batch_size, embedding_train) self.loss = loss for length in loss: tf.scalar_summary('%s_SGNS_loss'%length, loss[length]) processed_num, lr, optimize_op = self.optimizer(loss, sum(freq_table), epoch_num, initial_learning_rate=learning_rate) self.processed_num = processed_num self.optimize_op = optimize_op self.lr = lr #initialize all values tf.initialize_all_variables().run()
def train(self, sess, saver): self.logger.info("training model") best_accuracy = 0 patient_passes = 0 self.logger.info("init summary") self.add_summary(sess.graph) self.logger.info("init session") self.attach_session(sess) embedding, voc_size = read_embedding(self.flags.embedding_data) _ = sess.run(self.embedding_init, {self.embedding_placeholder: embedding}) for epoch in range(self.flags.epoch): self.logger.info("Running epoch {} of {}".format( epoch + 1, self.flags.epoch)) accuracy, loss = self.train_epoch(epoch, embedding, saver) self.logger.info( 'accuracy accuracy on dev {} loss on dev {}'.format( accuracy, loss)) if accuracy <= best_accuracy: patient_passes += 1 if patient_passes == self.flags.patient_passes: self.logger.info( ' - {} epochs without improvement, training stopped'. format(patient_passes)) break else: self.logger.info('- New best accuracy {}'.format(accuracy)) best_accuracy = accuracy patient_passes = 0 saver.save(sess, os.path.join(self.model_dir, "model"), global_step=self.step)
def preprocess(self,train,test,mode): ''' Preprocess data and do feature extraction Args: train (dataframe): train data in dataframe format test (dataframe): validation data in dataframe format mode (str): mode whether it is "training", "eval", "prediction" Returns: X: train data with shape=[num_examples,max_len,embedding_dim] Y: train label with shape=[num_examples,] test_X: validation data with shape=[num_examples,max_len,embedding_dim] test_Y: validation label with shape=[num_examples,] ''' embeddings_dict,embedding_dim = utils.read_embedding(self.embedding_model) num_words = len(embeddings_dict) X,Y,test_X,test_Y= utils.create_embedding_features( mode, train, test, self.input_col, self.target_col, embeddings_dict, embedding_dim, stopwords_file=self.stopwords_file, max_len=self.max_len, limit=self.limit ) return X,Y,test_X,test_Y,num_words,embedding_dim
def income_different_size_embedding_scenario(): # names = np.array( # [['ridge without emd', 'RF without emd'], # ['ridge with emd', 'RF with emd'], # ['ridge just emd', 'RF just emd']]) # names = [['ridge', 'RF']] names = [['ridge']] y_path = '../local_resources/Socio_economic_classification_data/income_dataset/y_thresh10.p' target = utils.read_target(y_path) y = np.array(target['mean_income']) n_folds = 10 sizes = [16, 32, 64, 128] for size in sizes: print 'running embeddings of size ', size emd_path = '../local_resources/Socio_economic_classification_data/income_dataset/thresh10_{0}.emd'.format( size) x = utils.read_embedding(emd_path, target) results = run_all_datasets([x], y, names, regressors, n_folds) # all_results = utils.merge_results(results) all_results = pd.concat([x for x in results]) all_results.rename(columns={n_folds: 'train'}, inplace=True) results, tests = t_tests(all_results) print results path = '../results/income/thresh10_' + str( size) + '_' + utils.get_timestamp() + '.csv' results.to_csv(path, index=True)
def karate_scenario(): deepwalk_path = 'local_resources/zachary_karate/size8_walks1_len10.emd' y_path = 'local_resources/zachary_karate/y.p' x_path = 'local_resources/zachary_karate/X.p' target = utils.read_target(y_path) x, y = utils.read_data(x_path, y_path, threshold=0) names = [['logistic'], ['deepwalk']] x_deepwalk = utils.read_embedding(deepwalk_path, target) # all_features = np.concatenate((x.toarray(), x_deepwalk), axis=1) X = [x_deepwalk, normalize(x, axis=0)] n_folds = 2 results = run_all_datasets(X, y, names, classifiers, n_folds) all_results = utils.merge_results(results) results, tests = utils.stats_test(all_results) tests[0].to_csv('results/karate/deepwalk_macro_pvalues' + utils.get_timestamp() + '.csv') tests[1].to_csv('results/karate/deepwalk_micro_pvalues' + utils.get_timestamp() + '.csv') print 'macro', results[0] print 'micro', results[1] macro_path = 'results/karate/deepwalk_macro' + utils.get_timestamp( ) + '.csv' micro_path = 'results/karate/deepwalk_micro' + utils.get_timestamp( ) + '.csv' results[0].to_csv(macro_path, index=True) results[1].to_csv(micro_path, index=True)
def _build_vocab(self, counter, embedding_config): """ :param counter: counter of words in dataset :param embedding_config: word_embedding config: (root, word_type, dim) :return: itos, stoi, vectors """ wv_dict, wv_vectors, wv_size = read_embedding(embedding_config) # embedding size = glove vector size embed_size = wv_vectors.size(1) print("word embedding size: %d" % embed_size) # build itos and stoi # words_in_dataset = sorted(counter.keys(), key=lambda x: counter[x], reverse=True) words_in_dataset = counter.keys() itos = self.specials[:] stoi = defaultdict(self.get_unk) itos.extend(words_in_dataset) for idx, word in enumerate(itos): stoi[word] = idx # build vectors vectors = torch.zeros([len(itos), embed_size]) for word, idx in stoi.items(): idx_in_pretrained_array = wv_dict.get(word, None) if idx_in_pretrained_array is not None: vectors[idx, :wv_size].copy_(wv_vectors[idx_in_pretrained_array]) return itos, stoi, vectors
def change_index(emd_path, target): """ change the embeding index to be Twitter IDs :param emd_path: the path to the embedding file :param target: a pandas DataFrame containing the labels indexed by Twitter IDs :return: None """ x = utils.read_embedding(emd_path, target) df = pd.DataFrame(data=x, index=target.index) try: del df.index.name except AttributeError: pass df.to_csv(emd_path)
def reindex_embeddings(): """ changes the first column of embeddings from an index to a Twitter ID :return: """ y_path = '../../local_resources/income_dataset/y_thresh10.p' target = utils.read_target(y_path) sizes = [16, 32, 64, 128] for size in sizes: print 'running embeddings of size ', size emd_path = '../../local_results/income_dataset/thresh10_{0}.emd'.format( size) x = utils.read_embedding(emd_path, target) df = pd.DataFrame(data=x, index=target.index) try: del df.index.name except AttributeError: pass df.to_csv(emd_path)
def income_scenario(): # names = np.array( # [['ridge without emd', 'RF without emd'], # ['ridge with emd', 'RF with emd'], # ['ridge just emd', 'RF just emd']]) # names = [['ridge', 'RF']] names = [['ridge']] y_path = '../local_resources/Socio_economic_classification_data/income_dataset/y_thresh10.p' emd_path = '../local_resources/Socio_economic_classification_data/income_dataset/thresh10_64.emd' target = utils.read_target(y_path) x = utils.read_embedding(emd_path, target) y = np.array(target['mean_income']) n_folds = 10 # x, y = utils.read_data(x_path, y_path, threshold=1) results = run_all_datasets([x], y, names, regressors, n_folds) # all_results = utils.merge_results(results) all_results = pd.concat([x for x in results]) all_results.rename(columns={n_folds: 'train'}, inplace=True) results, tests = t_tests(all_results) print results path = '../results/income/thresh10_' + utils.get_timestamp() + '.csv' results.to_csv(path, index=True)
from tqdm import tqdm if __name__ == '__main__': config_file = sys.argv[1] configure = json.load(open(config_file)) config = configure["main_configuration"] config_data = config["data_sets"] config_model = config["model"] config_model_param = config_model["parameters"] config_model_train = config_model["train"] config_model_test = config_model["test"] print("Data extraction\nConfiguration: ") print(json.dumps(config, indent=2), end='\n') print("Read embeddings ...") embed_tensor = convert_embed_2_numpy(read_embedding(config_data["embed"]), config_data["vocab_size"]) print("Create a model...") query = Input(name="in_query", shape=(config_data['query_maxlen'], ), dtype='int32') # ex: query vector of 10 words doc = Input(name="in_doc", shape=(config_data['doc_maxlen'], ), dtype='int32') embedding = Embedding(config_data['vocab_size'], config_data['embed_size'], weights=[embed_tensor], trainable=config_model_train['train_embed'],
def main(): config = """ { "query_maxlen": 10, "doc_maxlen": 1000, "hist_size": 200, "embed_path": "../../data/support/embed_query_100w_50.txt", "input": "../../runtime_data/fulltext/real_train_HL_solrex", "output": "../../runtime_data/fulltext/real_train_HL_solrex_drmm" } """ config = json.loads(config) embed_dict, vocab_size, embed_size, word_dict, idf_dict = read_embedding( config['embed_path']) embed = np.float32(np.random.uniform(-9, 9, [vocab_size, embed_size])) embed_dict = convert_embed_2_numpy('embed', embed_dict=embed_dict, embed=embed, normalize=True) if not os.path.exists(config['output']): os.mkdir(config['output']) for dirpath, dirnames, filenames in os.walk(config['input']): for fn in filenames: if not fn.endswith('.txt'): continue print os.path.join(dirpath, fn) with open(os.path.join(dirpath, fn)) as file: qid, query, uid, doc, label = file.readline().split('\t') output = open(os.path.join(config['output'], qid + '.txt'), 'w') query = query.strip().split( )[:min(len(query), config['query_maxlen'])] query = convert_term2id(query, word_dict) query_embed = np.zeros([config['query_maxlen'], embed_size], dtype=np.float32) for i, wid in enumerate(query): query_embed[i] = embed_dict[wid] file.seek(0) for line in file: qid, query, uid, doc, label = line.split('\t') uid = uid.strip() qid = qid.strip() query = query.strip() label = label.strip() doc = doc.strip().split( )[:min(len(doc), config['doc_maxlen'])] doc = convert_term2id(doc, word_dict) doc_embed = np.zeros([config['doc_maxlen'], embed_size], dtype=np.float32) for i, wid in enumerate(doc): doc_embed[i] = embed_dict[wid] hist = cal_hist(query_embed, doc_embed, config) hist = ' '.join(map(str, np.reshape(hist, [-1]))) output.write(qid + '\t' + query + '\t' + uid + '\t' + hist + '\t' + label + '\n') output.close()
self.logger.info("Running epoch {} of {}".format( epoch + 1, self.flags.epoch)) accuracy, loss = self.train_epoch(epoch, embedding, saver) self.logger.info( 'accuracy accuracy on dev {} loss on dev {}'.format( accuracy, loss)) if accuracy <= best_accuracy: patient_passes += 1 if patient_passes == self.flags.patient_passes: self.logger.info( ' - {} epochs without improvement, training stopped'. format(patient_passes)) break else: self.logger.info('- New best accuracy {}'.format(accuracy)) best_accuracy = accuracy patient_passes = 0 saver.save(sess, os.path.join(self.model_dir, "model"), global_step=self.step) # def restore(self): # self.logger.info("loading model") # self.attach_session(sess) # saver.restore(sess, self.flags.model_output) if __name__ == "__main__": eb, i = read_embedding('./data/embedding.pkl') print(eb[4])
""" ''' config = """ { "query_maxlen": 10, "doc_maxlen": 1000, "hist_size": 200, "embed_path": "/home/zhenfan/lab/K-NRM/data/embed_query_50w_50.txt", "input": "/home/zhenfan/lab/K-NRM/data/real_train_HL_solrex", "output": "/home/zhenfan/lab/K-NRM/data/real_train_HL_solrex_drmm" } """ ''' config = json.loads(config) embed_dict, vocab_size, embed_size, word_dict, idf_dict = read_embedding( config['embed_path']) embed = np.float32(np.random.uniform(-9, 9, [vocab_size, embed_size])) embed_dict = convert_embed_2_numpy('embed', embed_dict=embed_dict, embed=embed, normalize=True) def cal_hist(query_embed, doc_embed, config): hist = np.zeros([config['query_maxlen'], config['hist_size']], dtype=np.int32) hist[:] = 1 mm = np.zeros([config['query_maxlen'], config['doc_maxlen']], dtype=np.float32) for i in range(config['query_maxlen']): for j in range(config['doc_maxlen']):
def __init__(self, flags): """Init class.""" self.flags = flags self.embedding, self.embedding_size = read_embedding( self.flags.model_dir + self.flags.embedding_path)