def __init__(self): self.__embeddings, self.word2index = dp.load_embeddings() self.lemma2index, self.pos2index, self.synset2index, self.ambiguous_words = dp.load_dataset( self.word2index) self.multitask_model = self.__load_multitask_model() self.disambiguation_model = self.__load_disambiguation_model() self.index2lemma = {v: k for k, v in self.lemma2index.items()} self.index2synset = {v: k for k, v in self.synset2index.items()} self.synset2score = dp.load_sentibabelnet()
def train_cluster_model(): print("Preprocessing data...") print("start load_dataset") nlp_data_clean, df_bug_nonnlp = dp.load_dataset() print("end load_dataset") print("start make traing_all_features") train_data_all, _ = train.merge_all_features(nlp_data_clean, df_bug_nonnlp) print("end make traing_all_features") # train_data_all.set_index(nlp_data_clean['bf_bugid'], inplace=True) print("start make cluster model") model_clustering.clustering(train_data_all, nlp_data_clean) print("end make cluster model") print("start make df_nlp_features") df_nlp_feature = model_clustering.get_nlp_features() print("end make df_nlp_features")
LEARNING_RATE = 0.1 # learning rate to optimize """ Directories """ TRAIN_DIR = "dataset/DATA/TRAIN" VALID_DIR = "dataset/DATA/DEV" TMP_DIR = "/tmp/" ANALOGIES_FILE = "dataset/eval/questions-words.txt" """ Read data and write them to disk, only run for a first time. Comment the line below if you want to load them instead. """ dp.write_dataset(domain_limit=DOMAIN_LIMIT, skip_window=WINDOW_SIZE) """ Load data from disk. """ data, counter, W2I, I2W, unigram_table = dp.load_dataset() VOCABULARY_SIZE = len(W2I) # The most N word to consider in the dictionary valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) questions = dp.read_analogies(ANALOGIES_FILE, W2I) print("Load data done") print("Data size: %d samples with %d unique words" % (data.shape[0], len(W2I))) """ Skip gram model Below is a long sequence of graph nodes for the skip gram model. The flow is basically from the (inputs, labels) to the positive entropy loss (posEnt) and negative entropy loss (negEnt). To calculate posEnt, one just needs to calculate posLog = inputs*pos_outputs and then forward it to sigmoid cross entropy with positive labels. Similarly, to negEnt, one first calculates negLog = inputs*neg_outputs ==> sigmoid cross entropy with negative labels. While pos_outputs are lookups of vector 'labels' on embedding tables, neg_outputs are lookups of vector 'sampled_indices' which have to be determined by a candidate sampling function.
DICTIONARY_PATH = os.path.join(DATASET_DIR, DATASET_ID + '_words.ls') DATASET_PATH = os.path.join(DATASET_DIR, DATASET_ID + '_dataset_1.ls') TMP_DIR = r'C:\Users\Sergio\Documents\WIR\tmp' # when resuming training WEIGHTS_BASE_PATH = r'C:\Users\Sergio\Documents\WIR\tmp\tweets_all_978d3bfaa50a4e028d4c740ba39578df\978d3bfaa50a4e028d4c740ba39578df' WEIGHTS_PRESENT = WEIGHTS_BASE_PATH != None WORD_EMBEDDINGS_PATH = WEIGHTS_BASE_PATH + '_word-embeddings.ls' CONTEXT_EMBEDDINGS_PATH = WEIGHTS_BASE_PATH + '_context-embeddings.ls' BIASES_PATH = WEIGHTS_BASE_PATH + '_biases.ls' # dataset loading dictionary, direct_map, inverse_map = load_dictionary(DICTIONARY_PATH) DICTIONARY_SIZE = len(dictionary) # size of the loaded dictionary print('>> {} loaded words'.format(DICTIONARY_SIZE)) dataset = load_dataset(DATASET_PATH, SAMPLE_SIZE) print('>> {} loaded sentences'.format(len(dataset))) # model definition graph = tf.Graph() with graph.as_default(): # input tensors with tf.name_scope('inputs'): train_inputs = tf.placeholder(tf.int32, shape=[BATCH_SIZE]) train_labels = tf.placeholder(tf.int32, shape=[BATCH_SIZE, NUM_TRUE]) # outputs for every input word # variable tensors with tf.name_scope('variables'): if not WEIGHTS_PRESENT: word_embeddings = tf.Variable(tf.random_uniform([DICTIONARY_SIZE, EMBEDDING_SIZE], -1.0, 1.0), name='word_embeddings')
### HYPERPARAMETERS ### BATCH_SIZE = 10 # How many sentences i take HIDDEN_SIZE = 128 LEARNING_RATE = 0.001 EPOCHS = 1 PRINT_RESULTS = True GENERATE_ANSWER = False GLOBAL_EPOCH = dp.global_epoch(TMP_DIR + 'epoch.txt') ### LOAD GLOVE EMBEDDINGS AND TRAIN SET ### embeddings, word2index = dp.load_embeddings('WSD/Data/glove.6B.100d.txt') sentences = dp.get_sentences(DATA_DIR + 'CoNLL2009-ST-English-train.txt') training_data, role2index, pos2index = dp.load_dataset(sentences, word2index) del sentences #print('ROLES DICTIONARY SIZE:', len(role2index)) #print('POS DICTIONARY SIZE', len(pos2index)) ### MODEL ### graph = tf.Graph() with graph.as_default(): with tf.name_scope('input'): # shape = (batch_size, max length of sentence in batch) word_ids = tf.placeholder(tf.int32, shape=[None, None]) # shape = (batch_size, max length of sentence in batch)
def train(self, epochs=1): if self.__train_set and self.__test_set: pass else: self.__train_set, self.__test_set = dp.load_dataset( self.__TMP_DIR, self.__DATA_DIR) # Open a writer to write summaries. self.__writer = tf.summary.FileWriter(self.__TMP_DIR, self.__session.graph) average_loss = 0 for step in tqdm.tqdm(range(self.__ITERATIONS * epochs), desc='Training Siamese Network'): batch, label = dp.get_batch(self.__train_set, self.__BATCH_SIZE) pair_1 = np.array([b[0] for b in batch]) pair_2 = np.array([b[1] for b in batch]) # Define metadata variable. run_metadata = tf.RunMetadata() _, l = self.__session.run([self.__train_op, self.__loss], feed_dict={ self.__img_1: pair_1, self.__img_2: pair_2, self.__flags: label }, run_metadata=run_metadata) average_loss += l # print loss and accuracy on test set every 500 steps if (step % 500 == 0 and step > 0) or (step == (self.__ITERATIONS - 1)): correct = 0 k = len(self.__test_set) for _ in range(k): test, label = dp.get_one_shot_test(self.__test_set) pair_1 = np.array([b[0] for b in test]) pair_2 = np.array([b[1] for b in test]) run_metadata = tf.RunMetadata() pred = self.__session.run(self.__prediction, feed_dict={ self.__img_1: pair_1, self.__img_2: pair_2 }, run_metadata=run_metadata) if pred[0] == 0: correct += 1 print('Loss:', str(average_loss / step), '\tAccuracy:', correct / k) with open(self.__TMP_DIR + '/log.txt', 'a', encoding='utf8') as f: f.write( str(correct / k) + ' ' + str(average_loss / step) + '\n') if step == (self.__ITERATIONS - 1): self.__writer.add_run_metadata(run_metadata, 'step%d' % step, global_step=self.__GLOBAL_ITER + step + 1) self.__saver.save(self.__session, os.path.join(self.__TMP_DIR, 'model.ckpt')) dp.global_iteration(self.__TMP_DIR + '/iteration.txt', update=self.__GLOBAL_ITER + step + 1) pg.generate_accuracy_plot(self.__TMP_DIR + '/') pg.generate_loss_plot(self.__TMP_DIR + '/') self.__writer.close()