def load_dataset(): print("Reading CSV file...") bug_fields = config.bug_fields nlp_fields = config.nlp_fields nonnlp_fields = config.nonnlp_fields table = pq.read_table(config.bug_data, columns=bug_fields) df = table.to_pandas() df.drop_duplicates(inplace=True) df_bug_nlp, df_bug_nonnlp = clean_dataset(df) utils.create_vocabulary(df_bug_nlp['text'].tolist()) print("nlp feature clean_shape", df_bug_nlp.shape) df_bug_nlp.to_csv(config.nlp_data_clean, index=True) df_bug_nonnlp.to_csv(config.nonnlp_data_clean, index=True) return df_bug_nlp, df_bug_nonnlp
def __init__(self, sent_size): super(Attention_Net, self).__init__() #embeds are of the dimension n * (1 x (embedding_size * no_of_kmers)) self.embeds = nn.Embedding(len(utils.create_vocabulary(Config.window_size)), Config.embedding_size) #Embeds[125, 5] self.embeds_size = Config.embedding_size * sent_size """ Experimenting Start """ #self.embeds_size = sent_size # set this if we want the embeddings to be fed as vectors self.attn_weights = nn.Parameter(autograd.Variable(torch.randn(self.embeds_size, self.embeds_size))) self.attention = MultiHeadAttention(Config.n_head, Config.d_model, Config.d_k, Config.d_v, dropout=Config.attn_dropout); """ Experimenting END """ #self.attn_weights = nn.Parameter(autograd.Variable(torch.randn(self.embeds_size, self.embeds_size))) #self.attn_weights = autograd.Variable(torch.randn(self.embeds_size, self.embeds_size)) #attn_weights = autograd.Variable(torch.randn(self.embeds_size, self.embeds_size)) self.tanh = torch.tanh self.fc1 = nn.Linear(self.embeds_size, Config.hidden_layer_size) """ Experimenting Start """ #self.fc1 = nn.Linear(Config.embedding_size, Config.hidden_layer_size) """ Experimenting END """ self.relu = F.relu #self.context = None self.sigmoid = nn.Sigmoid() self.fc2 = nn.Linear(Config.hidden_layer_size, 1) self.threshold = F.threshold self.dropout = nn.Dropout(Config.dropout)
def pre_process_data(raw_data, tokenizer, config, logger): ''' raw_data: dir or a specific file ''' vocab_file = os.path.join(config.tokenized_data_dir, 'vocab.txt') sample_file = os.path.join(config.tokenized_data_dir, 'samples.txt') if os.path.isfile(vocab_file) and os.path.isfile(sample_file): logger.info("vocab file and sample file already existed!") return Data(vocab_file, sample_file, config, logger) else: logger.info("Genarate vocabulary and tokenized samples.") if os.path.isfile(raw_data): raw_data = [raw_data] else: raw_data = glob.glob(os.path.join(raw_data, '*')) samples = set() for file in raw_data: for qa in parse_raw_file(file): q = qa[0] a = qa[1] tokenized_q = tokenize_one_line( sentence=q, cut_fun=tokenizer.tokenize, specical_symbol=config.special_symbol, mode=config.source_language_type, lower=config.source_language_lower) tokenized_a = tokenize_one_line( sentence=a, cut_fun=tokenizer.tokenize, specical_symbol=config.special_symbol, mode=config.target_language_type, lower=config.target_language_lower) samples.add(tokenized_q + "\t" + tokenized_a) logger.info('sample size:{}'.format(len(samples))) logger.info("save samples in '{}'".format(sample_file)) write_lines(sample_file, samples) source_vocab, target_vocab, special_vocab = create_vocabulary( samples, config.special_symbol) source_vocab = set(list(source_vocab.keys())) for s_symbol in config.vocab_remains: if s_symbol in source_vocab: source_vocab.discard(s_symbol) if s_symbol in target_vocab: target_vocab.discard(s_symbol) if s_symbol in special_vocab: special_vocab.discard(s_symbol) logger.info('vocab size:{}'.format( len(source_vocab) + len(target_vocab) + len(special_vocab) + len(config.vocab_remains))) logger.info('save vocabulary in "{}"'.format(vocab_file)) with open(vocab_file, 'w', encoding='utf8') as f: for line in config.vocab_remains: f.write(line + '\n') for line in special_vocab: f.write(line + '\n') for line in source_vocab | target_vocab: f.write(line + '\n') return Data(vocab_file, sample_file, config, logger)
def __init__(self, sent_size): super(Lstm_Net, self).__init__() self.sent_size = sent_size self.embeds = nn.Embedding( len(utils.create_vocabulary(Config.window_size)), Config.embedding_size) self.embeds_size = Config.embedding_size * sent_size self.lstm = nn.LSTM(self.embeds_size, Config.hidden_layer_size, Config.num_layers) self.tanh = torch.tanh # self.fc1 = nn.Linear(self.embeds_size, Config.hidden_layer_size) self.fc1 = nn.Linear(Config.hidden_layer_size, Config.hidden_layer_size) self.dropout = nn.Dropout(Config.dropout_rate) self.relu = F.relu self.sigmoid = nn.Sigmoid() self.fc2 = nn.Linear(Config.hidden_layer_size, 1)
def train_epoch(model, inputs, labels, optimizer, criterion): model.train() losses = [] vocabulary = utils.create_vocabulary(Config.window_size) #labels_hat = [] j = 0 correct, wrong = 0, 0 for data in inputs.itertuples(): gene = data.Gene input_ = torch.tensor([ vocabulary[gene[i:i + Config.window_size]] for i in range(0, len(gene) - Config.window_size + 1) ], dtype=torch.long) #data_batch = inputs[i:i + batch_size, :] #labels_batch = labels[i:i + batch_size, :] inputs = autograd.Variable(input_) label = autograd.Variable(labels[j]) j += 1 optimizer.zero_grad() # (1) Forward label_hat = model(input_) # (2) Compute diff loss = criterion(label_hat, label) # (3) Compute gradients losses.append(loss.data.numpy()) loss.backward(retain_graph=False) # (4) update weights optimizer.step() #labels_hat.append(label_hat) correct, wrong = utils.get_train_accuracy(label_hat, j - 1, len(labels), correct, wrong) #print('labels_hat size>', len(labels_hat)) loss = sum(losses) / len(losses) return loss, tuple((correct, wrong))
def main(_): assert FLAGS.source_train_path, ("--source_train_path is required.") assert FLAGS.target_train_path, ("--target_train_path is required.") # Create vocabularies. source_vocab_path = os.path.join(os.path.dirname(FLAGS.source_train_path), "vocabulary.source") target_vocab_path = os.path.join(os.path.dirname(FLAGS.source_train_path), "vocabulary.target") utils.create_vocabulary(source_vocab_path, FLAGS.source_train_path, FLAGS.source_vocab_size) utils.create_vocabulary(target_vocab_path, FLAGS.target_train_path, FLAGS.target_vocab_size) # Read vocabularies. source_vocab, rev_source_vocab = utils.initialize_vocabulary(source_vocab_path) target_vocab, rev_target_vocab = utils.initialize_vocabulary(target_vocab_path) # Read parallel sentences. parallel_data = utils.read_data(FLAGS.source_train_path, FLAGS.target_train_path, source_vocab, target_vocab) # Read validation data set. if FLAGS.source_valid_path and FLAGS.target_valid_path: valid_data = utils.read_data(FLAGS.source_valid_path, FLAGS.target_valid_path, source_vocab, target_vocab) # Initialize BiRNN. config = Config(len(source_vocab), len(target_vocab), FLAGS.embedding_size, FLAGS.state_size, FLAGS.hidden_size, FLAGS.num_layers, FLAGS.learning_rate, FLAGS.max_gradient_norm, FLAGS.use_lstm, FLAGS.use_mean_pooling, FLAGS.use_max_pooling, FLAGS.source_embeddings_path, FLAGS.target_embeddings_path, FLAGS.fix_pretrained) model = BiRNN(config) # Build graph. model.build_graph() # Train model. with tf.Session() as sess: sess.run(tf.global_variables_initializer()) sess.run(tf.local_variables_initializer()) train_iterator = utils.TrainingIteratorRandom(parallel_data, FLAGS.num_negative) train_summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.checkpoint_dir, "train"), sess.graph) if FLAGS.source_valid_path and FLAGS.target_valid_path: valid_iterator = utils.EvalIterator(valid_data) valid_summary_writer = tf.summary.FileWriter(os.path.join(FLAGS.checkpoint_dir, "valid"), sess.graph) epoch_loss = 0 epoch_completed = 0 batch_completed = 0 num_iter = int(np.ceil(train_iterator.size / FLAGS.batch_size * FLAGS.num_epochs)) start_time = time.time() print("Training model on {} sentence pairs per epoch:". format(train_iterator.size, valid_iterator.size)) for step in xrange(num_iter): source, target, label = train_iterator.next_batch(FLAGS.batch_size) source_len = utils.sequence_length(source) target_len = utils.sequence_length(target) feed_dict = {model.x_source: source, model.x_target: target, model.labels: label, model.source_seq_length: source_len, model.target_seq_length: target_len, model.input_dropout: FLAGS.keep_prob_input, model.output_dropout: FLAGS.keep_prob_output, model.decision_threshold: FLAGS.decision_threshold} _, loss_value, epoch_accuracy,\ epoch_precision, epoch_recall = sess.run([model.train_op, model.mean_loss, model.accuracy[1], model.precision[1], model.recall[1]], feed_dict=feed_dict) epoch_loss += loss_value batch_completed += 1 # Write the model's training summaries. if step % FLAGS.steps_per_checkpoint == 0: summary = sess.run(model.summaries, feed_dict=feed_dict) train_summary_writer.add_summary(summary, global_step=step) # End of current epoch. if train_iterator.epoch_completed > epoch_completed: epoch_time = time.time() - start_time epoch_loss /= batch_completed epoch_f1 = utils.f1_score(epoch_precision, epoch_recall) epoch_completed += 1 print("Epoch {} in {:.0f} sec\n" " Training: Loss = {:.6f}, Accuracy = {:.4f}, " "Precision = {:.4f}, Recall = {:.4f}, F1 = {:.4f}" .format(epoch_completed, epoch_time, epoch_loss, epoch_accuracy, epoch_precision, epoch_recall, epoch_f1)) # Save a model checkpoint. checkpoint_path = os.path.join(FLAGS.checkpoint_dir, "model.ckpt") model.saver.save(sess, checkpoint_path, global_step=step) # Evaluate model on the validation set. if FLAGS.source_valid_path and FLAGS.target_valid_path: eval_epoch(sess, model, valid_iterator, valid_summary_writer) # Initialize local variables for new epoch. batch_completed = 0 epoch_loss = 0 sess.run(tf.local_variables_initializer()) start_time = time.time() print("Training done with {} steps.".format(num_iter)) train_summary_writer.close() valid_summary_writer.close()
if sys.version_info[0] < 3: raise Exception("Must be using Python 3") print('Preparing data') train_x, train_y, trial_x, trial_y, test_x, test_y = utils.load_dataset( 'data/train.csv', 'data/trial.csv', 'data/trial.labels', 'data/test-text-labels.csv') print('Preprocessing data') train_x, trial_x, test_x, max_string_length = preprocessing.preprocessing_pipeline( train_x, trial_x, test_x) print('Words to index') vocab_length, words_to_index, index_to_words = utils.create_vocabulary( train_x, trial_x, test_x) train_y_oh = utils.labels_to_indices(train_y, config.labels_to_index, config.classes) trial_y_oh = utils.labels_to_indices(trial_y, config.labels_to_index, config.classes) train_x_indices = utils.sentences_to_indices(train_x, words_to_index, max_len=max_string_length) trial_x_indices = utils.sentences_to_indices(trial_x, words_to_index, max_len=max_string_length) test_x_indices = utils.sentences_to_indices(test_x, words_to_index, max_len=max_string_length)
split=False) if 'dev.txt' not in os.listdir(train_flag['data_dir']): dev_data = read_dataset(join(train_flag['data_dir'], 'valid.txt'), parameters['maximum_L'], split=False) else: dev_data = read_dataset(join(train_flag['data_dir'], 'dev.txt'), parameters['maximum_L'], split=False) test_data = read_dataset(join(train_flag['data_dir'], 'test.txt'), parameters['maximum_L'], split=False) tag_vocabulary, i2t = create_vocabulary(train_data) parameters['labels_num'] = len(tag_vocabulary.keys()) # number of labels parameters['tag_emb_dim'] = len(tag_vocabulary.keys()) def train(generator, param, flags): with tf.Session() as sess: # create model model = NEF(param, tag_vocabulary, i2t) # print config print(model.path) sess.run(tf.global_variables_initializer()) # start learning
def main(train_path, val_path, labels_path, embedding_vectors_path, embedding_word2idx_path, categories_def_path, uncertainty_output_path, batch_size, model_snapshot_prefix, pretrained_model_path, model_snapshot_interval): embedding_vectors = bcolz.open(embedding_vectors_path)[:] embedding_dim = len(embedding_vectors[0]) embedding_word2idx = pickle.load(open(embedding_word2idx_path, 'rb')) # Maps words to embedding vectors. These are all embeddings available to us embeddings = { w: embedding_vectors[embedding_word2idx[w]] for w in embedding_word2idx } # Build vocabulary using training set. Maps words to indices vocab = create_vocabulary(train_path) vocab_size = len(vocab) print(f'Vocabulary size: {vocab_size}\nBatch size: {batch_size}') # TODO: take advantage of the multiple annotations labels = load_existing_annotations(labels_path, load_first_annotation_only=True) if model_snapshot_interval: print(f'Taking model snapshot every {model_snapshot_interval} epochs') else: print(f'Taking model snapshot ONLY at the end of training') humor_types = load_sentences_or_categories(categories_def_path) # Map label IDs to indices so that when computing cross entropy we don't operate on raw label IDs label_id_to_idx = { label_id: idx for idx, label_id in enumerate(humor_types) } word_weight_matrix = create_weight_matrix(vocab, embeddings, device) # Stores indexes of sentences provided in the original dataset train_labeled_idx, train_labeled_data_unpadded, train_labels, train_unlabeled_idx, train_unlabeled_data_unpadded,\ longest_sentence_length = load_unpadded_train_val_data(train_path, vocab, labels, label_id_to_idx) val_labeled_idx, val_labeled_data_unpadded, val_labels, val_unlabeled_idx, val_unlabeled_data_unpadded,\ _ = load_unpadded_train_val_data(val_path, vocab, labels, label_id_to_idx) # Create padded train and val dataset # TODO: Do not use longest length to pad input. Find mean and std train_labeled_data = create_padded_data(train_labeled_data_unpadded, longest_sentence_length) val_labeled_data = create_padded_data(val_labeled_data_unpadded, longest_sentence_length) print( f'Num of labeled training data: {train_labeled_data.shape[0]}, labeled val: {val_labeled_data.shape[0]}' ) num_iterations = train_labeled_data.shape[0] // batch_size textCNN = DataParallel( TextCNN(word_weight_matrix, NUM_FILTERS, WINDOW_SIZES, len(humor_types))).to(device) if pretrained_model_path: textCNN.module.initialize_from_pretrained(pretrained_model_path) optimizer = torch.optim.Adam(textCNN.parameters(), lr=LR, eps=OPTIM_EPS) for i in range(NUM_EPOCHS): print(f'Epoch {i}') train_one_epoch( textCNN, create_batch_iterable(train_labeled_data, train_labels, batch_size, device), optimizer, val_labeled_data, val_labels, num_iterations) if model_snapshot_prefix: if (not model_snapshot_interval and i + 1 == NUM_EPOCHS) or \ (model_snapshot_interval and (i + 1) % model_snapshot_interval == 0): print('\nSaving model snapshot...') torch.save(textCNN.state_dict(), f'{model_snapshot_prefix}_epoch{i}.mdl') print('Saved\n') if uncertainty_output_path: train_unlabeled_data = create_padded_data( train_unlabeled_data_unpadded, longest_sentence_length) rank_unlabeled_train( textCNN, torch.tensor(train_unlabeled_data, dtype=torch.long, device=device), train_unlabeled_idx, uncertainty_output_path)
#full path ./data/ + dataset + train/test/valid if arg.dataset == None: print("name of dataset can not be None") elif arg.dataset == "snip": print("use snip dataset") elif arg.dataset == "atis": print("use atis dataset") else: print("use own dataset: ", arg.dataset) full_train_path = os.path.join("./data", arg.dataset, arg.train_data_path) full_test_path = os.path.join('./data', arg.dataset, arg.test_data_path) full_valid_path = os.path.join('./data', arg.dataset, arg.valid_data_path) create_vocabulary(os.path.join(full_train_path, arg.input_file), os.path.join(arg.vocab_path, "in_vocab")) create_vocabulary(os.path.join(full_train_path, arg.slot_file), os.path.join(arg.vocab_path, "slot_vocab")) create_vocabulary(os.path.join(full_train_path, arg.intent_file), os.path.join(arg.vocab_path, "intent_vocab")) # {word 2 id, words list} in_vocab = load_vocabulary(os.path.join(arg.vocab_path, "in_vocab")) slot_vocab = load_vocabulary(os.path.join(arg.vocab_path, "slot_vocab")) intent_vocab = load_vocabulary(os.path.join(arg.vocab_path, "intent_vocab")) def create_model(input_data, input_size, sequence_length, slot_size,
# RNN PARAMETERS tf.app.flags.DEFINE_integer("BATCH_SIZE", 64, "batch size") tf.app.flags.DEFINE_integer("NUM_EPOCHS", 1, "number of epochs for training") tf.app.flags.DEFINE_float("LEARNING_RATE", 0.001, "learning rate for rnn") tf.app.flags.DEFINE_float("MAX_GRAD_NORM", 5.0, "max. norm for gradient clipping") tf.app.flags.DEFINE_string('f', '', 'tensorflow bug') FLAGS = tf.app.flags.FLAGS if FLAGS.EXPERIMENT == "C": FLAGS.STATE_DIM = 1024 tf_utils.print_flags(FLAGS, logger) # -------------------------------------------------------------------------------------------------------------------- # # PREPROCESSING logger.append("PREPROCESSING STARTING.") vocabulary, word_to_idx, idx_to_word = utils.create_vocabulary(FLAGS.DATA_DIR + FLAGS.SENTENCES_TRAIN_FILE, FLAGS.VOCABULARY_SIZE) X_train = utils.create_dataset(FLAGS.DATA_DIR + FLAGS.SENTENCES_TRAIN_FILE, word_to_idx) logger.append("X_train CREATED.") X_test = utils.create_dataset(FLAGS.DATA_DIR + FLAGS.SENTENCES_TEST_FILE, word_to_idx) logger.append("X_test CREATED.") X_eval = utils.create_dataset(FLAGS.DATA_DIR + FLAGS.SENTENCES_EVAL_FILE, word_to_idx) logger.append("X_eval CREATED.") X_cont = utils.load_continuation(FLAGS.DATA_DIR + FLAGS.SENTENCES_CONTINUATION_FILE, word_to_idx) logger.append("X_cont CREATED.") with open(FLAGS.RESULTS_DIR + "vocabulary.pkl", "wb") as f: pickle.dump((vocabulary, word_to_idx, idx_to_word), f) with open(FLAGS.RESULTS_DIR + "X_train.ids", "w") as f: for i in range(X_train.shape[0]): f.write(" ".join([str(x) for x in X_train[i, :]]) + "\n")