def wordvec_to_dict(filename, word_filter): vec_dict = {} # progress bar overhead bar = logger.get_progress_bar("Reading GloVe vectors", level=2, limit=20) total_size = os.path.getsize(filename) found_counter = 0 size_counter = 0 last_milestone = 0 with open(filename, "r", encoding="utf8") as file: for raw_line in file: # progress bar overhead size_counter += len(raw_line) + 1 if int((size_counter / total_size) * 20) > last_milestone: bar.next() last_milestone += 1 # process line line = raw_line.split() if not line[0] in word_filter: continue found_counter += 1 num_line = [float(x) for x in line[1:]] vec_dict[line[0]] = num_line bar.finish() # Observation: most of the unmatched words are typos, compounds or really uncommon. logger.info("Found vectors for " + str(found_counter) + " words out of " + str(len(word_filter)) + "." + " Elapsed time: " + str(bar.elapsed) + " s", level=2) return vec_dict
def download(url, archive_name, label=""): if os.path.exists(SAVE_DIR + "/" + archive_name + ".zip") or is_unpacked(archive_name): logger.info(label.capitalize() + " already downloaded.") return if not os.path.exists(SAVE_DIR): os.makedirs(SAVE_DIR) logger.info("Downloading the " + label + " archive from " + url) wget.download(url, SAVE_DIR + "/" + archive_name + ".zip") logger.success("Download completed.")
def unpack(archive_name, label=""): if is_unpacked(archive_name): logger.info(label.capitalize() + " already unpacked.") return if not os.path.exists(SAVE_DIR + "/" + archive_name + ".zip"): logger.error("No " + label + " zipfile to unpack") return logger.info("Unpacking " + label) os.makedirs(SAVE_DIR + "/" + archive_name) with zipfile.ZipFile(SAVE_DIR + "/" + archive_name + ".zip", "r") as file: file.extractall(SAVE_DIR + "/" + archive_name) logger.success("Unpacking complete.") os.remove(SAVE_DIR + "/" + archive_name + ".zip")
def input_data_to_matrices(dataset, word_id_mapping, label_dict): premise_matrix = [] hypothesis_matrix = [] label_counter = len(label_dict) labels = [] # sentence1 denotes premise, sentence2 is hypothesis for sentence_pair in dataset: label = sentence_pair["gold_label"] if label == '-': continue if label not in label_dict: label_dict[label] = label_counter label_counter += 1 premise_row = [] hypothesis_row = [] scrap = False for item in sentence_to_words(sentence_pair["sentence1"]): if item in word_id_mapping: premise_row.append(word_id_mapping[item]) else: scrap = True break for item in sentence_to_words(sentence_pair["sentence2"]): if item in word_id_mapping: hypothesis_row.append(word_id_mapping[item]) else: scrap = True break # do not use sentence, if it contains an unknown word if scrap: continue premise_matrix.append(premise_row) hypothesis_matrix.append(hypothesis_row) labels.append(label_dict[label]) logger.info("Number of distinct labels: " + str(label_counter), level=2) logger.info("Length of cleaned dataset: " + str(len(labels)), level=2) return premise_matrix, hypothesis_matrix, labels
def run(force_recompute=True): logger.header("Running preprocessor module.") if not check_all_unpacked(): logger.error( "Unpacked datasets or word vectors are missing. Please run downloader prior to preprocessor." ) time_start = time.time() logger.info("Loading datasets into memory") try: train_dataset = json_to_array(unpacked_dataset_path() + "/snli_1.0_train.jsonl") test_dataset = json_to_array(unpacked_dataset_path() + "/snli_1.0_test.jsonl") except FileNotFoundError as error: logger.error("File: " + error.filename + " not found") return time_end = time.time() logger.success("Datasets loaded. Elapsed time: " + "{0:.2f}".format(time_end - time_start) + " s") embeddings_changed = False time_start = time.time() if os.path.exists(PRECOMPUTED_GLOVE_PATH) and not force_recompute: logger.info("Precomputed word vectors found, loading into memory.") with open(PRECOMPUTED_GLOVE_PATH, 'r') as infile: word_vectors = json.load(infile) else: logger.info("Loading word vectors into memory") # Get a set of words used in datasets, so we don't store useless word vectors. vocabulary = set() vocabulary = get_used_words(train_dataset, vocabulary) vocabulary = get_used_words(test_dataset, vocabulary) # Load needed part of word vectors. Might induce large memory costs. try: word_vectors = wordvec_to_dict( unpacked_glove_path() + "/glove.42B.300d.txt", vocabulary) except FileNotFoundError as error: logger.error("File: " + error.filename + " not found") return logger.info("Storing loaded vectors for future use.", level=2) with open(PRECOMPUTED_GLOVE_PATH, 'w') as outfile: json.dump(word_vectors, outfile) embeddings_changed = True time_end = time.time() logger.success("Word vectors loaded. Elapsed time: " + "{0:.2f}".format(time_end - time_start) + " s") id_mapping = generate_dictionary_ids(word_vectors) if not os.path.exists(PRECOMPUTED_EMB_MATRIX_PATH ) or force_recompute or embeddings_changed: logger.info("Generating initial embedding matrix.") embedding_matrix = generate_embedding_matrix(word_vectors, id_mapping) logger.info("Storing embedding matrix for future use.", level=2) with open(PRECOMPUTED_EMB_MATRIX_PATH, 'w') as outfile: json.dump(embedding_matrix.tolist(), outfile) logger.success("Embedding matrix created.") else: logger.info("Embedding matrix found, skipping its computation.") label_dict = {} if not os.path.exists(PRECOMPUTED_TRAIN_PREMISES_PATH) \ or not os.path.exists(PRECOMPUTED_TRAIN_HYPOTHESES_PATH) \ or not os.path.exists(PRECOMPUTED_TRAIN_LABELS_PATH) \ or force_recompute or embeddings_changed: logger.info("Creating train matrix and labels") train_premise_matrix, train_hypothesis_matrix, train_labels = input_data_to_matrices( train_dataset, id_mapping, label_dict) logger.info("Storing matrix for future use.", level=2) with open(PRECOMPUTED_TRAIN_PREMISES_PATH, 'w') as outfile: json.dump(train_premise_matrix, outfile) with open(PRECOMPUTED_TRAIN_HYPOTHESES_PATH, 'w') as outfile: json.dump(train_hypothesis_matrix, outfile) with open(PRECOMPUTED_TRAIN_LABELS_PATH, 'w') as outfile: json.dump(train_labels, outfile) logger.success("Matrix stored") else: logger.info("Train matrix found, skipping its computation.") if not os.path.exists(PRECOMPUTED_TEST_PREMISES_PATH) \ or not os.path.exists(PRECOMPUTED_TEST_HYPOTHESES_PATH) \ or not os.path.exists(PRECOMPUTED_TEST_LABELS_PATH) \ or force_recompute or embeddings_changed: logger.info("Creating test matrix and labels") test_premise_matrix, test_hypothesis_matrix, test_labels = input_data_to_matrices( test_dataset, id_mapping, label_dict) logger.info("Storing matrix for future use.", level=2) with open(PRECOMPUTED_TEST_PREMISES_PATH, 'w') as outfile: json.dump(test_premise_matrix, outfile) with open(PRECOMPUTED_TEST_HYPOTHESES_PATH, 'w') as outfile: json.dump(test_hypothesis_matrix, outfile) with open(PRECOMPUTED_TEST_LABELS_PATH, 'w') as outfile: json.dump(test_labels, outfile) logger.success("Matrix stored") else: logger.info("Test matrix found, skipping its computation.")
def run(): logger.header("Running trainer module.") logger.info("Loading embedding matrix into tensorflow model.") embedding_matrix = load_embedding_matrix() logger.success("Matrix loaded.") logger.info("Loading training data matrices.") train_premise_matrix, train_hypothesis_matrix, train_labels = load_train_matrices( ) test_premise_matrix, test_hypothesis_matrix, test_labels = load_test_matrices( ) logger.success("Matrices loaded.") logger.info("Building Tensorflow model.") # Placeholders for the feed dict premise_ph = tf.placeholder(tf.int32, [None, None]) hypothesis_ph = tf.placeholder(tf.int32, [None, None]) labels_ph = tf.placeholder(tf.float32, [None, train_labels.shape[1]]) keep_rate_ph = tf.placeholder(tf.float32) # Model is given as optimizer minimize operation model, loss, error = build_model(premise_ph, hypothesis_ph, labels_ph, embedding_matrix, keep_rate_ph) # create batch producers for both training and testing num_batches = min(BATCH_CEILING, train_labels.shape[0] // BATCH_SIZE) num_test_batches = min(BATCH_CEILING, test_labels.shape[0] // BATCH_SIZE) train_batch_queue = tf.train.range_input_producer(limit=num_batches, shuffle=True) test_batch_queue = tf.train.range_input_producer(limit=num_test_batches, shuffle=False) premise_tf, hypothesis_tf, label_tf = produce_batch( train_premise_matrix, train_hypothesis_matrix, train_labels, train_batch_queue) premise_ts, hypothesis_ts, label_ts = produce_batch( test_premise_matrix, test_hypothesis_matrix, test_labels, test_batch_queue) logger.success("Model built. Number of variables: " + str(get_model_variable_count())) logger.info("Running Tensorflow session. Good luck.") with tf.Session() as session: # Wouldn't work without this, for some reason input_coord = tf.train.Coordinator() input_threads = tf.train.start_queue_runners(session, coord=input_coord) session.run(tf.global_variables_initializer()) train_stats = [] test_stats = [] for epoch in range(1, EPOCH_COUNT + 1): logger.info("Epoch " + str(epoch) + " startup...", level=2) # Run training on all batches (optimizer on) sum_loss = 0 sum_err = 0 for batch in range(1, num_batches + 1): premise_batch, hypothesis_batch, labels_batch = session.run( [premise_tf, hypothesis_tf, label_tf]) _, curr_loss, curr_err = session.run( [model, loss, error], { premise_ph: premise_batch, hypothesis_ph: hypothesis_batch, labels_ph: labels_batch, keep_rate_ph: DROPOUT_RATE }) sum_loss += curr_loss sum_err += curr_err if batch % 100 == 0 and batch > 0: logger.info("Batch " + str(batch) + ", loss: " + str(sum_loss / batch) + " acc.: " + str( (1 - sum_err / batch) * 100), level=3) train_stats.append( [sum_loss / num_batches, (1 - sum_err / num_batches) * 100]) # Run testing on all batches (optimizer off) test_loss = 0 test_err = 0 for test_batch in range(1, num_test_batches + 1): premise_batch_t, hypothesis_batch_t, labels_batch_t = session.run( [premise_ts, hypothesis_ts, label_ts]) curr_loss, curr_err = session.run( [loss, error], { premise_ph: premise_batch_t, hypothesis_ph: hypothesis_batch_t, labels_ph: labels_batch_t, keep_rate_ph: 1.0 }) test_loss += curr_loss test_err += curr_err test_loss /= num_test_batches test_err /= num_test_batches logger.info("Epoch " + str(epoch) + " done. Test loss: " + str(test_loss) + " Test acc: " + str( (1 - test_err) * 100), level=2) test_stats.append([test_loss, (1 - test_err) * 100]) print(train_stats) print(test_stats) input_coord.request_stop() input_coord.join(input_threads) logger.success("Session run complete")