def _load_data(self, data_type): source_labels = UTIL.load_embeddings( os.path.join( self.data_dir, self.params.files[data_type][ self.KN_FILE_NAMES['KN_SOURCE_LABELS']])).astype(int) source_indx = UTIL.load_embeddings( os.path.join( self.data_dir, self.params.files[data_type][ self.KN_FILE_NAMES['KN_SOURCE_IDX']])).astype(int) source_embeddings = UTIL.load_embeddings( os.path.join( self.data_dir, self.params.files[data_type][ self.KN_FILE_NAMES['KN_SOURCE_EMBEDDINGS']])) target_embeddings = UTIL.load_embeddings( os.path.join( self.data_dir, self.params.files[data_type][ self.KN_FILE_NAMES['KN_TARGET_EMBEDDINGS']])) source_padded = None source_length = None try: all_target_embeddings = UTIL.load_embeddings( os.path.join( self.data_dir, self.params.files[data_type][ "all_" + self.KN_FILE_NAMES['KN_TARGET_EMBEDDINGS']])) except: print('{} is not found for {} type'.format( "all_" + self.KN_FILE_NAMES['KN_TARGET_EMBEDDINGS'], data_type)) all_target_embeddings = None return source_labels, source_indx, source_embeddings, target_embeddings, all_target_embeddings, source_padded, source_length
def _load_test_recall_data(self, load_with_file_path=False): if load_with_file_path: self._test_recall_source_labels, self._test_recall_source_indx, self._test_recall_source_embeddings, self._test_recall_target_embeddings, self._test_recall_all_target_embeddings, self._test_recall_source_padded, self._test_recall_source_length = self._load_data_path( 'test_subset_recall') self._temp_test_recall_source_labels = UTIL.load_embeddings( self._test_recall_source_labels) else: self._test_recall_source_labels, self._test_recall_source_indx, self._test_recall_source_embeddings, self._test_recall_target_embeddings, self._test_recall_all_target_embeddings, self._test_recall_source_padded, self._test_recall_source_length = self._load_data( 'test_subset_recall') self._temp_test_recall_source_labels = self._test_recall_source_labels self._test_recall_baseline_source_embeddings = self._test_recall_source_embeddings if self.params.model['model_type'].lower() == 'conv': tokenized_documents = self._obtain_tokenized_documents( self._test_recall_source_indx) self._test_recall_source_embeddings, self._test_recall_source_embeddings_lengths = self._pad_documents( tokenized_documents) if load_with_file_path: UTIL.dump_embeddings(self._test_recall_source_embeddings, self._test_recall_source_padded) UTIL.dump_embeddings( self._test_recall_source_embeddings_lengths, self._test_recall_source_length, dtype="int32") self._test_recall_source_embeddings, self._test_recall_source_embeddings_lengths = self._test_recall_source_padded, self._test_recall_source_length else: self._test_recall_source_embeddings_lengths = np.zeros( [self._temp_test_recall_source_labels.shape[0], 1]) if load_with_file_path: UTIL.dump_embeddings( self._test_recall_source_embeddings_lengths, self._test_recall_source_length, dtype="int32") self._test_recall_source_embeddings_lengths = self._test_recall_source_length
def _load_predict_data(self, load_with_file_path=False): if load_with_file_path: self._source_embeddings = os.path.join( self.base_path, self.params.files['prediction']['source_embeddings']) self._source_padded = os.path.join( self.base_path, self.params.files['prediction']['source_padded']) self._source_length = os.path.join( self.base_path, self.params.files['prediction']['source_length']) self._baseline_source_embeddings = self._source_embeddings self._temp_source_embeddings = UTIL.load_embeddings( self._source_embeddings) else: self._source_embeddings = UTIL.load_embeddings( os.path.join( self.base_path, self.params.files['prediction']['source_embeddings'])) self._source_padded = None self._source_length = None self._baseline_source_embeddings = self._source_embeddings self._temp_source_embeddings = self._source_embeddings if self.KN_FILE_NAMES['DIR'].lower().startswith('qu'): tokenized_documents = self._tokenized_questions else: tokenized_documents = self._tokenized_paragraphs if self.params.model['model_type'].lower() == 'conv': self._source_embeddings, self._source_embeddings_lengths = self._pad_documents( tokenized_documents) if load_with_file_path: UTIL.dump_embeddings(self._source_embeddings, self._source_padded) UTIL.dump_embeddings(self._source_embeddings_lengths, self._source_length, dtype="int32") self._source_embeddings, self._source_embeddings_lengths = self._source_padded, self._source_length else: self._source_embeddings_lengths = np.zeros( [self._temp_source_embeddings.shape[0], 1]) if load_with_file_path: UTIL.dump_embeddings(self._source_embeddings_lengths, self._source_length, dtype="int32") self._source_embeddings_lengths = self._source_length
def _obtain_tokenized_documents(self, source_indx): documents = [] if self.load_with_file_path: source_indx = UTIL.load_embeddings(source_indx).astype(int) for indx in tqdm(source_indx): if self.KN_FILE_NAMES['DIR'].lower().startswith('qu'): document = self._questions_nontokenized[indx] else: document = self._paragraphs_nontokenized[indx] documents.append(document) tokenized_documents = [document.split(' ') for document in documents] return tokenized_documents
def eval_metrics_fn(base_data_path, params, before_model_embeddings, after_model_embeddings, KN_FILE_NAMES): """ # Evaluate the model's recall on the test set # 5k questions, ~20k paragraphs, can not be handled in estimator api (number of questions and number of paragraphs are not same) # In order to accelarete the calculation, I prefer doing the following """ all_targets = tf.constant(load_embeddings(os.path.join(base_data_path, params.files[params.executor["recall_calculation_for"] + "_subset_recall"]["all_" + KN_FILE_NAMES["KN_TARGET_EMBEDDINGS"]]))) normalized_all_targets = tf.nn.l2_normalize(all_targets, name='normalized_all_targets_embeddings', axis=1) subset_labels = tf.constant(load_embeddings(os.path.join(base_data_path, params.files[params.executor[ "recall_calculation_for"] + "_subset_recall"][ KN_FILE_NAMES["KN_SOURCE_LABELS"]]))) subset_labels = tf.reshape(subset_labels, [-1, 1]) # AVG RECALLS FOR ALL RECALL_TOPS eval_metrics_after = evaluation_metrics(after_model_embeddings, normalized_all_targets, subset_labels, params, distance_type=params.executor["distance_type"]) eval_metrics_before = None if params.executor["is_debug_mode"]: eval_metrics_before = evaluation_metrics(before_model_embeddings, normalized_all_targets, subset_labels, params, distance_type=params.executor[ "distance_type"]) return eval_metrics_after, eval_metrics_before
def _load_train_data(self, load_with_file_path=False): if load_with_file_path: ## LOAD WITH FILE PATHS self._train_source_labels, self._train_source_indx, self._train_source_embeddings, self._train_target_embeddings, self._train_all_target_embeddings, self._train_source_padded, self._train_source_length = self._load_data_path( 'train_loss') self._temp_train_source_labels = UTIL.load_embeddings( self._train_source_labels) else: ## LOAD WITH ACTUAL DATA self._train_source_labels, self._train_source_indx, self._train_source_embeddings, self._train_target_embeddings, self._train_all_target_embeddings, self._train_source_padded, self._train_source_length = self._load_data( 'train_loss') self._temp_train_source_labels = self._train_source_labels self._train_baseline_source_embeddings = self._train_source_embeddings if self.params.model['model_type'].lower() == 'conv': tokenized_documents = self._obtain_tokenized_documents( self._train_source_indx) self._train_source_embeddings, self._train_source_embeddings_lengths = self._pad_documents( tokenized_documents) if load_with_file_path: ## SAVE self._train_source_embeddings, self._train_source_embeddings_lengths SO THAT IT CAN BE RELOADED FROM FILE UTIL.dump_embeddings(self._train_source_embeddings, self._train_source_padded) UTIL.dump_embeddings(self._train_source_embeddings_lengths, self._train_source_length, dtype="int32") self._train_source_embeddings, self._train_source_embeddings_lengths = self._train_source_padded, self._train_source_length else: self._train_source_embeddings_lengths = np.zeros( [self._temp_train_source_labels.shape[0], 1]) if load_with_file_path: ## SAVE self._train_source_embeddings, self._train_source_embeddings_lengths SO THAT IT CAN BE RELOADED FROM FILE UTIL.dump_embeddings(self._train_source_embeddings_lengths, self._train_source_length, dtype="int32") self._train_source_embeddings_lengths = self._train_source_length
def dump_splitted_train_test(question_embeddings, paragraph_embeddings, labels, prefix, path, partition_size): UTIL.dump_embeddings(labels['q'], os.path.join(path, prefix + "_question_idx.hdf5")) UTIL.dump_embeddings(labels['p'], os.path.join(path, prefix + "_question_labels.hdf5"),dtype='int32') range_size = math.ceil(question_embeddings.shape[0]/partition_size) for part in range(0, range_size): pair_paragraph_embeddings = None start = part * partition_size end = start + partition_size for q_indx, q_embed in tqdm(enumerate(question_embeddings[start:end])): if pair_paragraph_embeddings is None: pair_paragraph_embeddings = paragraph_embeddings[labels['p'][q_indx]] else: pair_paragraph_embeddings = np.vstack( (pair_paragraph_embeddings, paragraph_embeddings[labels['p'][q_indx]])) UTIL.dump_embeddings(pair_paragraph_embeddings, os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part))) pair_paragraph_embeddings = None for part in range(0, range_size): embeddings = UTIL.load_embeddings(os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part))) if pair_paragraph_embeddings is None: pair_paragraph_embeddings = embeddings else: pair_paragraph_embeddings = np.vstack( (pair_paragraph_embeddings,embeddings)) UTIL.dump_embeddings(pair_paragraph_embeddings, os.path.join(path, prefix + "_paired_paragraph_embeddings.hdf5")) for part in range(0, range_size): os.remove(os.path.join(path, prefix + "_paired_paragraph_embeddings_part_{}.hdf5".format(part))) UTIL.dump_embeddings(question_embeddings, os.path.join(path,prefix + '_question_embeddings.hdf5')) UTIL.dump_embeddings(paragraph_embeddings, os.path.join(path, prefix + "_all_paragraph_embeddings.hdf5"))
init_op = tf.global_variables_initializer() # Merge all summary inforation. summary_op = tf.summary.merge_all() summaries_dir = os.path.join(params.executor["model_dir"], "low_level_log", 'non_est_' + params.model["active_model"]) # start the session with tf.Session() as sess: sess.run(init_op) train_writer = tf.summary.FileWriter(summaries_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(summaries_dir + '/test') #TRAINING DATA training_question_embeddings = load_embeddings( os.path.join(base_data_path, params.files['train_loss']['question_embeddings'])) training_paragraph_embeddings = load_embeddings( os.path.join(base_data_path, params.files['train_loss']['paragraph_embeddings'])) training_labels = load_embeddings( os.path.join(base_data_path, params.files['train_loss']['question_labels'])) training_labels = np.reshape(training_labels, [-1, 1]) # TESTING DATA testing_question_embeddings = load_embeddings( os.path.join( base_data_path, params.files['test_subset_loss']['question_embeddings'])) testing_paragraph_embeddings = load_embeddings( os.path.join(
" ".join(context) for context in tokenized_questions ] print('# of Tokenized Questions in {} : {}'.format( dataset_type, len(tokenized_questions))) # # if is_dump_during_execution: # UTIL.dump_tokenized_contexts(tokenized_paragraphs, paragraphs_file.format(dataset_type)) # UTIL.dump_tokenized_contexts(tokenized_questions, questions_file.format(dataset_type)) # UTIL.dump_mapping_data(q_to_ps, mapping_file.format(dataset_type)) end = datetime.datetime.now() print('Parsing Ended in {} minutes'.format((end - start).seconds / 60)) print(100 * '*') """ ****************************************************************************************************************** ****************************************************************************************************************** END: PARSING FILE ****************************************************************************************************************** ****************************************************************************************************************** """ tokenized_questions, tokenized_paragraphs = UTIL.fixing_the_token_problem( tokenized_questions, tokenized_paragraphs) else: s_to_ts = UTIL.load_embeddings(labels_path).astype(int) target_embeddings = UTIL.load_embeddings(target_embeddings_path) source_embeddings = UTIL.load_embeddings(source_embedddings_path) calculate_similarity_and_dump(target_embeddings, source_embeddings, s_to_ts, len(source_embeddings), recalls_path)
def main(args): ################ CONFIGURATIONS ################# squad_formatted_file = os.path.join(args.data_path, args.squad_formatted_file) bert_extension = ".json" file_name_splitter = '_' document_embeddings = None questions_folder_path = os.path.join(args.data_path, 'questions') paragraphs_folder_path = os.path.join(args.data_path, 'paragraphs') new_question_tokens_path = os.path.join(args.data_path, 'questions_tokens@@.pkl') new_paragraph_tokens_path = os.path.join(args.data_path, 'paragraphs_tokens@@.pkl') calculated_token_embeddings_file_path = os.path.join( args.data_path, 'contextualized_document_embeddings_with_token_##_@@.hdf5') vocab_path = os.path.join(args.data_path, 'wordpiece_vocab.txt') ind_layer = None conc_layers = None test_size = None if args.test_size is not None: test_size = [int(x) for x in args.test_size.split(",")] if args.ind_layer is not None: ind_layer = int(args.ind_layer) contextualized_questions_with_token_file_path = os.path.join( args.data_path, "contextualized_questions_embeddings_with_tokens_{}_layers_@@.hdf5" .format(args.ind_layer)) contextualized_paragraphs_with_token_file_path = os.path.join( args.data_path, "contextualized_paragraphs_embeddings_with_tokens_{}_layers.hdf5_@@" .format(args.ind_layer)) contextualized_document_embeddings_with_token_path = os.path.join( args.data_path, "contextualized_document_embeddings_with_token_{}_layers.hdf5". format(args.ind_layer)) final_questions_file_path = os.path.join( args.data_path, "question_document_embeddings_{}_layers_@@.hdf5".format( args.ind_layer)) final_paragraphs_file_path = os.path.join( args.data_path, "paragraph_document_embeddings_{}_layers_@@.hdf5".format( args.ind_layer)) else: conc_layers = [int(x) for x in args.conc_layers.split(",")] contextualized_questions_with_token_file_path = os.path.join( args.data_path, "contextualized_questions_embeddings_with_tokens_{}_layers_@@.hdf5" .format(conc_layers)) contextualized_paragraphs_with_token_file_path = os.path.join( args.data_path, "contextualized_paragraphs_embeddings_with_tokens_{}_layers_@@.hdf5" .format(conc_layers)) contextualized_document_embeddings_with_token_path = os.path.join( args.data_path, "contextualized_document_embeddings_with_token_{}_layers.hdf5". format(conc_layers)) final_questions_file_path = os.path.join( args.data_path, "question_document_embeddings_{}_layers_@@.hdf5".format( conc_layers)) final_paragraphs_file_path = os.path.join( args.data_path, "paragraph_document_embeddings_{}_layers_@@.hdf5".format( conc_layers)) if args.ind_layer is None and args.conc_layers is None: raise Exception('There must be some layer configurations !!!') if args.ind_layer is not None and args.conc_layers is not None: raise Exception('There must only one layer configuration !!!') # ################ CONFIGURATIONS ################# """ ****************************************************************************************************************** ****************************************************************************************************************** START: PARSING FILE ****************************************************************************************************************** ****************************************************************************************************************** """ tokenized_questions, tokenized_paragraphs, questions_nontokenized, paragraphs_nontokenized = UTIL.prepare_squad_objects( squad_formatted_file, args.squad_formatted_file) """ ****************************************************************************************************************** ****************************************************************************************************************** END: PARSING FILE ****************************************************************************************************************** ****************************************************************************************************************** """ """ ****************************************************************************************************************** ****************************************************************************************************************** START: LOAD EMBEDINGS ****************************************************************************************************************** ****************************************************************************************************************** """ new_question_tokens = [] is_questions_already_processed = False if os.path.exists( contextualized_questions_with_token_file_path.replace('@@', '')): is_questions_already_processed = True else: file_names = get_file_names(questions_folder_path, file_name_splitter, bert_extension) tokenized_questions_size = test_size[ 0] if test_size is not None else len(tokenized_questions) checkpoint = None jsons = None if args.is_parititioned is True: partition_counter = 0 for _p_counter in tqdm( range(0, tokenized_questions_size, args.document_partition_size)): print("Partition {} is running for writing questions".format( partition_counter)) if not os.path.exists( contextualized_questions_with_token_file_path.replace( '@@', str(partition_counter))): # TOKEN DEBUGGING #tokens_size_before_partition = sum([len(sentence) for sentence in new_question_tokens]) jsons, checkpoint, partition_shape = process_documents( _p_counter, args.document_partition_size, checkpoint, jsons, tokenized_questions_size, file_names, questions_folder_path, ind_layer, conc_layers, new_question_tokens, contextualized_questions_with_token_file_path.replace( '@@', str(partition_counter)), new_question_tokens_path.replace( '@@', str(partition_counter))) # TOKEN DEBUGGING # tokens_size_after_partition = sum([len(sentence) for sentence in new_question_tokens]) # if tokens_size_after_partition - tokens_size_before_partition != partition_shape[0]: # print("*" * 25) # print("Tokens problem in partition {}, before: {}, after: {}, partition_shape:{}".format(_p_counter, tokens_size_before_partition, tokens_size_after_partition, partition_shape[0])) # print("*" * 25) else: new_question_tokens.extend( UTIL.load_from_pickle( new_question_tokens_path.replace( '@@', str(partition_counter)))) partition_counter += 1 question_embeddings = None for _p_counter in tqdm(range(0, partition_counter)): print("Partition {} is running for reading questions".format( partition_counter)) temp_question_embeddings = UTIL.load_embeddings( contextualized_questions_with_token_file_path.replace( "@@", str(_p_counter))) if question_embeddings is None: question_embeddings = temp_question_embeddings else: question_embeddings = np.vstack( (question_embeddings, temp_question_embeddings)) print('MAIN embeddings shape: {}'.format( question_embeddings.shape)) UTIL.dump_embeddings( question_embeddings, contextualized_questions_with_token_file_path.replace( '@@', '')) print('MAIN embeddings are dumped') else: print("It is running for writing questions") jsons, checkpoint, partition_shape = process_documents( 0, None, checkpoint, jsons, tokenized_questions_size, file_names, questions_folder_path, ind_layer, conc_layers, new_question_tokens, contextualized_questions_with_token_file_path.replace( '@@', '')) UTIL.save_as_pickle(new_question_tokens, new_question_tokens_path.replace('@@', '')) ## *************************************************************************************************************** ## *************************************************************************************************************** ## *************************************************************************************************************** new_paragraph_tokens = [] is_paragraphs_already_processed = False if os.path.exists( contextualized_paragraphs_with_token_file_path.replace('@@', '')): is_paragraphs_already_processed = True else: file_names = get_file_names(paragraphs_folder_path, file_name_splitter, bert_extension) tokenized_paragraphs_size = test_size[ 1] if test_size is not None else len(tokenized_paragraphs) checkpoint = None jsons = None if args.is_parititioned is True: partition_counter = 0 for _p_counter in tqdm( range(0, tokenized_paragraphs_size, args.document_partition_size)): print("Partition {} is running for writing paragraphs".format( partition_counter)) if not os.path.exists( contextualized_paragraphs_with_token_file_path.replace( '@@', str(partition_counter))): #tokens_size_before_partition = sum([len(sentence) for sentence in new_paragraph_tokens]) jsons, checkpoint, partition_shape = process_documents( _p_counter, args.document_partition_size, checkpoint, jsons, tokenized_paragraphs_size, file_names, paragraphs_folder_path, ind_layer, conc_layers, new_paragraph_tokens, contextualized_paragraphs_with_token_file_path.replace( '@@', str(partition_counter)), new_paragraph_tokens_path.replace( '@@', str(partition_counter))) else: new_paragraph_tokens.extend( UTIL.load_from_pickle( new_paragraph_tokens_path.replace( '@@', str(partition_counter)))) partition_counter += 1 # TOKEN DEBUGGING # tokens_size_after_partition = sum([len(sentence) for sentence in new_paragraph_tokens]) # if tokens_size_after_partition - tokens_size_before_partition != partition_shape[0]: # print("*" * 25) # print("Tokens problem in partition {}, before: {}, after: {}, partition_shape:{}".format(_p_counter, tokens_size_before_partition, tokens_size_after_partition, partition_shape[0])) # print("*" * 25) paragraph_embeddings = None for _p_counter in tqdm(range(0, partition_counter)): print("Partition {} is running for reading paragraphs".format( partition_counter)) temp_paragraph_embeddings = UTIL.load_embeddings( contextualized_paragraphs_with_token_file_path.replace( "@@", str(_p_counter))) if paragraph_embeddings is None: paragraph_embeddings = temp_paragraph_embeddings else: paragraph_embeddings = np.vstack( (paragraph_embeddings, temp_paragraph_embeddings)) print('MAIN embeddings shape: {}'.format( paragraph_embeddings.shape)) UTIL.dump_embeddings( paragraph_embeddings, contextualized_paragraphs_with_token_file_path.replace( '@@', '')) print('MAIN embeddings are dumped') else: print("It is running for writing paragraphs") jsons, checkpoint, partition_shape = process_documents( 0, None, checkpoint, jsons, tokenized_paragraphs_size, file_names, paragraphs_folder_path, ind_layer, conc_layers, new_paragraph_tokens, contextualized_paragraphs_with_token_file_path.replace( '@@', '')) UTIL.save_as_pickle(new_paragraph_tokens, new_paragraph_tokens_path.replace('@@', '')) if is_questions_already_processed: question_embeddings = UTIL.load_embeddings( contextualized_questions_with_token_file_path.replace('@@', '')) new_question_tokens = UTIL.load_from_pickle( new_question_tokens_path.replace('@@', '')) if is_paragraphs_already_processed: paragraph_embeddings = UTIL.load_embeddings( contextualized_paragraphs_with_token_file_path.replace('@@', '')) new_paragraph_tokens = UTIL.load_from_pickle( new_paragraph_tokens_path.replace('@@', '')) if os.path.exists(contextualized_document_embeddings_with_token_path): if args.is_parititioned is not True: document_embeddings = UTIL.load_embeddings( contextualized_document_embeddings_with_token_path) else: document_embeddings = np.vstack( (question_embeddings, paragraph_embeddings)) UTIL.dump_embeddings( document_embeddings, contextualized_document_embeddings_with_token_path) del question_embeddings del paragraph_embeddings print('All Documents are dumped') """ ****************************************************************************************************************** ****************************************************************************************************************** END: LOAD EMBEDINGS ****************************************************************************************************************** ****************************************************************************************************************** """ document_embedding_guideline, corpus_as_tokens = UTIL.generate_document_embedding_guideline( new_question_tokens, new_paragraph_tokens) paragraphs_nontokenized = [ " ".join(context) for context in new_paragraph_tokens ] questions_nontokenized = [ " ".join(context) for context in new_question_tokens ] """ ****************************************************************************************************************** ****************************************************************************************************************** START: IDF ****************************************************************************************************************** ****************************************************************************************************************** """ if args.is_inject_idf: print('IDF is going to be calculated') # vocab = [] # for sentence in new_question_tokens + new_paragraph_tokens: # for word in sentence: # vocab.append(word) # vocab = set(vocab) # UTIL.dump_vocab(vocab_path, vocab) #tokenize = wordpiece.FullTokenizer(vocab_file=vocab_path, do_lower_case=False) nlp = spacy.blank("en") tokenize = lambda doc: [token.text for token in nlp(doc)] start = datetime.datetime.now() token2idfweight, idf_vec = UTIL.transform_to_idf_weigths( new_question_tokens, new_paragraph_tokens, tokenize, questions_nontokenized, paragraphs_nontokenized) if args.is_parititioned is True: with h5py.File(contextualized_document_embeddings_with_token_path, 'r') as fin: partition_counter = 0 for partition in range(0, idf_vec.shape[0], args.token_partition_size): partition_counter += 1 temp_doc_embeddings = fin['embeddings'][ partition:partition + args.token_partition_size, :] temp_idf_vec = idf_vec[ partition:partition + args.token_partition_size, :].reshape(-1, 1) #temp_doc_embeddings = temp_doc_embeddings[:,0,:] #temp_doc_embeddings = preprocessing.normalize(temp_doc_embeddings, norm='l2') temp_weighted_token_embeddings = np.multiply( temp_idf_vec, temp_doc_embeddings) UTIL.dump_embeddings( temp_weighted_token_embeddings, calculated_token_embeddings_file_path.replace( '@@', str(partition_counter)).replace('##', 'idf')) print( "Partition {} is completed and processed {} - {} tokens" .format(partition_counter, partition, partition + args.token_partition_size)) else: idf_vec = idf_vec.reshape(-1, 1) weighted_token_embeddings = np.multiply(idf_vec, document_embeddings) del idf_vec del token2idfweight end = datetime.datetime.now() print('IDF calculation is ended in {} minutes'.format( (end - start).seconds / 60)) else: print('IDF is skipped') _type = 'only' if args.is_parititioned is True: with h5py.File(contextualized_document_embeddings_with_token_path, 'r') as fin: partition_counter = 0 for partition in range(0, len(corpus_as_tokens), args.token_partition_size): partition_counter += 1 temp_doc_embeddings = fin['embeddings'][ partition:partition + args.token_partition_size, :] #temp_doc_embeddings = temp_doc_embeddings[:, 0, :] #temp_doc_embeddings = preprocessing.normalize(temp_doc_embeddings, norm='l2') UTIL.dump_embeddings( temp_doc_embeddings, calculated_token_embeddings_file_path.replace( '@@', str(partition_counter)).replace('##', '')) print( "Partition {} is completed and processed {} - {} tokens" .format(partition_counter, partition, partition + args.token_partition_size)) else: weighted_token_embeddings = document_embeddings """ ****************************************************************************************************************** ****************************************************************************************************************** END: LOAD IDF ****************************************************************************************************************** ****************************************************************************************************************** """ """ ****************************************************************************************************************** ****************************************************************************************************************** START: WEIGHTED ARE GETTING APPLIED TO TOKEN EMBEDDINGS ****************************************************************************************************************** ****************************************************************************************************************** """ del document_embeddings #LOAD PARTIAL FILES AFTER CLEANING THE DOCUMENT EMBEDDINGS. if args.is_parititioned is True: weighted_token_embeddings = None for partition in range(1, partition_counter + 1): temp_weighted_token_embeddings = UTIL.load_embeddings( calculated_token_embeddings_file_path.replace( '@@', str(partition)).replace( '##', 'idf' if args.is_inject_idf else '')) if weighted_token_embeddings is None: weighted_token_embeddings = temp_weighted_token_embeddings else: weighted_token_embeddings = np.vstack( (weighted_token_embeddings, temp_weighted_token_embeddings)) print("Partition {} is loaded".format(partition)) WM = None #np.array(args['weights_arguments']).reshape((1, len(args['weights_arguments']), 1)) questions_embeddings, paragraphs_embeddings = UTIL.token_to_document_embeddings( new_question_tokens, new_paragraph_tokens, weighted_token_embeddings, document_embedding_guideline, WM) if args.is_inject_idf: questions_elmo_embeddings = np.reshape( questions_embeddings, (questions_embeddings.shape[0], questions_embeddings.shape[1])) UTIL.dump_embeddings( questions_elmo_embeddings, final_questions_file_path.replace('@@', 'with_idf')) paragraphs_elmo_embeddings = np.reshape( paragraphs_embeddings, (paragraphs_embeddings.shape[0], paragraphs_embeddings.shape[1])) UTIL.dump_embeddings( paragraphs_elmo_embeddings, final_paragraphs_file_path.replace('@@', 'with_idf')) else: questions_elmo_embeddings = np.reshape( questions_embeddings, (questions_embeddings.shape[0], questions_embeddings.shape[1])) UTIL.dump_embeddings(questions_elmo_embeddings, final_questions_file_path.replace('@@', '')) paragraphs_elmo_embeddings = np.reshape( paragraphs_embeddings, (paragraphs_embeddings.shape[0], paragraphs_embeddings.shape[1])) UTIL.dump_embeddings(paragraphs_elmo_embeddings, final_paragraphs_file_path.replace('@@', '')) print('Weighted are applied') """
def load_data(embedding_path, label_path, prefix): question_embeddings = UTIL.load_embeddings(os.path.join(embedding_path, prefix + '_question_embeddings.hdf5')) paragraph_embeddings = UTIL.load_embeddings(os.path.join(embedding_path, prefix + '_paragraph_embeddings.hdf5')) labels = pd.read_csv(os.path.join(label_path, prefix + '_question_labels.csv')) return question_embeddings, paragraph_embeddings, labels
****************************************************************************************************************** START: LOAD EMBEDINGS ****************************************************************************************************************** ****************************************************************************************************************** """ root_folder_path = os.path.join(datadir, args["root_path"]) document_embeddings = None questions_folder_path = root_folder_path if args[ "embedding_questions_path"] is None else os.path.join( root_folder_path, args["embedding_questions_path"]) question_embeddings = None if os.path.exists( os.path.join(root_folder_path, args['contextualized_questions_embeddings_with_token'])): question_embeddings = UTIL.load_embeddings( os.path.join(root_folder_path, args['contextualized_questions_embeddings_with_token'])) else: for question_indx in range(len(tokenized_questions)): q_file_path = os.path.join( questions_folder_path, args['embedding_questions_file_pattern'].replace( '@@', str(question_indx))) question_embedding = UTIL.load_embeddings(q_file_path) if args['change_shape']: question_embedding = np.expand_dims(question_embedding, axis=1) if question_embeddings is None: question_embeddings = question_embedding else: question_embeddings = np.vstack( (question_embeddings, question_embedding))