def __init__(self, data_file): data_processor = DataProcessor(data_file, seperator=',,,') self.data, self.labels = data_processor.get_training_data( raw_text=True) # self.data , self.labels = twenty_train.data, twenty_train.target self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.data, self.labels, test_size=0.33, random_state=42) # print('Running Naive Bayes...') # pipeline, parameters =self.get_naive_bayes_model() @use_named_args(space) def objective(**params): print params # max_df,ngram_range,max_features,alpha pipeline = Pipeline([ ('tfidf', TfidfVectorizer(max_df=params['max_df'],ngram_range=(1,params['ngram_range']),\ max_features=params['max_features'])), ('clf', LinearSVC(loss=params['loss'],C=params['C'], max_iter=1000)) ]) pipeline.fit(self.X_train, self.y_train) accuracy = accuracy_score(pipeline.predict(self.X_test), self.y_test) print('Accuracy {}'.format(accuracy)) return -accuracy res_gp = gp_minimize(objective, space, n_calls=100, random_state=10)
def test_on_guild_join(gid, obj_guild): # Make sure guild is removed from guilds table DataConnector.run_query("DELETE FROM {}.guilds WHERE guild_id='{}'".format( SCHEMA_NAME, gid)) df = DataConnector.read_data( "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 0 # Make sure guild is removed from days table DataConnector.run_query("DELETE FROM {}.days WHERE guild_id='{}'".format( SCHEMA_NAME, gid)) df = DataConnector.read_data( "SELECT * FROM {}.days WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 0 # Check on guild join DataProcessor._on_guild_join(obj_guild) df = DataConnector.read_data( "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 1 # Also very the days table df = DataConnector.read_data( "SELECT * FROM {}.days WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 1 # Check that the size doesn't change on the same guild DataProcessor._on_guild_join(obj_guild) df = DataConnector.read_data( "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 1 df = DataConnector.read_data( "SELECT * FROM {}.days WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 1
def valid(full_path, in_vocab): data_processor_valid = DataProcessor(full_path, in_vocab) pred_scores = [] true_scores = [] eval_loss = 0 num_loss = 0 while True: a_ids_data, a_context_ids_data, a_keyword_index, a_len_data, p_ids_data, p_context_ids_data, \ p_keyword_index, p_len_data, y_data = \ data_processor_valid.get_batch_siamese(arg) if len(a_ids_data) != 0: feed_dict = { model.input_a.name: a_ids_data, model.input_a_context.name: a_context_ids_data, model.input_a_keyword_index.name: a_keyword_index, model.input_a_len.name: a_len_data, model.input_n.name: p_ids_data, model.input_n_context.name: p_context_ids_data, model.input_n_keyword_index.name: p_keyword_index, model.input_n_len.name: p_len_data, model.input_y: y_data } ret = sess.run(inference_outputs, feed_dict) eval_loss += np.mean(ret[1]) num_loss += 1 pred_scores.append(ret[0]) true_scores.append(y_data) if data_processor_valid.end == 1: break pred_scores = np.concatenate(pred_scores) true_scores = np.concatenate(true_scores) import sklearn fpr, tpr, thresholds = sklearn.metrics.roc_curve(true_scores, pred_scores, pos_label=1) auc = sklearn.metrics.auc(fpr, tpr) map = average_precision_score(true_scores, pred_scores, average='micro') df = pd.DataFrame({ 'model': 'siamese', 'score': pred_scores, 'class': true_scores }) logging.info('Loss: ' + str(eval_loss / num_loss)) logging.info('AUC: ' + str(auc)) logging.info('MAP: ' + str(map)) data_processor_valid.close() return (eval_loss / num_loss), auc, df
def test_on_ready(gid, obj_guild): # New guild test DataConnector.run_query("DELETE FROM {}.guilds WHERE guild_id='{}'".format( SCHEMA_NAME, gid)) df = DataConnector.read_data( "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 0 lst_guilds = [] lst_guilds.append(obj_guild) DataProcessor._on_ready(lst_guilds) df = DataConnector.read_data( "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 1 # Same guild list test DataProcessor._on_ready(lst_guilds) df = DataConnector.read_data( "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 1 # Additional guild test new_gid = '54321' DataConnector.run_query("DELETE FROM {}.guilds WHERE guild_id='{}'".format( SCHEMA_NAME, new_gid)) new_obj_guild = guilds(new_gid) lst_guilds.append(new_obj_guild) DataProcessor._on_ready(lst_guilds) df = DataConnector.read_data("""SELECT * FROM {}.guilds WHERE guild_id='{}' or guild_id='{}' """.format(SCHEMA_NAME, gid, new_gid)) assert df.shape[0] == 2 # Removed guilds test empty_lst_guilds = [] DataProcessor._on_ready(empty_lst_guilds) df = DataConnector.read_data("""SELECT * FROM {}.guilds WHERE guild_id='{}' or guild_id='{}' """.format(SCHEMA_NAME, gid, new_gid)) assert df.shape[0] == 0 # One last sanity check lst_guilds.remove(new_obj_guild) DataProcessor._on_ready(lst_guilds) df = DataConnector.read_data("""SELECT * FROM {}.guilds WHERE guild_id='{}' or guild_id='{}' """.format(SCHEMA_NAME, gid, new_gid)) assert df.shape[0] == 1
def inference(full_path, full_inference_label_file, in_vocab): data_processor_valid = DataProcessor(full_path, in_vocab) pred_scores = [] while True: a_ids_data, a_context_ids_data, a_keyword_index, a_len_data, p_ids_data, p_context_ids_data, \ p_keyword_index, p_len_data, n_ids_data, n_context_ids_data, n_keyword_index, n_len_data = \ data_processor_valid.get_batch_triple(arg) if len(a_ids_data) != 0: feed_dict = { model.input_a.name: a_ids_data, model.input_a_context.name: a_context_ids_data, model.input_a_keyword_index.name: a_keyword_index, model.input_a_len.name: a_len_data, model.input_n.name: n_ids_data, model.input_n_context.name: n_context_ids_data, model.input_n_keyword_index.name: n_keyword_index, model.input_n_len.name: n_len_data, model.input_p.name: p_ids_data, model.input_p_context.name: p_context_ids_data, model.input_p_keyword_index.name: p_keyword_index, model.input_p_len.name: p_len_data, } ret = sess.run(inference_outputs, feed_dict) pred_scores.append(ret[0][1]) if data_processor_valid.end == 1: break pred_scores = np.concatenate(pred_scores, axis=0) import pandas as pd df = pd.read_csv(full_inference_label_file, sep='\t', header=None) true_scores = df.iloc[:, 2] chunks = df.groupby(df.iloc[:, 0]).groups for k_num in [1, 5, 10, 20]: topkp, topkr, topkf1 = evaluateTopN( pred_scores, true_scores, chunks, k_num) data_processor_valid.close() return
def test_on_message_watch_channel(gid, obj_guild): DataProcessor._on_guild_join(obj_guild) cid = '1234567890' DataProcessor._on_message_watch_channel(gid, cid) df = DataConnector.read_data( "SELECT channel_id FROM {}.guilds WHERE guild_id='{}'".format( SCHEMA_NAME, gid)) assert df['channel_id'][0] == cid DataProcessor._on_guild_join(obj_guild)
def test_on_message_watch_message(gid, obj_guild): DataProcessor._on_guild_join(obj_guild) mid = '1234567890' DataProcessor._on_message_watch_message(gid, mid) df = DataConnector.read_data( "SELECT message_id FROM {}.guilds WHERE guild_id='{}'".format( SCHEMA_NAME, gid)) assert df['message_id'][0] == mid df = DataConnector.read_data( "SELECT COUNT(*) FROM {}.guilds WHERE guild_id='{}'".format( SCHEMA_NAME, gid)) assert df['count'][0] == 1 DataProcessor._on_guild_join(obj_guild)
def calculate(i): global last_sum_volume, last_price # response = binance_api.send_public_request('/api/v3/trades', payload={'symbol': symbol, "limit": 1000}) response = binance_api.send_public_request('/fapi/v1/trades', payload={ 'symbol': symbol, "limit": 1000 }) df = DataProcessor.convert_trade_response_to_dataframe(response) # df["volume_buy"] = df[ df["isBuyerMaker"] == False ]["qty"].sum() # df["volume_sell"] = df[ df["isBuyerMaker"] == True ]["qty"].sum() # df["count_buy"] = df[ df["isBuyerMaker"] == False ].shape[0] # df["count_sell"] = df[ df["isBuyerMaker"] == True ].shape[0] # df["isBuyerMaker"][ df["isBuyerMaker"] == False ] = "buy" # df["isBuyerMaker"][ df["isBuyerMaker"] == True ] = "sell" # df.to_csv("current_trade_logs.csv") sum_volume = df[df["isBuyerMaker"] == False]["qty"].sum() - df[ df["isBuyerMaker"] == True]["qty"].sum() sum_volume_deque.append(sum_volume) diff_price = df["price"].iloc[-1] - last_price last_price = df["price"].iloc[-1] diff_price_deque.append(diff_price) zero_deque.append(0) if i < max_lenth: x_deque.append(i) diff_volume = sum_volume - last_sum_volume last_sum_volume = sum_volume last_sum_volume_deque.append(str(int(diff_volume))) ax.set_title(int(sum_volume), loc="left", fontsize=20) ax.set_title(int(diff_volume), loc="right", fontsize=20) ax.xaxis.set_label_position('top') ax.xaxis.tick_top() ax.set_xlabel(" ".join(last_sum_volume_deque)) line1.set_data(x_deque, sum_volume_deque) line2.set_data(x_deque, diff_price_deque) return [line1, line2]
def test_on_guild_remove(gid, obj_guild): DataProcessor._on_guild_join(obj_guild) # Ensure that size 1 df = DataConnector.read_data( "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 1 df = DataConnector.read_data( "SELECT * FROM {}.days WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 1 # Remove the guild and verify none in guilds and days tables DataProcessor._on_guild_remove(obj_guild) df = DataConnector.read_data( "SELECT * FROM {}.guilds WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 0 df = DataConnector.read_data( "SELECT * FROM {}.days WHERE guild_id='{}'".format(SCHEMA_NAME, gid)) assert df.shape[0] == 0 DataProcessor._on_guild_join(obj_guild)
def valid(in_path, slot_path, intent_path): data_processor_valid = DataProcessor(in_path, slot_path, intent_path, in_vocab, slot_vocab, intent_vocab) pred_intents = [] correct_intents = [] slot_outputs = [] correct_slots = [] input_words = [] gate_seq = [] while True: in_data, slot_data, slot_weight, length, intents, in_seq, slot_seq, intent_seq = data_processor_valid.get_batch( arg.batch_size) if len(in_data) <= 0: break feed_dict = { input_data.name: in_data, sequence_length.name: length } ret = sess.run(inference_outputs, feed_dict) for i in ret[0]: pred_intents.append(np.argmax(i)) for i in intents: correct_intents.append(i) pred_slots = ret[1].reshape( (slot_data.shape[0], slot_data.shape[1], -1)) for p, t, i, l in zip(pred_slots, slot_data, in_data, length): if arg.use_crf: p = p.reshape([-1]) else: p = np.argmax(p, 1) tmp_pred = [] tmp_correct = [] tmp_input = [] for j in range(l): tmp_pred.append(slot_vocab['rev'][p[j]]) tmp_correct.append(slot_vocab['rev'][t[j]]) tmp_input.append(in_vocab['rev'][i[j]]) slot_outputs.append(tmp_pred) correct_slots.append(tmp_correct) input_words.append(tmp_input) if data_processor_valid.end == 1: break pred_intents = np.array(pred_intents) correct_intents = np.array(correct_intents) accuracy = (pred_intents == correct_intents) semantic_acc = accuracy accuracy = accuracy.astype(float) accuracy = np.mean(accuracy) * 100.0 index = 0 for t, p in zip(correct_slots, slot_outputs): # Process Semantic Error if len(t) != len(p): raise ValueError('Error!!') for j in range(len(t)): if p[j] != t[j]: semantic_acc[index] = False break index += 1 semantic_acc = semantic_acc.astype(float) semantic_acc = np.mean(semantic_acc) * 100.0 f1, precision, recall = computeF1Score(correct_slots, slot_outputs) if "test" in in_path: print("save result_intent.out") with open(str(epochs) + "intent.out", "w") as outfile: for true, pred in zip(correct_intents, pred_intents): outfile.write("{} {}\n".format(true, pred)) print("save slot.out") with open(in_path) as infile: data = infile.readlines() lines = [line.split() for line in data] with open(str(epochs) + "-slot.out", "w") as outfile: print(len(lines), len(correct_slots), len(slot_outputs)) for i in range(len(lines)): for w, true, pred in zip( lines[i], correct_slots[i], slot_outputs[i]): outfile.write("{} {} {}\n".format( w, true, pred)) outfile.write("\n") logging.info('slot f1: ' + str(f1)) logging.info('intent accuracy: ' + str(accuracy)) logging.info('semantic Acc(intent, slots are all correct): ' + str(semantic_acc)) data_processor_valid.close() return f1, accuracy, semantic_acc, pred_intents, correct_intents, slot_outputs, correct_slots, input_words, gate_seq
num_loss = 0 step = 0 no_improve = 0 valid_slot = 0 test_slot = 0 valid_intent = 0 test_intent = 0 valid_err = 0 test_err = 0 best_epoch_num = 0 while True: if data_processor == None: data_processor = DataProcessor( os.path.join(full_train_path, arg.input_file), os.path.join(full_train_path, arg.slot_file), os.path.join(full_train_path, arg.intent_file), in_vocab, slot_vocab, intent_vocab) in_data, slot_data, slot_weight, length, intents, _, _, _ = data_processor.get_batch( arg.batch_size) feed_dict = { input_data.name: in_data, slots.name: slot_data, slot_weights.name: slot_weight, sequence_length.name: length, intent.name: intents } ret = sess.run(training_outputs, feed_dict) #print(feed_dict) loss += np.mean(ret[1])
########################################### #### brake discs located at the right, left sides of a train ####### df['ComponentParentLocation'].unique() df[df['ComponentParentLocation'] ==2] df[df['ComponentParentLocation'].isna()] mask = (df['ComponentParentLocation'] == 1) & (df['BrskChangeDate1'].notnull()) col_names = ['BrskThickness1', 'TotalPerformanceSnapshot'] thick_1 = df.loc[mask, col_names] df_sort = df.sort_values('ComponentParentLocation') cols = ['BrskThickness1', 'BrskThickness2', 'BrskThickness3', 'BrskThickness4'] data_prep = DataProcessor(df) df_1 = data_prep.string_to_numeric(df, cols=cols) def mu_sd(df, col_1): df.groupby('ComponentParentLocation').agg({col_1: ['mean', 'std']}) df_1.groupby('ComponentParentLocation').agg({'BrskThickness1': ['mean', 'std']}) df_1.groupby('ComponentParentLocation')[cols].describe() ########################################## columns_to_remove = ['PostID', 'BrskLatheDate1', 'BrskLatheDate2', 'BrskLatheDate3', 'BrskLatheDate4', 'ReportingDateTime', 'DataSavedInDBDateTime'] df = data_loader.remove_col(df, column_name_list=columns_to_remove) # create a new column for broms 1 and 2 thickness
line = 0 num_loss = 0 step = 0 no_improve = 0 #variables to store highest values among epochs, only use 'valid_err' for now valid_slot = 0 test_slot = 0 valid_intent = 0 test_intent = 0 valid_err = 0 test_err = 0 while True: if data_processor == None: data_processor = DataProcessor(os.path.join(full_train_path, arg.input_file), os.path.join(full_train_path, arg.slot_file), os.path.join(full_train_path, arg.intent_file), in_vocab, slot_vocab, intent_vocab) in_data, slot_data, slot_weight, length, intents,_,_,_ = data_processor.get_batch(arg.batch_size) feed_dict = {input_data.name: in_data, slots.name: slot_data, slot_weights.name: slot_weight, sequence_length.name: length, intent.name: intents} ret = sess.run(training_outputs, feed_dict) loss += np.mean(ret[1]) line += arg.batch_size step = ret[0] num_loss += 1 if data_processor.end == 1: line = 0 data_processor.close() data_processor = None epochs += 1 logging.info('Step: ' + str(step))
epochs = 0 eval_loss = 0.0 data_processor = None line = 0 num_loss = 0 step = 0 no_improve = 0 valid_err = 1 test_err = 1 while True: if data_processor == None: data_processor = DataProcessor(full_train_path, in_vocab, shuffle=True) a_ids_data, a_context_ids_data, a_keyword_index, a_len_data, n_ids_data, n_context_ids_data, n_keyword_index, n_len_data, y = data_processor.get_batch_siamese( arg) if len(a_ids_data) != 0: feed_dict = { model.input_a.name: a_ids_data, model.input_a_context.name: a_context_ids_data, model.input_a_keyword_index.name: a_keyword_index, model.input_a_len.name: a_len_data, model.input_n.name: n_ids_data, model.input_n_context.name: n_context_ids_data, model.input_n_keyword_index.name: n_keyword_index, model.input_n_len.name: n_len_data, model.input_y.name: y }
def main(): parser = argparse.ArgumentParser() # Required parameters parser.add_argument( "--bert_model", default='bert-base-uncased', type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( '--task', type=str, default=None, required=True, help="Task code in {hotpot_open, hotpot_distractor, squad, nq}") # Other parameters parser.add_argument( "--max_seq_length", default=378, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=1, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=5, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam. (def: 5e-5)") parser.add_argument("--num_train_epochs", default=5.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument('--local_rank', default=-1, type=int) # RNN graph retriever-specific parameters parser.add_argument("--example_limit", default=None, type=int) parser.add_argument("--max_para_num", default=10, type=int) parser.add_argument( "--neg_chunk", default=8, type=int, help="The chunk size of negative examples during training (to " "reduce GPU memory consumption with negative sampling)") parser.add_argument( "--eval_chunk", default=100000, type=int, help= "The chunk size of evaluation examples (to reduce RAM consumption during evaluation)" ) parser.add_argument( "--split_chunk", default=300, type=int, help= "The chunk size of BERT encoding during inference (to reduce GPU memory consumption)" ) parser.add_argument('--train_file_path', type=str, default=None, help="File path to the training data") parser.add_argument('--dev_file_path', type=str, default=None, help="File path to the eval data") parser.add_argument('--beam', type=int, default=1, help="Beam size") parser.add_argument('--min_select_num', type=int, default=1, help="Minimum number of selected paragraphs") parser.add_argument('--max_select_num', type=int, default=3, help="Maximum number of selected paragraphs") parser.add_argument( "--use_redundant", action='store_true', help="Whether to use simulated seqs (only for training)") parser.add_argument( "--use_multiple_redundant", action='store_true', help="Whether to use multiple simulated seqs (only for training)") parser.add_argument( '--max_redundant_num', type=int, default=100000, help= "Whether to limit the number of the initial TF-IDF pool (only for open-domain eval)" ) parser.add_argument( "--no_links", action='store_true', help= "Whether to omit any links (or in other words, only use TF-IDF-based paragraphs)" ) parser.add_argument("--pruning_by_links", action='store_true', help="Whether to do pruning by links (and top 1)") parser.add_argument( "--expand_links", action='store_true', help= "Whether to expand links with paragraphs in the same article (for NQ)") parser.add_argument( '--tfidf_limit', type=int, default=None, help= "Whether to limit the number of the initial TF-IDF pool (only for open-domain eval)" ) parser.add_argument("--pred_file", default=None, type=str, help="File name to write paragraph selection results") parser.add_argument("--tagme", action='store_true', help="Whether to use tagme at inference") parser.add_argument( '--topk', type=int, default=2, help="Whether to use how many paragraphs from the previous steps") parser.add_argument( "--model_suffix", default=None, type=str, help="Suffix to load a model file ('pytorch_model_' + suffix +'.bin')") parser.add_argument("--db_save_path", default=None, type=str, help="File path to DB") parser.add_argument("--fp16", default=False, action='store_true') parser.add_argument("--fp16_opt_level", default="O1", type=str) parser.add_argument("--do_label", default=False, action='store_true', help="For pre-processing features only.") parser.add_argument("--oss_cache_dir", default=None, type=str) parser.add_argument("--cache_dir", default=None, type=str) parser.add_argument("--dist", default=False, action='store_true', help='use distributed training.') parser.add_argument("--save_steps", default=5000, type=int) parser.add_argument("--resume", default=None, type=int) parser.add_argument("--oss_pretrain", default=None, type=str) parser.add_argument("--model_version", default='v1', type=str) parser.add_argument("--disable_rnn_layer_norm", default=False, action='store_true') args = parser.parse_args() if args.dist: dist.init_process_group(backend='nccl') print(f"local rank: {args.local_rank}") print(f"global rank: {dist.get_rank()}") print(f"world size: {dist.get_world_size()}") if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of synchronizing nodes/GPUs dist.init_process_group(backend='nccl') if args.dist: global_rank = dist.get_rank() world_size = dist.get_world_size() if world_size > 1: args.local_rank = global_rank if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.train_file_path is not None: do_train = True if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) elif args.dev_file_path is not None: do_train = False else: raise ValueError( 'One of train_file_path: {} or dev_file_path: {} must be non-None'. format(args.train_file_path, args.dev_file_path)) processor = DataProcessor() # Configurations of the graph retriever graph_retriever_config = GraphRetrieverConfig( example_limit=args.example_limit, task=args.task, max_seq_length=args.max_seq_length, max_select_num=args.max_select_num, max_para_num=args.max_para_num, tfidf_limit=args.tfidf_limit, train_file_path=args.train_file_path, use_redundant=args.use_redundant, use_multiple_redundant=args.use_multiple_redundant, max_redundant_num=args.max_redundant_num, dev_file_path=args.dev_file_path, beam=args.beam, min_select_num=args.min_select_num, no_links=args.no_links, pruning_by_links=args.pruning_by_links, expand_links=args.expand_links, eval_chunk=args.eval_chunk, tagme=args.tagme, topk=args.topk, db_save_path=args.db_save_path, disable_rnn_layer_norm=args.disable_rnn_layer_norm) logger.info(graph_retriever_config) logger.info(args) tokenizer = AutoTokenizer.from_pretrained(args.bert_model) if args.model_version == 'roberta': from modeling_graph_retriever_roberta import RobertaForGraphRetriever elif args.model_version == 'v3': from modeling_graph_retriever_roberta import RobertaForGraphRetrieverIterV3 as RobertaForGraphRetriever else: raise RuntimeError() ############################## # Training # ############################## if do_train: _model_state_dict = None if args.oss_pretrain is not None: _model_state_dict = torch.load(load_pretrain_from_oss( args.oss_pretrain), map_location='cpu') logger.info(f"Loaded pretrained model from {args.oss_pretrain}") if args.resume is not None: _model_state_dict = torch.load(load_buffer_from_oss( os.path.join(args.oss_cache_dir, f"pytorch_model_{args.resume}.bin")), map_location='cpu') model = RobertaForGraphRetriever.from_pretrained( args.bert_model, graph_retriever_config=graph_retriever_config, state_dict=_model_state_dict) model.to(device) global_step = 0 POSITIVE = 1.0 NEGATIVE = 0.0 _cache_file_name = f"cache_roberta_train_{args.max_seq_length}_{args.max_para_num}" _examples_cache_file_name = f"examples_{_cache_file_name}" _features_cache_file_name = f"features_{_cache_file_name}" # Load training examples logger.info(f"Loading training examples and features.") try: if args.cache_dir is not None and os.path.exists( os.path.join(args.cache_dir, _features_cache_file_name)): logger.info( f"Loading pre-processed features from {os.path.join(args.cache_dir, _features_cache_file_name)}" ) train_features = torch.load( os.path.join(args.cache_dir, _features_cache_file_name)) else: # train_examples = torch.load(load_buffer_from_oss(os.path.join(oss_features_cache_dir, # _examples_cache_file_name))) train_features = torch.load( load_buffer_from_oss( os.path.join(oss_features_cache_dir, _features_cache_file_name))) logger.info( f"Pre-processed features are loaded from oss: " f"{os.path.join(oss_features_cache_dir, _features_cache_file_name)}" ) except: train_examples = processor.get_train_examples( graph_retriever_config) train_features = convert_examples_to_features( train_examples, args.max_seq_length, args.max_para_num, graph_retriever_config, tokenizer, train=True) logger.info( f"Saving pre-processed features into oss: {oss_features_cache_dir}" ) torch_save_to_oss( train_examples, os.path.join(oss_features_cache_dir, _examples_cache_file_name)) torch_save_to_oss( train_features, os.path.join(oss_features_cache_dir, _features_cache_file_name)) if args.do_label: logger.info("Finished.") return # len(train_examples) and len(train_features) can be different, depending on the redundant setting num_train_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight', 'layer_norm'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps if args.local_rank != -1: t_total = t_total // dist.get_world_size() optimizer = AdamW(optimizer_grouped_parameters, betas=(0.9, 0.98), lr=args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, int(t_total * args.warmup_proportion), t_total) logger.info(optimizer) if args.fp16: from apex import amp amp.register_half_function(torch, "einsum") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) if args.local_rank != -1: if args.fp16_opt_level == 'O2': try: import apex model = apex.parallel.DistributedDataParallel( model, delay_allreduce=True) except ImportError: model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) else: model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) if n_gpu > 1: model = torch.nn.DataParallel(model) if args.resume is not None: _amp_state_dict = os.path.join(args.oss_cache_dir, f"amp_{args.resume}.bin") _optimizer_state_dict = os.path.join( args.oss_cache_dir, f"optimizer_{args.resume}.pt") _scheduler_state_dict = os.path.join( args.oss_cache_dir, f"scheduler_{args.resume}.pt") amp.load_state_dict( torch.load(load_buffer_from_oss(_amp_state_dict))) optimizer.load_state_dict( torch.load(load_buffer_from_oss(_optimizer_state_dict))) scheduler.load_state_dict( torch.load(load_buffer_from_oss(_scheduler_state_dict))) logger.info(f"Loaded resumed state dict of step {args.resume}") logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (dist.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) model.train() epc = 0 # test if args.local_rank in [-1, 0]: if args.fp16: amp_file = os.path.join(args.oss_cache_dir, f"amp_{global_step}.bin") torch_save_to_oss(amp.state_dict(), amp_file) optimizer_file = os.path.join(args.oss_cache_dir, f"optimizer_{global_step}.pt") torch_save_to_oss(optimizer.state_dict(), optimizer_file) scheduler_file = os.path.join(args.oss_cache_dir, f"scheduler_{global_step}.pt") torch_save_to_oss(scheduler.state_dict(), scheduler_file) tr_loss = 0 for _ in range(int(args.num_train_epochs)): logger.info('Epoch ' + str(epc + 1)) TOTAL_NUM = len(train_features) train_start_index = 0 CHUNK_NUM = 8 train_chunk = TOTAL_NUM // CHUNK_NUM chunk_index = 0 random.shuffle(train_features) save_retry = False while train_start_index < TOTAL_NUM: train_end_index = min(train_start_index + train_chunk - 1, TOTAL_NUM - 1) chunk_len = train_end_index - train_start_index + 1 if args.resume is not None and global_step < args.resume: _chunk_steps = int( math.ceil(chunk_len * 1.0 / args.train_batch_size / (1 if args.local_rank == -1 else dist.get_world_size()))) _chunk_steps = _chunk_steps // args.gradient_accumulation_steps if global_step + _chunk_steps <= args.resume: global_step += _chunk_steps train_start_index = train_end_index + 1 continue train_features_ = train_features[ train_start_index:train_start_index + chunk_len] all_input_ids = torch.tensor( [f.input_ids for f in train_features_], dtype=torch.long) all_input_masks = torch.tensor( [f.input_masks for f in train_features_], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features_], dtype=torch.long) all_output_masks = torch.tensor( [f.output_masks for f in train_features_], dtype=torch.float) all_num_paragraphs = torch.tensor( [f.num_paragraphs for f in train_features_], dtype=torch.long) all_num_steps = torch.tensor( [f.num_steps for f in train_features_], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_masks, all_segment_ids, all_output_masks, all_num_paragraphs, all_num_steps) if args.local_rank != -1: train_sampler = torch.utils.data.DistributedSampler( train_data) else: train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size, pin_memory=True, num_workers=4) if args.local_rank != -1: train_dataloader.sampler.set_epoch(epc) logger.info('Examples from ' + str(train_start_index) + ' to ' + str(train_end_index)) for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): if args.resume is not None and global_step < args.resume: if (step + 1) % args.gradient_accumulation_steps == 0: global_step += 1 continue input_masks = batch[1] batch_max_len = input_masks.sum(dim=2).max().item() num_paragraphs = batch[4] batch_max_para_num = num_paragraphs.max().item() num_steps = batch[5] batch_max_steps = num_steps.max().item() # output_masks_cpu = (batch[3])[:, :batch_max_steps, :batch_max_para_num + 1] batch = tuple(t.to(device) for t in batch) input_ids, input_masks, segment_ids, output_masks, _, _ = batch B = input_ids.size(0) input_ids = input_ids[:, :batch_max_para_num, : batch_max_len] input_masks = input_masks[:, :batch_max_para_num, : batch_max_len] segment_ids = segment_ids[:, :batch_max_para_num, : batch_max_len] output_masks = output_masks[:, :batch_max_steps, : batch_max_para_num + 1] # 1 for EOE target = torch.zeros(output_masks.size()).fill_( NEGATIVE) # (B, NUM_STEPS, |P|+1) <- 1 for EOE for i in range(B): output_masks[i, :num_steps[i], -1] = 1.0 # for EOE for j in range(num_steps[i].item() - 1): target[i, j, j].fill_(POSITIVE) target[i, num_steps[i] - 1, -1].fill_(POSITIVE) target = target.to(device) neg_start = batch_max_steps - 1 while neg_start < batch_max_para_num: neg_end = min(neg_start + args.neg_chunk - 1, batch_max_para_num - 1) neg_len = (neg_end - neg_start + 1) input_ids_ = torch.cat( (input_ids[:, :batch_max_steps - 1, :], input_ids[:, neg_start:neg_start + neg_len, :]), dim=1) input_masks_ = torch.cat( (input_masks[:, :batch_max_steps - 1, :], input_masks[:, neg_start:neg_start + neg_len, :]), dim=1) segment_ids_ = torch.cat( (segment_ids[:, :batch_max_steps - 1, :], segment_ids[:, neg_start:neg_start + neg_len, :]), dim=1) output_masks_ = torch.cat( (output_masks[:, :, :batch_max_steps - 1], output_masks[:, :, neg_start:neg_start + neg_len], output_masks[:, :, batch_max_para_num: batch_max_para_num + 1]), dim=2) target_ = torch.cat( (target[:, :, :batch_max_steps - 1], target[:, :, neg_start:neg_start + neg_len], target[:, :, batch_max_para_num:batch_max_para_num + 1]), dim=2) if neg_start != batch_max_steps - 1: output_masks_[:, :, :batch_max_steps - 1] = 0.0 output_masks_[:, :, -1] = 0.0 loss = model(input_ids_, segment_ids_, input_masks_, output_masks_, target_, batch_max_steps) if n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() neg_start = neg_end + 1 # del input_ids_ # del input_masks_ # del segment_ids_ # del output_masks_ # del target_ if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), 1.0) else: torch.nn.utils.clip_grad_norm_( model.parameters(), 1.0) optimizer.step() scheduler.step() # optimizer.zero_grad() model.zero_grad() global_step += 1 if global_step % 50 == 0: _cur_steps = global_step if args.resume is None else global_step - args.resume logger.info( f"Training loss: {tr_loss / _cur_steps}\t" f"Learning rate: {scheduler.get_lr()[0]}\t" f"Global step: {global_step}") if global_step % args.save_steps == 0: if args.local_rank in [-1, 0]: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join( args.oss_cache_dir, f"pytorch_model_{global_step}.bin") torch_save_to_oss(model_to_save.state_dict(), output_model_file) _suffix = "" if args.local_rank == -1 else f"_{args.local_rank}" if args.fp16: amp_file = os.path.join( args.oss_cache_dir, f"amp_{global_step}{_suffix}.bin") torch_save_to_oss(amp.state_dict(), amp_file) optimizer_file = os.path.join( args.oss_cache_dir, f"optimizer_{global_step}{_suffix}.pt") torch_save_to_oss(optimizer.state_dict(), optimizer_file) scheduler_file = os.path.join( args.oss_cache_dir, f"scheduler_{global_step}{_suffix}.pt") torch_save_to_oss(scheduler.state_dict(), scheduler_file) logger.info( f"checkpoint of step {global_step} is saved to oss." ) # del input_ids # del input_masks # del segment_ids # del output_masks # del target # del batch chunk_index += 1 train_start_index = train_end_index + 1 # Save the model at the half of the epoch if (chunk_index == CHUNK_NUM // 2 or save_retry) and args.local_rank in [-1, 0]: status = save(model, args.output_dir, str(epc + 0.5)) save_retry = (not status) del train_features_ del all_input_ids del all_input_masks del all_segment_ids del all_output_masks del all_num_paragraphs del all_num_steps del train_data del train_sampler del train_dataloader gc.collect() # Save the model at the end of the epoch if args.local_rank in [-1, 0]: save(model, args.output_dir, str(epc + 1)) # model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # output_model_file = os.path.join(args.oss_cache_dir, "pytorch_model_" + str(epc + 1) + ".bin") # torch_save_to_oss(model_to_save.state_dict(), output_model_file) epc += 1 if do_train: return ############################## # Evaluation # ############################## assert args.model_suffix is not None if graph_retriever_config.db_save_path is not None: import sys sys.path.append('../') from pipeline.tfidf_retriever import TfidfRetriever tfidf_retriever = TfidfRetriever(graph_retriever_config.db_save_path, None) else: tfidf_retriever = None if args.oss_cache_dir is not None: file_name = 'pytorch_model_' + args.model_suffix + '.bin' model_state_dict = torch.load( load_buffer_from_oss(os.path.join(args.oss_cache_dir, file_name))) else: model_state_dict = load(args.output_dir, args.model_suffix) model = RobertaForGraphRetriever.from_pretrained( args.bert_model, state_dict=model_state_dict, graph_retriever_config=graph_retriever_config) model.to(device) model.eval() if args.pred_file is not None: pred_output = [] eval_examples = processor.get_dev_examples(graph_retriever_config) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) TOTAL_NUM = len(eval_examples) eval_start_index = 0 while eval_start_index < TOTAL_NUM: eval_end_index = min( eval_start_index + graph_retriever_config.eval_chunk - 1, TOTAL_NUM - 1) chunk_len = eval_end_index - eval_start_index + 1 eval_features = convert_examples_to_features( eval_examples[eval_start_index:eval_start_index + chunk_len], args.max_seq_length, args.max_para_num, graph_retriever_config, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_masks = torch.tensor([f.input_masks for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_output_masks = torch.tensor( [f.output_masks for f in eval_features], dtype=torch.float) all_num_paragraphs = torch.tensor( [f.num_paragraphs for f in eval_features], dtype=torch.long) all_num_steps = torch.tensor([f.num_steps for f in eval_features], dtype=torch.long) all_ex_indices = torch.tensor([f.ex_index for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_masks, all_segment_ids, all_output_masks, all_num_paragraphs, all_num_steps, all_ex_indices) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) for input_ids, input_masks, segment_ids, output_masks, num_paragraphs, num_steps, ex_indices in tqdm( eval_dataloader, desc="Evaluating"): batch_max_len = input_masks.sum(dim=2).max().item() batch_max_para_num = num_paragraphs.max().item() batch_max_steps = num_steps.max().item() input_ids = input_ids[:, :batch_max_para_num, :batch_max_len] input_masks = input_masks[:, :batch_max_para_num, :batch_max_len] segment_ids = segment_ids[:, :batch_max_para_num, :batch_max_len] output_masks = output_masks[:, :batch_max_para_num + 2, :batch_max_para_num + 1] output_masks[:, 1:, -1] = 1.0 # Ignore EOE in the first step input_ids = input_ids.to(device) input_masks = input_masks.to(device) segment_ids = segment_ids.to(device) output_masks = output_masks.to(device) examples = [ eval_examples[eval_start_index + ex_indices[i].item()] for i in range(input_ids.size(0)) ] with torch.no_grad(): pred, prob, topk_pred, topk_prob = model.beam_search( input_ids, segment_ids, input_masks, examples=examples, tokenizer=tokenizer, retriever=tfidf_retriever, split_chunk=args.split_chunk) for i in range(len(pred)): e = examples[i] titles = [e.title_order[p] for p in pred[i]] # Output predictions to a file if args.pred_file is not None: pred_output.append({}) pred_output[-1]['q_id'] = e.guid pred_output[-1]['titles'] = titles pred_output[-1]['probs'] = [] for prob_ in prob[i]: entry = {'EOE': prob_[-1]} for j in range(len(e.title_order)): entry[e.title_order[j]] = prob_[j] pred_output[-1]['probs'].append(entry) topk_titles = [[e.title_order[p] for p in topk_pred[i][j]] for j in range(len(topk_pred[i]))] pred_output[-1]['topk_titles'] = topk_titles topk_probs = [] for k in range(len(topk_prob[i])): topk_probs.append([]) for prob_ in topk_prob[i][k]: entry = {'EOE': prob_[-1]} for j in range(len(e.title_order)): entry[e.title_order[j]] = prob_[j] topk_probs[-1].append(entry) pred_output[-1]['topk_probs'] = topk_probs # Output the selected paragraphs context = {} for ts in topk_titles: for t in ts: context[t] = e.all_paras[t] pred_output[-1]['context'] = context eval_start_index = eval_end_index + 1 del eval_features del all_input_ids del all_input_masks del all_segment_ids del all_output_masks del all_num_paragraphs del all_num_steps del all_ex_indices del eval_data if args.pred_file is not None: json.dump(pred_output, open(args.pred_file, 'w'))
def __init__( self, args: DataTrainingArguments, op_args: GeneralArguments, tokenizer: PreTrainedTokenizer, limit_length: Optional[int] = None, mode: Split = Split.train, cache_dir: Optional[str] = None, ): self.args = args self.processor = DataProcessor() self.output_mode = op_args.output_mode self.processor.set_labels(op_args._labels()) if isinstance(mode, str): try: mode = Split[mode] except KeyError: raise KeyError("mode is not a valid split name") # Load data features from cache or dataset file cached_features_file = os.path.join( cache_dir if cache_dir is not None else args.data_dir, "cached_{}_{}_{}_{}".format( mode.value, tokenizer.__class__.__name__, str(args.max_seq_length), args.task_name, ) ) label_list = self.processor.get_labels() if args.task_name in ["mnli", "mnli-mm"] and tokenizer.__class__ in ( RobertaTokenizer, RobertaTokenizerFast, XLMRobertaTokenizer, BartTokenizer, BartTokenizerFast, ): # HACK(label indices are swapped in RoBERTa pretrained model) label_list[1], label_list[2] = label_list[2], label_list[1] self.label_list = label_list # Make sure only the first process in distributed training processes the dataset, # and the others will use the cache. lock_path = cached_features_file + ".lock" with FileLock(lock_path): if os.path.exists(cached_features_file) and not args.overwrite_cache: start = time.time() self.features = torch.load(cached_features_file) logger.info( f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start ) else: logger.info(f"Creating features from dataset file at {args.data_dir}") label_list = self.processor.get_labels() if mode.value == 'train': examples = self.processor.get_train_examples(args.data_dir) elif mode.value == 'dev': examples = self.processor.get_test_examples(args.data_dir) elif mode.value == 'test': examples = self.processor.get_test_examples(args.data_dir) else: examples = None if limit_length is not None: examples = examples[:limit_length] self.features = convert_examples_to_features( examples, tokenizer, max_length=args.max_seq_length, task=args.task_name, label_list=label_list, output_mode=self.output_mode, ) start = time.time() torch.save(self.features, cached_features_file) # ^ This seems to take a lot of time so I want to investigate why and how we can improve. logger.info( "Saving features into cached file %s [took %.3f s]", cached_features_file, time.time() - start )
def valid(in_path, slot_path, intent_path): data_processor_valid = DataProcessor(in_path, slot_path, intent_path, in_vocab, slot_vocab, intent_vocab, use_bert=arg.use_bert) pred_intents = [] correct_intents = [] slot_outputs = [] correct_slots = [] input_words = [] while True: in_data, slot_data, slot_weight, length, intents, in_seq, slot_seq, intent_seq = data_processor_valid.get_batch( arg.batch_size) input_seq_embeddings = np.empty( shape=[0, 0, arg.embed_dim]) if arg.use_bert: input_seq_embeddings = get_bert_embeddings(in_seq) feed_dict = { input_data.name: in_data, sequence_length.name: length, input_sequence_embeddings.name: input_seq_embeddings } if len(in_data) != 0: ret = sess.run(inference_outputs, feed_dict) for i in ret[2]: pred_intents.append(np.argmax(i)) for i in intents: correct_intents.append(i) pred_slots = ret[3][-1, :, :, :].reshape( (slot_data.shape[0], slot_data.shape[1], -1)) for p, t, i, l, s in zip(pred_slots, slot_data, in_data, length, slot_seq): p = np.argmax(p, 1) tmp_pred = [] tmp_correct = [] tmp_input = [] for j in range(l): tmp_pred.append(slot_vocab['rev'][p[j]]) tmp_correct.append(slot_vocab['rev'][t[j]]) tmp_input.append(in_vocab['rev'][i[j]]) slot_outputs.append(tmp_pred) correct_slots.append(tmp_correct) input_words.append(tmp_input) if data_processor_valid.end == 1: break pred_intents = np.array(pred_intents) correct_intents = np.array(correct_intents) from sklearn.metrics import classification_report logging.info( classification_report(y_true=correct_intents, y_pred=pred_intents, digits=4)) accuracy = (pred_intents == correct_intents) semantic_error = accuracy accuracy = accuracy.astype(float) accuracy = np.mean(accuracy) * 100.0 index = 0 for t, p in zip(correct_slots, slot_outputs): # Process Semantic Error if len(t) != len(p): raise ValueError('Error!!') for j in range(len(t)): if p[j] != t[j]: semantic_error[index] = False break index += 1 semantic_error = semantic_error.astype(float) semantic_error = np.mean(semantic_error) * 100.0 f1, precision, recall = computeF1Score(correct_slots, slot_outputs) logging.info('slot f1: ' + str(f1)) logging.info('intent accuracy: ' + str(accuracy)) logging.info( 'semantic error(intent, slots are all correct): ' + str(semantic_error)) return f1, accuracy, semantic_error, pred_intents, correct_intents, slot_outputs, correct_slots, input_words
#Unk # For unk purpose if arg.use_unk == True: unker = UNKer(os.path.join(full_train_path, arg.input_file), os.path.join( full_train_path, arg.input_file + ".unk." + arg.unk_priority), os.path.join(full_train_path, arg.slot_file), ratio=arg.unk_ratio, threshold=arg.unk_threshold, priority=arg.unk_priority) data_processor = DataProcessor( os.path.join(full_train_path, arg.input_file + ".unk." + arg.unk_priority), os.path.join(full_train_path, arg.slot_file), os.path.join(full_train_path, arg.intent_file), in_vocab, slot_vocab, intent_vocab, shuffle=True, use_bert=arg.use_bert) else: data_processor = DataProcessor( os.path.join(full_train_path, arg.input_file), os.path.join(full_train_path, arg.slot_file), os.path.join(full_train_path, arg.intent_file), in_vocab, slot_vocab, intent_vocab, shuffle=True, use_bert=arg.use_bert)
def valid(in_path, slot_path, intent_path): data_processor_valid = DataProcessor(in_path, slot_path, intent_path, in_vocab, slot_vocab, intent_vocab) pred_intents = [] correct_intents = [] slot_outputs = [] correct_slots = [] input_words = [] #used to gate gate_seq = [] while True: in_data, slot_data, slot_weight, length, intents, in_seq, slot_seq, intent_seq = data_processor_valid.get_batch(arg.batch_size) feed_dict = {input_data.name: in_data, sequence_length.name: length} ret = sess.run(inference_outputs, feed_dict) for i in ret[0]: pred_intents.append(np.argmax(i)) for i in intents: correct_intents.append(i) pred_slots = ret[1].reshape((slot_data.shape[0], slot_data.shape[1], -1)) for p, t, i, l in zip(pred_slots, slot_data, in_data, length): p = np.argmax(p, 1) tmp_pred = [] tmp_correct = [] tmp_input = [] for j in range(l): tmp_pred.append(slot_vocab['rev'][p[j]]) tmp_correct.append(slot_vocab['rev'][t[j]]) tmp_input.append(in_vocab['rev'][i[j]]) slot_outputs.append(tmp_pred) correct_slots.append(tmp_correct) input_words.append(tmp_input) if data_processor_valid.end == 1: break pred_intents = np.array(pred_intents) correct_intents = np.array(correct_intents) accuracy = (pred_intents==correct_intents) semantic_error = accuracy accuracy = accuracy.astype(float) accuracy = np.mean(accuracy)*100.0 index = 0 for t, p in zip(correct_slots, slot_outputs): # Process Semantic Error if len(t) != len(p): raise ValueError('Error!!') for j in range(len(t)): if p[j] != t[j]: semantic_error[index] = False break index += 1 semantic_error = semantic_error.astype(float) semantic_error = np.mean(semantic_error)*100.0 f1, precision, recall = computeF1Score(correct_slots, slot_outputs) logging.info('slot f1: ' + str(f1)) logging.info('intent accuracy: ' + str(accuracy)) logging.info('semantic error(intent, slots are all correct): ' + str(semantic_error)) data_processor_valid.close() return f1,accuracy,semantic_error,pred_intents,correct_intents,slot_outputs,correct_slots,input_words,gate_seq
data_loader.correlation_numeric_col(df, corr_method="spearman") cols_list = ["TotalPerformanceSnapshot", "BrskThickness1", "BrskThickness2", "BrskThickness3", "BrskThickness4"] feature_selector = FeatureSelect(data = df[cols_list]) feature_selector.corr_standarised_num_col(corr_method="pearson") feature_selector.heat_map() # wide dataframe to long # melt import pandas as pd df_changedate = pd.melt(df, id_vars=['PostID'], value_vars=['BrskChangeDate1', 'BrskChangeDate2', 'BrskChangeDate3', 'BrskChangeDate4'], var_name='Brakes', value_name='ChangeDate') df.join(df_changedate, on='PostID', how='right') dat_prep = DataProcessor(df) dat_prep.eda() one_hot_enc = DataProcessor(df) one_hotify_these_categorical = ["VehicleOperatorName", "Littera"] one_hot_enc.one_hot_encoding(one_hotify_these_categorical)
def __init__(self, data_file, seperator=','): data_processor = DataProcessor(data_file, seperator=seperator, raw_data=True) self.data, self.labels = data_processor.get_training_data( raw_text=True) self.X_train = self.data self.y_train = self.labels test_data, test_labels = data_processor.process_test_file( '../data/imdb/test.csv', contains_label=True, header=0) print('Running Naive Bayes...') pipeline, parameters = self.get_naive_bayes_model() grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=4, verbose=10) grid_search_tune.fit(self.X_train, self.y_train) print("Best parameters set:") self.best_estimator_ = grid_search_tune.best_estimator_ print(grid_search_tune.best_score_) self.calculate_metric(test_data, test_labels) print('#' * 80) print('Running Linear SVM...') pipeline, parameters = self.get_linear_svm_model() grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=4, verbose=10) grid_search_tune.fit(self.X_train, self.y_train) print("Best parameters set:") self.best_estimator_ = grid_search_tune.best_estimator_ print(grid_search_tune.best_score_) self.calculate_metric(test_data, test_labels) print('#' * 80) print('Running Non Linear SVM...') pipeline, parameters = self.get_non_linear_svm_model() grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=4, verbose=10) grid_search_tune.fit(self.X_train, self.y_train) print("Best parameters set:") self.best_estimator_ = grid_search_tune.best_estimator_ print(grid_search_tune.best_score_) self.calculate_metric() print('#' * 80) print('Running Naive Bayes SVM...') pipeline, parameters = self.get_nbsvm_model() grid_search_tune = GridSearchCV(pipeline, parameters, cv=2, n_jobs=6, verbose=10) grid_search_tune.fit(self.X_train, self.y_train) print("Best parameters set:") self.best_estimator_ = grid_search_tune.best_estimator_ print(grid_search_tune.best_score_) self.calculate_metric(test_data, test_labels) print('#' * 80)
no_improve = 0 # variables to store highest values among epochs, only use 'valid_err' for now valid_slot = 0 test_slot = 0 valid_intent = 0 test_intent = 0 valid_err = 0 test_err = 0 while True: if data_processor == None: data_processor = DataProcessor( os.path.join(full_train_path, arg.input_file), os.path.join(full_train_path, arg.slot_file), os.path.join(full_train_path, arg.intent_file), in_vocab, slot_vocab, intent_vocab, ) ( in_data, slot_data, slot_weight, length, intents, _, _, _, ) = data_processor.get_batch(arg.batch_size) feed_dict = { input_data.name: in_data,
import config from api.BinanceApi import BinanceApi from utils import DataProcessor, DataDownloader binance_api = BinanceApi(api_type="future") # symbol = "BNBUSDT" symbol = "DOGEUSDT" response = binance_api.send_public_request('/fapi/v1/trades', payload={ 'symbol': symbol, "limit": 1000 }) df = DataProcessor.convert_trade_response_to_dataframe(response) print(df) DataDownloader.download_trade_id('/fapi/v1/historicalTrades', symbol=symbol, last_id=363853000, n_trade=10000000, delay_time=2.0) # df = DataDownloader.download_with_number_trade('/fapi/v1/historicalTrades', symbol=symbol, last_id=174000000, n_trade=100000, delay_time=2) # df.to_csv(f"{config.trade_logs_binance_data_dir}{symbol}.csv") # df = DataDownloader.download_with_number_trade('/fapi/v1/historicalTrades', symbol=symbol, last_id=174000000, n_trade=100000, delay_time=2) # df.to_csv(f"{config.trade_logs_binance_data_dir}{symbol}.csv") # DataDownloader.download_period('/api/v3/historicalTrades', symbol=symbol, id=224114000) # DataDownloader.download_period('/fapi/v1/historicalTrades', symbol=symbol, last_id=174000000, until_date=datetime(2021, 4, 1), delay_time=1.2)
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument( "--bert_model", default='bert-base-uncased', type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, " "bert-base-multilingual-cased, bert-base-chinese.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument( '--task', type=str, default=None, required=True, help="Task code in {hotpot_open, hotpot_distractor, squad, nq, ambigqa}" ) ## Other parameters parser.add_argument( "--max_seq_length", default=378, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=1, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=5, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam. (def: 5e-5)") parser.add_argument("--num_train_epochs", default=5.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) # RNN graph retriever-specific parameters parser.add_argument("--example_limit", default=None, type=int) parser.add_argument("--max_para_num", default=10, type=int) parser.add_argument( "--neg_chunk", default=8, type=int, help= "The chunk size of negative examples during training (to reduce GPU memory consumption with negative sampling)" ) parser.add_argument( "--eval_chunk", default=100000, type=int, help= "The chunk size of evaluation examples (to reduce RAM consumption during evaluation)" ) parser.add_argument( "--split_chunk", default=300, type=int, help= "The chunk size of BERT encoding during inference (to reduce GPU memory consumption)" ) parser.add_argument('--train_file_path', type=str, default=None, help="File path to the training data") parser.add_argument('--dev_file_path', type=str, default=None, help="File path to the eval data") parser.add_argument('--beam', type=int, default=1, help="Beam size") parser.add_argument('--min_select_num', type=int, default=1, help="Minimum number of selected paragraphs") parser.add_argument('--max_select_num', type=int, default=3, help="Maximum number of selected paragraphs") parser.add_argument( "--use_redundant", action='store_true', help="Whether to use simulated seqs (only for training)") parser.add_argument( "--use_multiple_redundant", action='store_true', help="Whether to use multiple simulated seqs (only for training)") parser.add_argument( '--max_redundant_num', type=int, default=100000, help= "Whether to limit the number of the initial TF-IDF pool (only for open-domain eval)" ) parser.add_argument( "--no_links", action='store_true', help= "Whether to omit any links (or in other words, only use TF-IDF-based paragraphs)" ) parser.add_argument("--pruning_by_links", action='store_true', help="Whether to do pruning by links (and top 1)") parser.add_argument( "--expand_links", action='store_true', help= "Whether to expand links with paragraphs in the same article (for NQ)") parser.add_argument( '--tfidf_limit', type=int, default=None, help= "Whether to limit the number of the initial TF-IDF pool (only for open-domain eval)" ) parser.add_argument("--pred_file", default=None, type=str, help="File name to write paragraph selection results") parser.add_argument("--tagme", action='store_true', help="Whether to use tagme at inference") parser.add_argument( '--topk', type=int, default=2, help="Whether to use how many paragraphs from the previous steps") parser.add_argument( "--model_suffix", default=None, type=str, help="Suffix to load a model file ('pytorch_model_' + suffix +'.bin')") parser.add_argument("--db_save_path", default=None, type=str, help="File path to DB") args = parser.parse_args() cpu = torch.device('cpu') device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.train_file_path is not None: do_train = True if os.path.exists(args.output_dir) and os.listdir(args.output_dir): raise ValueError( "Output directory ({}) already exists and is not empty.". format(args.output_dir)) os.makedirs(args.output_dir, exist_ok=True) elif args.dev_file_path is not None: do_train = False else: raise ValueError( 'One of train_file_path: {} or dev_file_path: {} must be non-None'. format(args.train_file_path, args.dev_file_path)) processor = DataProcessor() # Configurations of the graph retriever graph_retriever_config = GraphRetrieverConfig( example_limit=args.example_limit, task=args.task, max_seq_length=args.max_seq_length, max_select_num=args.max_select_num, max_para_num=args.max_para_num, tfidf_limit=args.tfidf_limit, train_file_path=args.train_file_path, use_redundant=args.use_redundant, use_multiple_redundant=args.use_multiple_redundant, max_redundant_num=args.max_redundant_num, dev_file_path=args.dev_file_path, beam=args.beam, min_select_num=args.min_select_num, no_links=args.no_links, pruning_by_links=args.pruning_by_links, expand_links=args.expand_links, eval_chunk=args.eval_chunk, tagme=args.tagme, topk=args.topk, db_save_path=args.db_save_path) logger.info(graph_retriever_config) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) ############################## # Training # ############################## if do_train: model = BertForGraphRetriever.from_pretrained( args.bert_model, cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1), graph_retriever_config=graph_retriever_config) model.to(device) if n_gpu > 1: print("Parallel Training.") model = torch.nn.DataParallel(model) global_step = 0 nb_tr_steps = 0 tr_loss = 0 POSITIVE = 1.0 NEGATIVE = 0.0 # Load training examples train_examples = None num_train_steps = None train_examples = processor.get_train_examples(graph_retriever_config) train_features = convert_examples_to_features(train_examples, args.max_seq_length, args.max_para_num, graph_retriever_config, tokenizer, train=True) # len(train_examples) and len(train_features) can be different, depedning on the redundant setting num_train_steps = int( len(train_features) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] t_total = num_train_steps optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=t_total, max_grad_norm=1.0) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_features)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) model.train() epc = 0 for _ in range(int(args.num_train_epochs)): logger.info('Epoch ' + str(epc + 1)) TOTAL_NUM = len(train_features) train_start_index = 0 CHUNK_NUM = 4 # this doesn't matter for performance train_chunk = TOTAL_NUM // CHUNK_NUM chunk_index = 0 random.shuffle(train_features) save_retry = False while train_start_index < TOTAL_NUM: train_end_index = min(train_start_index + train_chunk - 1, TOTAL_NUM - 1) chunk_len = train_end_index - train_start_index + 1 train_features_ = train_features[ train_start_index:train_start_index + chunk_len] all_input_ids = torch.tensor( [f.input_ids for f in train_features_], dtype=torch.long) all_input_masks = torch.tensor( [f.input_masks for f in train_features_], dtype=torch.long) all_segment_ids = torch.tensor( [f.segment_ids for f in train_features_], dtype=torch.long) all_output_masks = torch.tensor( [f.output_masks for f in train_features_], dtype=torch.float) all_num_paragraphs = torch.tensor( [f.num_paragraphs for f in train_features_], dtype=torch.long) all_num_steps = torch.tensor( [f.num_steps for f in train_features_], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_masks, all_segment_ids, all_output_masks, all_num_paragraphs, all_num_steps) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 logger.info('Examples from ' + str(train_start_index) + ' to ' + str(train_end_index)) for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): input_masks = batch[1] batch_max_len = input_masks.sum(dim=2).max().item() num_paragraphs = batch[4] batch_max_para_num = num_paragraphs.max().item() num_steps = batch[5] batch_max_steps = num_steps.max().item() output_masks_cpu = ( batch[3])[:, :batch_max_steps, :batch_max_para_num + 1] batch = tuple(t.to(device) for t in batch) input_ids, input_masks, segment_ids, output_masks, _, __ = batch B = input_ids.size(0) input_ids = input_ids[:, :batch_max_para_num, : batch_max_len] input_masks = input_masks[:, :batch_max_para_num, : batch_max_len] segment_ids = segment_ids[:, :batch_max_para_num, : batch_max_len] output_masks = output_masks[:, :batch_max_steps, : batch_max_para_num + 1] # 1 for EOE target = torch.FloatTensor(output_masks.size()).fill_( NEGATIVE) # (B, NUM_STEPS, |P|+1) <- 1 for EOE for i in range(B): output_masks[i, :num_steps[i], -1] = 1.0 # for EOE for j in range(num_steps[i].item() - 1): target[i, j, j].fill_( POSITIVE ) # positive paragraphs are stored in order of the right path target[i, num_steps[i] - 1, -1].fill_(POSITIVE) # EOE target = target.to(device) neg_start = batch_max_steps - 1 while neg_start < batch_max_para_num: neg_end = min(neg_start + args.neg_chunk - 1, batch_max_para_num - 1) neg_len = (neg_end - neg_start + 1) input_ids_ = torch.cat( (input_ids[:, :batch_max_steps - 1, :], input_ids[:, neg_start:neg_start + neg_len, :]), dim=1) input_masks_ = torch.cat( (input_masks[:, :batch_max_steps - 1, :], input_masks[:, neg_start:neg_start + neg_len, :]), dim=1) segment_ids_ = torch.cat( (segment_ids[:, :batch_max_steps - 1, :], segment_ids[:, neg_start:neg_start + neg_len, :]), dim=1) output_masks_ = torch.cat( (output_masks[:, :, :batch_max_steps - 1], output_masks[:, :, neg_start:neg_start + neg_len], output_masks[:, :, batch_max_para_num: batch_max_para_num + 1]), dim=2) target_ = torch.cat( (target[:, :, :batch_max_steps - 1], target[:, :, neg_start:neg_start + neg_len], target[:, :, batch_max_para_num:batch_max_para_num + 1]), dim=2) if neg_start != batch_max_steps - 1: output_masks_[:, :, :batch_max_steps - 1] = 0.0 output_masks_[:, :, -1] = 0.0 loss = model(input_ids_, segment_ids_, input_masks_, output_masks_, target_, batch_max_steps) if n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() tr_loss += loss.item() neg_start = neg_end + 1 nb_tr_examples += B nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: # modify learning rate with special warm up BERT uses lr_this_step = args.learning_rate * warmup_linear( global_step / t_total, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 chunk_index += 1 train_start_index = train_end_index + 1 # Save the model at the half of the epoch if (chunk_index == CHUNK_NUM // 2 or save_retry): status = save(model, args.output_dir, str(epc + 0.5)) save_retry = (not status) del all_input_ids del all_input_masks del all_segment_ids del all_output_masks del all_num_paragraphs del all_num_steps del train_data # Save the model at the end of the epoch save(model, args.output_dir, str(epc + 1)) epc += 1 if do_train: return ############################## # Evaluation # ############################## assert args.model_suffix is not None if graph_retriever_config.db_save_path is not None: import sys sys.path.append('../') from pipeline.tfidf_retriever import TfidfRetriever tfidf_retriever = TfidfRetriever(graph_retriever_config.db_save_path, None) else: tfidf_retriever = None model_state_dict = load(args.output_dir, args.model_suffix) model = BertForGraphRetriever.from_pretrained( args.bert_model, state_dict=model_state_dict, graph_retriever_config=graph_retriever_config) model.to(device) model.eval() if args.pred_file is not None: pred_output = [] eval_examples = processor.get_dev_examples(graph_retriever_config) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) TOTAL_NUM = len(eval_examples) eval_start_index = 0 while eval_start_index < TOTAL_NUM: eval_end_index = min( eval_start_index + graph_retriever_config.eval_chunk - 1, TOTAL_NUM - 1) chunk_len = eval_end_index - eval_start_index + 1 eval_features = convert_examples_to_features( eval_examples[eval_start_index:eval_start_index + chunk_len], args.max_seq_length, args.max_para_num, graph_retriever_config, tokenizer) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_masks = torch.tensor([f.input_masks for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_output_masks = torch.tensor( [f.output_masks for f in eval_features], dtype=torch.float) all_num_paragraphs = torch.tensor( [f.num_paragraphs for f in eval_features], dtype=torch.long) all_num_steps = torch.tensor([f.num_steps for f in eval_features], dtype=torch.long) all_ex_indices = torch.tensor([f.ex_index for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_masks, all_segment_ids, all_output_masks, all_num_paragraphs, all_num_steps, all_ex_indices) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) for input_ids, input_masks, segment_ids, output_masks, num_paragraphs, num_steps, ex_indices in tqdm( eval_dataloader, desc="Evaluating"): batch_max_len = input_masks.sum(dim=2).max().item() batch_max_para_num = num_paragraphs.max().item() batch_max_steps = num_steps.max().item() input_ids = input_ids[:, :batch_max_para_num, :batch_max_len] input_masks = input_masks[:, :batch_max_para_num, :batch_max_len] segment_ids = segment_ids[:, :batch_max_para_num, :batch_max_len] output_masks = output_masks[:, :batch_max_para_num + 2, :batch_max_para_num + 1] output_masks[:, 1:, -1] = 1.0 # Ignore EOE in the first step input_ids = input_ids.to(device) input_masks = input_masks.to(device) segment_ids = segment_ids.to(device) output_masks = output_masks.to(device) examples = [ eval_examples[eval_start_index + ex_indices[i].item()] for i in range(input_ids.size(0)) ] with torch.no_grad(): pred, prob, topk_pred, topk_prob = model.beam_search( input_ids, segment_ids, input_masks, examples=examples, tokenizer=tokenizer, retriever=tfidf_retriever, split_chunk=args.split_chunk) for i in range(len(pred)): e = examples[i] titles = [e.title_order[p] for p in pred[i]] # Output predictions to a file if args.pred_file is not None: pred_output.append({}) pred_output[-1]['q_id'] = e.guid pred_output[-1]['titles'] = titles pred_output[-1]['probs'] = [] for prob_ in prob[i]: entry = {'EOE': prob_[-1]} for j in range(len(e.title_order)): entry[e.title_order[j]] = prob_[j] pred_output[-1]['probs'].append(entry) topk_titles = [[e.title_order[p] for p in topk_pred[i][j]] for j in range(len(topk_pred[i]))] pred_output[-1]['topk_titles'] = topk_titles topk_probs = [] for k in range(len(topk_prob[i])): topk_probs.append([]) for prob_ in topk_prob[i][k]: entry = {'EOE': prob_[-1]} for j in range(len(e.title_order)): entry[e.title_order[j]] = prob_[j] topk_probs[-1].append(entry) pred_output[-1]['topk_probs'] = topk_probs # Output the selected paragraphs context = {} for ts in topk_titles: for t in ts: context[t] = e.all_paras[t] pred_output[-1]['context'] = context eval_start_index = eval_end_index + 1 del eval_features del all_input_ids del all_input_masks del all_segment_ids del all_output_masks del all_num_paragraphs del all_num_steps del all_ex_indices del eval_data if args.pred_file is not None: json.dump(pred_output, open(args.pred_file, 'w'))
# 'num_steps': 300, # 'embed_size': 300, # 'hidden_size':300, # 'keep_prob': 0.5, # 'batch_size': 32, # 'num_classes': 2, # 'vocab_size': 40000, # 'combine_mode': 'last', # 'weight_decay': 1e-8, # 'save_path': 'checkpoint/imdb/' # } if len(sys.argv) ==2 and sys.argv[1] == 'std': x_train, x_test, y_train, y_test = imdb_for_library(seq_len=config['num_steps'], max_features=config['vocab_size']) else: data_path = 'data/imdb/train.csv' data_processor = DataProcessor(data_path,vocab_size=config['vocab_size'],\ seperator=',',max_seq_len=config['num_steps'],header=0,reverse=True) x_train , y_train = data_processor.get_training_data() x_test, y_test = data_processor.process_test_file( 'data/imdb/test.csv',contains_label=True,header=0) # embedding = data_processor.get_embedding(config['embed_size']) # # print('Embedding Shape',embedding.shape) trainer = Trainer(config,x_train,y_train,embedding=None) trainer.train() trainer.load_best_model() pred = trainer.predict(x_test) print(classification_report(y_true=y_test,y_pred=pred))
from trainer import train_model, save_png import warnings warnings.filterwarnings('ignore') # =============== # Settings # =============== parser = argparse.ArgumentParser() parser.add_argument('--common', default='../configs/common/default.yml') parser.add_argument('--notify', default='../configs/common/notify.yml') parser.add_argument('-m', '--model') parser.add_argument('-c', '--comment') options = parser.parse_args() dp = DataProcessor() config = dp.load(options.common) config.update(dp.load(f'../configs/exp/{options.model}.yml')) # =============== # Constants # =============== comment = options.comment now = datetime.datetime.now() model_name = options.model run_name = f'{model_name}_{now:%Y%m%d%H%M%S}' compe_params = config.compe data_params = config.data train_params = config.train_params setting_params = config.settings
def valid(in_path, slot_path, intent_path): data_processor_valid = DataProcessor(in_path, slot_path, intent_path, in_vocab, slot_vocab, intent_vocab) pred_intents = [] correct_intents = [] slot_outputs = [] correct_slots = [] input_words = [] # used to gate gate_seq = [] while True: ( in_data, slot_data, slot_weight, length, intents, in_seq, slot_seq, intent_seq, ) = data_processor_valid.get_batch(arg.batch_size) feed_dict = { input_data.name: in_data, sequence_length.name: length } ret = sess.run(inference_outputs, feed_dict) for i in ret[0]: pred_intents.append(np.argmax(i)) for i in intents: correct_intents.append(i) pred_slots = ret[1].reshape( (slot_data.shape[0], slot_data.shape[1], -1)) for p, t, i, l in zip(pred_slots, slot_data, in_data, length): p = np.argmax(p, 1) tmp_pred = [] tmp_correct = [] tmp_input = [] for j in range(l): tmp_pred.append(slot_vocab["rev"][p[j]]) tmp_correct.append(slot_vocab["rev"][t[j]]) tmp_input.append(in_vocab["rev"][i[j]]) slot_outputs.append(tmp_pred) correct_slots.append(tmp_correct) input_words.append(tmp_input) if data_processor_valid.end == 1: break pred_intents = np.array(pred_intents) correct_intents = np.array(correct_intents) accuracy = pred_intents == correct_intents semantic_error = accuracy accuracy = accuracy.astype(float) accuracy = np.mean(accuracy) * 100.0 index = 0 for t, p in zip(correct_slots, slot_outputs): # Process Semantic Error if len(t) != len(p): raise ValueError("Error!!") for j in range(len(t)): if p[j] != t[j]: semantic_error[index] = False break index += 1 semantic_error = semantic_error.astype(float) semantic_error = np.mean(semantic_error) * 100.0 f1, precision, recall = computeF1Score(correct_slots, slot_outputs) logging.info("slot f1: " + str(f1)) logging.info("intent accuracy: " + str(accuracy)) logging.info( "semantic error(intent, slots are all correct): " + str(semantic_error)) data_processor_valid.close() return ( f1, accuracy, semantic_error, pred_intents, correct_intents, slot_outputs, correct_slots, input_words, gate_seq, )
"/txts.embedded.npy", # text encoded by pre-trained bert, shape=[N, max_len_of_word_seqs, dim_of_bert_output] paths["embedded"] + "/txts.embeddedG.npy", # vectors of [CLS] encoded by a pre-trained bert, shape=[N, dim_of_bert_output] paths["embedded"] + "/cids_of_imgs", # indexes to find image encoded vector paths["embedded"] + "/imgs.embedded.npy", # image encoded by pre-trained resnet, shape=[N, image_region_num, dim_of_resnet_output] paths["embedded"] + "/imgs.embeddedG.npy" # image encoded by pre-trained resnet, shape=[N, dim_of_resnet_output] ) # data_processor: utils for data processing(load data, get batch samples, etc.) data_processor_train = DataProcessor(paths["train_data"] + "/indexs", paths["train_data"] + "/input.seq", paths["train_data"] + "/output.seq", paths["train_data"] + "/output.label", w2i_word, w2i_bio, w2i_label, shuffling=True) data_processor_valid = DataProcessor(paths["valid_data"] + "/indexs", paths["valid_data"] + "/input.seq", paths["valid_data"] + "/output.seq", paths["valid_data"] + "/output.label", w2i_word, w2i_bio, w2i_label, shuffling=False) data_processor_test = DataProcessor(paths["test_data"] + "/indexs", paths["test_data"] + "/input.seq",
chlr.setFormatter(formatter) fhlr = logging.FileHandler(log_file_path) fhlr.setFormatter(formatter) logger.addHandler(chlr) logger.addHandler(fhlr) logger.info("loading vocab...") w2i_char, i2w_char = load_vocabulary("./data/vocab_char.txt") w2i_bio, i2w_bio = load_vocabulary("./data/vocab_bioattr.txt") logger.info("loading data...") data_processor_train = DataProcessor("./data/train/input.seq.char", "./data/train/output.seq.bioattr", w2i_char, w2i_bio, shuffling=True) data_processor_valid = DataProcessor("./data/test/input.seq.char", "./data/test/output.seq.bioattr", w2i_char, w2i_bio, shuffling=True) logger.info("building model...") model = MyModel(embedding_dim=300, hidden_dim=300, vocab_size_char=len(w2i_char), vocab_size_bio=len(w2i_bio),
"image_vector": "./data/image_fc_vectors.npy" } use_image = False print("load data...") w2i_word, i2w_word = load_vocabulary(paths["vocab_word"]) w2i_attr, i2w_attr = load_vocabulary(paths["vocab_attr"]) w2i_value, i2w_value = load_vocabulary(paths["vocab_value"]) data_processor = DataProcessor( paths["test_data"] + "/input.seq", paths["test_data"] + "/input.imageindex", paths["test_data"] + "/input.attr", paths["test_data"] + "/output.value", w2i_word, w2i_attr, w2i_value, shuffling=False ) if use_image: image_vector_container = VectorContainer(paths["image_vector"], img_embedding_size) else: image_vector_container = VectorContainer_ZERO(img_embedding_size) print("loading checkpoint from", paths["ckpt"], "...") tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True sess = tf.Session(config=tf_config)
from utils.uiUtils import yesNoPrompt from utils import uiUtils #Custom datset processor from utils import DataProcessor import json import cv2 import numpy as np #Prompt user to select dataset print("Which dataset would you like to visualize?") dataset = uiUtils.listOptionsPrompt(['train', 'validate', 'test', 'exit']) while (dataset != 'exit'): pp = DataProcessor.DataPreProcessor(args.data_directory, 1, dataset, args, debug=True) for i, (inputs, labels, meta) in enumerate(pp): title = 'Image #' + str(i) title += ', ' + str(meta[0]).replace(args.data_directory, '') print(title) print('LABELS') print(labels[0]) fgDisplay = cv2.resize(np.reshape(inputs['input_4'], (25, 25)), (224, 224)) fgDisplay = np.stack((fgDisplay, fgDisplay, fgDisplay), axis=2) #Place 3 images side by side to display output = np.concatenate((inputs['input_3'][0], fgDisplay,
'num_layers': 1, 'num_steps': 50, 'embed_size': 300, 'hidden_size': 600, 'keep_prob': 0.7, 'batch_size': 16, 'num_classes': 2, 'vocab_size': 40000, 'combine_mode': 'last', 'weight_decay': 3e-6, 'save_path': 'checkpoint/cove/' } data_path = 'data/imdb/train.csv' data_processor = DataProcessor(data_path,vocab_size=config['vocab_size'],\ seperator=',',max_seq_len=config['num_steps'],\ header=0,reverse=True) data, labels = data_processor.get_training_data() print('Train Data Shape', data.shape, labels.shape) embedding = data_processor.get_embedding(config['embed_size']) print('Embedding Shape', embedding.shape) test_data, test_labels = data_processor.process_test_file( 'data/imdb/test.csv', contains_label=True, header=0) trainer = Trainer(config, data, labels, embedding) # trainer.X_test = test_data # trainer.y_test = test_labels trainer.train() trainer.load_best_model() pred = trainer.predict(test_data) print(classification_report(y_true=test_labels, y_pred=pred))