def predict_on_test_files(self, estimator, csv_files_path): failed_csvs = [] for csv_file in tqdm(os.listdir(csv_files_path), desc="predicting"): csv_file = os.path.join(csv_files_path, csv_file) if csv_file.endswith(".csv"): sentence = "" # try: print_info("processing ====> {}".format(csv_file)) df = pd.read_csv(csv_file).fillna(UNKNOWN_WORD) df = self.predict_on_test_file(estimator, df) out_dir = estimator.model_dir + "/predictions/" check_n_makedirs(out_dir) df.to_csv(out_dir + ntpath.basename(csv_file), index=False) # except Exception as e: # print_error(traceback.print_exc()) # failed_csvs.append(csv_file) # print_warn("Failed processing ====> {}".format(csv_file)) # pdb.set_trace() # print_error(failed_csvs) return out_dir
def predict_on_csv_files(self, estimator, csv_files_path): df = "" sentence = "" # for csv_file in tqdm(os.listdir(csv_files_path)): # csv_file = os.path.join(csv_files_path, csv_file) csv_file = csv_files_path if csv_file.endswith(".csv"): print_info(csv_file) # print_info("processing ====> {}".format(csv_file)) df = pd.read_csv(csv_file).fillna(UNKNOWN_WORD) # df = io_2_iob(df, entity_col, entity_iob_col) # removing since we are using preprocessed test folder TODO chain IOB sentence = (" ".join(df[self.preprocessed_data_info.TEXT_COL].values)) char_ids = [[self.preprocessed_data_info.char_2_id_map.get(c, 0) for c in word] for word in sentence.split(" ")] char_ids, char_ids_length = self._pad_sequences([char_ids], pad_tok=0, nlevels=2) # TODO add batch support predicted_tags, confidence, pred_1, pred_1_confidence, pred_2, pred_2_confidence, \ pred_3, pred_3_confidence = self.get_tags(estimator, sentence, char_ids, self.preprocessed_data_info.ENTITY_VOCAB_FILE) df["predictions"] = predicted_tags df["confidence"] = confidence df["pred_1"] = pred_1 df["pred_1_confidence"] = pred_1_confidence df["pred_2"] = pred_2 df["pred_2_confidence"] = pred_2_confidence df["pred_3"] = pred_3 df["pred_3_confidence"] = pred_3_confidence out_dir = estimator.model_dir +"/predictions/" check_n_makedirs(out_dir) df.to_csv(out_dir +ntpath.basename(csv_file), index=False) return df
def __init__(self, name, experiment_dir, batch_size): ''' Data Iterators with different features type are expected to implement this interface, exposing the input functions and their hooks :param experiment_dir: :param batch_size: ''' self.NAME = name self.EXPERIMENT_ROOT_DIR = experiment_dir self.OUT_DIR = self.EXPERIMENT_ROOT_DIR + "/" + self.NAME + "/" self._load_ini() # self.preprocessed_data_info = PreprocessedDataInfo.load(experiment_dir) # This rule is assumed to be correct if the previous stage is of IPreprocessorInterface self.PREPROCESSED_DATA_DIR = self.EXPERIMENT_ROOT_DIR + "/" + self.config.get_item( "OutputDirectories", "preprocessed_data_dir") self.TRAIN_FILES_IN_PATH = self.PREPROCESSED_DATA_DIR + "/train/" self.VAL_FILES_IN_PATH = self.PREPROCESSED_DATA_DIR + "/val/" self.TEST_FILES_IN_PATH = self.PREPROCESSED_DATA_DIR + "/test/" self.TEXT_COL = self.config.get_item("Schema", "text_column") self.ENTITY_COL = self.config.get_item("Schema", "entity_column") self.WORDS_VOCAB_FILE = self.OUT_DIR + "/" + self.TEXT_COL + "_" + "vocab.tsv" self.CHARS_VOCAB_FILE = self.OUT_DIR + "/" + self.TEXT_COL + "_" + "chars_vocab.tsv" self.ENTITY_VOCAB_FILE = self.OUT_DIR + "/" + self.ENTITY_COL + "_vocab.tsv" check_n_makedirs(self.OUT_DIR) self.BATCH_SIZE = batch_size self.NUM_TAGS = None self.VOCAB_SIZE = None self.CHAR_VOCAB_SIZE = None self._train_data_input_fn = None self._train_data_init_hook = None self._val_data_input_fn = None self._val_data_init_hook = None self._test_data_input_fn = None self._test_data_init_hook = None
def predict_on_test_files(self, estimator, csv_files_path): failed_csvs = [] out_dir = estimator.model_dir + "/predictions/" check_n_makedirs(out_dir) files = [ file for file in os.listdir(csv_files_path) if file.endswith('.csv') ] batchsize = 12 index = 0 remaining = len(files) progress_bar = tqdm(total=len(files)) while remaining > 0: batch = min(remaining, batchsize) print('NEW BATCH\n') dfs = [] for csv_file in files[index:index + batch]: df = pd.read_csv(os.path.join(csv_files_path, csv_file)).fillna(UNKNOWN_WORD) df.file_name = csv_file dfs.append(df) dfs = self.predict_on_dataframes(estimator, dfs) for predicted_df in dfs: print_info(predicted_df.file_name) predicted_df.to_csv(out_dir + ntpath.basename(predicted_df.file_name), index=False) index += batch remaining -= batch progress_bar.update(index) progress_bar.close() return out_dir
def create_target_directories(self): if os.path.exists(self.OUT_DIR): if self.OVER_WRITE == "yes": print_info("Deletingls data folder: {}".format(self.OUT_DIR)) shutil.rmtree(self.OUT_DIR) print_info("Recreating data folder: {}".format(self.OUT_DIR)) os.makedirs(self.OUT_DIR) check_n_makedirs(self.TRAIN_CSV_INTERMEDIATE_PATH) check_n_makedirs(self.VAL_CSV_INTERMEDIATE_PATH) check_n_makedirs(self.TEST_CSV_INTERMEDIATE_PATH) else: print_info( "Skipping preprocessing step, since the data is already available" ) return "skip" else: print_info("Creating data folder: {}".format(self.OUT_DIR)) os.makedirs(self.OUT_DIR) check_n_makedirs(self.TRAIN_CSV_INTERMEDIATE_PATH) check_n_makedirs(self.VAL_CSV_INTERMEDIATE_PATH) check_n_makedirs(self.TEST_CSV_INTERMEDIATE_PATH)
def with_user_hyperparamaters(experiment_root_dir, data_dir): # model_root_dir, # vocab_size, # char_vocab_size, # number_tags, # unknown_word, # pad_word, # tags_vocab_file, # words_vocab_file, # chars_vocab_file): preprocessed_data_info = PreprocessedDataInfo.load(data_dir) use_crf = "y" # TODO use_char_embedding = False char_level_lstm_hidden_size = 128 # default char_emd_size = 128 # default if use_crf == 'y': use_crf = True else: use_crf = False use_char_embedding_option = input("use_char_embedding (y/n)") or "y" learning_rate = float(input("learning_rate (0.001): ")) or 0.001 num_lstm_layers = int(input("num_word_lstm_layers (2): ")) or 2 if use_char_embedding_option == 'y': use_char_embedding = True char_level_lstm_hidden_size = int( input("char_level_lstm_hidden_size (32): ")) or 32 char_emd_size = int(input("char_emd_size (32): ")) or 32 else: use_char_embedding = False word_level_lstm_hidden_size = int( input("word_level_lstm_hidden_size (32): ")) or 32 word_emd_size = int(input("word_emd_size (128): ")) or 32 out_keep_propability = float( input("out_keep_propability(0.5) : ")) or 0.5 # Does this sound logical? review please model_dir = experiment_root_dir + "/bilstm_crf_v1/" + \ "charembd_{}_lr_{}_lstmsize_{}-{}-{}_wemb_{}_cemb_{}_outprob_{}".format( str(use_char_embedding), learning_rate, num_lstm_layers, word_level_lstm_hidden_size, char_level_lstm_hidden_size, word_emd_size, char_emd_size, out_keep_propability) model_config = BiLSTMCRFConfigV1( model_dir=model_dir, vocab_size=preprocessed_data_info.VOCAB_SIZE, char_vocab_size=preprocessed_data_info.CHAR_VOCAB_SIZE, number_tags=preprocessed_data_info.NUM_TAGS, unknown_word=UNKNOWN_WORD, pad_word=PAD_WORD, tags_vocab_file=preprocessed_data_info.ENTITY_VOCAB_FILE, words_vocab_file=preprocessed_data_info.WORDS_VOCAB_FILE, chars_vocab_file=preprocessed_data_info.WORDS_VOCAB_FILE, # hyper parameters use_char_embedding=use_char_embedding, learning_rate=learning_rate, word_level_lstm_hidden_size=word_level_lstm_hidden_size, char_level_lstm_hidden_size=char_level_lstm_hidden_size, word_emd_size=word_emd_size, char_emd_size=char_emd_size, num_lstm_layers=num_lstm_layers, out_keep_propability=out_keep_propability, use_crf=True) check_n_makedirs(model_dir) IModelConfig.save(model_dir=model_dir, config=model_config) return model_config
def with_user_hyperparamaters(experiment_root_dir, data_iterator): use_crf = "y" # TODO use_char_embedding = False char_level_lstm_hidden_size = 32 # default char_emd_size = 32 # default if use_crf == 'y': use_crf = True else: use_crf = False use_char_embedding_option = input("use_char_embedding (y/n): ") or "y" learning_rate = input("learning_rate (0.001): ") or 0.001 learning_rate = float(learning_rate) num_lstm_layers = input("num_word_lstm_layers (2): ") or 2 num_lstm_layers = int(num_lstm_layers) if use_char_embedding_option == 'y': use_char_embedding = True char_level_lstm_hidden_size = input( "char_level_lstm_hidden_size (48): ") or 32 char_level_lstm_hidden_size = int(char_level_lstm_hidden_size) char_emd_size = input("char_emd_size (32): ") or 32 char_emd_size = int(char_emd_size) else: use_char_embedding = False word_level_lstm_hidden_size = input( "word_level_lstm_hidden_size (48): ") or 48 word_level_lstm_hidden_size = int(word_level_lstm_hidden_size) word_emd_size = input("word_emd_size (48): ") or 48 word_emd_size = int(word_emd_size) out_keep_propability = input("out_keep_propability(0.5) : ") or 0.5 out_keep_propability = float(out_keep_propability) # Does this sound logical? review please ''' experiment_root_dir/ - data_iterator/ - model_name/ - user_hyper_params/ ''' model_dir = experiment_root_dir + "/" + data_iterator.NAME + "/bilstm_crf_v0/" + \ "charembd_{}_lr_{}_lstmsize_{}-{}-{}_wemb_{}_cemb_{}_outprob_{}".format( str(use_char_embedding), learning_rate, num_lstm_layers, word_level_lstm_hidden_size, char_level_lstm_hidden_size, word_emd_size, char_emd_size, out_keep_propability) model_config = BiLSTMCRFConfigV0( model_dir=model_dir, vocab_size=data_iterator.VOCAB_SIZE, char_vocab_size=data_iterator.CHAR_VOCAB_SIZE, number_tags=data_iterator.NUM_TAGS, unknown_word=UNKNOWN_WORD, pad_word=PAD_WORD, tags_vocab_file=data_iterator.ENTITY_VOCAB_FILE, words_vocab_file=data_iterator.WORDS_VOCAB_FILE, chars_vocab_file=data_iterator.CHAR_VOCAB_SIZE, # hyper parameters use_char_embedding=use_char_embedding, learning_rate=learning_rate, word_level_lstm_hidden_size=word_level_lstm_hidden_size, char_level_lstm_hidden_size=char_level_lstm_hidden_size, word_emd_size=word_emd_size, char_emd_size=char_emd_size, num_lstm_layers=num_lstm_layers, out_keep_propability=out_keep_propability, use_crf=True) check_n_makedirs(model_dir) IModelConfig.save(model_dir=model_dir, config=model_config) return model_config