def permute(self, folder_path, tagging=0, padding=0, csv_name='meta.csv'): print("Start Permutation...") self.tagging = tagging self.padding = padding check_folder(folder_path) meta_path = folder_path + '/' + csv_name with open(meta_path, 'w') as outfile: self.__print_header(outfile) for row_index, row in tqdm(self.templates.iterrows(), total=self.templates.shape[0]): language = row['language'] entity_holder = re.findall(r'{\S+}', row['text']) if entity_holder: if len(entity_holder) <= self.max_holder_amount: entities_combo_list = self.__combine_entities( entity_holder, language=language) for entities_combo in entities_combo_list: self.__print_pad(outfile, language=language, value='<sos>') self.__print_combo(outfile, row, entity_holder, entities_combo) self.__print_pad(outfile, language=language, value='<eos>') self.sentence_num += 1 else: self.__print_pad(outfile, language=language, value='<sos>') self.__print_plain(outfile, row) self.__print_pad(outfile, language=language, value='<eos>') self.sentence_num += 1 return meta_path
def __init__(self, pipeline_dir, prepared_dir, classifier_dir, pretrained, normalizer_dir, model_yaml_path, encoder_level='char', decoder_level='char', onmt_dir='./OpenNMT-py', language='en'): """ pretrained = None to disable the classifier model_yaml_path = None to use Rule-based normalizer """ self.pipeline_dir = pipeline_dir self.prepared_dir = prepared_dir self.classifier_dir = classifier_dir self.pretrained = pretrained self.normalizer_dir = normalizer_dir self.encoder_level = encoder_level self.decoder_level = decoder_level self.onmt_dir = onmt_dir check_folder(self.pipeline_dir) check_folder(self.pipeline_dir + '/tmp') self.Classifier = Classifier(pretrained, prepared_dir, classifier_dir) self.Normalizer = Normalizer(model_yaml_path, prepared_dir, normalizer_dir, norm_only=False if pretrained else True, onmt_dir=onmt_dir, encoder_level=encoder_level, decoder_level=decoder_level, language=language)
def __init__(self, pretrained, prepared_dir, classifier_dir): """ pretrained is None means disable classifier """ self.pretrained = pretrained self.classifier_dir = classifier_dir self.prepared_dir = prepared_dir self.datasets = DatasetDict({ 'train': read_dataset_from_csv(prepared_dir + '/train.csv'), 'test': read_dataset_from_csv(prepared_dir + '/test.csv'), 'validation': read_dataset_from_csv(prepared_dir + '/validation.csv') }) self.metric = load_metric("seqeval") self.label_list = self.datasets["train"].features["tag"].feature.names check_folder(self.classifier_dir) if pretrained: self.model = AutoModelForTokenClassification.from_pretrained( self.pretrained, num_labels=len(self.label_list)) self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained) self.data_collator = DataCollatorForTokenClassification( self.tokenizer)
def load(self): try: project_dir = Path(__file__).resolve().parents[2] models_dir = str(project_dir) + '/models/' + self.name + '/' utils.check_folder(models_dir) self.model.load_weights(models_dir + self.name + ".h5") print("Loaded " + self.name + " model from disk") except ValueError as e: print( "No saved model found. Check file name or train from scratch")
def save(self): """ Keras saving methods such as model.save() or model.save_weights() are not suitable, since Keras won"t serialize tf.Tensor objects which get included into saving process as arguments of Lambda layers. """ project_dir = Path(__file__).resolve().parents[2] models_dir = str(project_dir) + '/models/' + self.name + '/' utils.check_folder(models_dir) self.saver.save(self.sess, models_dir + self.name) print("Saved model to disk")
def load(self): try: # Build the model self.build_model() project_dir = Path(__file__).resolve().parents[2] models_dir = str(project_dir) + '/models/' + self.name + '/' utils.check_folder(models_dir) self.saver.restore(self.sess, models_dir + self.name) self.initialized = True print("Loaded " + self.name + " model from disk") except ValueError as e: print( "No saved model found. Check file name or train from scratch")
def __init__(self, model_yaml_path, prepared_dir, normalizer_dir, onmt_dir='./OpenNMT-py', norm_only=True, encoder_level='char', decoder_level='char', language='en'): self.normalizer_dir = normalizer_dir self.onmt_dir = onmt_dir self.no_classifier = norm_only self.encoder_level = encoder_level self.decoder_level = decoder_level self.prepared_dir = prepared_dir self.yaml_path = model_yaml_path self.language = language if self.yaml_path: self.new_yaml_path = '{}/{}'.format(normalizer_dir, model_yaml_path.split('/')[-1]) check_folder(normalizer_dir + '/checkpoints') check_folder(normalizer_dir + '/data') check_folder(normalizer_dir + '/tmp')
def save(self): project_dir = Path(__file__).resolve().parents[2] models_dir = str(project_dir) + '/models/' + self.name + '/' utils.check_folder(models_dir) self.model.save_weights(models_dir + self.name + ".h5") print("Saved model to disk")
df = pd.read_csv(meta_path, converters={'before': str, 'after': str}) df = df.dropna() df['after'] = df['after'].str.lower() df['before'] = df['before'].str.lower() df = df[~df.after.str.contains('^\W*$')] df = df[df['class'] != 'PUNCT'] filter_id = df[df.after.apply(len) > len_thresh]['sentence_id'].unique() df = df[~df['sentence_id'].isin(filter_id)] df.columns = ['sentence_id', 'token_id', 'tag', 'written', 'spoken'] df["tag"].replace({"PLAIN": "O", "PUNCT": "O"}, inplace=True) df['token'] = df['spoken'].str.split() df['tag'] = df.apply(tag2bio, axis=1) df = df[(df['written'] != '') & (df['spoken'] != '')] # save to prepared dir check_folder('./TNChallenge') meta_path = './TNChallenge/meta.csv' with open(meta_path, 'w+') as outfile: columns = ['sentence_id', 'token_id', 'written', 'spoken', 'token', 'tag'] outfile.write('\t'.join(columns) + '\n') def write_meta(row): d = dict(row) values = [] for col in columns[:-2]: values.append(str(d[col])) for i in range(len(row['tag'])): r_values = values.copy() r_values.append(str(row['token'][i])) r_values.append(str(row['tag'][i]))
def main(): parser = argparse.ArgumentParser() # pipeline args parser.add_argument("--pipeline_dir", default='./output/pipeline/distilbert-base_LSTM', type=str, required=False, help="Directory to save pipeline data") parser.add_argument( "--prepared_dir", default='./output', type=str, required=False, help= "The prepared dataset location (containing test.csv, train.csv, validation.csv)" ) # classifier args parser.add_argument("--classifier_dir", default='./output/classifier/distilbert-base-uncased', type=str, required=False, help="Directory to save classifier model and data") parser.add_argument( "--pretrained", default='distilbert-base-uncased', type=str, required=False, help= "Load model from huggingface pretrained/ local pretrained. set None to disable classifier" ) # normalizer args parser.add_argument("--normalizer_dir", default='./output/normalizer/LSTM', type=str, required=False, help="Directory to save normalizer model and data") parser.add_argument("--model_yaml", default='./config/dummy.yaml', type=str, required=False, help="Load normalizer model from OpenNMT yaml file") parser.add_argument("--encoder_level", default='char', type=str, required=False, help="char or token") parser.add_argument("--decoder_level", default='char', type=str, required=False, help="char or token") parser.add_argument( "--language", default='en', type=str, required=False, help="language of the dataset (used only for Rule-based normalizer)") parser.add_argument("--onmt_dir", default='./OpenNMT-py', type=str, required=False, help="OpenNMT package location") # load config values args = parser.parse_args() # save args check_folder(args.pipeline_dir) with open(args.pipeline_dir + '/pipeline_args.txt', 'w') as f: json.dump(args.__dict__, f, indent=2) print("Pipeline args saved to: ", args.pipeline_dir + '/pipeline_args.txt')
def make_dataset(dataset="MNIST"): """ Fetches the raw datasets, split them into train, test and validation sets, does the necessary processing and saves it to data (../data). """ project_dir = Path(__file__).resolve().parents[2] data_dir = str(project_dir) + "/data/" utils.check_folder(data_dir) if dataset == "MNIST": # See if directory exists data_dir_mnist = data_dir + dataset utils.check_folder(data_dir_mnist) # Download the MNIST dataset from source mnist = tf.keras.datasets.mnist (x_train, y_train), (x_test, y_test) = mnist.load_data() # Hold out last 10000 training samples for validation x_valid, y_valid = x_train[-10000:], y_train[-10000:] x_train, y_train = x_train[:-10000], y_train[:-10000] # Retrieve label shapes from training data n_classes = np.unique(y_train).shape[0] # Convert labels to 1-hot vectors y_train_one_hot = tf.keras.utils.to_categorical(y_train, n_classes) y_valid_one_hot = tf.keras.utils.to_categorical(y_valid, n_classes) y_test_one_hot = tf.keras.utils.to_categorical(y_test, n_classes) # Normalize inputs and cast to float x_train = (x_train / np.max(x_train)).astype(np.float32) x_valid = (x_valid / np.max(x_valid)).astype(np.float32) x_test = (x_test / np.max(x_test)).astype(np.float32) x_train_flat = x_train.reshape((x_train.shape[0], -1)) x_valid_flat = x_valid.reshape((x_valid.shape[0], -1)) x_test_flat = x_test.reshape((x_test.shape[0], -1)) # Save all data np.save(data_dir_mnist + "/x_train.npy", x_train) np.save(data_dir_mnist + "/x_valid.npy", x_valid) np.save(data_dir_mnist + "/x_test.npy", x_test) np.save(data_dir_mnist + "/x_train_flat.npy", x_train_flat) np.save(data_dir_mnist + "/x_valid_flat.npy", x_valid_flat) np.save(data_dir_mnist + "/x_test_flat.npy", x_test_flat) np.save(data_dir_mnist + "/y_train.npy", y_train) np.save(data_dir_mnist + "/y_valid.npy", y_valid) np.save(data_dir_mnist + "/y_test.npy", y_test) np.save(data_dir_mnist + "/y_train_one_hot.npy", y_train_one_hot) np.save(data_dir_mnist + "/y_valid_one_hot.npy", y_valid_one_hot) np.save(data_dir_mnist + "/y_test_one_hot.npy", y_test_one_hot) elif dataset == "FashionMNIST": # See if directory exists data_dir_fashion_mnist = data_dir + dataset utils.check_folder(data_dir_fashion_mnist) # Download the Fashion MNIST dataset from source fashion_mnist = tf.keras.datasets.fashion_mnist (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data() # Hold out last 10000 training samples for validation x_valid, y_valid = x_train[-10000:], y_train[-10000:] x_train, y_train = x_train[:-10000], y_train[:-10000] # Retrieve label shapes from training data n_classes = np.unique(y_train).shape[0] # Convert labels to 1-hot vectors y_train_one_hot = tf.keras.utils.to_categorical(y_train, n_classes) y_valid_one_hot = tf.keras.utils.to_categorical(y_valid, n_classes) y_test_one_hot = tf.keras.utils.to_categorical(y_test, n_classes) # Normalize inputs and cast to float x_train = (x_train / np.max(x_train)).astype(np.float32) x_valid = (x_valid / np.max(x_valid)).astype(np.float32) x_test = (x_test / np.max(x_test)).astype(np.float32) x_train_flat = x_train.reshape((x_train.shape[0], -1)) x_valid_flat = x_valid.reshape((x_valid.shape[0], -1)) x_test_flat = x_test.reshape((x_test.shape[0], -1)) # Save all data np.save(data_dir_fashion_mnist + "/x_train.npy", x_train) np.save(data_dir_fashion_mnist + "/x_valid.npy", x_valid) np.save(data_dir_fashion_mnist + "/x_test.npy", x_test) np.save(data_dir_fashion_mnist + "/x_train_flat.npy", x_train_flat) np.save(data_dir_fashion_mnist + "/x_valid_flat.npy", x_valid_flat) np.save(data_dir_fashion_mnist + "/x_test_flat.npy", x_test_flat) np.save(data_dir_fashion_mnist + "/y_train.npy", y_train) np.save(data_dir_fashion_mnist + "/y_valid.npy", y_valid) np.save(data_dir_fashion_mnist + "/y_test.npy", y_test) np.save(data_dir_fashion_mnist + "/y_train_one_hot.npy", y_train_one_hot) np.save(data_dir_fashion_mnist + "/y_valid_one_hot.npy", y_valid_one_hot) np.save(data_dir_fashion_mnist + "/y_test_one_hot.npy", y_test_one_hot) elif dataset == "EMNIST_Letter": # See if directory exists data_dir_emnist = data_dir + dataset utils.check_folder(data_dir_emnist) # Download the EMNIST Letters dataset from source x_train, y_train = extract_training_samples("letters") x_test, y_test = extract_test_samples("letters") # Shift labels from [1:26] to [0:25], to make use of tf.keras.utils.to_categorical y_train = y_train - 1 y_test = y_test - 1 # Hold out last 10000 training samples for validation x_valid, y_valid = x_train[-20800:], y_train[-20800:] x_train, y_train = x_train[:-20800], y_train[:-20800] # Retrieve label shapes from training data n_classes = np.unique(y_train).shape[0] # Convert labels to 1-hot vectors y_train_one_hot = tf.keras.utils.to_categorical(y_train, n_classes) y_valid_one_hot = tf.keras.utils.to_categorical(y_valid, n_classes) y_test_one_hot = tf.keras.utils.to_categorical(y_test, n_classes) # Normalize inputs and cast to float x_train = (x_train / np.max(x_train)).astype(np.float32) x_valid = (x_valid / np.max(x_valid)).astype(np.float32) x_test = (x_test / np.max(x_test)).astype(np.float32) # Flatten to 1D vectors x_train_flat = x_train.reshape((x_train.shape[0], -1)) x_valid_flat = x_valid.reshape((x_valid.shape[0], -1)) x_test_flat = x_test.reshape((x_test.shape[0], -1)) # Save all data np.save(data_dir_emnist + "/x_train.npy", x_train) np.save(data_dir_emnist + "/x_valid.npy", x_valid) np.save(data_dir_emnist + "/x_test.npy", x_test) np.save(data_dir_emnist + "/x_train_flat.npy", x_train_flat) np.save(data_dir_emnist + "/x_valid_flat.npy", x_valid_flat) np.save(data_dir_emnist + "/x_test_flat.npy", x_test_flat) np.save(data_dir_emnist + "/y_train.npy", y_train) np.save(data_dir_emnist + "/y_valid.npy", y_valid) np.save(data_dir_emnist + "/y_test.npy", y_test) np.save(data_dir_emnist + "/y_train_one_hot.npy", y_train_one_hot) np.save(data_dir_emnist + "/y_valid_one_hot.npy", y_valid_one_hot) np.save(data_dir_emnist + "/y_test_one_hot.npy", y_test_one_hot) elif dataset == "EMNIST_Letter_Uppercase": # See if directory exists data_dir_emnist_uppercase = data_dir + dataset utils.check_folder(data_dir_emnist_uppercase) # Download the EMNIST Letters dataset from source x_train, y_train = extract_training_samples("byclass") x_test, y_test = extract_test_samples("byclass") # Extract uppercase data ix_train = [] for i, x in enumerate(y_train): if 9 < x < 36: ix_train.append(i) x_train = x_train[ix_train] y_train = y_train[ix_train] ix_test = [] for i, x in enumerate(y_test): if 9 < x < 36: ix_test.append(i) x_test = x_test[ix_test] y_test = y_test[ix_test] # Shift labels from [10:35] to [0:25], to make use of tf.keras.utils.to_categorical y_train = y_train - 10 y_test = y_test - 10 # Flatten datasets x_train_flat = x_train.reshape((x_train.shape[0], -1)) x_test_flat = x_test.reshape((x_test.shape[0], -1)) # Create balanced dataset x_train_flat, y_train, indices = utils.balanced_sample_maker( x_train_flat, y_train, 54100, random_seed=1234) x_test_flat, y_test, indices = utils.balanced_sample_maker( x_test_flat, y_test, 9880, random_seed=1234) # Hold out last 10000 training samples for validation x_valid_flat, y_valid = x_train_flat[-9880:], y_train[-9880:] x_train_flat, y_train = x_train_flat[:-9880], y_train[:-9880] # Retrieve label shapes from training data n_classes = np.unique(y_train).shape[0] # Convert labels to 1-hot vectors y_train_one_hot = tf.keras.utils.to_categorical(y_train, n_classes) y_valid_one_hot = tf.keras.utils.to_categorical(y_valid, n_classes) y_test_one_hot = tf.keras.utils.to_categorical(y_test, n_classes) # Normalize inputs and cast to float x_train_flat = (x_train_flat / np.max(x_train_flat)).astype(np.float32) x_valid_flat = (x_valid_flat / np.max(x_valid_flat)).astype(np.float32) x_test_flat = (x_test_flat / np.max(x_test_flat)).astype(np.float32) # Recreate images x_train = np.array([ np.reshape(x_train_flat[i], [28, 28]) for i in range(len(x_train_flat)) ]) x_valid = np.array([ np.reshape(x_valid_flat[i], [28, 28]) for i in range(len(x_valid_flat)) ]) x_test = np.array([ np.reshape(x_test_flat[i], [28, 28]) for i in range(len(x_test_flat)) ]) # Save all data np.save(data_dir_emnist_uppercase + "/x_train.npy", x_train) np.save(data_dir_emnist_uppercase + "/x_valid.npy", x_valid) np.save(data_dir_emnist_uppercase + "/x_test.npy", x_test) np.save(data_dir_emnist_uppercase + "/x_train_flat.npy", x_train_flat) np.save(data_dir_emnist_uppercase + "/x_valid_flat.npy", x_valid_flat) np.save(data_dir_emnist_uppercase + "/x_test_flat.npy", x_test_flat) np.save(data_dir_emnist_uppercase + "/y_train.npy", y_train) np.save(data_dir_emnist_uppercase + "/y_valid.npy", y_valid) np.save(data_dir_emnist_uppercase + "/y_test.npy", y_test) np.save(data_dir_emnist_uppercase + "/y_train_one_hot.npy", y_train_one_hot) np.save(data_dir_emnist_uppercase + "/y_valid_one_hot.npy", y_valid_one_hot) np.save(data_dir_emnist_uppercase + "/y_test_one_hot.npy", y_test_one_hot) else: print( "Please choose either: 'MNIST', 'FashionMNIST', 'EMNIST_Letter' or 'EMNIST_Letter_Uppercase'." )