Exemplo n.º 1
0
 def permute(self, folder_path, tagging=0, padding=0, csv_name='meta.csv'):
     print("Start Permutation...")
     self.tagging = tagging
     self.padding = padding
     check_folder(folder_path)
     meta_path = folder_path + '/' + csv_name
     with open(meta_path, 'w') as outfile:
         self.__print_header(outfile)
         for row_index, row in tqdm(self.templates.iterrows(),
                                    total=self.templates.shape[0]):
             language = row['language']
             entity_holder = re.findall(r'{\S+}', row['text'])
             if entity_holder:
                 if len(entity_holder) <= self.max_holder_amount:
                     entities_combo_list = self.__combine_entities(
                         entity_holder, language=language)
                     for entities_combo in entities_combo_list:
                         self.__print_pad(outfile,
                                          language=language,
                                          value='<sos>')
                         self.__print_combo(outfile, row, entity_holder,
                                            entities_combo)
                         self.__print_pad(outfile,
                                          language=language,
                                          value='<eos>')
                         self.sentence_num += 1
             else:
                 self.__print_pad(outfile, language=language, value='<sos>')
                 self.__print_plain(outfile, row)
                 self.__print_pad(outfile, language=language, value='<eos>')
                 self.sentence_num += 1
     return meta_path
Exemplo n.º 2
0
    def __init__(self,
                 pipeline_dir,
                 prepared_dir,
                 classifier_dir,
                 pretrained,
                 normalizer_dir,
                 model_yaml_path,
                 encoder_level='char',
                 decoder_level='char',
                 onmt_dir='./OpenNMT-py',
                 language='en'):
        """
        pretrained =  None to disable the classifier
        model_yaml_path = None to use Rule-based normalizer
        """
        self.pipeline_dir = pipeline_dir
        self.prepared_dir = prepared_dir
        self.classifier_dir = classifier_dir
        self.pretrained = pretrained
        self.normalizer_dir = normalizer_dir
        self.encoder_level = encoder_level
        self.decoder_level = decoder_level
        self.onmt_dir = onmt_dir

        check_folder(self.pipeline_dir)
        check_folder(self.pipeline_dir + '/tmp')
        self.Classifier = Classifier(pretrained, prepared_dir, classifier_dir)
        self.Normalizer = Normalizer(model_yaml_path,
                                     prepared_dir,
                                     normalizer_dir,
                                     norm_only=False if pretrained else True,
                                     onmt_dir=onmt_dir,
                                     encoder_level=encoder_level,
                                     decoder_level=decoder_level,
                                     language=language)
Exemplo n.º 3
0
    def __init__(self, pretrained, prepared_dir, classifier_dir):
        """
        pretrained is None means disable classifier
        """
        self.pretrained = pretrained
        self.classifier_dir = classifier_dir
        self.prepared_dir = prepared_dir
        self.datasets = DatasetDict({
            'train':
            read_dataset_from_csv(prepared_dir + '/train.csv'),
            'test':
            read_dataset_from_csv(prepared_dir + '/test.csv'),
            'validation':
            read_dataset_from_csv(prepared_dir + '/validation.csv')
        })
        self.metric = load_metric("seqeval")
        self.label_list = self.datasets["train"].features["tag"].feature.names
        check_folder(self.classifier_dir)

        if pretrained:
            self.model = AutoModelForTokenClassification.from_pretrained(
                self.pretrained, num_labels=len(self.label_list))
            self.tokenizer = AutoTokenizer.from_pretrained(self.pretrained)
            self.data_collator = DataCollatorForTokenClassification(
                self.tokenizer)
Exemplo n.º 4
0
    def load(self):
        try:
            project_dir = Path(__file__).resolve().parents[2]
            models_dir = str(project_dir) + '/models/' + self.name + '/'
            utils.check_folder(models_dir)
            self.model.load_weights(models_dir + self.name + ".h5")
            print("Loaded " + self.name + " model from disk")

        except ValueError as e:
            print(
                "No saved model found. Check file name or train from scratch")
Exemplo n.º 5
0
 def save(self):
     """
     Keras saving methods such as model.save() or model.save_weights()
     are not suitable, since Keras won"t serialize tf.Tensor objects which
     get included into saving process as arguments of Lambda layers.
     """
     project_dir = Path(__file__).resolve().parents[2]
     models_dir = str(project_dir) + '/models/' + self.name + '/'
     utils.check_folder(models_dir)
     self.saver.save(self.sess, models_dir + self.name)
     print("Saved model to disk")
Exemplo n.º 6
0
    def load(self):
        try:
            # Build the model
            self.build_model()

            project_dir = Path(__file__).resolve().parents[2]
            models_dir = str(project_dir) + '/models/' + self.name + '/'
            utils.check_folder(models_dir)
            self.saver.restore(self.sess, models_dir + self.name)
            self.initialized = True
            print("Loaded " + self.name + " model from disk")

        except ValueError as e:
            print(
                "No saved model found. Check file name or train from scratch")
Exemplo n.º 7
0
 def __init__(self, model_yaml_path, prepared_dir, normalizer_dir, onmt_dir='./OpenNMT-py', norm_only=True,
              encoder_level='char', decoder_level='char', language='en'):
     self.normalizer_dir = normalizer_dir
     self.onmt_dir = onmt_dir
     self.no_classifier = norm_only
     self.encoder_level = encoder_level
     self.decoder_level = decoder_level
     self.prepared_dir = prepared_dir
     self.yaml_path = model_yaml_path
     self.language = language
     if self.yaml_path:
         self.new_yaml_path = '{}/{}'.format(normalizer_dir, model_yaml_path.split('/')[-1])
     check_folder(normalizer_dir + '/checkpoints')
     check_folder(normalizer_dir + '/data')
     check_folder(normalizer_dir + '/tmp')
Exemplo n.º 8
0
 def save(self):
     project_dir = Path(__file__).resolve().parents[2]
     models_dir = str(project_dir) + '/models/' + self.name + '/'
     utils.check_folder(models_dir)
     self.model.save_weights(models_dir + self.name + ".h5")
     print("Saved model to disk")
Exemplo n.º 9
0
df = pd.read_csv(meta_path, converters={'before': str, 'after': str})
df = df.dropna()
df['after'] = df['after'].str.lower()
df['before'] = df['before'].str.lower()
df = df[~df.after.str.contains('^\W*$')]
df = df[df['class'] != 'PUNCT']
filter_id = df[df.after.apply(len) > len_thresh]['sentence_id'].unique()
df = df[~df['sentence_id'].isin(filter_id)]
df.columns = ['sentence_id', 'token_id', 'tag', 'written', 'spoken']
df["tag"].replace({"PLAIN": "O", "PUNCT": "O"}, inplace=True)
df['token'] = df['spoken'].str.split()
df['tag'] = df.apply(tag2bio, axis=1)
df = df[(df['written'] != '') & (df['spoken'] != '')]

# save to prepared dir
check_folder('./TNChallenge')
meta_path = './TNChallenge/meta.csv'
with open(meta_path, 'w+') as outfile:
    columns = ['sentence_id', 'token_id', 'written', 'spoken', 'token', 'tag']
    outfile.write('\t'.join(columns) + '\n')

    def write_meta(row):
        d = dict(row)
        values = []
        for col in columns[:-2]:
            values.append(str(d[col]))

        for i in range(len(row['tag'])):
            r_values = values.copy()
            r_values.append(str(row['token'][i]))
            r_values.append(str(row['tag'][i]))
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()
    # pipeline args
    parser.add_argument("--pipeline_dir",
                        default='./output/pipeline/distilbert-base_LSTM',
                        type=str,
                        required=False,
                        help="Directory to save pipeline data")
    parser.add_argument(
        "--prepared_dir",
        default='./output',
        type=str,
        required=False,
        help=
        "The prepared dataset location (containing test.csv, train.csv, validation.csv)"
    )

    # classifier args
    parser.add_argument("--classifier_dir",
                        default='./output/classifier/distilbert-base-uncased',
                        type=str,
                        required=False,
                        help="Directory to save classifier model and data")
    parser.add_argument(
        "--pretrained",
        default='distilbert-base-uncased',
        type=str,
        required=False,
        help=
        "Load model from huggingface pretrained/ local pretrained. set None to disable classifier"
    )

    # normalizer args
    parser.add_argument("--normalizer_dir",
                        default='./output/normalizer/LSTM',
                        type=str,
                        required=False,
                        help="Directory to save normalizer model and data")
    parser.add_argument("--model_yaml",
                        default='./config/dummy.yaml',
                        type=str,
                        required=False,
                        help="Load normalizer model from OpenNMT yaml file")
    parser.add_argument("--encoder_level",
                        default='char',
                        type=str,
                        required=False,
                        help="char or token")
    parser.add_argument("--decoder_level",
                        default='char',
                        type=str,
                        required=False,
                        help="char or token")
    parser.add_argument(
        "--language",
        default='en',
        type=str,
        required=False,
        help="language of the dataset (used only for Rule-based normalizer)")
    parser.add_argument("--onmt_dir",
                        default='./OpenNMT-py',
                        type=str,
                        required=False,
                        help="OpenNMT package location")

    # load config values
    args = parser.parse_args()

    # save args
    check_folder(args.pipeline_dir)
    with open(args.pipeline_dir + '/pipeline_args.txt', 'w') as f:
        json.dump(args.__dict__, f, indent=2)
    print("Pipeline args saved to: ", args.pipeline_dir + '/pipeline_args.txt')
Exemplo n.º 11
0
def make_dataset(dataset="MNIST"):
    """
    Fetches the raw datasets, split them into train, test and validation sets,
    does the necessary processing and saves it to data (../data).
    """

    project_dir = Path(__file__).resolve().parents[2]

    data_dir = str(project_dir) + "/data/"
    utils.check_folder(data_dir)

    if dataset == "MNIST":
        # See if directory exists
        data_dir_mnist = data_dir + dataset
        utils.check_folder(data_dir_mnist)

        # Download the MNIST dataset from source
        mnist = tf.keras.datasets.mnist
        (x_train, y_train), (x_test, y_test) = mnist.load_data()

        # Hold out last 10000 training samples for validation
        x_valid, y_valid = x_train[-10000:], y_train[-10000:]
        x_train, y_train = x_train[:-10000], y_train[:-10000]

        # Retrieve label shapes from training data
        n_classes = np.unique(y_train).shape[0]

        # Convert labels to 1-hot vectors
        y_train_one_hot = tf.keras.utils.to_categorical(y_train, n_classes)
        y_valid_one_hot = tf.keras.utils.to_categorical(y_valid, n_classes)
        y_test_one_hot = tf.keras.utils.to_categorical(y_test, n_classes)

        # Normalize inputs and cast to float
        x_train = (x_train / np.max(x_train)).astype(np.float32)
        x_valid = (x_valid / np.max(x_valid)).astype(np.float32)
        x_test = (x_test / np.max(x_test)).astype(np.float32)

        x_train_flat = x_train.reshape((x_train.shape[0], -1))
        x_valid_flat = x_valid.reshape((x_valid.shape[0], -1))
        x_test_flat = x_test.reshape((x_test.shape[0], -1))

        # Save all data
        np.save(data_dir_mnist + "/x_train.npy", x_train)
        np.save(data_dir_mnist + "/x_valid.npy", x_valid)
        np.save(data_dir_mnist + "/x_test.npy", x_test)

        np.save(data_dir_mnist + "/x_train_flat.npy", x_train_flat)
        np.save(data_dir_mnist + "/x_valid_flat.npy", x_valid_flat)
        np.save(data_dir_mnist + "/x_test_flat.npy", x_test_flat)

        np.save(data_dir_mnist + "/y_train.npy", y_train)
        np.save(data_dir_mnist + "/y_valid.npy", y_valid)
        np.save(data_dir_mnist + "/y_test.npy", y_test)

        np.save(data_dir_mnist + "/y_train_one_hot.npy", y_train_one_hot)
        np.save(data_dir_mnist + "/y_valid_one_hot.npy", y_valid_one_hot)
        np.save(data_dir_mnist + "/y_test_one_hot.npy", y_test_one_hot)

    elif dataset == "FashionMNIST":
        # See if directory exists
        data_dir_fashion_mnist = data_dir + dataset
        utils.check_folder(data_dir_fashion_mnist)

        # Download the Fashion MNIST dataset from source
        fashion_mnist = tf.keras.datasets.fashion_mnist
        (x_train, y_train), (x_test, y_test) = fashion_mnist.load_data()

        # Hold out last 10000 training samples for validation
        x_valid, y_valid = x_train[-10000:], y_train[-10000:]
        x_train, y_train = x_train[:-10000], y_train[:-10000]

        # Retrieve label shapes from training data
        n_classes = np.unique(y_train).shape[0]

        # Convert labels to 1-hot vectors
        y_train_one_hot = tf.keras.utils.to_categorical(y_train, n_classes)
        y_valid_one_hot = tf.keras.utils.to_categorical(y_valid, n_classes)
        y_test_one_hot = tf.keras.utils.to_categorical(y_test, n_classes)

        # Normalize inputs and cast to float
        x_train = (x_train / np.max(x_train)).astype(np.float32)
        x_valid = (x_valid / np.max(x_valid)).astype(np.float32)
        x_test = (x_test / np.max(x_test)).astype(np.float32)

        x_train_flat = x_train.reshape((x_train.shape[0], -1))
        x_valid_flat = x_valid.reshape((x_valid.shape[0], -1))
        x_test_flat = x_test.reshape((x_test.shape[0], -1))

        # Save all data
        np.save(data_dir_fashion_mnist + "/x_train.npy", x_train)
        np.save(data_dir_fashion_mnist + "/x_valid.npy", x_valid)
        np.save(data_dir_fashion_mnist + "/x_test.npy", x_test)

        np.save(data_dir_fashion_mnist + "/x_train_flat.npy", x_train_flat)
        np.save(data_dir_fashion_mnist + "/x_valid_flat.npy", x_valid_flat)
        np.save(data_dir_fashion_mnist + "/x_test_flat.npy", x_test_flat)

        np.save(data_dir_fashion_mnist + "/y_train.npy", y_train)
        np.save(data_dir_fashion_mnist + "/y_valid.npy", y_valid)
        np.save(data_dir_fashion_mnist + "/y_test.npy", y_test)

        np.save(data_dir_fashion_mnist + "/y_train_one_hot.npy",
                y_train_one_hot)
        np.save(data_dir_fashion_mnist + "/y_valid_one_hot.npy",
                y_valid_one_hot)
        np.save(data_dir_fashion_mnist + "/y_test_one_hot.npy", y_test_one_hot)

    elif dataset == "EMNIST_Letter":
        # See if directory exists
        data_dir_emnist = data_dir + dataset
        utils.check_folder(data_dir_emnist)

        # Download the EMNIST Letters dataset from source
        x_train, y_train = extract_training_samples("letters")
        x_test, y_test = extract_test_samples("letters")

        # Shift labels from [1:26] to [0:25], to make use of tf.keras.utils.to_categorical
        y_train = y_train - 1
        y_test = y_test - 1

        # Hold out last 10000 training samples for validation
        x_valid, y_valid = x_train[-20800:], y_train[-20800:]
        x_train, y_train = x_train[:-20800], y_train[:-20800]

        # Retrieve label shapes from training data
        n_classes = np.unique(y_train).shape[0]

        # Convert labels to 1-hot vectors
        y_train_one_hot = tf.keras.utils.to_categorical(y_train, n_classes)
        y_valid_one_hot = tf.keras.utils.to_categorical(y_valid, n_classes)
        y_test_one_hot = tf.keras.utils.to_categorical(y_test, n_classes)

        # Normalize inputs and cast to float
        x_train = (x_train / np.max(x_train)).astype(np.float32)
        x_valid = (x_valid / np.max(x_valid)).astype(np.float32)
        x_test = (x_test / np.max(x_test)).astype(np.float32)

        # Flatten to 1D vectors
        x_train_flat = x_train.reshape((x_train.shape[0], -1))
        x_valid_flat = x_valid.reshape((x_valid.shape[0], -1))
        x_test_flat = x_test.reshape((x_test.shape[0], -1))

        # Save all data
        np.save(data_dir_emnist + "/x_train.npy", x_train)
        np.save(data_dir_emnist + "/x_valid.npy", x_valid)
        np.save(data_dir_emnist + "/x_test.npy", x_test)

        np.save(data_dir_emnist + "/x_train_flat.npy", x_train_flat)
        np.save(data_dir_emnist + "/x_valid_flat.npy", x_valid_flat)
        np.save(data_dir_emnist + "/x_test_flat.npy", x_test_flat)

        np.save(data_dir_emnist + "/y_train.npy", y_train)
        np.save(data_dir_emnist + "/y_valid.npy", y_valid)
        np.save(data_dir_emnist + "/y_test.npy", y_test)

        np.save(data_dir_emnist + "/y_train_one_hot.npy", y_train_one_hot)
        np.save(data_dir_emnist + "/y_valid_one_hot.npy", y_valid_one_hot)
        np.save(data_dir_emnist + "/y_test_one_hot.npy", y_test_one_hot)

    elif dataset == "EMNIST_Letter_Uppercase":
        # See if directory exists
        data_dir_emnist_uppercase = data_dir + dataset
        utils.check_folder(data_dir_emnist_uppercase)

        # Download the EMNIST Letters dataset from source
        x_train, y_train = extract_training_samples("byclass")
        x_test, y_test = extract_test_samples("byclass")

        # Extract uppercase data
        ix_train = []
        for i, x in enumerate(y_train):
            if 9 < x < 36:
                ix_train.append(i)

        x_train = x_train[ix_train]
        y_train = y_train[ix_train]

        ix_test = []
        for i, x in enumerate(y_test):
            if 9 < x < 36:
                ix_test.append(i)

        x_test = x_test[ix_test]
        y_test = y_test[ix_test]

        # Shift labels from [10:35] to [0:25], to make use of tf.keras.utils.to_categorical
        y_train = y_train - 10
        y_test = y_test - 10

        # Flatten datasets
        x_train_flat = x_train.reshape((x_train.shape[0], -1))
        x_test_flat = x_test.reshape((x_test.shape[0], -1))

        # Create balanced dataset
        x_train_flat, y_train, indices = utils.balanced_sample_maker(
            x_train_flat, y_train, 54100, random_seed=1234)
        x_test_flat, y_test, indices = utils.balanced_sample_maker(
            x_test_flat, y_test, 9880, random_seed=1234)

        # Hold out last 10000 training samples for validation
        x_valid_flat, y_valid = x_train_flat[-9880:], y_train[-9880:]
        x_train_flat, y_train = x_train_flat[:-9880], y_train[:-9880]

        # Retrieve label shapes from training data
        n_classes = np.unique(y_train).shape[0]

        # Convert labels to 1-hot vectors
        y_train_one_hot = tf.keras.utils.to_categorical(y_train, n_classes)
        y_valid_one_hot = tf.keras.utils.to_categorical(y_valid, n_classes)
        y_test_one_hot = tf.keras.utils.to_categorical(y_test, n_classes)

        # Normalize inputs and cast to float
        x_train_flat = (x_train_flat / np.max(x_train_flat)).astype(np.float32)
        x_valid_flat = (x_valid_flat / np.max(x_valid_flat)).astype(np.float32)
        x_test_flat = (x_test_flat / np.max(x_test_flat)).astype(np.float32)

        # Recreate images
        x_train = np.array([
            np.reshape(x_train_flat[i], [28, 28])
            for i in range(len(x_train_flat))
        ])
        x_valid = np.array([
            np.reshape(x_valid_flat[i], [28, 28])
            for i in range(len(x_valid_flat))
        ])
        x_test = np.array([
            np.reshape(x_test_flat[i], [28, 28])
            for i in range(len(x_test_flat))
        ])

        # Save all data
        np.save(data_dir_emnist_uppercase + "/x_train.npy", x_train)
        np.save(data_dir_emnist_uppercase + "/x_valid.npy", x_valid)
        np.save(data_dir_emnist_uppercase + "/x_test.npy", x_test)

        np.save(data_dir_emnist_uppercase + "/x_train_flat.npy", x_train_flat)
        np.save(data_dir_emnist_uppercase + "/x_valid_flat.npy", x_valid_flat)
        np.save(data_dir_emnist_uppercase + "/x_test_flat.npy", x_test_flat)

        np.save(data_dir_emnist_uppercase + "/y_train.npy", y_train)
        np.save(data_dir_emnist_uppercase + "/y_valid.npy", y_valid)
        np.save(data_dir_emnist_uppercase + "/y_test.npy", y_test)

        np.save(data_dir_emnist_uppercase + "/y_train_one_hot.npy",
                y_train_one_hot)
        np.save(data_dir_emnist_uppercase + "/y_valid_one_hot.npy",
                y_valid_one_hot)
        np.save(data_dir_emnist_uppercase + "/y_test_one_hot.npy",
                y_test_one_hot)

    else:
        print(
            "Please choose either: 'MNIST', 'FashionMNIST', 'EMNIST_Letter' or 'EMNIST_Letter_Uppercase'."
        )