Пример #1
0
    def text_classification(self,
                            num_classes: int = None,
                            multi_label: bool = False,
                            **kwargs) -> ak.TextClassifier:
        """Text Classification.

        Args:
            num_classes (int, optional): Number of classes. Defaults to None.
            multi_label (bool, optional): The target is multi-labeled. Defaults to False.


        Returns:
            ak.TextClassifier: AutoKERAS text classification class.
        """
        return ak.TextClassifier(
            num_classes=num_classes,
            multi_label=multi_label,
            loss=self.loss,
            metrics=self.metrics,
            project_name=self.project_name,
            max_trials=self.max_trials,
            directory=self.directory,
            objective=self.objective,
            tuner=self.tuner,
            overwrite=self.overwrite,
            seed=self.seed,
            max_model_size=self.max_model_size,
            **kwargs,
        )
Пример #2
0
    def experiment02(self):

        self.load_trainingset(False)

        model = ak.TextClassifier()
        model.fit(self.c_x_train, self.c_y_train)
        y_pred = model.predict(self.c_x_test)
Пример #3
0
def main():
    # Loads dataset.
    # FIXME [implement] >>
    """
	(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
	x_train = x_train.reshape(x_train.shape + (1,))
	x_test = x_test.reshape(x_test.shape + (1,))
	"""

    #--------------------
    clf = ak.TextClassifier(verbose=True)

    print('Fitting...')
    start_time = time.time()
    clf.fit(x_train, y_train, time_limit=12 * 60 * 60)  # time_limit in secs.
    print('\tElapsed time = {}'.format(time.time() - start_time))

    print('Final Fitting...')
    start_time = time.time()
    clf.final_fit(x_train, y_train, x_test, y_test, retrain=True)
    print('\tElapsed time = {}'.format(time.time() - start_time))

    print('Evaluating...')
    start_time = time.time()
    accuracy = clf.evaluate(x_test, y_test)
    print('\tElapsed time = {}'.format(time.time() - start_time))

    print('Accuracy =', accuracy * 100)

    print('Predicting...')
    start_time = time.time()
    predictions = clf.predict(x_test)
    print('\tElapsed time = {}'.format(time.time() - start_time))

    print('Predictions =', predictions)
def train_model(df):
    embeddings_index = get_embeddings_index()
    reviews = df['review']
    labels = df['label']
    num_words = len([word for sentence in reviews for word in sentence.split(' ')])
    # # getting the biggest sentence length for padding
    max_num_words = max([len(sentence.split()) for sentence in reviews])
    tokenizer = Tokenizer(num_words=max_num_words)
    tokenizer.fit_on_texts(reviews)
    x_train = tokenizer.texts_to_sequences(reviews)
    # x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    # y_train = to_categorical(np.asarray(labels))
    y_train = np.asarray(labels)
    word_index = tokenizer.word_index
    word_index["<PAD>"] = 0
    word_index["<START>"] = 1
    word_index["<UNK>"] = 2
    id_to_word = {value: key for key, value in word_index.items()}
    # Convert the word indices to words.
    validation_split = 0.2
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train,
                                                        test_size=validation_split, random_state=1)

    x_train = list(map(lambda sentence: ' '.join(
        id_to_word[i] for i in sentence), x_train))
    x_test = list(map(lambda sentence: ' '.join(
        id_to_word[i] for i in sentence), x_test))
    x_train = np.array(x_train, dtype=np.str)
    x_test = np.array(x_test, dtype=np.str)
    y_train = np.asarray(y_train)
    y_test = np.asarray(y_test)
    # print('Found %s unique tokens.' % len(word_index))
    # data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    # labels = to_categorical(np.asarray(labels))
    #
    # print('Shape of data tensor:', data.shape)
    # print('Shape of label tensor:', labels.shape)
    #
    # # split the data into a training set and a validation set
    # indices = np.arange(data.shape[0])
    # np.random.shuffle(indices)
    # data = data[indices]
    # labels = labels[indices]
    # x_train, x_test, y_train, y_test = train_test_split(data, labels,
    #                                                     test_size=0.2, random_state=1)
    #
    # x_train, x_val, y_train, y_val = train_test_split(x_train, y_train,
    #                                                   test_size=0.2, random_state=1)

    import autokeras as ak
    text_classifier = ak.TextClassifier(max_trials = 10)
    # x = np.asarray(df['review'].values)
    # x = np.asarray(list(map(np.str_, x)))
    # y = np.asarray(df['label'].values)


    text_classifier.fit(x_train,y_train, epochs = 5)
    predicted_y = text_classifier.predict(x_test)
    # Evaluate the best model with testing data.
    print(text_classifier.evaluate(x_test, y_test))
Пример #5
0
def test_text_classifier(tmp_path):
    (train_x, train_y), (test_x, test_y) = utils.imdb_raw()
    clf = ak.TextClassifier(directory=tmp_path, max_trials=2, seed=utils.SEED,
                            metrics=['accuracy'], objective='accuracy')
    clf.fit(train_x, train_y, epochs=2, validation_data=(test_x, test_y))
    clf.export_model()
    assert clf.predict(test_x).shape == (len(test_x), 1)
    assert clf.tuner._get_best_trial_epochs() == 2
Пример #6
0
def test_txt_clf_init_hp0_equals_hp_of_a_model(tmp_path):
    clf = ak.TextClassifier(directory=tmp_path)
    clf.inputs[0].shape = (1, )
    clf.outputs[0].in_blocks[0].output_shape = (10, )
    init_hp = task_specific.TEXT_CLASSIFIER[0]
    hp = kerastuner.HyperParameters()
    hp.values = copy.copy(init_hp)

    clf.tuner.hypermodel.build(hp)
    assert set(init_hp.keys()) == set(hp._hps.keys())
Пример #7
0
def main():
    (x_train, y_train), (x_test, y_test) = imdb_raw()
    clf = ak.TextClassifier(max_trials=10, directory='tmp_dir', overwrite=True)

    start_time = timeit.default_timer()
    clf.fit(x_train, y_train)
    stop_time = timeit.default_timer()

    accuracy = clf.evaluate(x_test, y_test)[1]
    print('Accuracy: {accuracy}%'.format(accuracy=round(accuracy * 100, 2)))
    print('Total time: {time} seconds.'.format(
        time=round(stop_time - start_time, 2)))
Пример #8
0
def genre_prediction():
    vectorizer = MultiVectorizer()
    genre_prediction = GenrePredictionModel(vectorizer=vectorizer)

    training_data_df, validation_data_df = genre_prediction.load_data("data/film_data_lots.xlsx", no_sentences=True)

    clf = ak.TextClassifier(max_trials=4, multi_label=True)

    X_train = np.array(training_data_df["Subtitles"].tolist())
    y_train = genre_prediction.training_labels

    X_validation = np.array(validation_data_df["Subtitles"].tolist())
    y_validation = genre_prediction.validation_labels

    clf.fit(X_train, y_train, validation_data=(X_validation, y_validation))
Пример #9
0
def test_txt_clf_init_hp2_equals_hp_of_a_model(tmp_path):
    clf = ak.TextClassifier(directory=tmp_path)
    clf.inputs[0].shape = (1, )
    clf.inputs[0].batch_size = 6
    clf.inputs[0].num_samples = 1000
    clf.outputs[0].in_blocks[0].shape = (10, )
    clf.tuner.hypermodel.hypermodel.epochs = 1000
    clf.tuner.hypermodel.hypermodel.num_samples = 20000
    init_hp = task_specific.TEXT_CLASSIFIER[2]
    hp = keras_tuner.HyperParameters()
    hp.values = copy.copy(init_hp)

    clf.tuner.hypermodel.build(hp)

    assert set(init_hp.keys()) == set(hp._hps.keys())
Пример #10
0
def run_auto_keras():
    df = pd.read_csv("./data/02/emotions_full.csv", index_col=0)

    y = np.array(df["sentiment"].astype("str"))
    X = np.array(df["lemma"].astype("str"))

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=.2,
                                                        random_state=1)

    weights = class_weight.compute_class_weight('balanced', np.unique(y_train),
                                                y_train)
    class_weights = dict(zip([i for i in range(len(weights))], weights))

    # Initialize the structured data classifier.
    model = ak.TextClassifier(overwrite=True,
                              max_trials=4,
                              metrics="accuracy",
                              objective=kt.Objective("accuracy",
                                                     direction="max"),
                              loss="categorical_crossentropy")

    early_stopping = tf.keras.callbacks.EarlyStopping(
        min_delta=0.001,  # minimium amount of change to count as an improvement
        patience=3,  # how many epochs to wait before stopping
        restore_best_weights=True,
    )

    model.fit(X_train,
              y_train,
              epochs=100,
              class_weight=class_weights,
              callbacks=[early_stopping])

    # Export as a Keras Model.
    model.export_model()

    print(
        type(model))  # <class 'tensorflow.python.keras.engine.training.Model'>

    try:
        model.save("./models/model_autokeras", save_format="tf")
    except Exception:
        model.save("./models/model_autokeras.h5")
Пример #11
0
def test_text_classifier(tmp_path):
    train_x = utils.generate_text_data(num_instances=320)
    train_y = np.random.randint(0, 2, 320)
    test_x = train_x
    test_y = train_y
    clf = ak.TextClassifier(
        directory=tmp_path,
        max_trials=2,
        seed=utils.SEED,
        metrics=["accuracy"],
        objective="accuracy",
    )
    clf.fit(
        train_x, train_y, epochs=2, validation_data=(test_x, test_y), batch_size=6
    )
    clf.export_model()
    assert clf.predict(test_x).shape == (len(test_x), 1)
    assert clf.tuner._get_best_trial_epochs() <= 2
def main(
    input_filepath: str = typer.Argument(
        ..., help="Filepath to the TSV-formatted train dataset."),
    output_directory: str = typer.Argument(
        "./",
        help=
        ("Directory to save the output generated during the search. The best model will be saved"
         " as 'output_directory/model_autokeras' or 'output_directory/model_autokeras.h5'"
         ),
    ),
    max_trials: int = typer.Option(
        1000,
        help=("The maximum number of different Keras Models to try."
              " The search may finish before reaching the max_trials."),
    ),
):
    df = pd.read_csv(input_filepath,
                     sep="\t",
                     header=None,
                     names=["text", "labels"])
    X = df["text"].values.astype(str)
    y = df["labels"].values

    output_directory = Path(output_directory)
    output_directory.mkdir(parents=True, exist_ok=True)

    # Hardcode max_model_size to the upper bound of the first models AutoKeras tries.
    clf = ak.TextClassifier(max_trials=max_trials,
                            directory=output_directory,
                            seed=RANDOM_STATE)
    clf.fit(X, y)

    model = clf.export_model()
    try:
        output_filepath = output_directory / "model_autokeras"
        model.save(output_filepath, save_format="tf")
    except ImportError:
        output_filepath = output_directory / "model_autokeras.h5"
        model.save(output_filepath)
    typer.secho(
        f"Best model saved to {output_filepath.absolute()}.",
        bold=True,
    )
Пример #13
0
    def __init__(self,
                 model_pars=None,
                 data_pars=None,
                 compute_pars=None,
                 out_pars=None):
        ### Model Structure        ################################

        if model_pars is None:
            self.model = None
            return self

        # Initialize the text classifier.
        # It tries n different models.
        if model_pars["model_name"] == "text":
            # Initialize the TextClassifier
            self.model = ak.TextClassifier(max_trials=model_pars['max_trials'])
        elif model_pars["model_name"] == "vision":
            # Initialize the ImageClassifier.
            self.model = ak.ImageClassifier(
                max_trials=model_pars['max_trials'])
        elif model_pars["model_name"] == "tabular_classifier":
            # Initialize the classifier.
            self.model = ak.StructuredDataClassifier(
                max_trials=model_pars['max_trials'])
Пример #14
0
    map(lambda sentence: ' '.join(id_to_word[i] for i in sentence), x_train))
x_test = list(
    map(lambda sentence: ' '.join(id_to_word[i] for i in sentence), x_test))
x_train = np.array(x_train, dtype=np.str)
x_test = np.array(x_test, dtype=np.str)
print(x_train.shape)  # (25000,)
print(y_train.shape)  # (25000, 1)
print(x_train[0][:50])  # <START> this film was just brilliant casting <UNK>
"""
The second step is to run the [TextClassifier](/text_classifier).
"""

import autokeras as ak

# Initialize the text classifier.
clf = ak.TextClassifier(max_trials=1)  # It tries 10 different models.
# Feed the text classifier with training data.
clf.fit(x_train, y_train, epochs=2)
# Predict with the best model.
predicted_y = clf.predict(x_test)
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))
"""
## Validation Data
By default, AutoKeras use the last 20% of training data as validation data.
As shown in the example below, you can use `validation_split` to specify the percentage.
"""

clf.fit(
    x_train,
    y_train,
Пример #15
0
x_test = np.array(test_data.data)
y_test = np.array(test_data.target)

print(x_train.shape)  # (25000,)
print(y_train.shape)  # (25000, 1)
print(x_train[0][:50])  # this film was just brilliant casting
"""
The second step is to run the [TextClassifier](/text_classifier).
As a quick demo, we set epochs to 2.
You can also leave the epochs unspecified for an adaptive number of epochs.
"""

import autokeras as ak

# Initialize the text classifier.
clf = ak.TextClassifier(overwrite=True,
                        max_trials=1)  # It only tries 1 model as a quick demo.
# Feed the text classifier with training data.
clf.fit(x_train, y_train, epochs=2)
# Predict with the best model.
predicted_y = clf.predict(x_test)
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))
"""
## Validation Data
By default, AutoKeras use the last 20% of training data as validation data.
As shown in the example below, you can use `validation_split` to specify the percentage.
"""

clf.fit(
    x_train,
    y_train,
Пример #16
0
# Separate labels and features
X_train = df_train['Sentence']
y_train = df_train['Polarity']

# Load Testing data
print('Reading test set...', end='')
df_test = pd.read_csv("data/sentiment_test.csv")
print('Done.')

# Separate labels and features
X_test = df_test['Sentence']
y_test = df_test['Polarity']

# Instantiate classifier object
classifier = ak.TextClassifier(max_trials=30, seed=42)

# Clean up sentence data using custom tokenizer, convert datatypes for autokeras
X_train_clean = np.array(X_train.apply(spacy_tokenizer_string), dtype=np.str)
X_test_clean = np.array(X_test.apply(spacy_tokenizer_string), dtype=np.str)

# Convert datatypes for compatibility with autokeras
y_train_clean = np.array(y_train)
y_test_clean = np.array(y_test)

# Fit the autokeras classifier
classifier.fit(X_train_clean, y_train_clean, epochs=5)

# Extract the best model from search function.
# Note that due to some bugs in autokeras, the best model needs to be extracted by pausing execution using debug mode
# and recording the model layers and hyperparameters.
Пример #17
0
 def get_auto_model(self):
     return ak.TextClassifier(max_trials=10,
                              directory=self.tmp_dir,
                              overwrite=True)
Пример #18
0
    map(lambda sentence: ' '.join(id_to_word[i] for i in sentence), x_train))
x_test = list(
    map(lambda sentence: ' '.join(id_to_word[i] for i in sentence), x_test))
x_train = np.array(x_train, dtype=np.str)
x_test = np.array(x_test, dtype=np.str)
print(x_train.shape)  # (25000,)
print(y_train.shape)  # (25000, 1)
print(x_train[0][:50])  # <START> this film was just brilliant casting <UNK>
"""
The second step is to run the [TextClassifier](/text_classifier).
"""

import autokeras as ak

# Initialize the text classifier.
clf = ak.TextClassifier(overwrite=True,
                        max_trials=1)  # It tries 10 different models.
# Feed the text classifier with training data.
clf.fit(x_train, y_train, epochs=2)
# Predict with the best model.
predicted_y = clf.predict(x_test)
# Evaluate the best model with testing data.
print(clf.evaluate(x_test, y_test))
"""
## Validation Data
By default, AutoKeras use the last 20% of training data as validation data.
As shown in the example below, you can use `validation_split` to specify the percentage.
"""

clf.fit(
    x_train,
    y_train,
Пример #19
0
    word_to_id = tf.keras.datasets.imdb.get_word_index()
    word_to_id = {k: (v + index_offset) for k, v in word_to_id.items()}
    word_to_id["<PAD>"] = 0
    word_to_id["<START>"] = 1
    word_to_id["<UNK>"] = 2

    id_to_word = {value: key for key, value in word_to_id.items()}
    x_train = list(
        map(lambda sentence: " ".join(id_to_word[i] for i in sentence),
            x_train))
    x_test = list(
        map(lambda sentence: " ".join(id_to_word[i] for i in sentence),
            x_test))
    x_train = np.array(x_train, dtype=np.str)
    x_test = np.array(x_test, dtype=np.str)
    return (x_train, y_train), (x_test, y_test)


# Prepare the data.
(x_train, y_train), (x_test, y_test) = imdb_raw()
print(x_train.shape)  # (25000,)
print(y_train.shape)  # (25000, 1)
print(x_train[0][:50])  # <START> this film was just brilliant casting <UNK>

# Initialize the TextClassifier
clf = ak.TextClassifier(max_trials=3)
# Search for the best model.
clf.fit(x_train, y_train, epochs=2)
# Evaluate on the testing data.
print("Accuracy: {accuracy}".format(accuracy=clf.evaluate(x_test, y_test)))
Пример #20
0
def test_txt_clf_fit_call_auto_model_fit(fit, tmp_path):
    auto_model = ak.TextClassifier(directory=tmp_path, seed=utils.SEED)

    auto_model.fit(x=np.array(["a b c", "b b c"]), y=np.array([1, 2]))

    assert fit.is_called
Пример #21
0
def test_imdb_accuracy_over_84(tmp_path):
    (x_train, y_train), (x_test, y_test) = utils.imdb_raw(num_instances=None)
    clf = ak.TextClassifier(max_trials=2, directory=tmp_path)
    clf.fit(x_train, y_train, epochs=2)
    accuracy = clf.evaluate(x_test, y_test)[1]
    assert accuracy >= 0.84
Пример #22
0
def test_imdb_accuracy_over_92(tmp_path):
    (x_train, y_train), (x_test, y_test) = imdb_raw(num_instances=None)
    clf = ak.TextClassifier(max_trials=3, directory=tmp_path)
    clf.fit(x_train, y_train, batch_size=6, epochs=1)
    accuracy = clf.evaluate(x_test, y_test)[1]
    assert accuracy >= 0.92
Пример #23
0
def task_api():
    (x_train, y_train), (x_test, y_test) = imdb_raw()
    clf = ak.TextClassifier(max_trials=3, seed=5)
    clf.fit(x_train, y_train, validation_split=0.2)
    return clf.evaluate(x_test, y_test)
Пример #24
0
def test_text_classifier(tmp_dir):
    (train_x, train_y), (test_x, test_y) = imdb_raw()
    clf = ak.TextClassifier(directory=tmp_dir, max_trials=2)
    clf.fit(train_x, train_y, epochs=2, validation_split=0.2)
    assert clf.predict(test_x).shape == (len(test_x), 1)
Пример #25
0
    x_test = list(
        map(lambda sentence: " ".join(id_to_word[i] for i in sentence), x_test)
    )
    x_train = np.array(x_train, dtype=np.str)
    x_test = np.array(x_test, dtype=np.str)
    return (x_train, y_train), (x_test, y_test)


# Prepare the data.
(x_train, y_train), (x_test, y_test) = reuters_raw()
print(x_train.shape)  # (8982,)
print(y_train.shape)  # (8982, 1)
print(x_train[0][:50])  # <START> <UNK> <UNK> said as a result of its decemb

# Initialize the TextClassifier
clf = ak.TextClassifier(
    max_trials=5,
    overwrite=True,
)

# Callback to avoid overfitting with the EarlyStopping.
cbs = [
    tf.keras.callbacks.EarlyStopping(patience=3),
]

# Search for the best model.
clf.fit(x_train, y_train, epochs=10, callback=cbs)

# Evaluate on the testing data.
print("Accuracy: {accuracy}".format(accuracy=clf.evaluate(x_test, y_test)))
Пример #26
0
def test_text_classifier(tmp_path):
    (train_x, train_y), (test_x, test_y) = utils.imdb_raw()
    clf = ak.TextClassifier(directory=tmp_path, max_trials=2, seed=utils.SEED)
    clf.fit(train_x, train_y, epochs=1, validation_data=(test_x, test_y))
    clf.export_model()
    assert clf.predict(test_x).shape == (len(test_x), 1)
Пример #27
0
def test_text_classifier(tmp_dir):
    (train_x, train_y), (test_x, test_y) = common.imdb_raw()
    clf = ak.TextClassifier(directory=tmp_dir, max_trials=2, seed=common.SEED)
    clf.fit(train_x, train_y, epochs=1, validation_data=(test_x, test_y))
    assert clf.predict(test_x).shape == (len(test_x), 1)
Пример #28
0
# batch_size=batch_size)

# for x, y in train_data:
# for i, a in enumerate(x.numpy()):
# for j, b in enumerate(record_x):
# if a == b:
# print('*')
# assert record_y[j] == y.numpy()[i]

# import numpy as np
# x_train = []
# y_train = []
# for x, y in train_data:
# for a in x.numpy():
# x_train.append(a)
# for a in y.numpy():
# y_train.append(a)

# x_train = np.array(x_train)
# y_train = np.array(y_train)

# train_data = train_data.shuffle(1000, seed=123, reshuffle_each_iteration=False)

clf = ak.TextClassifier(overwrite=True, max_trials=2)
# clf.fit(train_data, validation_data=test_data)
# clf.fit(train_data, validation_data=train_data)
clf.fit(train_data, validation_data=val_data)
# clf.fit(x_train, y_train)
# clf.fit(train_data)
print(clf.evaluate(test_data))