示例#1
0
def prepare_data(featurizer, dim):
    if featurizer != 'ngram' and featurizer != 'glove':
        print("Please choose featurizer: 'ngram' or 'glove'.")
        return

    # Load and preprocessing data.
    train_data = load_train_data()
    train_data = process_train_data(train_data)
    test_data = load_test_data_a()
    test_data = process_test_data(test_data)

    # Get training X, y, and testing X, y
    if featurizer == 'ngram':
        train_set_ngram = build_ngrams_dataset(train_data)       
        vectorizer = train_set_ngram['vectorizer']
        test_set_ngram = build_ngrams_dataset(test_data, vectorizer=vectorizer)
    else:
        train_set_ngram = build_glove_featurized_dataset(train_data, dim)
        test_set_ngram = build_glove_featurized_dataset(test_data, dim)
    train_X = train_set_ngram['X']
    train_y = train_set_ngram['y']
    print("Shape of train_X: {}".format(train_X.shape))
    test_X = test_set_ngram['X']
    test_y = test_set_ngram['y']
    print("Shape of test_X: {}".format(test_X.shape))
    return {'train_X': train_X,
            'train_y': train_y,
            'test_X': test_X,
            'test_y': test_y}
示例#2
0
def main(argv):

    helpers = joblib.load('titanic.pkl')
    data = pd.read_csv(argv[1])
    result = pd.DataFrame()
    result['PassengerId'] = data['PassengerId']
    X = pp.process_test_data(data, helpers)
    result['Survived'] = helpers['model'].predict(X)
    result.to_csv('result.csv', index=False)

    print("Prediction saved to file result.csv")
def BERT_model(max_set_length=128,
               max_iter=2,
               batch_size=32,
               eta=2e-5,
               eps=1e-8):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=
        2,  # The number of output labels--2 for binary classification.
        # You can increase this for multi-class tasks.
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=
        False,  # Whether the model returns all hidden-states.
    )

    train_data = load_train_data()
    train_data = process_train_data(train_data)
    X_train, y_train = list(train_data['tweet']), list(train_data['subtask_a'])

    BertClassifier = TorchBertClassifier(tokenizer=tokenizer,
                                         model=model,
                                         optimizer=AdamW,
                                         max_set_length=max_set_length,
                                         max_iter=max_iter,
                                         batch_size=batch_size,
                                         eta=eta,
                                         eps=eps)
    print(BertClassifier)

    BertClassifier.fit(X_train, y_train)

    test_data = load_test_data_a()
    test_data = process_test_data(test_data)
    X_test, y_test = list(test_data['tweet']), list(test_data['subtask_a'])

    predictions = BertClassifier.predict(X_test)
    test_data['prediction'] = np.array(predictions)
    if not os.path.exists(RESULT_FOLDER):
        os.makedirs(RESULT_FOLDER)
    output_file_path = os.path.join(
        RESULT_FOLDER, "BERT_Iter_{}_prediction.csv".format(max_iter))
    test_data.to_csv(output_file_path, index=False)

    print("\nClassification report:")
    print(classification_report(y_test, predictions))

    F1_score = f1_score(change_to_binary(y_test),
                        change_to_binary(predictions),
                        average='macro')
    print("f1 score: {}".format(F1_score))
    return F1_score
示例#4
0
def BiLSTM_CNN_model(embed_dim=50,
                     batch_size=1024,
                     max_iter=10,
                     hidden_dim=50,
                     bidirectional=True,
                     out_channels=30,
                     kernel_sizes=[3, 4, 5],
                     dropout_prob=0.1):
    start_time = time.time()
    vocab, embedding = generate_glove_embedding(embed_dim)

    train_data = load_train_data()
    train_data = process_train_data(train_data)
    X_train, y_train = build_LSTM_dataset(train_data, 128)

    mod = TorchLSTM_CNNClassifier(vocab=vocab,
                                  embedding=embedding,
                                  embed_dim=embed_dim,
                                  max_iter=max_iter,
                                  bidirectional=bidirectional,
                                  hidden_dim=hidden_dim,
                                  out_channels=out_channels,
                                  kernel_sizes=kernel_sizes,
                                  dropout_prob=dropout_prob,
                                  batch_size=batch_size)

    mod.fit(X_train, y_train)

    test_data = load_test_data_a()
    test_data = process_test_data(test_data)
    X_test, y_test = build_LSTM_dataset(test_data, 128)

    predictions = mod.predict(X_test)
    test_data['prediction'] = np.array(predictions)
    if not os.path.exists(RESULT_FOLDER):
        os.makedirs(RESULT_FOLDER)
    output_file_path = os.path.join(RESULT_FOLDER, "BiLSTM_CNN_{}-embedding_{}-batchsize_{}-hidden_{}-filters_{}-iter_prediction.csv". \
        format(embed_dim, batch_size, hidden_dim, out_channels, max_iter))
    test_data.to_csv(output_file_path, index=False)

    print("\nClassification report:")
    print(classification_report(y_test, predictions))

    f1_macro = f1_score(change_to_binary(y_test),
                        change_to_binary(predictions),
                        average='macro')
    print("BiLSTM+CNN embedding dim: {}, batch size: {}, hiddend dim: {}, out channels: {}, max_iter: {}, dropout: {}, macro f1 score: {}" \
        .format(embed_dim, batch_size, hidden_dim, out_channels, max_iter, dropout_prob, f1_macro))

    end_time = time.time()
    print("Finish BiLSTM+CNN in {} mins.".format((end_time - start_time) / 60))
    return f1_macro
def LSTM_model(embed_dim=300,
               max_iter=100,
               batch_size=32,
               hidden_dim=50,
               eta=0.001,
               bidirectional=False):
    vocab, embedding = generate_glove_embedding(embed_dim)

    train_data = load_train_data()
    train_data = process_train_data(train_data)
    X_train, y_train = build_LSTM_dataset(train_data, 128)

    mod = TorchLSTMClassifier(vocab=vocab,
                              embedding=embedding,
                              embed_dim=embed_dim,
                              max_iter=max_iter,
                              batch_size=batch_size,
                              eta=eta,
                              bidirectional=bidirectional,
                              hidden_dim=hidden_dim)

    print(mod)

    mod.fit(X_train, y_train)

    test_data = load_test_data_a()
    test_data = process_test_data(test_data)
    X_test, y_test = build_LSTM_dataset(test_data, 128)

    predictions = mod.predict(X_test)
    test_data['prediction'] = np.array(predictions)
    if not os.path.exists(RESULT_FOLDER):
        os.makedirs(RESULT_FOLDER)
    output_file_path = os.path.join(
        RESULT_FOLDER, "LSTM_{}-embedding_{}-hidden_prediction.csv".format(
            embed_dim, hidden_dim))
    test_data.to_csv(output_file_path, index=False)

    print("\nClassification report:")
    print(classification_report(y_test, predictions))

    F1_score = f1_score(change_to_binary(y_test),
                        change_to_binary(predictions),
                        average='macro')

    print("LSTM embedding dim: {}, f1 score: {}".format(embed_dim, F1_score))
    return F1_score
示例#6
0
def get_prediction():

    helpers = joblib.load('titanic.pkl')
    model = helpers['model']

    passenger = {}

    passenger['Name'] = request.args.get('n')
    passenger['Sex'] = request.args.get('s')
    passenger['Age'] = int(float(request.args.get('a')))
    passenger['Fare'] = float(request.args.get('f'))
    passenger['Pclass'] = int(float(request.args.get('c')))
    passenger['SibSp'] = int(float(request.args.get('si')))
    passenger['Parch'] = int(float(request.args.get('p')))
    passenger['Embarked'] = request.args.get('e')
    passenger['Cabin'] = request.args.get('ca')

    data = pd.DataFrame(passenger, index=[0])

    X = pp.process_test_data(data, helpers)
    survived = model.predict(X)

    survived = 'yes' if survived else 'no'
    return jsonify({'survived': survived})
示例#7
0
IMG_SIZE = 100  # resize image to this height and width
lr = 0.0001  # learning rate
epochs = 25  # number of times model sees full data

MODEL_NAME = 'dogsvscats-{}-{}.model'.format(lr, 'integrated')

# Ask user to load or process
# For first time need to process but subsequently can load data
# UNLESS IMG_SIZE is changed
print(
    'Load pre-existing preprocessed data for training (L) or preprocess data (P)?'
)
decision1 = input()
if decision1 == 'P':
    train_data = pp.create_train_data(TRAIN_DIR=TRAIN_DIR, IMG_SIZE=IMG_SIZE)
    test_data = pp.process_test_data(TEST_DIR=TEST_DIR, IMG_SIZE=IMG_SIZE)
elif decision1 == 'L':
    if os.path.exists('train_data.npy'):
        train_data = np.load('train_data.npy')
        test_data = np.load('test_data.npy')
    else:
        raise Exception(
            'No preprocessed data exists in path, please preprocess some.')
else:
    raise Exception('Please retry and type L or P')
'''
25000 train images are now:
IMG_SIZE*IMG_SIZE grayscale attached to one hot class label indicating cat [1,0] or dog [0,1] and ordered randomly
'''

# split data into train (24500) and validation (500) data