def prepare_data(featurizer, dim): if featurizer != 'ngram' and featurizer != 'glove': print("Please choose featurizer: 'ngram' or 'glove'.") return # Load and preprocessing data. train_data = load_train_data() train_data = process_train_data(train_data) test_data = load_test_data_a() test_data = process_test_data(test_data) # Get training X, y, and testing X, y if featurizer == 'ngram': train_set_ngram = build_ngrams_dataset(train_data) vectorizer = train_set_ngram['vectorizer'] test_set_ngram = build_ngrams_dataset(test_data, vectorizer=vectorizer) else: train_set_ngram = build_glove_featurized_dataset(train_data, dim) test_set_ngram = build_glove_featurized_dataset(test_data, dim) train_X = train_set_ngram['X'] train_y = train_set_ngram['y'] print("Shape of train_X: {}".format(train_X.shape)) test_X = test_set_ngram['X'] test_y = test_set_ngram['y'] print("Shape of test_X: {}".format(test_X.shape)) return {'train_X': train_X, 'train_y': train_y, 'test_X': test_X, 'test_y': test_y}
def main(argv): helpers = joblib.load('titanic.pkl') data = pd.read_csv(argv[1]) result = pd.DataFrame() result['PassengerId'] = data['PassengerId'] X = pp.process_test_data(data, helpers) result['Survived'] = helpers['model'].predict(X) result.to_csv('result.csv', index=False) print("Prediction saved to file result.csv")
def BERT_model(max_set_length=128, max_iter=2, batch_size=32, eta=2e-5, eps=1e-8): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab. num_labels= 2, # The number of output labels--2 for binary classification. # You can increase this for multi-class tasks. output_attentions=False, # Whether the model returns attentions weights. output_hidden_states= False, # Whether the model returns all hidden-states. ) train_data = load_train_data() train_data = process_train_data(train_data) X_train, y_train = list(train_data['tweet']), list(train_data['subtask_a']) BertClassifier = TorchBertClassifier(tokenizer=tokenizer, model=model, optimizer=AdamW, max_set_length=max_set_length, max_iter=max_iter, batch_size=batch_size, eta=eta, eps=eps) print(BertClassifier) BertClassifier.fit(X_train, y_train) test_data = load_test_data_a() test_data = process_test_data(test_data) X_test, y_test = list(test_data['tweet']), list(test_data['subtask_a']) predictions = BertClassifier.predict(X_test) test_data['prediction'] = np.array(predictions) if not os.path.exists(RESULT_FOLDER): os.makedirs(RESULT_FOLDER) output_file_path = os.path.join( RESULT_FOLDER, "BERT_Iter_{}_prediction.csv".format(max_iter)) test_data.to_csv(output_file_path, index=False) print("\nClassification report:") print(classification_report(y_test, predictions)) F1_score = f1_score(change_to_binary(y_test), change_to_binary(predictions), average='macro') print("f1 score: {}".format(F1_score)) return F1_score
def BiLSTM_CNN_model(embed_dim=50, batch_size=1024, max_iter=10, hidden_dim=50, bidirectional=True, out_channels=30, kernel_sizes=[3, 4, 5], dropout_prob=0.1): start_time = time.time() vocab, embedding = generate_glove_embedding(embed_dim) train_data = load_train_data() train_data = process_train_data(train_data) X_train, y_train = build_LSTM_dataset(train_data, 128) mod = TorchLSTM_CNNClassifier(vocab=vocab, embedding=embedding, embed_dim=embed_dim, max_iter=max_iter, bidirectional=bidirectional, hidden_dim=hidden_dim, out_channels=out_channels, kernel_sizes=kernel_sizes, dropout_prob=dropout_prob, batch_size=batch_size) mod.fit(X_train, y_train) test_data = load_test_data_a() test_data = process_test_data(test_data) X_test, y_test = build_LSTM_dataset(test_data, 128) predictions = mod.predict(X_test) test_data['prediction'] = np.array(predictions) if not os.path.exists(RESULT_FOLDER): os.makedirs(RESULT_FOLDER) output_file_path = os.path.join(RESULT_FOLDER, "BiLSTM_CNN_{}-embedding_{}-batchsize_{}-hidden_{}-filters_{}-iter_prediction.csv". \ format(embed_dim, batch_size, hidden_dim, out_channels, max_iter)) test_data.to_csv(output_file_path, index=False) print("\nClassification report:") print(classification_report(y_test, predictions)) f1_macro = f1_score(change_to_binary(y_test), change_to_binary(predictions), average='macro') print("BiLSTM+CNN embedding dim: {}, batch size: {}, hiddend dim: {}, out channels: {}, max_iter: {}, dropout: {}, macro f1 score: {}" \ .format(embed_dim, batch_size, hidden_dim, out_channels, max_iter, dropout_prob, f1_macro)) end_time = time.time() print("Finish BiLSTM+CNN in {} mins.".format((end_time - start_time) / 60)) return f1_macro
def LSTM_model(embed_dim=300, max_iter=100, batch_size=32, hidden_dim=50, eta=0.001, bidirectional=False): vocab, embedding = generate_glove_embedding(embed_dim) train_data = load_train_data() train_data = process_train_data(train_data) X_train, y_train = build_LSTM_dataset(train_data, 128) mod = TorchLSTMClassifier(vocab=vocab, embedding=embedding, embed_dim=embed_dim, max_iter=max_iter, batch_size=batch_size, eta=eta, bidirectional=bidirectional, hidden_dim=hidden_dim) print(mod) mod.fit(X_train, y_train) test_data = load_test_data_a() test_data = process_test_data(test_data) X_test, y_test = build_LSTM_dataset(test_data, 128) predictions = mod.predict(X_test) test_data['prediction'] = np.array(predictions) if not os.path.exists(RESULT_FOLDER): os.makedirs(RESULT_FOLDER) output_file_path = os.path.join( RESULT_FOLDER, "LSTM_{}-embedding_{}-hidden_prediction.csv".format( embed_dim, hidden_dim)) test_data.to_csv(output_file_path, index=False) print("\nClassification report:") print(classification_report(y_test, predictions)) F1_score = f1_score(change_to_binary(y_test), change_to_binary(predictions), average='macro') print("LSTM embedding dim: {}, f1 score: {}".format(embed_dim, F1_score)) return F1_score
def get_prediction(): helpers = joblib.load('titanic.pkl') model = helpers['model'] passenger = {} passenger['Name'] = request.args.get('n') passenger['Sex'] = request.args.get('s') passenger['Age'] = int(float(request.args.get('a'))) passenger['Fare'] = float(request.args.get('f')) passenger['Pclass'] = int(float(request.args.get('c'))) passenger['SibSp'] = int(float(request.args.get('si'))) passenger['Parch'] = int(float(request.args.get('p'))) passenger['Embarked'] = request.args.get('e') passenger['Cabin'] = request.args.get('ca') data = pd.DataFrame(passenger, index=[0]) X = pp.process_test_data(data, helpers) survived = model.predict(X) survived = 'yes' if survived else 'no' return jsonify({'survived': survived})
IMG_SIZE = 100 # resize image to this height and width lr = 0.0001 # learning rate epochs = 25 # number of times model sees full data MODEL_NAME = 'dogsvscats-{}-{}.model'.format(lr, 'integrated') # Ask user to load or process # For first time need to process but subsequently can load data # UNLESS IMG_SIZE is changed print( 'Load pre-existing preprocessed data for training (L) or preprocess data (P)?' ) decision1 = input() if decision1 == 'P': train_data = pp.create_train_data(TRAIN_DIR=TRAIN_DIR, IMG_SIZE=IMG_SIZE) test_data = pp.process_test_data(TEST_DIR=TEST_DIR, IMG_SIZE=IMG_SIZE) elif decision1 == 'L': if os.path.exists('train_data.npy'): train_data = np.load('train_data.npy') test_data = np.load('test_data.npy') else: raise Exception( 'No preprocessed data exists in path, please preprocess some.') else: raise Exception('Please retry and type L or P') ''' 25000 train images are now: IMG_SIZE*IMG_SIZE grayscale attached to one hot class label indicating cat [1,0] or dog [0,1] and ordered randomly ''' # split data into train (24500) and validation (500) data