def main(preprocess_params, classifier_params): dataset_loader = DatasetLoader() X_train, y_train = dataset_loader.load_train() X_test, y_test = dataset_loader.load_test() clf = Pipeline([("preprocessing", TfidfVectorizer(**preprocess_params)), ("classifier", RandomForestClassifier(n_jobs=-1, **classifier_params))]) clf.fit(X_train, y_train) result_storage = ResultStorage(ex, clf) result_storage.store_experiment_data(X_test, y_test)
def main(classifier_params): dataset_loader = DatasetLoader() x_train, y_train = dataset_loader.load_train() w2v_model = Word2Vec.load("word2vec_models/word2vec.model") X_train = np.array( [get_sentence_embedding(w2v_model, sentence) for sentence in x_train]) x_test, y_test = dataset_loader.load_test() X_test = np.array( [get_sentence_embedding(w2v_model, sentence) for sentence in x_test]) clf = SVC(verbose=2, max_iter=10000, **classifier_params) clf.fit(X_train, y_train) result_storage = ResultStorage(ex, clf) result_storage.store_experiment_data(X_test, y_test)
def main(max_df): dataset_loader = DatasetLoader() X, y = dataset_loader.load_train() vectorizer = TfidfVectorizer(strip_accents='ascii', max_df=max_df, max_features=50000) X_train = vectorizer.fit_transform(X).todense() X_train, X_val, y_train, y_val = train_test_split(X_train, y) model = keras.models.Sequential([ keras.layers.Dense(128, input_shape=(50000, ), activation='relu'), keras.layers.Dropout(0.5), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.5), keras.layers.Dense(32, activation='relu'), keras.layers.Dense(len(np.unique(y)), activation='softmax'), ]) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(X_train, y_train, epochs=15, validation_data=(X_val, y_val)) X_test, y_test = dataset_loader.load_test() X_test = vectorizer.transform(X_test) score = model.evaluate(X_test, y_test) print(score) # summarize history for accuracy plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() # summarize history for loss plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show()
def main(model_conf, _run): dataset_loader = DatasetLoader() X, y = dataset_loader.load_train() w2v_model = Word2Vec.load("word2vec_models/word2vec.model") x_w2v = np.array([get_sentence_embedding(w2v_model, sentence) for sentence in X]) X_train, X_val, y_train, y_val = train_test_split(x_w2v, y) model = keras.models.Sequential([ keras.layers.Dense(128, input_shape=(3000,), activation='relu'), keras.layers.Dropout(0.5), keras.layers.Dense(64, activation='relu'), keras.layers.Dropout(0.5), keras.layers.Dense(32, activation='relu'), keras.layers.Dense(len(np.unique(y)), activation='softmax'), ]) model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy']) history = model.fit(X_train, y_train, epochs=20, batch_size=1024, validation_data=(X_val, y_val)) X_test, y_test = dataset_loader.load_test() w_test = np.array([get_sentence_embedding(w2v_model, sentence) for sentence in X_test]) score = model.evaluate(w_test, y_test) print(score) # summarize history for accuracy plt.plot(history.history['acc']) plt.plot(history.history['val_acc']) plt.title('model accuracy') plt.ylabel('accuracy') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show() # summarize history for loss plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('model loss') plt.ylabel('loss') plt.xlabel('epoch') plt.legend(['train', 'test'], loc='upper left') plt.show()
def main(clf_params): dataset_loader = DatasetLoader() w2v_model = Word2Vec.load("word2vec.model") X_train, y_train = dataset_loader.load_train() X_train = preprocess_x(X_train, w2v_model) X_test, y_test = dataset_loader.load_test() X_test = preprocess_x(X_test, w2v_model) grid = GridSearchCV(SVC(), clf_params, n_jobs=-1, cv=5, verbose=2, return_train_score=True, refit=True) grid.fit(X_train, y_train) result_storage = ResultStorage(ex, grid) result_storage.store_experiment_data(X_test, y_test)
from sklearn.metrics.pairwise import cosine_similarity from utils import DatasetLoader # This example finds the most "similar" review (in terms of words used) based on a user-generated review. This is done # by creating a so-called bag-of-words model. Each unique word in the dataset is given an index in a vector. Each review # is in turn transformed into a vector, where the value in each index represents how many times a specific word is # present. For instance, the word "dress" may have index=159. If a review has the value 3 at index 159, "dress" is # mentioned 3 times in the review. The vector representation is a format that can be taken as input by machine learning # algorithms. # # In this task, we calculate the cosine similarity of two vectors, and obtain a metric on how similar they are. Note # that this is in terms of which words are used -- the vectors have no understanding on how the different words relate. # Load dataset dataset = DatasetLoader.load_reviews() # Transform each text in the dataset to its corresponding vector. print("Vectorizing dataset") vectorizer = TfidfVectorizer() texts = [row.full_text() for row in dataset] vectorized_dataset = vectorizer.fit_transform(texts) # Method for finding the review in the dataset most similar to `query`. def find_most_similar_review(query: str) -> str: # find the vector of the query vectorized_query = vectorizer.transform([query]) # Transform all reviews' vectors to the cosine similarity to the vector of query. This is a measurement on how # similar the vectors are.
import torch.nn.functional as F # # Settings. # torch.cuda.set_device(4) learning_rate = 0.001 device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # # load datasets # batch_size = 32 bert_dim = 300 train_data = DatasetLoader('mr', set_name="train") vocab = train_data.vocab test_data = DatasetLoader('mr', set_name="test") max_seq_len = train_data.nnodes train_data_loader = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=True) test_data_loader = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=True) nhid = 300 vote_dim = 100 nclass = train_data.nclass() input_cols = ['node_embeddings', 'dependency_graph', 'polarity'] #
'real_A': real_A, 'real_B': real_B, 'fake_A': fake_A, 'fake_B': fake_B }) loss_dict['loss_D'] = loss_D loss_dict['loss_G'] = loss_G loss_dict['loss_G_GAN'] = loss_G_GAN loss_dict['loss_G_identity'] = loss_G_identity loss_dict['loss_G_cycle'] = loss_G_cycle # Update learning rates client.lr_update() return loss_dict if __name__ == '__main__': clear.clear_records(if_clients=True, if_servers=True, if_logs=True) clients, server, config = init_federated() datasetLoader = DatasetLoader() datasetLoader.load_dataset_default() for client in clients: client.load_dataset_from_dir("clients/" + str(client.id) + "/dataset/") train_federated(config, clients, server)
from utils import DatasetLoader import gensim from gensim.models import Word2Vec dataset_loader = DatasetLoader() X, y = dataset_loader.load_train() print("loaded data") w2v_model = Word2Vec(min_count=20, window=2, size=3000, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20) cleaned_text = [x.split() for x in X] w2v_model.build_vocab(cleaned_text) print("start training") w2v_model.train(cleaned_text, total_examples=w2v_model.corpus_count, epochs=100) w2v_model.save("word2vec_models/word2vec.model")