def classify(texts, output_format): # load model model = textClassification.Classifier('toxic', "gru", list_classes=list_classes) model.load() start_time = time.time() result = model.predict(texts, output_format) print("runtime: %s seconds " % (round(time.time() - start_time, 3))) return result
def test(): # load model model = textClassification.Classifier('toxic', "gru", list_classes=list_classes) model.load() print('loading test dataset...') xte = load_texts_pandas("data/textClassification/toxic/test.csv") print('number of texts to classify:', len(xte)) start_time = time.time() result = model.predict(xte, output_format="csv") print("runtime: %s seconds " % (round(time.time() - start_time, 3))) return result
def train(embeddings_name, fold_count): model = textClassification.Classifier('toxic', "gru", list_classes=list_classes, max_epoch=30, fold_number=fold_count, embeddings_name=embeddings_name) print('loading train dataset...') xtr, y = load_texts_and_classes_pandas("data/textClassification/toxic/train.csv") if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save()
def classify(texts, output_format): # load model model = textClassification.Classifier('citations', "gru", list_classes=list_classes) model.load() start_time = time.time() result = model.predict(texts, output_format) runtime = round(time.time() - start_time, 3) if output_format is 'json': result["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) return result
def train(embeddings_name, fold_count): model = textClassification.Classifier('citations', "gru", list_classes=list_classes, max_epoch=70, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name) print('loading citation sentiment corpus...') xtr, y = load_citation_sentiment_corpus( "data/textClassification/citations/citation_sentiment_corpus.txt") if fold_count == 1: model.train(xtr, y) else: model.train_nfold(xtr, y) # saving the model model.save()
def train_and_eval(embeddings_name, training_data, fold_count): model = textClassification.Classifier('suomi24', "gru", list_classes=list_classes, max_epoch=1, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name) print('loading train dataset...') # xtr, y = load_texts_and_classes_pandas("data/textClassification/suomi24/suomi24.csv") xtr, y = load_texts_and_classes_pandas(training_data) # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
def train_and_eval(embeddings_name, fold_count): model = textClassification.Classifier('citations', "gru", list_classes=list_classes, max_epoch=70, fold_number=fold_count, use_roc_auc=True, embeddings_name=embeddings_name) print('loading citation sentiment corpus...') xtr, y = load_citation_sentiment_corpus( "data/textClassification/citations/citation_sentiment_corpus.txt") # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) if fold_count == 1: model.train(x_train, y_train) else: model.train_nfold(x_train, y_train) model.eval(x_test, y_test) # saving the model model.save()
batch_size = BATCH_SIZE, sort_key = lambda x: len(x.text), sort_within_batch=True ) size_of_vocab = len(TEXT.vocab) embedding_dim = 100 num_hidden_nodes = 128 num_layers = 2 N_EPOCHS = 20 best_valid_loss = float('inf') model=textModel.Classifier(size_of_vocab,embedding_dim,num_hidden_nodes,num_layers) pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) optimizer = torch.optim.Adam(model.parameters()) criterion = torch.nn.BCELoss() for epoch in range(N_EPOCHS): #train the model train_loss, train_acc = textModel.train(model, train_iterator, optimizer, criterion) #evaluate the model valid_loss, valid_acc = textModel.evaluate(model, valid_iterator, criterion)