def classify(texts, output_format): # load model model = Classifier('toxic', "gru", list_classes=list_classes) model.load() start_time = time.time() result = model.predict(texts, output_format) print("runtime: %s seconds " % (round(time.time() - start_time, 3))) return result
def classify(texts, output_format, architecture="gru", transformer=None): # load model model = Classifier('toxic_' + architecture) model.load() start_time = time.time() result = model.predict(texts, output_format) print("runtime: %s seconds " % (round(time.time() - start_time, 3))) return result
def test(): # load model model = Classifier('toxic', "gru", list_classes=list_classes) model.load() print('loading test dataset...') xte = load_texts_pandas("data/textClassification/toxic/test.csv") print('number of texts to classify:', len(xte)) start_time = time.time() result = model.predict(xte, output_format="csv") print("runtime: %s seconds " % (round(time.time() - start_time, 3))) return result
def classify(texts, output_format): # load model model = Classifier('citations', "gru", list_classes=list_classes) model.load() start_time = time.time() result = model.predict(texts, output_format) runtime = round(time.time() - start_time, 3) if output_format is 'json': result["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) return result
def classify(texts, output_format, architecture="gru"): # load model model = Classifier('software_use', model_type=architecture, list_classes=list_classes) model.load() start_time = time.time() result = model.predict(texts, output_format) runtime = round(time.time() - start_time, 3) if output_format is 'json': result["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) return result
def classify(texts, output_format, architecture="gru", cascaded=False): ''' Classify a list of texts with an existing model ''' # load model model = Classifier('dataseer', model_type=architecture) model.load() start_time = time.time() result = model.predict(texts, output_format) runtime = round(time.time() - start_time, 3) if output_format is 'json': result["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) return result
def classify(texts, output_format, embeddings_name=None, architecture="gru", transformer=None): # load model model = Classifier('software_context_' + architecture) model.load() start_time = time.time() result = model.predict(texts, output_format) runtime = round(time.time() - start_time, 3) if output_format == 'json': result["runtime"] = runtime else: print("runtime: %s seconds " % (runtime)) return result
def train_eval_cascaded(embeddings_name, fold_count, use_ELMo=False, use_BERT=False, architecture="gru"): # general setting of parameters class_weights = None batch_size = 256 maxlen = 300 if use_ELMo: batch_size = 20 elif use_BERT: batch_size = 50 # default bert model parameters if architecture.find("bert") != -1: batch_size = 32 maxlen = 100 # first binary classifier: dataset or no_dataset xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-binary.csv") print(list_classes) model_binary = Classifier('dataseer-binary', model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) # segment train and eval sets x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9) if fold_count == 1: model_binary.train(x_train, y_train) else: model_binary.train_nfold(x_train, y_train) model_binary.eval(x_test, y_test) x_test_binary = x_test y_test_binary = y_test # second, the first level datatype taxonomy for sentences classified as dataset xtr, y_classes, y_subclasses, y_leafclasses, list_classes, list_subclasses, list_leaf_classes = load_dataseer_corpus_csv( "data/textClassification/dataseer/all-1.csv") # ignore the no_dataset, ignore the first eval set, build first level classifier ind = list_classes.index('no_dataset') to_remove = vectorizer(ind, len(list_classes)) x_train, y_train = filter_exclude_class(xtr, y_classes, to_remove) y_train2 = np.zeros(shape=(len(y_train), len(list_classes) - 1)) for i in range(0, len(y_train)): y_train2[i] = np.delete(y_train[i], ind) y_train = y_train2 list_classes.remove('no_dataset') model_first = Classifier('dataseer-first', model_type=architecture, list_classes=list_classes, max_epoch=100, fold_number=fold_count, patience=10, use_roc_auc=True, embeddings_name=embeddings_name, use_ELMo=use_ELMo, use_BERT=use_BERT, batch_size=batch_size, maxlen=maxlen, class_weights=class_weights) if fold_count == 1: model_first.train(x_train, y_train) else: model_first.train_nfold(x_train, y_train) model_first.eval(x_test, y_test) # eval by cascading result_binary = model_binary.predict(x_test_binary, output_format='default') result_first = model_first.predict(x_test, output_format='default') # select sequences classified as dataset result_intermediate = np.asarray( [np.argmax(line) for line in result_binary]) def vectorize(index, size): result = np.zeros(size) if index < size: result[index] = 1 return result result_binary = np.array( [vectorize(xi, len(list_classes)) for xi in result_intermediate])