示例#1
0
def classify(texts, output_format):
    # load model
    model = Classifier('toxic', "gru", list_classes=list_classes)
    model.load()
    start_time = time.time()
    result = model.predict(texts, output_format)
    print("runtime: %s seconds " % (round(time.time() - start_time, 3)))
    return result
示例#2
0
def classify(texts, output_format, architecture="gru", transformer=None):
    # load model
    model = Classifier('toxic_' + architecture)
    model.load()
    start_time = time.time()
    result = model.predict(texts, output_format)
    print("runtime: %s seconds " % (round(time.time() - start_time, 3)))
    return result
示例#3
0
def test():
    # load model
    model = Classifier('toxic', "gru", list_classes=list_classes)
    model.load()

    print('loading test dataset...')
    xte = load_texts_pandas("data/textClassification/toxic/test.csv")
    print('number of texts to classify:', len(xte))
    start_time = time.time()
    result = model.predict(xte, output_format="csv")
    print("runtime: %s seconds " % (round(time.time() - start_time, 3)))
    return result
示例#4
0
def classify(texts, output_format):
    # load model
    model = Classifier('citations', "gru", list_classes=list_classes)
    model.load()
    start_time = time.time()
    result = model.predict(texts, output_format)
    runtime = round(time.time() - start_time, 3)
    if output_format is 'json':
        result["runtime"] = runtime
    else:
        print("runtime: %s seconds " % (runtime))
    return result
示例#5
0
def classify(texts, output_format, architecture="gru"):
    # load model
    model = Classifier('software_use',
                       model_type=architecture,
                       list_classes=list_classes)
    model.load()
    start_time = time.time()
    result = model.predict(texts, output_format)
    runtime = round(time.time() - start_time, 3)
    if output_format is 'json':
        result["runtime"] = runtime
    else:
        print("runtime: %s seconds " % (runtime))
    return result
示例#6
0
def classify(texts, output_format, architecture="gru", cascaded=False):
    '''
        Classify a list of texts with an existing model
    '''
    # load model
    model = Classifier('dataseer', model_type=architecture)
    model.load()
    start_time = time.time()
    result = model.predict(texts, output_format)
    runtime = round(time.time() - start_time, 3)
    if output_format is 'json':
        result["runtime"] = runtime
    else:
        print("runtime: %s seconds " % (runtime))
    return result
示例#7
0
def classify(texts,
             output_format,
             embeddings_name=None,
             architecture="gru",
             transformer=None):
    # load model
    model = Classifier('software_context_' + architecture)
    model.load()
    start_time = time.time()
    result = model.predict(texts, output_format)
    runtime = round(time.time() - start_time, 3)
    if output_format == 'json':
        result["runtime"] = runtime
    else:
        print("runtime: %s seconds " % (runtime))
    return result
示例#8
0
def train_eval_cascaded(embeddings_name,
                        fold_count,
                        use_ELMo=False,
                        use_BERT=False,
                        architecture="gru"):
    # general setting of parameters
    class_weights = None
    batch_size = 256
    maxlen = 300
    if use_ELMo:
        batch_size = 20
    elif use_BERT:
        batch_size = 50

    # default bert model parameters
    if architecture.find("bert") != -1:
        batch_size = 32
        maxlen = 100

    # first binary classifier: dataset or no_dataset
    xtr, y, _, _, list_classes, _, _ = load_dataseer_corpus_csv(
        "data/textClassification/dataseer/all-binary.csv")

    print(list_classes)

    model_binary = Classifier('dataseer-binary',
                              model_type=architecture,
                              list_classes=list_classes,
                              max_epoch=100,
                              fold_number=fold_count,
                              patience=10,
                              use_roc_auc=True,
                              embeddings_name=embeddings_name,
                              use_ELMo=use_ELMo,
                              use_BERT=use_BERT,
                              batch_size=batch_size,
                              maxlen=maxlen,
                              class_weights=class_weights)

    # segment train and eval sets
    x_train, y_train, x_test, y_test = split_data_and_labels(xtr, y, 0.9)

    if fold_count == 1:
        model_binary.train(x_train, y_train)
    else:
        model_binary.train_nfold(x_train, y_train)
    model_binary.eval(x_test, y_test)

    x_test_binary = x_test
    y_test_binary = y_test

    # second, the first level datatype taxonomy for sentences classified as dataset
    xtr, y_classes, y_subclasses, y_leafclasses, list_classes, list_subclasses, list_leaf_classes = load_dataseer_corpus_csv(
        "data/textClassification/dataseer/all-1.csv")
    # ignore the no_dataset, ignore the first eval set, build first level classifier

    ind = list_classes.index('no_dataset')
    to_remove = vectorizer(ind, len(list_classes))

    x_train, y_train = filter_exclude_class(xtr, y_classes, to_remove)
    y_train2 = np.zeros(shape=(len(y_train), len(list_classes) - 1))
    for i in range(0, len(y_train)):
        y_train2[i] = np.delete(y_train[i], ind)
    y_train = y_train2

    list_classes.remove('no_dataset')

    model_first = Classifier('dataseer-first',
                             model_type=architecture,
                             list_classes=list_classes,
                             max_epoch=100,
                             fold_number=fold_count,
                             patience=10,
                             use_roc_auc=True,
                             embeddings_name=embeddings_name,
                             use_ELMo=use_ELMo,
                             use_BERT=use_BERT,
                             batch_size=batch_size,
                             maxlen=maxlen,
                             class_weights=class_weights)

    if fold_count == 1:
        model_first.train(x_train, y_train)
    else:
        model_first.train_nfold(x_train, y_train)
    model_first.eval(x_test, y_test)

    # eval by cascading
    result_binary = model_binary.predict(x_test_binary,
                                         output_format='default')
    result_first = model_first.predict(x_test, output_format='default')

    # select sequences classified as dataset
    result_intermediate = np.asarray(
        [np.argmax(line) for line in result_binary])

    def vectorize(index, size):
        result = np.zeros(size)
        if index < size:
            result[index] = 1
        return result

    result_binary = np.array(
        [vectorize(xi, len(list_classes)) for xi in result_intermediate])