示例#1
0
def collect_gcnn(name, most_similar_name, phi, ind):
    gcnn_results = []

    data = dataTools.AuthorshipOneVsOne(name, most_similar_name, ratioTrain,
                                        ratioValid, dataPath)
    logging.info('Training GCNN on {0} v.s. {1}'.format(
        name, most_similar_name))
    h_params = ClusterUtils.load_best_hyperparams(name)

    for split_n in range(NO_OF_SPLITS):
        data.get_split(name, most_similar_name, ratioTrain, ratioValid)

        data.reduce_dim(ind)

        start = time.perf_counter()

        gcnn = train_helper.train_net(data, h_params, phi=phi)

        end = time.perf_counter()

        gcnn_eval = evaluate_gcnn(gcnn, data)
        gcnn_eval['time'] = start - end
        gcnn_results.append(gcnn_eval)

        logging.info(
            'SPLIT {0}: GCNN results successfully collected: {1}'.format(
                split_n, gcnn_results[split_n]))

    return gcnn_results
示例#2
0
def collect_gcnn(name, most_similar_name, perc):
    gcnn_results = []

    data = dataTools.Authorship(name, ratioTrain, ratioValid, dataPath)
    logging.info('Training GCNN on {0}'.format(name))
    h_params = ClusterUtils.load_best_hyperparams(name)

    phi, indices = load_phi(name, data, percentage=perc)

    if indices.shape[0] < 2:
        return [{'acc': 0, 'f1': 0, 'auc': 0, 'prec': 0, 'time': 0}]

    for split_n in range(NO_OF_SPLITS):
        data.get_split(name, ratioTrain, ratioValid)

        data.reduce_dim(indices)

        start = time.perf_counter()

        gcnn = train_helper.train_net(data, h_params, phi=phi)

        end = time.perf_counter()

        gcnn_eval = evaluate_gcnn(gcnn, data)
        gcnn_eval['time'] = start - end
        gcnn_results.append(gcnn_eval)

        logging.info(
            'SPLIT {0}: GCNN results successfully collected: {1}'.format(
                split_n, gcnn_results[split_n]))

    return gcnn_results
示例#3
0
def collect_gcnn(name, most_similar_name):
    gcnn_results = []

    data = dataTools.AuthorshipOneVsOne(name, most_similar_name, ratioTrain, ratioValid, dataPath)
    logging.info('Training GCNN on {0} v.s. {1}'.format(name, most_similar_name))
    h_params = ClusterUtils.load_best_hyperparams(name)

    for split_n in range(NO_OF_SPLITS):
        data.get_split(name, most_similar_name, ratioTrain, ratioValid)
        gcnn = train_helper.train_net(data, h_params)

        gcnn_results.append(evaluate_gcnn(gcnn, data))

        logging.info('SPLIT {0}: GCNN results successfully collected: {1}'.format(split_n, gcnn_results[split_n]))

    return gcnn_results
示例#4
0
# Possible authors: (just use the names in ' ')
# jacob 'abbott', robert louis 'stevenson', louisa may 'alcott',
# horatio 'alger', james 'allen', jane 'austen', emily 'bronte', james 'cooper',
# charles 'dickens', hamlin 'garland', nathaniel 'hawthorne', henry 'james',
# herman 'melville', 'page', herny 'thoreau', mark 'twain',
# arthur conan 'doyle', washington 'irving', edgar allan 'poe',
# sarah orne 'jewett', edith 'wharton'

try:
    atexit.register(delete_active_author, authorName, ACTIVE_AUTHORS_FILE,
                    None, None)

    file_name = "{0}{1}.txt".format(BASE_FILE_NAME, authorName)

    # load best performing hyperparameters
    nFeatures, nShifts = ClusterUtils.load_best_hyperparams(
        authorName, HYPER_PARAM_FILE)  # K: number of shift tap

    if doPrint:
        print('Author: {0}, Combination: {1}'.format(authorName,
                                                     str((nFeatures,
                                                          nShifts))))

    # set training params
    nClasses = 1  # Either authorName or not
    ratioTrain = 0.6  # Ratio of training samples
    ratioValid = 0.2  # Ratio of validation samples (out of the total training
    # samples)
    # Final split is:
    #   nValidation = round(ratioValid * ratioTrain * nTotal)
    #   nTrain = round((1 - ratioValid) * ratioTrain * nTotal)
    #   nTest = nTotal - nTrain - nValidation
示例#5
0
    file_name = "{0}{1}.txt".format(BASE_FILE_NAME, authorName)

    # create empty files so that other jobs would skip this author
    with open(file_name, mode='w+') as f:
        pass

    # Possible authors: (just use the names in ' ')
    # jacob 'abbott', robert louis 'stevenson', louisa may 'alcott',
    # horatio 'alger', james 'allen', jane 'austen', emily 'bronte', james 'cooper',
    # charles 'dickens', hamlin 'garland', nathaniel 'hawthorne', henry 'james',
    # herman 'melville', 'page', herny 'thoreau', mark 'twain',
    # arthur conan 'doyle', washington 'irving', edgar allan 'poe',
    # sarah orne 'jewett', edith 'wharton'

    nFeatures, nShifts = ClusterUtils.load_best_hyperparams(authorName)

    if doPrint:
        print('Author: {0}, Combination: {1}'.format(authorName, str((nFeatures, nShifts))))

    # set training params
    nClasses = 1  # Either authorName or not
    ratioTrain = 0.6  # Ratio of training samples
    ratioValid = 0.2  # Ratio of validation samples (out of the total training
    # samples)
    # Final split is:
    #   nValidation = round(ratioValid * ratioTrain * nTotal)
    #   nTrain = round((1 - ratioValid) * ratioTrain * nTotal)
    #   nTest = nTotal - nTrain - nValidation

    nDataSplits = 7  # Number of data realizations