예제 #1
0
def run_lp():
    # 20NG
    for i in range(2):
        scope_name = ng20_scope_names[i]
        scope = ng20_scopes[i]
        count = ng20_counts[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        newLabels = GraphGenerator.getNewLabels(hin)

        tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
        graph, newIds = GraphGenerator.generateCosineNeighborGraph(
            hin, 10, tf_param)
        print scope_name + ' lp+entity'
        result[i, 4] = lp_experiment(scope, scope_name, count, graph,
                                     newLabels, newIds)

    # GCAT
    for i in range(2):
        scope_name = gcat_scope_names[i]
        scope = gcat_scopes[i]
        count = gcat_counts[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
        newLabels = GraphGenerator.getNewLabels(hin)
        graph, newIds = GraphGenerator.generateCosineNeighborGraph(
            hin, 10, tf_param)

        with open('data/local/laplacian/' + scope_name + '.x') as f:
            X = pk.load(f)
        graph = GraphGenerator.generateCosineNeighborGraphfromX(X)
        print scope_name + ' lp+entity'
        result[i + 2, 4] = lp_experiment(scope, scope_name, count, graph,
                                         newLabels, newIds)
예제 #2
0
def generate_train_test_split():
    # generate random train-test split for 2 data set * 2 scopes
    repeat_times = 50

    lp_candidate = [5]

    # 20ng
    for i in range(2):
        scope_name = ng20_scope_names[i]
        scope = ng20_scopes[i]
        count = ng20_counts[i]
        experiment_path = 'data/local/split/' + scope_name + '/'
        if not os.path.exists('data/local/split/' + scope_name):
            os.makedirs('data/local/split/' + scope_name)
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        lp_param = {'alpha': 0.99, 'normalization_factor': 0.01}
        graph, newIds = GraphGenerator.generateCosineNeighborGraph(
            hin, kNeighbors=10, tf_param=tf_param)
        new_label = GraphGenerator.getNewLabels(hin)
        for lp in lp_candidate:
            ssl = SSLClassifier(graph,
                                new_label,
                                scope,
                                lp_param,
                                repeatTimes=repeat_times,
                                trainNumbers=lp,
                                classCount=count)
            ssl.repeatedExperiment(savePathPrefix=experiment_path + 'lb' +
                                   str(lp).zfill(3) + '_')

    # gcat
    for i in range(2):
        scope_name = gcat_scope_names[i]
        scope = gcat_scopes[i]
        count = gcat_counts[i]
        if not os.path.exists('data/local/split/' + scope_name):
            os.makedirs('data/local/split/' + scope_name)
        experiment_path = 'data/local/split/' + scope_name + '/'
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        lp_param = {'alpha': 0.99, 'normalization_factor': 0.01}
        graph, newIds = GraphGenerator.generateCosineNeighborGraph(
            hin, kNeighbors=10, tf_param=tf_param)
        new_label = GraphGenerator.getNewLabels(hin)
        for lp in lp_candidate:
            ssl = SSLClassifier(graph,
                                new_label,
                                scope,
                                lp_param,
                                repeatTimes=repeat_times,
                                trainNumbers=lp,
                                classCount=count)
            ssl.repeatedExperiment(savePathPrefix=experiment_path + 'lb' +
                                   str(lp).zfill(3) + '_')