예제 #1
0
def knowsim_experiment(scope,
                       scope_name,
                       type_list,
                       count,
                       newLabels,
                       tau=1,
                       kNeighbors=10,
                       label_num=5):
    split_path = 'data/local/split/' + scope_name + '/'
    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    repeats = 50
    tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
    X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param)
    n = X_word.shape[0]

    knowsim = sparse.lil_matrix((n, n))
    for t in type_list:
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(
            hin, tf_param, t)

        # make similarity graph
        cosX = cosine_similarity(X_typed)
        graph = sparse.lil_matrix((n, n))
        for i in range(n):
            for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]:
                if j == i:
                    continue
                graph[i, j] = cosX[i, j]  # np.exp(- (1 - cosX[i, j]) / 0.03) #
                graph[j, i] = cosX[i, j]  # np.exp(- (1 - cosX[i, j]) / 0.03) #

        # calculate laplacian scores
        row_sum = graph.sum(axis=1)
        laplacian_score = generate_laplacian_score(row_sum, X_word, kNeighbors)

        # add meta-path-based similarity to the knowsim
        knowsim = knowsim + np.exp(-tau * laplacian_score) * graph

    knowsim = knowsim.tocsr()
    print 'running lp'
    lp_param = {'alpha': 0.98, 'normalization_factor': 5}

    ssl = SSLClassifier(knowsim,
                        newLabels,
                        scope,
                        lp_param,
                        repeatTimes=50,
                        trainNumbers=label_num,
                        classCount=count)
    ssl.repeatedFixedExperimentwithNewIds(pathPrefix=split_path + 'lb' +
                                          str(label_num).zfill(3) + '_',
                                          newIds=newIds)
    return ssl.get_mean()
예제 #2
0
def run_semihin():
    # 20NG
    for i in range(2):
        scope_name = ng20_scope_names[i]
        scope = ng20_scopes[i]
        count = ng20_counts[i]

        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        newLabels = GraphGenerator.getNewLabels(hin)

        tf_param = {'word': True, 'entity': False, 'we_weight': 0.112}
        X, newIds, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        print scope_name + ' semihin'
        result[i, 7] = semihin_experiment(scope, scope_name, count, X, newIds)

        tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
        X, newIds, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        print scope_name + ' semihin+entity'
        result[i, 8] = semihin_experiment(scope, scope_name, count, X, newIds)

    # GCAT
    for i in range(2):
        scope_name = gcat_scope_names[i]
        scope = gcat_scopes[i]
        count = gcat_counts[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        newLabels = GraphGenerator.getNewLabels(hin)

        tf_param = {'word': True, 'entity': False, 'we_weight': 0.112}
        X, newIds, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        print scope_name + ' semihin'
        result[i + 2, 6] = semihin_experiment(scope, scope_name, count, X,
                                              newIds)

        with open('data/local/laplacian/' + scope_name + '.x') as f:
            X = pk.load(f)
        print scope_name + ' semihin+entity'
        result[i + 2, 7] = semihin_experiment(scope, scope_name, count, X,
                                              newIds)
예제 #3
0
def run_svm():
    # 20NG
    for i in range(2):
        scope_name = ng20_scope_names[i]
        scope = ng20_scopes[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)

        print scope_name + ' svm'
        tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
        X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        y = GraphGenerator.gety(hin)
        result[i, 2] = svm_experiment(scope_name, X, y)

        print scope_name + ' svm+entity'
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        y = GraphGenerator.gety(hin)
        result[i, 3] = svm_experiment(scope_name, X, y)

    # GCAT
    for i in range(2):
        scope_name = gcat_scope_names[i]
        scope = gcat_scopes[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)

        print scope_name + ' svm'
        tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
        X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        y = GraphGenerator.gety(hin)
        result[i + 2, 2] = svm_experiment(scope_name, X, y)

        print scope_name + ' svm+entity'
        with open('data/local/laplacian/' + scope_name + '.x') as f:
            X = pk.load(f)
        y = GraphGenerator.gety(hin)
        result[i + 2, 3] = svm_experiment(scope_name, X, y)
예제 #4
0
def run_generate_laplacian_score():
    print 'generate laplacian score for feature reweighting'
    # 20NG
    for i in range(2):
        scope_name = ng20_scope_names[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param)
        for t in NG20TypeList:
            tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
            X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(
                hin, tf_param, t)
            laplacian_score = generate_laplacian_score_vector(
                X_typed, X_word, 100)
            if not os.path.exists('data/local/laplacian/' + scope_name):
                os.makedirs('data/local/laplacian/' + scope_name)
            with open(
                    'data/local/laplacian/' + scope_name + '/' + str(t) +
                    '_scores', 'w') as f:
                pk.dump(laplacian_score, f)
    # GCAT
    for i in range(2):
        scope_name = gcat_scope_names[i]
        type_list = GCATTypeList[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param)
        for t in type_list:
            tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
            X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(
                hin, tf_param, t)
            laplacian_score = generate_laplacian_score_vector(
                X_typed, X_word, 100)
            if not os.path.exists('data/local/laplacian/' + scope_name):
                os.makedirs('data/local/laplacian/' + scope_name)
            with open(
                    'data/local/laplacian/' + scope_name + '/' + str(t) +
                    '_scores', 'w') as f:
                pk.dump(laplacian_score, f)
예제 #5
0
def generate_meta_graph(scope, scope_name, type_list, count):
    split_path = 'data/local/split/' + scope_name + '/'
    pred_path = 'data/local/metagraph/' + scope_name + '/'
    if not os.path.exists('data/local/metagraph/' + scope_name + '/'):
        os.makedirs('data/local/metagraph/' + scope_name + '/')

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)
    tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}

    for t in type_list:
        #print t
        X, newIds, entitynewIds = GraphGenerator.getTFVectorX(hin, tf_param, t)
        n = X.shape[0]
        e = X.shape[1]
        with open('data/local/laplacian/' + scope_name + '/' + str(t) +
                  '_scores') as f:
            laplacian_score = pk.load(f)
        laplacian_score = 20 * np.exp(-laplacian_score * 0.01)
        D = sparse.diags(laplacian_score)
        X = X * D
        X = X.toarray()
        graph = np.zeros((n + e, n + e))
        graph[0:n, n:n + e] = X
        graph[n:n + e, 0:n] = X.transpose()
        graph = sparse.csc_matrix(graph)

        newLabel = GraphGenerator.getNewLabels(hin)
        lp_param = {'alpha': 0.98, 'normalization_factor': 5}
        # 3-class classification
        lp_candidate = [5]
        for lp in lp_candidate:
            ssl = SSLClassifier(graph,
                                newLabel,
                                scope,
                                lp_param,
                                repeatTimes=50,
                                trainNumbers=lp,
                                classCount=count)
            if not os.path.exists(pred_path + str(t) + '/'):
                os.makedirs(pred_path + str(t) + '/')
            ssl.repeatedFixedExperimentwithNewIds(
                pathPrefix=split_path + 'lb' + str(lp).zfill(3) + '_',
                newIds=newIds,
                saveProb=True,
                savePathPrefix=pred_path + str(t) + '/' + 'lb' +
                str(lp).zfill(3))
예제 #6
0
def ensemble_cotrain_experiment(scope,
                                scope_name,
                                type_list,
                                threshold,
                                weight,
                                count,
                                label_num=5):

    pred_path = 'data/local/cotrain/' + scope_name + '/'
    split_path = 'data/local/split/' + scope_name + '/'
    if not os.path.exists(pred_path):
        os.makedirs(pred_path)

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
    c = len(scope)
    lb_cand = [label_num]
    repeats = 50

    # rounds for alternating optimization
    rounds = 2
    best_res = 0
    X_s = {}

    tf_param = {'word': True, 'entity': False, 'we_weight': 0.112}
    X_word, newIds, entity_new_ids = GraphGenerator.getTFVectorX(
        hin, param=tf_param, entity_types=None)

    for t in type_list:
        if not os.path.exists(pred_path + str(t) + '/'):
            os.makedirs(pred_path + str(t) + '/')

        with open('data/local/laplacian/' + scope_name + '/' + str(t) +
                  '_scores') as f:
            laplacian_score = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
        X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(
            hin, tf_param, t)
        laplacian_score = 20 * np.exp(-laplacian_score * 0.01)
        # laplacian_score = laplacian_score / np.sum(laplacian_score) * laplacian_score.shape[0]
        D = sparse.diags(laplacian_score)
        X_typed = X_typed * D
        X_s[str(t)] = X_typed

    for rd in range(rounds):
        round_best_res = 0
        round_best_t = ''

        # step 1:
        # generate output of each meta-path
        for t in type_list:

            X = X_s[str(t)].toarray()
            n = X.shape[0]
            e = X.shape[1]
            graph = np.zeros((n + e, n + e))
            graph[0:n, n:n + e] = X
            graph[n:n + e, 0:n] = X.transpose()
            graph = sparse.csc_matrix(graph)

            newLabel = GraphGenerator.getNewLabels(hin)
            lp_param = {
                'alpha': 0.98,
                'normalization_factor': 5,
                'method': 'variant'
            }

            lb = label_num
            ssl = SSLClassifier(graph,
                                newLabel,
                                scope,
                                lp_param,
                                repeatTimes=repeats,
                                trainNumbers=lb,
                                classCount=count)
            if rd == 0:
                ssl.repeatedFixedExperimentwithNewIds(
                    pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_',
                    newIds=newIds,
                    saveProb=True,
                    savePathPrefix=pred_path + str(t) + '/lb' +
                    str(lb).zfill(3))
            else:
                inputPredPath = 'data/local/cotrain/' + scope_name + '/lb' + str(
                    lb).zfill(3) + '_pred_rd_' + str(rd - 1).zfill(3)
                ssl.repeatedFixedExpeimentwithInput(
                    pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_',
                    newIds=newIds,
                    saveProb=True,
                    savePathPrefix=pred_path + 'lb' + str(lb).zfill(3) + '_' +
                    str(t),
                    inputPredPath=inputPredPath)
            res = ssl.get_mean()
            if res > best_res:
                best_res = res
                best_t = t
            if res > round_best_res:
                round_best_res = res
                round_best_t = t
        print 'Round %d\t%.4f\t%s' % (rd, round_best_res, str(round_best_t))

        # step 2:
        # propagate pseudo-label for other path
        for lb in lb_cand:
            results = []
            for r in range(repeats):
                with open('data/local/split/' + scope_name + '/lb' +
                          str(lb).zfill(3) + '_' + str(r).zfill(3) +
                          '_train') as f:
                    trainLabel = pk.load(f)
                with open('data/local/split/' + scope_name + '/lb' +
                          str(lb).zfill(3) + '_' + str(r).zfill(3) +
                          '_test') as f:
                    testLabel = pk.load(f)

                numTrain = len(trainLabel)
                numTest = len(testLabel)
                n = numTrain + numTest

                # write output probability
                outPred = np.zeros((n, c))
                for t in type_list:
                    typePred = np.zeros((n, c))
                    with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) +
                              '_' + str(r).zfill(3) + '_train') as f:
                        trainPred = pk.load(f)
                        for i, k in enumerate(trainLabel.keys()):
                            typePred[k, :] = trainPred[i, :]

                    with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) +
                              '_' + str(r).zfill(3) + '_test') as f:
                        testPred = pk.load(f)
                        for i, k in enumerate(testLabel.keys()):
                            #typePred[k,:] = testPred[i,:]

                            # some potential improvement: set a threshold for random walk number to block
                            # 'unconfident' data points
                            max = np.max(testPred[i, :])
                            if max > threshold[str(t)]:
                                typePred[k, :] = testPred[i, :]
                    # add meta-path probability to global probability
                    outPred += typePred * weight[str(t)]

                with open(
                        'data/local/cotrain/' + scope_name + '/lb' +
                        str(lb).zfill(3) + '_pred_rd_' + str(rd).zfill(3) +
                        '_' + str(r).zfill(3), 'w') as f:
                    pk.dump(outPred, f)
    return best_res