def sensitive_3():
    top_50 = []
    f = open("../data/univ_top_50_cs.txt", "r")
    for line in f:
        line = line.strip().lower()
        top_50.append(line)
    f.close()

    fo = open(
        "../result/result_top50_cs_newdata_apr09/sensitivity/all/sensitivity_diff_hits_weighted-inedge1.csv",
        "w")
    node_list, edge_list = dp.read_data("../data/data_top50_cs_apr09.csv",
                                        self_edge=False)
    G = dp.construct_graph(node_list, edge_list)
    hits = algo.weighted_HITS(G, max_iterations=100, min_delta=0.00001)
    result = sorted(hits.iteritems(), key=lambda asd: asd[1], reverse=True)
    G.clear()

    rank = []
    for e in result:
        if e[0] in top_50:
            rank.append(e[0])

    original_r = []
    for e in result:
        if e[0] in top_50:
            original_r.append([e[0]])

    for k in range(len(original_r)):
        #         if not original_r[k][0] == "mit":
        node_list, edge_list = dp.read_data("../data/data_top50_cs_apr09.csv",
                                            self_edge=False)
        G = dp.construct_graph(node_list, edge_list)
        G = remove_significant_edge(
            G, original_r[k][0],
            rank=rank)  ### add one edge from MIT to <node>
        hits = algo.weighted_HITS(G, max_iterations=100, min_delta=0.00001)
        result = sorted(hits.iteritems(), key=lambda asd: asd[1], reverse=True)
        #result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True)
        G.clear()
        res1 = []
        for e in result:
            if e[0] in top_50:
                res1.append(e[0])
        kr = 0
        for i in range(len(res1)):
            if res1[i] == original_r[k][0]:
                kr = i
        original_r[k].append(k - kr)
    print original_r
    fo.write("univ,diff+mit1\n")
    for r in original_r:
        for i in range(len(r)):
            if i == 0:
                fo.write(str(r[i]))
            else:
                fo.write("," + str(r[i]))
        fo.write("\n")
    fo.close()
def sensitive_add_edge(filename1, filename2, outputfilename, type = "hits_weighted", add_node = "mit"):
    top_50 = []
    f = open(filename2,"r")
    for line in f:
        line = line.strip().lower()
        top_50.append(line)
    f.close()
    
    fo = open(outputfilename,"w")
    node_list, edge_list = dp.read_data(filename1, filename2, self_edge = False, extended = True)
    G = dp.construct_graph(node_list, edge_list)
    #r = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001)
    r = choose_algorithm(G, type = type)
    result = sorted(r.iteritems(), key = lambda asd:asd[1], reverse = True)
    G.clear()
    
    rank = []
    for e in result:
        if e[0] in top_50:
            rank.append(e[0])

    original_r = []
    for e in result:
        if e[0] in top_50:
            original_r.append([e[0]])

    for k in range(len(original_r)):
#         if not original_r[k][0] == "mit":
            node_list, edge_list = dp.read_data(filename1, filename2, self_edge = False, extended = True)
            G = dp.construct_graph(node_list, edge_list)
            G = G = add_non_existing_edges(G, original_r[k][0], add_node, weight = 1) ### add one edge from MIT to <node>
            #r = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001)
            r = choose_algorithm(G, type = type)
            result = sorted(r.iteritems(), key = lambda asd:asd[1], reverse = True)
            #result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True)
            G.clear()
            res1 = []
            for e in result:
                if e[0] in top_50:
                    res1.append(e[0])
            kr = 0
            for i in range(len(res1)):
                if res1[i] == original_r[k][0]:
                    kr = i
            original_r[k].append(k-kr)
    print original_r
    fo.write("univ,diff+%s1\n" %(add_node))
    for r in original_r:
        for i in range(len(r)):
            if i == 0:
                fo.write(str(r[i]))
            else:
                fo.write(","+str(r[i]))
        fo.write("\n")
    fo.close()
def sensitive_3():
    top_50 = []
    f = open("../data/univ_top_50_cs.txt","r")
    for line in f:
        line = line.strip().lower()
        top_50.append(line)
    f.close()
    
    fo = open("../result/result_top50_cs_newdata_apr09/sensitivity/all/sensitivity_diff_hits_weighted-inedge1.csv","w")
    node_list, edge_list = dp.read_data("../data/data_top50_cs_apr09.csv", self_edge = False)
    G = dp.construct_graph(node_list, edge_list)
    hits = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001)
    result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True)
    G.clear()
    
    rank = []
    for e in result:
        if e[0] in top_50:
            rank.append(e[0])

    original_r = []
    for e in result:
        if e[0] in top_50:
            original_r.append([e[0]])

    for k in range(len(original_r)):
#         if not original_r[k][0] == "mit":
            node_list, edge_list = dp.read_data("../data/data_top50_cs_apr09.csv", self_edge = False)
            G = dp.construct_graph(node_list, edge_list)
            G = remove_significant_edge(G, original_r[k][0], rank = rank) ### add one edge from MIT to <node>
            hits = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001)
            result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True)
            #result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True)
            G.clear()
            res1 = []
            for e in result:
                if e[0] in top_50:
                    res1.append(e[0])
            kr = 0
            for i in range(len(res1)):
                if res1[i] == original_r[k][0]:
                    kr = i
            original_r[k].append(k-kr)
    print original_r
    fo.write("univ,diff+mit1\n")
    for r in original_r:
        for i in range(len(r)):
            if i == 0:
                fo.write(str(r[i]))
            else:
                fo.write(","+str(r[i]))
        fo.write("\n")
    fo.close()
Пример #4
0
def main():
    dimension = 32
    X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat')

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)

    # average matrix over train data
    avg_matrix = X_train.mean(axis=0)

    # generate random walks
    walk = random_walk(avg_matrix, steps=1000)
    seq = np.zeros((len(walk), 268))
    for i, pos in enumerate(walk):
        seq[i, :] = avg_matrix[pos]
    print(seq.shape)

    skipgram = Skip_Gram(268, dimension, 2, 0.1)
    skipgram.train_from_feature_seq(seq, epochs=200)

    embedded_train_matrix = np.zeros((len(X_train), 268 * dimension))
    for i in range(len(X_train)):
        embedding_train = skipgram.encode(X_train[i])
        embedded_train_matrix[i] = np.ndarray.flatten(embedding_train)

    embedded_test_matrix = np.zeros((len(X_test), 268 * dimension))
    for i in range(len(X_test)):
        embedding_test = skipgram.encode(X_test[i])
        embedded_test_matrix[i] = np.ndarray.flatten(embedding_test)

    lasso = Lasso(100, .01)

    lasso.train_coordinate_descent(embedded_train_matrix, y_train)

    predicted = lasso.predict(embedded_test_matrix)
    print(mean_squared_error(y_test, predicted))
Пример #5
0
def main():
    TRAIN_CSV_PATH = "./data/train.csv"
    MAX_SEQUENCE_LENGTH = 20

    data = read_data(TRAIN_CSV_PATH)

    clean(data)
    word_to_ix = tokenizer(data)
    label_to_ix = one_hot_encoding(data)

    def trim_zero_padding(x):
        arr = x[:MAX_SEQUENCE_LENGTH]
        arr = arr + [0] * (MAX_SEQUENCE_LENGTH - len(arr))
        return arr

    data['text_token'] = data.loc[:, 'text_token'].apply(trim_zero_padding)
    X = list(data['text_token'])
    y = list(data['label_one_hot'])

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=1,
                                                        stratify=y)

    svm = SVC(kernel="rbf", random_state=1, gamma=0.2, C=1.0)
    svm.fit(X_train, y_train)

    y_pred = svm.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(acc)
Пример #6
0
def main():
    X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat')
    Xm = X.mean(axis = 0)

    EMBEDDING_DIM = 8
    ACTIVATION = 'tanh'
    HEADS = 16

    #Fully-Connected AutoEncoder
    e_x = tf.keras.layers.Input((None, X.shape[-1]))
    e_o = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(EMBEDDING_DIM, activation=ACTIVATION))(e_x)
    e = tf.keras.Model(e_x, e_o)

    d_x = tf.keras.layers.Input((None, EMBEDDING_DIM))
    d_o = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(X.shape[-1], activation='linear'))(d_x)
    d = tf.keras.Model(d_x, d_o)

    model = AutoEncoder(e, d)
    model.train(X, epochs = 50, learning_rate = 0.001, loss = 'mse')
    generate_embedding_vis(Xm, model.encode(Xm), embedding_name='Neural Autoencoder')

    #Transformer AutoEncoder
    et_x = tf.keras.layers.Input((X.shape[1], X.shape[2]))
    et_o = Transformer(EMBEDDING_DIM, heads=HEADS, activation=ACTIVATION)(et_x)
    et = tf.keras.Model(et_x, et_o)

    dt_x = tf.keras.layers.Input((X.shape[1], EMBEDDING_DIM))
    dt_o = Transformer(X.shape[2], heads=HEADS, activation='linear')(dt_x)
    dt = tf.keras.Model(dt_x, dt_o)

    modelt = AutoEncoder(et, dt)
    modelt.train(X, epochs = 100, learning_rate = 0.001, loss = 'mse')
    generate_embedding_vis(Xm, modelt.encode(Xm), embedding_name='Neural Transformer')
Пример #7
0
def main():
    X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat')
    #X = data_processing.adjacency_matrix(X)

    avg_matrix = X.mean(axis = 0)
    print(avg_matrix.shape)

    model = AutoEncoder(X.shape[-1], 64, activation = 'relu')
    model.train(X, epochs = 200, learning_rate = 0.001, loss = 'mse')
    #generate_embedding_vis(avg_matrix, model.encode(avg_matrix), embedding_name='Neural Autoencoder')

    walk = random_walk(avg_matrix, steps = 1000)
    seq = np.zeros((len(walk), 268))
    for i, pos in enumerate(walk):
        seq[i, :] = avg_matrix[pos]
    print(seq.shape)

    skipgram = Skip_Gram(268, 64, 2, 0.1)
    skipgram.train_from_feature_seq(seq, epochs = 200)
    #generate_embedding_vis(avg_matrix, skipgram.encode(avg_matrix), embedding_name='SkipGram')

    cbow = CBOW(268, 64, 2, 0.1)
    cbow.train_from_feature_seq(seq, epochs = 200)
    #generate_embedding_vis(avg_matrix, cbow.encode(avg_matrix), embedding_name='CBOW')

    distances = [[avg_matrix, model.encode(avg_matrix)], [skipgram.encode(avg_matrix), cbow.encode(avg_matrix)]]
    names = [['Original Distances', 'Autoencoder Distances'], ['SkipGram Distances', 'CBOW Distances']]
    generate_embedding_vis_array(distances, names)
def run_li():
    # 读入数据
    # pos_file_path = '/Users/li/Kunyan/MyRepository/DeepNaturalLanguageProcessing/DeepNLP/data/test3.txt'
    # neg_file_path = '/Users/li/Kunyan/MyRepository/DeepNaturalLanguageProcessing/DeepNLP/data/test2.txt'

    pos_file_path = globe.file_pos
    neg_file_path = globe.file_neg

    tmp = data_processing.read_data(pos_file_path, neg_file_path)
    res = data_processing.data_split(tmp[0], tmp[1])
    train_vecs = res[0]
    test_vecs = res[1]
    label_train = res[2]
    label_test = res[3]

    # 分类训练
    lr = SGDClassifier(loss='log', penalty='l1')
    lr.fit(train_vecs, label_train)

    print('Test Accuracy: %.2f' % lr.score(test_vecs, label_test))

    pred_probas = lr.predict_proba(test_vecs)[:, 1]

    fpr, tpr, _ = roc_curve(label_test, pred_probas)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='area = %.2f' % roc_auc)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.legend(loc='lower right')

    plt.show()
def word2vec_test():
    # 读入数据
    pos_file_path = globe.file_pos
    neg_file_path = globe.file_neg

    tmp = data_processing.read_data(pos_file_path, neg_file_path)
    res = data_processing.data_split(tmp[0], tmp[1])
    x_train = res[0]
    x_train = data_processing.text_clean(x_train)

    for i in x_train:
        for j in i:
            print j,
    n_dim = 200
    min_count = 2

    # model = gensim.models.Word2Vec(x_train, min_count=0, size=200, workers=4)

    model = word2vec_model(x_train, n_dim, min_count)

    # res = w2c_model.most_similar(positive=['纤维', '批次'], negative=['成分'], topn=1)
    #
    # w2c_model.doesnt_match("我 爱 中国".split())
    #
    # var = w2c_model.similarity('纤维', '批次')
    # print var
    # res = w2c_model.most_similar("纤维")
    # for i in res:
    #     print i[0],

    dd = model.most_similar("批次")
    for i in dd:
        print i[0],
Пример #10
0
def main():
    X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat')
    Xm = X.mean(axis = 0)

    factorization = MatrixFactorization(Xm, 2)
    factorization.fit(200, 0.00001)

    generate_embedding_vis(Xm, factorization.factor, embedding_name="Matrix Factorization")
Пример #11
0
def test(args):
    if args.load_var:
        test_utterances, test_labels, word_dict = read_data(
            load_var=args.load_var, input_=None, mode='test')
    else:
        test_utterances, test_labels, word_dict = read_data(load_var=args.load_var, \
                input_=os.path.join(constant.data_path, "entangled_{}.json".format(args.mode)), mode='test')

    if args.save_input:
        utils.save_or_read_input(os.path.join(constant.save_input_path, "{}_utterances.pk".format(args.mode)), \
                                    rw='w', input_obj=test_utterances)
        utils.save_or_read_input(os.path.join(constant.save_input_path, "{}_labels.pk".format(args.mode)), \
                                    rw='w', input_obj=test_labels)

    current_time = re.findall('.*model_(.+?)/.*', args.model_path)[0]
    step_cnt = re.findall('.step_(.+?)\.pkl', args.model_path)[0]

    test_dataloader = TrainDataLoader(test_utterances,
                                      test_labels,
                                      word_dict,
                                      name='test',
                                      batch_size=4)

    ensemble_model = EnsembleModel(word_dict,
                                   word_emb=None,
                                   bidirectional=False)
    if torch.cuda.is_available():
        ensemble_model.cuda()

    supervised_trainer = SupervisedTrainer(args,
                                           ensemble_model,
                                           current_time=current_time)

    supervised_trainer.test(test_dataloader,
                            args.model_path,
                            step_cnt=step_cnt)
Пример #12
0
def main():
    X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat')
    Xm = X.mean(axis=0)

    walk = random_walk(Xm, steps=1000)
    one_hot = np.zeros((len(walk), 268))
    for i, pos in enumerate(walk):
        one_hot[i, :] = Xm[pos]

    #Skip-Gram
    model = Skip_Gram(268, 64, 2, 0.1)
    model.train_from_feature_seq(one_hot, epochs=200)
    generate_embedding_vis(Xm, model.encode(Xm), embedding_name="Skip-Gram")

    #CBOW
    model = CBOW(268, 64, 2, 0.1)
    model.train_from_feature_seq(one_hot, epochs=200)
    generate_embedding_vis(Xm, model.encode(Xm), embedding_name="CBOW")
Пример #13
0
def main():
    X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat')
    Xm = X.mean(axis=0)

    factorization = MatrixFactorization(Xm, 8)
    factorization.fit(200, 0.0001)

    #generate_embedding_vis(Xm, factorization.factor, embedding_name="Matrix Factorization")
    generate_embedding_vis(X,
                           factorization.encode(X),
                           embedding_name="Matrix Factorization")

    factorization = TensorFactorization(X, 8)
    factorization.fit(50)

    #generate_embedding_vis(Xm, factorization.matrix_factor, embedding_name="Tensor Factorization")
    generate_embedding_vis(X,
                           factorization.encode(X),
                           embedding_name='Tensor Factorization')
Пример #14
0
def main():
    X, y = data_processing.read_data('Data/conmat_240.mat',
                                     'Data/task_240.mat',
                                     target_variable='mean_rxn')
    #X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat', target_variable='age')
    #task = mean_rxn, age = age for tagert variable

    indices = ~np.isnan(X).any(axis=(1, 2))
    X, y = X[indices], y[indices]

    permutation = np.random.permutation(len(X))
    X, y = X[permutation], y[permutation]

    #y = (y - y.min()) / (y.max() - y.min())
    y = (y - y.mean()) / y.std()

    node_features = np.eye(268)[np.newaxis, ...]
    node_features = np.repeat(node_features, len(X), axis=0)

    edge_features = X + node_features
    edge_features = edge_features[:, np.newaxis, ...]

    model = graph_nn(268)

    X_train, y_train = [node_features[:200], edge_features[:200]], y[:200]
    X_test, y_test = [node_features[200:], edge_features[200:]], y[200:]

    model.compile(loss='mse',
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5))
    model.fit(X_train,
              y_train,
              validation_data=(X_test, y_test),
              epochs=4000,
              batch_size=16)

    predictions = model.predict(X_test)

    print(predictions)
    print(y_test)

    print('MSE:', ((predictions - y_test)**2).mean())
    print('Corr:', np.corrcoef(predictions[:, 0], y_test[:, 0])[0, 1])
def _data_read(pos_file_path, neg_file_path, w2c_model_path):
    """read data and word2vec model from file path,
    Args:
        pos_file_path: Positive file path.
        neg_file_path: Negative file path.
        w2c_model_path: word2vec model path
    Returns:
        A list contains train and test data with labels.
    Raises:
        IOError: An error occurred accessing the bigtable.Table object.
    """

    tmp = data_processing.read_data(pos_file_path, neg_file_path)
    res = data_processing.data_split(tmp[0], tmp[1])
    (train_data, test_data, train_labels, test_labels) = (res[0], res[1],
                                                          res[2], res[3])

    # print train_labels[0]
    train_data = data_processing.text_clean(train_data)
    test_data = data_processing.text_clean(test_data)

    # 词向量的维度
    n_dim = globe.n_dim
    doc_vecs = []
    try:
        # load word2vec model from model path
        word2vec_model = Word2Vec.load(w2c_model_path)

        doc_vecs = word2vec_gensim_train.text_vecs(train_data, test_data,
                                                   n_dim, word2vec_model)
    except IOError:
        pass

    # 生成文本向量
    train_data_vecs = doc_vecs[0]
    # print train_data_vecs.shape
    test_data_vecs = doc_vecs[1]
    # print test_data_vecs.shape

    return train_data_vecs, train_labels, test_data_vecs, test_labels
Пример #16
0
def add_pro_feature(data, flag='train'):
    train_data = None
    if flag == 'train':
        data = data.drop(['conversionTime', 'userID'], axis=1)

        data['clickHour'] = pd.Series([str(x)[2:4] for x in data.clickTime])
        hourDict = data.groupby(['clickHour'])['label'].mean()
        data['clickTimePro'] = 0.0
        for i in hourDict.index:
            data.loc[data.clickHour == i, 'clickTimePro'] = hourDict[i]
        data = data.drop(['clickHour', 'clickTime'], axis=1)
        print('clickTime to clickTimePro finished!')
        # data['conversionHour']=pd.Series([str(x)[2:4] for x in data.conversionTime if pd.isnull(x)==True])
        # hourDict=data.groupby(['conversionHour'])['label'].mean()
        # data['conversionTimePro']=0.0
        # for i in hourDict.index:
        # 	data.loc[data.conversionHour==i,'conversionTimePro']=hourDict[i]
        # data=data.drop(['conversionHour','conversionTime'],axis=1)

        positionIDDict = data.groupby(['positionID'])['label'].mean()
        data['positionIDPro'] = 0.0
        for i in positionIDDict.index:
            data.loc[data.positionID == i, 'positionIDPro'] = positionIDDict[i]
        data = data = data.drop(['positionID'], axis=1)
        print('positionID to positionIDPro finished!')

        connectionTypeDict = data.groupby(['connectionType'])['label'].mean()
        data['connectionTypePro'] = 0.0
        for i in connectionTypeDict.index:
            data.loc[data.connectionType == i,
                     'connectionTypePro'] = connectionTypeDict[i]
        data = data.drop(['connectionType'], axis=1)
        print('connectionType to connectionTypePro finished!')

        telecomsOperatorDict = data.groupby(['telecomsOperator'
                                             ])['label'].mean()
        data['telecomsOperatorPro'] = 0.0
        for i in telecomsOperatorDict.index:
            data.loc[data.telecomsOperator == i,
                     'telecomsOperatorPro'] = telecomsOperatorDict[i]
        data = data.drop(['telecomsOperator'], axis=1)
        print('telecomsOperator to telecomsOperatorPro finished!')

        creativeIDDict = data.groupby(['creativeID'])['label'].mean()
        data['creativeIDPro'] = 0.0
        for i in creativeIDDict.index:
            data.loc[data.creativeID == i, 'creativeIDPro'] = creativeIDDict[i]
        data = data.drop(['creativeID'], axis=1)
        print('creativeID to creativeIDPro finished!')

    # userIDDict=data.groupby(['userID'])['label'].mean()
    # data['userIDPro']=0.0
    # for i in userIDDict.index:
    # 	data.loc[data.userID==i,'userIDPro']=userIDDict[i]
    # data=data.drop(['userID'],axis=1)
    else:
        train_data = dp.read_data('train.csv')
        data = data.drop(['userID'], axis=1)

        train_data['clickHour'] = pd.Series(
            [str(x)[2:4] for x in train_data.clickTime])
        data['clickHour'] = pd.Series([str(x)[2:4] for x in data.clickTime])
        hourDict = train_data.groupby(['clickHour'])['label'].mean()
        data['clickTimePro'] = 0.0
        for i in hourDict.index:
            data.loc[data.clickHour == i, 'clickTimePro'] = hourDict[i]
        data = data.drop(['clickHour', 'clickTime'], axis=1)
        print('clickTime to clickTimePro finished!')
        # data['conversionHour']=pd.Series([str(x)[2:4] for x in data.conversionTime if pd.isnull(x)==True])
        # hourDict=data.groupby(['conversionHour'])['label'].mean()
        # data['conversionTimePro']=0.0
        # for i in hourDict.index:
        # 	data.loc[data.conversionHour==i,'conversionTimePro']=hourDict[i]
        # data=data.drop(['conversionHour','conversionTime'],axis=1)

        positionIDDict = train_data.groupby(['positionID'])['label'].mean()
        data['positionIDPro'] = 0.0
        for i in positionIDDict.index:
            data.loc[data.positionID == i, 'positionIDPro'] = positionIDDict[i]
        data = data.drop(['positionID'], axis=1)
        print('positionID to positionIDPro finished!')

        connectionTypeDict = train_data.groupby(['connectionType'
                                                 ])['label'].mean()
        data['connectionTypePro'] = 0.0
        for i in connectionTypeDict.index:
            data.loc[data.connectionType == i,
                     'connectionTypePro'] = connectionTypeDict[i]
        data = data.drop(['connectionType'], axis=1)
        print('connectionType to connectionTypePro finished!')

        telecomsOperatorDict = train_data.groupby(['telecomsOperator'
                                                   ])['label'].mean()
        data['telecomsOperatorPro'] = 0.0
        for i in telecomsOperatorDict.index:
            data.loc[data.telecomsOperator == i,
                     'telecomsOperatorPro'] = telecomsOperatorDict[i]
        data = data.drop(['telecomsOperator'], axis=1)
        print('telecomsOperator to telecomsOperatorPro finished!')

        creativeIDDict = train_data.groupby(['creativeID'])['label'].mean()
        data['creativeIDPro'] = 0.0
        for i in creativeIDDict.index:
            data.loc[data.creativeID == i, 'creativeIDPro'] = creativeIDDict[i]
        data = data.drop(['creativeID'], axis=1)
        print('creativeID to creativeIDPro finished!')

    # userIDDict=data.groupby(['userID'])['label'].mean()
    # data['userIDPro']=0.0
    # for i in userIDDict.index:
    # 	data.loc[data.userID==i,'userIDPro']=userIDDict[i]
    # data=data.drop(['userID'],axis=1)

    data.to_csv('new_' + flag + '.csv')  # train 13分钟 ;test 1分钟
    return data
    # res = w2c_model.most_similar(positive=['纤维', '批次'], negative=['成分'], topn=1)
    #
    # w2c_model.doesnt_match("我 爱 中国".split())
    #
    # var = w2c_model.similarity('纤维', '批次')
    # print var
    # res = w2c_model.most_similar("纤维")
    # for i in res:
    #     print i[0],

    dd = model.most_similar("批次")
    for i in dd:
        print i[0],


if __name__ == "__main__":
    word2vec_test()
    pos_file_path = globe.file_pos
    neg_file_path = globe.file_neg
    tmp = data_processing.read_data(pos_file_path, neg_file_path)
    res = data_processing.data_split(tmp[0], tmp[1])
    x_train = res[0]
    x_train = data_processing.text_clean(x_train)

    n_dim = 200
    min_count = 2
    model_path = globe.model_path
    mymodel = word2vec_model(x_train, n_dim, min_count)
    mymodel.save(model_path)
Пример #18
0
def main():
    X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat')
    Xm = X.mean(axis=0)

    EMBEDDING_DIM = 16

    #Fully-Connected AutoEncoder
    e_x = tf.keras.layers.Input((None, X.shape[-1]))
    e_o = tf.keras.layers.TimeDistributed(
        tf.keras.layers.Dense(EMBEDDING_DIM, activation='tanh'))(e_x)
    e = tf.keras.Model(e_x, e_o)

    d_x = tf.keras.layers.Input((None, EMBEDDING_DIM))
    d_o = tf.keras.layers.TimeDistributed(
        tf.keras.layers.Dense(X.shape[-1], activation='linear'))(d_x)
    d = tf.keras.Model(d_x, d_o)

    ae_model = AutoEncoder(e, d)
    ae_model.train(X, epochs=50, learning_rate=0.001, loss='mse')

    #Transformer AutoEncoder
    et_x = tf.keras.layers.Input((X.shape[1], X.shape[2]))
    et_o = Transformer(EMBEDDING_DIM, heads=8, activation='tanh')(et_x)
    et = tf.keras.Model(et_x, et_o)

    dt_x = tf.keras.layers.Input((X.shape[1], EMBEDDING_DIM))
    dt_o = Transformer(X.shape[2], heads=8, activation='linear')(dt_x)
    dt = tf.keras.Model(dt_x, dt_o)

    ae_modelt = AutoEncoder(et, dt)
    ae_modelt.train(X, epochs=100, learning_rate=0.001, loss='mse')

    #Matrix Factorization
    mat_factorization = MatrixFactorization(Xm, EMBEDDING_DIM)
    mat_factorization.fit(200, 0.0001)

    #Tensor Factorization
    tens_factorization = TensorFactorization(X, EMBEDDING_DIM)
    tens_factorization.fit(50)

    walk = random_walk(Xm, steps=1000)
    one_hot = np.zeros((len(walk), 268))
    for i, pos in enumerate(walk):
        one_hot[i, :] = Xm[pos]

    #Skip-Gram
    skipgram = Skip_Gram(268, EMBEDDING_DIM, 3, 0.1)
    skipgram.train_from_feature_seq(one_hot, epochs=200)

    #CBOW
    cbow = CBOW(268, EMBEDDING_DIM, 3, 0.1)
    skipgram.train_from_feature_seq(one_hot, epochs=200)

    og_distances = calculate_distance_matrix(X.reshape((len(X), -1)))

    models = {
        'AutoEncoder': ae_model,
        'Transformer': ae_modelt,
        'Matrix Factorization': mat_factorization,
        'Tensor Factorization': tens_factorization,
        'Skip-Gram': skipgram,
        'CBOW': cbow
    }

    model_distances = {}

    for key, mod in models.items():
        x_embed = mod.encode(X)
        model_distances[key] = calculate_distance_matrix(
            x_embed.reshape((len(x_embed), -1)))

    #plot distances
    plt.matshow(og_distances, cmap='Blues', vmin=0)
    plt.title('Original Distances')
    plt.savefig('images/og_distance_matrix.png')

    fig, axes = plt.subplots(2, 3)
    i = 0
    for embedding_name, embedding_distances in model_distances.items():
        r, c = i // 3, i % 3
        axes[r, c].matshow(embedding_distances, cmap='Blues', vmin=0)
        axes[r, c].set_title(embedding_name)
        i += 1
    fig.savefig('images/embedding_distances_matrix.png')
Пример #19
0
def gradcam_on_dataset(data_conf,
                       model_path,
                       layer_name,
                       custom_objects=None,
                       cache_dir=None,
                       images_dir=None,
                       vectorized_dir=None,
                       output_dir=None,
                       predict_two_output=True):
    """
    Applies GradCAM to a set of images.

    :param data_dir: path to compressed (featurized) images.
    :param csv_path: list of slides.
    :param partitions: list of partitions to select slides.
    :param model_path: path to trained model.
    :param layer_name: name of convolutional layer used to compute GradCAM.
    :param output_unit: output unit in the final layer of the network to compute GradCAM.
    :param custom_objects: used to load the model.
    :param cache_dir: folder to store compressed images temporarily.
    :return: nothing
    """

    # Featurized directories
    data_dir_luad = data_conf['data_dir_luad']
    data_dir_lusc = data_conf['data_dir_lusc']
    csv_test = data_conf['csv_path']

    # Output dir
    output_dir = join(dirname(model_path),
                      'gradcam') if output_dir is None else output_dir
    if not exists(output_dir):
        os.makedirs(output_dir)

    print('GradCAM in directory: {d} with content {c}'.format(
        d=output_dir, c=os.system("ls " + output_dir)),
          flush=True)

    # List features
    data_config = {
        'data_dir_luad': data_dir_luad,
        'data_dir_lusc': data_dir_lusc,
        'csv_path': csv_test
    }
    image_ids, paths, dm_paths, labels, features_ids = read_data(
        data_config)  #, custom_augmentations=[('none', 0)])

    # Load model and gradient function
    K.set_learning_phase(
        0
    )  # required to avoid bug "You must feed a value for placeholder tensor 'batch_normalization_1/keras_learning_phase' with dtype bool"
    model = keras.models.load_model(model_path, custom_objects=custom_objects)
    gradient_function_0 = grad_cam_fn(model, 0, layer_name)
    if predict_two_output:
        gradient_function_1 = grad_cam_fn(model, 1, layer_name)
    else:
        gradient_function_1 = None

    # Analyze features
    for i, (image_id, path, dm_path, label, features_id,
            batch_id) in enumerate(
                zip(image_ids, paths, dm_paths, labels, features_ids,
                    batch_ids)):

        try:
            print('Computing GradCAM on {filename} ... {i}/{n}'.format(
                filename=features_id, i=i + 1, n=len(image_ids)),
                  flush=True)

            output_npy_path0, output_png_path0 = gradcam_on_features(
                features_path=cache_file(path, cache_dir, overwrite=False),
                distance_map_path=cache_file(dm_path,
                                             cache_dir,
                                             overwrite=False),
                gradient_function=gradient_function_0,
                output_npy_path=join(
                    output_dir,
                    features_id + '_{unit}_{preds}_gradcam.npy'.format(
                        unit=0, preds='{preds:0.3f}')),
                output_png_path=join(
                    output_dir,
                    features_id + '_{unit}_{preds}_gradcam.png'.format(
                        unit=0, preds='{preds:0.3f}')),
            )

            if predict_two_output:
                output_npy_path1, output_png_path1 = gradcam_on_features(
                    features_path=cache_file(path, cache_dir, overwrite=False),
                    distance_map_path=cache_file(dm_path,
                                                 cache_dir,
                                                 overwrite=False),
                    gradient_function=gradient_function_1,
                    output_npy_path=join(
                        output_dir,
                        features_id + '_{unit}_{preds}_gradcam.npy'.format(
                            unit=1, preds='{preds:0.3f}')),
                    output_png_path=join(
                        output_dir,
                        features_id + '_{unit}_{preds}_gradcam.png'.format(
                            unit=1, preds='{preds:0.3f}')),
                )

            if (images_dir is not None) and (vectorized_dir is not None):
                image_crop_from_wsi(
                    wsi_path=join(images_dir, batch_id, image_id + '.mrxs'),
                    vectorized_im_shape_path=join(vectorized_dir,
                                                  image_id + '_im_shape.npy'),
                    distance_map_path=cache_file(dm_path,
                                                 cache_dir,
                                                 overwrite=False),
                    output_npy_path=join(output_dir,
                                         features_id + '_image.npy'),
                    output_png_path=join(output_dir,
                                         features_id + '_image.png'),
                    crop_size=400)

                overlay_gradcam_heatmap(
                    gradcam_npy_path=output_npy_path0,
                    image_npy_path=join(output_dir,
                                        features_id + '_image.npy'),
                    output_png_path=join(
                        output_dir,
                        features_id + '_{unit}_heatmap.png'.format(unit=0)))

                if predict_two_output:
                    overlay_gradcam_heatmap(
                        gradcam_npy_path=output_npy_path1,
                        image_npy_path=join(output_dir,
                                            features_id + '_image.npy'),
                        output_png_path=join(
                            output_dir, features_id +
                            '_{unit}_heatmap.png'.format(unit=1)))

                    overlay_gradcam_heatmap_bicolor(
                        gradcam_npy_path1=output_npy_path0,
                        gradcam_npy_path2=output_npy_path1,
                        image_npy_path=join(output_dir,
                                            features_id + '_image.npy'),
                        output_png_path=join(output_dir, features_id +
                                             '_both_heatmap.png'))
        except Exception as e:
            print('Failed to compute GradCAM on {f}. Exception: {e}'.format(
                f=path, e=e),
                  flush=True)
def sensitive_add_edge(filename1,
                       filename2,
                       outputfilename,
                       type="hits_weighted",
                       add_node="mit"):
    top_50 = []
    f = open(filename2, "r")
    for line in f:
        line = line.strip().lower()
        top_50.append(line)
    f.close()

    fo = open(outputfilename, "w")
    node_list, edge_list = dp.read_data(filename1,
                                        filename2,
                                        self_edge=False,
                                        extended=True)
    G = dp.construct_graph(node_list, edge_list)
    #r = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001)
    r = choose_algorithm(G, type=type)
    result = sorted(r.iteritems(), key=lambda asd: asd[1], reverse=True)
    G.clear()

    rank = []
    for e in result:
        if e[0] in top_50:
            rank.append(e[0])

    original_r = []
    for e in result:
        if e[0] in top_50:
            original_r.append([e[0]])

    for k in range(len(original_r)):
        #         if not original_r[k][0] == "mit":
        node_list, edge_list = dp.read_data(filename1,
                                            filename2,
                                            self_edge=False,
                                            extended=True)
        G = dp.construct_graph(node_list, edge_list)
        G = G = add_non_existing_edges(
            G, original_r[k][0], add_node,
            weight=1)  ### add one edge from MIT to <node>
        #r = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001)
        r = choose_algorithm(G, type=type)
        result = sorted(r.iteritems(), key=lambda asd: asd[1], reverse=True)
        #result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True)
        G.clear()
        res1 = []
        for e in result:
            if e[0] in top_50:
                res1.append(e[0])
        kr = 0
        for i in range(len(res1)):
            if res1[i] == original_r[k][0]:
                kr = i
        original_r[k].append(k - kr)
    print original_r
    fo.write("univ,diff+%s1\n" % (add_node))
    for r in original_r:
        for i in range(len(r)):
            if i == 0:
                fo.write(str(r[i]))
            else:
                fo.write("," + str(r[i]))
        fo.write("\n")
    fo.close()
Пример #21
0
def train(args):
    utils.make_all_dirs(current_time)
    if args.load_var:
        all_utterances, labels, word_dict = read_data(load_var=args.load_var,
                                                      input_=None,
                                                      mode='train')
        dev_utterances, dev_labels, _ = read_data(load_var=args.load_var,
                                                  input_=None,
                                                  mode='dev')
    else:
        all_utterances, labels, word_dict = read_data(load_var=args.load_var, \
                input_=os.path.join(constant.data_path, "entangled_train.json"), mode='train')
        dev_utterances, dev_labels, _ = read_data(load_var=args.load_var, \
                input_=os.path.join(constant.data_path, "entangled_dev.json"), mode='dev')

    word_emb = build_embedding_matrix(word_dict, glove_loc=args.glove_loc, \
                    emb_loc=os.path.join(constant.save_input_path, "word_emb.pk"), load_emb=False)

    if args.save_input:
        utils.save_or_read_input(os.path.join(constant.save_input_path, "train_utterances.pk"), \
                                    rw='w', input_obj=all_utterances)
        utils.save_or_read_input(os.path.join(constant.save_input_path, "train_labels.pk"), \
                                    rw='w', input_obj=labels)
        utils.save_or_read_input(os.path.join(constant.save_input_path, "word_dict.pk"), \
                                    rw='w', input_obj=word_dict)
        utils.save_or_read_input(os.path.join(constant.save_input_path, "word_emb.pk"), \
                                    rw='w', input_obj=word_emb)
        utils.save_or_read_input(os.path.join(constant.save_input_path, "dev_utterances.pk"), \
                                    rw='w', input_obj=dev_utterances)
        utils.save_or_read_input(os.path.join(constant.save_input_path, "dev_labels.pk"), \
                                    rw='w', input_obj=dev_labels)

    train_dataloader = TrainDataLoader(all_utterances, labels, word_dict)
    if args.add_noise:
        noise_train_dataloader = TrainDataLoader(all_utterances,
                                                 labels,
                                                 word_dict,
                                                 add_noise=True)
    else:
        noise_train_dataloader = None
    dev_dataloader = TrainDataLoader(dev_utterances,
                                     dev_labels,
                                     word_dict,
                                     name='dev')

    logger_name = os.path.join(constant.log_path,
                               "{}.txt".format(current_time))
    LOG_FORMAT = '%(asctime)s %(name)-12s %(levelname)-8s %(message)s'
    logging.basicConfig(format=LOG_FORMAT,
                        level=logging.INFO,
                        filename=logger_name,
                        filemode='w')
    logger = logging.getLogger()
    global log_head
    log_head = log_head + "Training Model: {}; ".format(args.model)
    if args.add_noise:
        log_head += "Add Noise: True; "
    logger.info(log_head)

    if args.model == 'T':
        ensemble_model_bidirectional = EnsembleModel(word_dict,
                                                     word_emb=word_emb,
                                                     bidirectional=True)
    elif args.model == 'TS':
        ensemble_model_bidirectional = EnsembleModel(word_dict,
                                                     word_emb=None,
                                                     bidirectional=True)
    else:
        ensemble_model_bidirectional = None
    if args.model == 'TS':
        ensemble_model_bidirectional.load_state_dict(
            torch.load(args.model_path))
    ensemble_model = EnsembleModel(word_dict,
                                   word_emb=word_emb,
                                   bidirectional=False)

    if torch.cuda.is_available():
        ensemble_model.cuda()
        if args.model == 'T' or args.model == 'TS':
            ensemble_model_bidirectional.cuda()

    supervised_trainer = SupervisedTrainer(args, ensemble_model, teacher_model=ensemble_model_bidirectional, \
                                                logger=logger, current_time=current_time)

    supervised_trainer.train(train_dataloader, noise_train_dataloader,
                             dev_dataloader)
Пример #22
0
def main():
    X, y = data_processing.read_data('maps_conmat.mat', 'maps_age.mat')
    #X = data_processing.adjacency_matrix(X)

    print(random_walk(X[0], steps=1000))
Пример #23
0
def main():
    
#     bucket = {}
#     f = open("../result/result_top50_cs_newdata_apr09/year_statistical_from1995_to2015.csv","r")
#     f.readline()
#     for line in f:
#         lines = line.split(",")
#         try:
#             bucket.update({lines[0].strip() : int(lines[2].strip())})
#         except:
#             pass
#     f.close()
#     
#     node_list, edge_list = dp.read_data_in_range("../data/data_may28_new/data_top50_ee.csv", 
#                                                  "../data/data_may28_new/top50_ee_2015.txt",
#                                                  start_year = 1992, end_year = 2013, self_edge = True)
    
    node_list, edge_list = dp.read_data("../data/data_may28_new/data_top50_ee.csv", 
                                        "../data/data_may28_new/top50_ee_2015.txt", 
                                        self_edge = False, extended = False)
    print len(node_list), node_list
    print len(edge_list), edge_list
    
    exit(0)
    
    G = dp.construct_graph(node_list, edge_list)
    
    top_50 = []
    f = open("../data/data_may28_new/top50_ee_2015.txt","r")
    for line in f:
        line = line.strip().lower()
        top_50.append(line)
    f.close()
    
    print len(G.edges())
    print len(G.nodes())

    nodes = dp.rank_univ(G, t = "in_degree")
    f = open("../result/result_may28/ee/comparison/ee_1951-1991_indegree.csv","w")
    for node in nodes:
        if node[0] in top_50:
            f.write("%s;%d\n" %(node[0], node[1]))
    f.close()

    weighted_pagerank = algo.weighted_PR_wnorm(G, damping_factor = 0.85, max_iterations = 100, min_delta = 0.00001)
    result = sorted(weighted_pagerank.iteritems(), key = lambda asd:asd[1], reverse = True)
    f = open("../result/result_may28/ee/comparison/ee_1992-2013_weightedPR_w_norm.csv","w")
    for r in result:
        if r[0] in top_50:
            f.write("%s;%.5f\n" %(r[0], r[1]))
    f.close()
    
    weighted_pagerank = algo.weighted_PR_wonorm(G, damping_factor = 0.85, max_iterations = 100, min_delta = 0.00001)
    s = sum(weighted_pagerank.values())
    for rank in weighted_pagerank:
        weighted_pagerank[rank] = weighted_pagerank[rank]*50.0/s
    result = sorted(weighted_pagerank.iteritems(), key = lambda asd:asd[1], reverse = True)
    f = open("../result/result_may28/ee/comparison/ee_1992-2013_weightedPR_wo_norm.csv","w")
    for r in result:
        if r[0] in top_50:
            f.write("%s;%.5f\n" %(r[0], r[1]))
    f.close()
#    
#     hits = algo.HITS(G, max_iterations = 100, min_delta = 0.00001)
#     result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True)
#     f = open("../result/result_may28/me/extendedGwselfedges/cs_hits.csv","w")
#     for r in result:
#         if r[0] in top_50:
#             f.write("%s;%.5f\n" %(r[0], r[1]))
#     f.close()
       
    hits = algo.weighted_HITS(G, max_iterations = 100, min_delta = 0.00001)
    result = sorted(hits.iteritems(), key = lambda asd:asd[1], reverse = True)
    f = open("../result/result_may28/ee/comparison/ee_1992-2013_hits_weighted.csv","w")
    for r in result:
        if r[0] in top_50:
            f.write("%s;%.5f\n" %(r[0], r[1]))
    f.close()
    
    hubavg = algo.hubavg_HITS(G, max_iterations = 100, min_delta = 0.00001)
    result = sorted(hubavg.iteritems(), key = lambda asd:asd[1], reverse = True)
    f = open("../result/result_may28/ee/comparison/ee_1992-2013_hits_hubavg.csv","w")
    for r in result:
        if r[0] in top_50:
            f.write("%s;%.5f\n" %(r[0], r[1]))
    f.close()

#     salsa = algo.SALSA(G)
#     result = sorted(salsa.iteritems(), key = lambda asd:asd[1], reverse = True)
#     f = open("../result/result_top50_cs_newdata_apr09/result_top50_cs/univ_top50_cs_from2000_salsa.csv","w")
#     for r in result:
#         f.write("%s;%.5f\n" %(r[0], r[1]))
#     f.close()
#       
#     salsa = algo.modified_SALSA(G)
#     result = sorted(salsa.iteritems(), key = lambda asd:asd[1], reverse = True)
#     f = open("../result/result_top50_cs_extended/entire/univ_top40_me_from1946_to1990_salsa_modified.csv","w")
#     for r in result:
#         if r[0] in top_50:
#             f.write("%s;%.5f\n" %(r[0], r[1]))
#     f.close()
#  
#     credit = algo.CreditPropagation(G, original_rank = hits, cr = 0.8, max_iterations = 10000, min_delta = 0.00001)
#     result = sorted(credit.iteritems(), key = lambda asd:asd[1], reverse = True)
#     f = open("../result/result_top50_cs_newdata_apr09/result_top50_cs_subtracted_woselfedge/univ_top50_cs_wo_selfedges_CreditProp_hits.csv","w")
#     for r in result:
#         if r[0] in top_50:
#             f.write("%s;%.5f\n" %(r[0], r[1]))
#     f.close()


    """ new experiments on authavg and weightedHITS_normalized @ May 13th """
Пример #24
0
def main():
    # dimensions to test
    DIMENSIONS = [64, 32, 16, 8, 4, 2, 1]

    X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat')

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)

    # train embeddings for each dimension
    encoders = list()
    for dimension in DIMENSIONS:

        print(str(dimension) + "-D Embedding Training")

        e_x = tf.keras.layers.Input((None, 268))
        e_o = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(dimension, activation='tanh'))(e_x)
        e = tf.keras.Model(e_x, e_o)

        d_x = tf.keras.layers.Input((None, dimension))
        d_o = tf.keras.layers.TimeDistributed(
            tf.keras.layers.Dense(268, activation='linear'))(d_x)
        d = tf.keras.Model(d_x, d_o)

        model = AutoEncoder(e, d)
        model.train(X_train, epochs=100, learning_rate=0.001, loss='mse')

        encoders.append((model, dimension))

    # encode train and test data using embeddings, then flatten for prediction
    embedded_train_list = list()
    embedded_test_list = list()
    for model, dim in encoders:
        embedded_train_matrix = np.zeros((len(X_train), 268 * dim))
        for i in range(len(X_train)):
            embedding_train = model.encode(X_train[i])
            embedded_train_matrix[i] = np.ndarray.flatten(embedding_train)
        embedded_train_list.append(embedded_train_matrix)
        embedded_test_matrix = np.zeros((len(X_test), 268 * dim))
        for i in range(len(X_test)):
            embedding_test = model.encode(X_test[i])
            embedded_test_matrix[i] = np.ndarray.flatten(embedding_test)
        embedded_test_list.append(embedded_test_matrix)

    # train prediction models on encoded train data, then test on encoded test data and calculate Mean Squared Error
    lr_error_list = list()
    svr_error_list = list()
    mlp_error_list = list()
    lr_error_list_train = list()
    svr_error_list_train = list()
    mlp_error_list_train = list()
    for i in range(len(embedded_train_list)):
        #savemat(f'Data/neural_{DIMENSIONS[i]}.mat', {'train':embedded_train_list[i] ,'test':embedded_test_list[i]})
        lr = Ridge(alpha=2).fit(embedded_train_list[i], y_train)
        svr = SVR().fit(embedded_train_list[i], np.reshape(y_train, -1))
        mlp = MLPRegressor(hidden_layer_sizes=(64, 32, 16, 8),
                           learning_rate_init=0.001,
                           max_iter=1000).fit(embedded_train_list[i],
                                              np.reshape(y_train, -1))
        predictedLR = lr.predict(embedded_train_list[i])
        predictedSV = svr.predict(embedded_train_list[i])
        predictedMLP = mlp.predict(embedded_train_list[i])
        lr_error = mean_squared_error(predictedLR, y_train)
        svr_error = mean_squared_error(predictedSV, y_train)
        mlp_error = mean_squared_error(predictedMLP, y_train)
        lr_error_list_train.append(lr_error)
        svr_error_list_train.append(svr_error)
        mlp_error_list_train.append(mlp_error)
        predictedLR = lr.predict(embedded_test_list[i])
        predictedSV = svr.predict(embedded_test_list[i])
        predictedMLP = mlp.predict(embedded_test_list[i])
        print(str(embedded_test_list[i].shape[-1] // 268) + "-D Predicted")
        lr_error = mean_squared_error(predictedLR, y_test)
        svr_error = mean_squared_error(predictedSV, y_test)
        mlp_error = mean_squared_error(predictedMLP, y_test)
        lr_error_list.append(lr_error)
        svr_error_list.append(svr_error)
        mlp_error_list.append(mlp_error)

    # plot MSE for different embedding dims and prediction methods
    width = 0.35
    plt.bar(np.arange(len(lr_error_list_train)),
            lr_error_list_train,
            width,
            label="LinReg")
    plt.bar(np.arange(len(svr_error_list_train)) + width,
            svr_error_list_train,
            width,
            label="SVR")
    plt.bar(np.arange(len(mlp_error_list_train)) + 2 * width,
            mlp_error_list_train,
            width,
            label="MLP")
    plt.ylabel("MSE")
    plt.xlabel("Dimensions")
    plt.title("Autoencoder Mean Squared Error by Embedding Dimension - Train")
    plt.xticks(np.arange(len(svr_error_list)) + width, list(DIMENSIONS))
    plt.legend(loc="best")
    plt.savefig('images/autoencoder_train')
    plt.show()

    width = 0.35
    plt.bar(np.arange(len(lr_error_list)),
            lr_error_list,
            width,
            label="LinReg")
    plt.bar(np.arange(len(svr_error_list)) + width,
            svr_error_list,
            width,
            label="SVR")
    plt.bar(np.arange(len(mlp_error_list)) + 2 * width,
            mlp_error_list,
            width,
            label="MLP")
    plt.ylabel("MSE")
    plt.xlabel("Dimensions")
    plt.title("Autoencoder Mean Squared Error by Embedding Dimension - test")
    plt.xticks(np.arange(len(svr_error_list)) + width, list(DIMENSIONS))
    plt.legend(loc="best")
    plt.savefig('images/autoencoder_test')
    plt.show()
def main():
    # dimensions to test
    DIMENSIONS = [64, 32, 16, 8, 4, 2]

    X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat')

    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.8)

    # average matrix over train data
    avg_matrix = X_train.mean(axis=0)

    # generate random walks
    walk = random_walk(avg_matrix, steps=1000)
    seq = np.zeros((len(walk), 268))
    for i, pos in enumerate(walk):
        seq[i, :] = avg_matrix[pos]
    print(seq.shape)

    # train embeddings for each dimension
    skipgrams = list()
    for dimension in DIMENSIONS:

        print(str(dimension) + "-D Embedding Training")
        skipgram = CBOW(268, dimension, 2, 0.1)
        skipgram.train_from_feature_seq(seq, epochs=300)

        skipgrams.append((skipgram, dimension))

    # encode train and test data using embeddings, then flatten for prediction
    embedded_train_list = list()
    embedded_test_list = list()
    for skipgram in skipgrams:
        embedded_train_matrix = np.zeros((len(X_train), 268 * skipgram[1]))
        for i in range(len(X_train)):
            embedding_train = skipgram[0].encode(X_train[i])
            embedded_train_matrix[i] = np.ndarray.flatten(embedding_train)
        embedded_train_list.append(embedded_train_matrix)
        embedded_test_matrix = np.zeros((len(X_test), 268 * skipgram[1]))
        for i in range(len(X_test)):
            embedding_test = skipgram[0].encode(X_test[i])
            embedded_test_matrix[i] = np.ndarray.flatten(embedding_test)
        embedded_test_list.append(embedded_test_matrix)

    # train prediction models on encoded train data, then test on encoded test data and calculate Mean Squared Error
    lr_error_list = list()
    svr_error_list = list()
    mlp_error_list = list()
    for i in range(len(embedded_train_list)):
        #savemat(f'Data/cbow_{DIMENSIONS[i]}.mat', {'train':embedded_train_list[i] ,'test':embedded_test_list[i]})
        lr = Ridge().fit(embedded_train_list[i], y_train)
        svr = SVR().fit(embedded_train_list[i], np.reshape(y_train, -1))
        mlp = MLPRegressor(hidden_layer_sizes=(100,)).fit(embedded_train_list[i], np.reshape(y_train, -1))
        print(mlp.loss_)
        predictedLR = lr.predict(embedded_test_list[i])
        predictedSV = svr.predict(embedded_test_list[i])
        predictedMLP = mlp.predict(embedded_test_list[i])
        print(str(embedded_test_list[i].shape[-1] // 268) + "-D Predicted")
        lr_error = mean_squared_error(predictedLR, y_test)
        svr_error = mean_squared_error(predictedSV, y_test)
        mlp_error = mean_squared_error(predictedMLP, y_test)
        lr_error_list.append(lr_error)
        svr_error_list.append(svr_error)
        mlp_error_list.append(mlp_error)

    # plot MSE for different embedding dims and prediction methods
    width = 0.35
    plt.bar(np.arange(len(lr_error_list)), lr_error_list, width, label="LinReg")
    plt.bar(np.arange(len(svr_error_list)) + width, svr_error_list, width, label="SVR")
    plt.bar(np.arange(len(mlp_error_list)) + 2 * width, mlp_error_list, width, label="MLP")
    plt.ylabel("MSE")
    plt.xlabel("Dimensions")
    plt.title("SkipGram Mean Squared Error by Embedding Dimension")
    plt.xticks(np.arange(len(svr_error_list)) + width, list(DIMENSIONS))
    plt.legend(loc="best")
    plt.show()
Пример #26
0
@decription: this piece of code exclusively examine how the cr parameter impacts the result of credit propagation

@author: Bolun
"""
import data_processing as dp
import algorithms as algo
import networkx as nx
import ranking_evaluation as reval

list1 = []
f = open("../data/univ_top_50_cs.csv","r")
for line in f:
    list1.append(line.strip())
f.close()

node_list, edge_list = dp.read_data("../data/data_top50_cs.csv")
G = dp.construct_graph(node_list, edge_list)

# orank = algo.weighted_PR_wonorm(G, damping_factor = 0.85, max_iterations = 100, min_delta = 0.00001)
# s = sum(orank.values()) 
# for rank in orank:
#     orank[rank] = orank[rank]*50.0/s
# result = sorted(orank.iteritems(), key = lambda asd:asd[1], reverse = True)

orank = algo.HITS(G, max_iterations = 100, min_delta = 0.00001)
result = sorted(orank.iteritems(), key = lambda asd:asd[1], reverse = True)
print result

f = open("../result/result_top50_cs/CreditPropagation_hits_evaluation.csv","w")
f.write("cr;dist\n")
i = 0.0
Пример #27
0
@decription: this piece of code exclusively examine how the cr parameter impacts the result of credit propagation

@author: Bolun
"""
import data_processing as dp
import algorithms as algo
import networkx as nx
import ranking_evaluation as reval

list1 = []
f = open("../data/univ_top_50_cs.csv", "r")
for line in f:
    list1.append(line.strip())
f.close()

node_list, edge_list = dp.read_data("../data/data_top50_cs.csv")
G = dp.construct_graph(node_list, edge_list)

# orank = algo.weighted_PR_wonorm(G, damping_factor = 0.85, max_iterations = 100, min_delta = 0.00001)
# s = sum(orank.values())
# for rank in orank:
#     orank[rank] = orank[rank]*50.0/s
# result = sorted(orank.iteritems(), key = lambda asd:asd[1], reverse = True)

orank = algo.HITS(G, max_iterations=100, min_delta=0.00001)
result = sorted(orank.iteritems(), key=lambda asd: asd[1], reverse=True)
print result

f = open("../result/result_top50_cs/CreditPropagation_hits_evaluation.csv", "w")
f.write("cr;dist\n")
i = 0.0
Пример #28
0
def main():
    X, y = data_processing.read_data('Data/conmat_240.mat', 'Data/age_240.mat')
    #X = data_processing.adjacency_matrix(X)

    print(random_walk(X[0], steps=1000))