예제 #1
0
def dump_feature_id_to_file():
    """
    transform a publication into a set of author and word IDs, dump it to csv
    """
    model = EmbeddingModel.Instance()
    author_emb_model = model.load_author_name_emb()
    author_emb_file = "author_emb.array"
    word_emb_model = model.load_word_name_emb()
    word_emb_file = "word_emb.array"
    dump_emb_array(author_emb_model, author_emb_file)
    dump_emb_array(word_emb_model, word_emb_file)

    features = data_utils.load_data('Essential_Embeddings/', "pub.features")
    author_idfs = data_utils.load_data('Essential_Embeddings/global/', 'author_feature_idf.pkl')
    word_idfs = data_utils.load_data('Essential_Embeddings/global/', 'word_feature_idf.pkl')
    index = 0
    feature_dict = {}
    for pub_index in range(len(features)):
    	pub_features = features[pub_index]
    	if (pub_features == None):
            continue
    	for author_index in range(len(pub_features)):
    		aid, author_features, word_features = pub_features[author_index]
    		if index % 100000 == 0:
    			print(index, author_features, word_features)
    		index += 1
    		author_id_list, author_idf_list = get_feature_ids_idfs_for_one_pub(author_features, author_emb_model, author_idfs)
    		word_id_list, word_idf_list = get_feature_ids_idfs_for_one_pub(word_features, word_emb_model, word_idfs)

    		if author_id_list is not None or word_id_list is not None:
    			feature_dict[aid] = (author_id_list, author_idf_list, word_id_list, word_idf_list)
    data_utils.dump_data(feature_dict, 'Essential_Embeddings/emb/', "pub_feature.ids")
예제 #2
0
 def load_vectors(self, role, fold):
     src_vectors_fname = '{}-titles-doc2vec-{}-{}.pkl'.format('src', role, fold)
     src_vectors = data_utils.load_data(self.vectors_dir, src_vectors_fname)
     dst_vectors_fname = '{}-titles-doc2vec-{}-{}.pkl'.format('dst', role, fold)
     dst_vectors = data_utils.load_data(self.vectors_dir, dst_vectors_fname)
     print('loaded')
     return src_vectors, dst_vectors
예제 #3
0
 def load_batch_triplets(self, f_idx, role='train'): #加载 对应批次文件 编号f_idx 中的三元组 数据
     if role == 'train':
         cur_dir = self.train_triplets_dir #设定目录
     #else:
         # cur_dir = self.test_triplets_dir
     X1 = data_utils.load_data(cur_dir, 'anchor_embs_{}_{}.pkl'.format(role, f_idx)) #加载 数据
     X2 = data_utils.load_data(cur_dir, 'pos_embs_{}_{}.pkl'.format(role, f_idx))
     X3 = data_utils.load_data(cur_dir, 'neg_embs_{}_{}.pkl'.format(role, f_idx))
     return X1, X2, X3 #返回 数据
예제 #4
0
 def load_batch_triplets(self, f_idx, role='train'):
     if role == 'train':
         cur_dir = self.train_triplets_dir
     else:
         cur_dir = self.test_triplets_dir
     X1 = data_utils.load_data(cur_dir, 'anchor_embs_{}_{}.pkl'.format(role, f_idx))
     X2 = data_utils.load_data(cur_dir, 'pos_embs_{}_{}.pkl'.format(role, f_idx))
     X3 = data_utils.load_data(cur_dir, 'neg_embs_{}_{}.pkl'.format(role, f_idx))
     return X1, X2, X3
예제 #5
0
 def _prepare_data(self):
     # self.author_emb = data_utils.load_data('/root/zhangjing/NA/emb/', "author_emb.array")
     self.author_emb = data_utils.load_data(settings.EMB_DATA_DIR, "author_emb.array")
     print_logging("Loaded author embeddings")
     # self.word_emb = data_utils.load_data('/root/zhangjing/NA/emb/', "title_emb.array")
     self.word_emb = data_utils.load_data(settings.EMB_DATA_DIR, "word_emb.array")
     print_logging("Author emb shape = %s, word emb shape = %s"% (str(self.author_emb.shape), str(self.word_emb.shape)))
     self.author_num = len(self.author_emb)
     print_logging("#author = %d" %self.author_num)
     self.word_num = len(self.word_emb)
     print_logging("#Word = %d" %self.word_num)
예제 #6
0
파일: main.py 프로젝트: yuzhiw/ALFramework
def load_stats(path):
    if path and os.path.exists(path) and os.path.isfile(path):
        stats_df = data_utils.load_data(path)
    else:
        stats_df = None

    return stats_df
def train():
    dataset_path = ""
    sess = tf.InteractiveSession()
    model = net.AlexNet()
    stride = 100
    X_train, y_train = data.load_data(dataset_path,"TRAIN", stride)
    model.train(X_train, y_train, sess)
예제 #8
0
    def _init_model(self):
        # Restore data and .
        self._round = self._stats_df['round'].iloc[
            -1] + 1 if self._stats_df.shape[0] else 0

        if self._round > 0:
            # Restore data structure and continue the active learning process.
            self.data_container = data_utils.load_data(
                os.path.join(os.path.dirname(self.stats_path),
                             'data_container.np'))
        else:
            if self.pre_train:
                # # Train from scratch.
                logger.info('Start pre-training.')
                eval_res = self._train_and_eval()
            else:
                logger.info('Start pre-eval.')
                eval_res = self._train_and_eval(train=False)

            self._stats_df = add_stats(
                self._stats_df,
                0,
                num_new_labeled=None,
                num_pseudo_labeled=None,
                num_total_labeled=self.data_container.labeled_data.shape[0],
                eval_res=eval_res,
                pseudo_acc=None,
                **self.al_args)
            self._round = 1
예제 #9
0
def dump_author_embs():
    """
    dump author embedding to lmdb
    author embedding is calculated by weighted-average of word vectors with IDF
    """
    emb_model = EmbeddingModel.Instance()
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    print('idf loaded')
    LMDB_NAME_FEATURE = 'pub_authors.feature'
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)
    LMDB_NAME_EMB = "author_100.emb.weighted"
    lc_emb = LMDBClient(LMDB_NAME_EMB)
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')
            # print ("pid_order: ", pid_order)
            features = data_utils.deserialize_embedding(k[1])
            cur_emb = emb_model.project_embedding(features, idf)
            if cur_emb is not None:
                # print ("pid_order: is not none", pid_order)
                lc_emb.set(pid_order, cur_emb)
예제 #10
0
def dump_author_embs():  # 将作者嵌入 导入到 lmdb 中,  作者嵌入 是  词向量 IDF 的 加权平均
    """
    dump author embedding to lmdb
    author embedding is calculated by weighted-average of word vectors with IDF
    """
    emb_model = EmbeddingModel.Instance()
    idf = data_utils.load_data(
        settings.GLOBAL_DATA_DIR,
        'feature_idf.pkl')  #取出 上个函数 计算的 idf {feature: idf}
    print('idf loaded')
    LMDB_NAME_FEATURE = 'pub_authors.feature'  # (pid-j, author_feature)
    lc_feature = LMDBClient(LMDB_NAME_FEATURE)  # 连接 作者特征 lmdb
    LMDB_NAME_EMB = "author_100.emb.weighted"  # (pid-j, x^-)
    lc_emb = LMDBClient(LMDB_NAME_EMB)  # 连接 作者嵌入 lmdb
    cnt = 0
    with lc_feature.db.begin() as txn:
        for k in txn.cursor():  # 遍历 特征
            if cnt % 1000 == 0:
                print('cnt', cnt, datetime.now() - start_time)
            cnt += 1
            pid_order = k[0].decode('utf-8')  # 解码获得 文章 编号
            features = data_utils.deserialize_embedding(
                k[1])  # 反序列化 得 对应 作者特征 对象
            cur_emb = emb_model.project_embedding(
                features, idf)  # 获得 对应 加权平均IDF 的 嵌入 x^-
            if cur_emb is not None:
                lc_emb.set(
                    pid_order, cur_emb
                )  # 结果 保存 到 作者 嵌入lmdb author_100.emb.weigthed 中  (pid-j, x^-)
            else:
                print(pid_order)
예제 #11
0
    def vocab(self):
        train, valid, train_y, valid_y, corpus = load_data(
            './rsc/data/chatbot_korean.csv')
        vocab = Vocabulary(corpus=corpus)
        vocab.build_vocab()

        return vocab
예제 #12
0
    def train(self):
        features = data_utils.load_data('human_labeled_data/', "pub.features")
        index = 0
        author_data = []
        word_data = []
        for pub_index in range(len(features)):
            pub_features = features[pub_index]
            if (pub_features == None):
                continue
            for author_index in range(len(pub_features)):
                aid, author_features, word_features = pub_features[author_index]

                if index % 100000 == 0:
                    print(index, author_features, word_features)
                index += 1

                random.shuffle(author_features)
                author_data.append(author_features)
                random.shuffle(word_features)
                word_data.append(word_features)

        self.author_model = Word2Vec(
            author_data, size=settings.EMB_DIM, window=5, min_count=5, workers=20,
        )
        self.author_model.save(join('Essential_Embeddings/emb/', 'author_name.emb'))
        self.word_model = Word2Vec(
            word_data, size=settings.EMB_DIM, window=5, min_count=5, workers=20,
        )
        self.word_model.save(join('Essential_Embeddings/emb/', 'word.emb'))
예제 #13
0
def test_dpf(task='nav01',
             data_path='../data/100s',
             model_path='../models/tmp'):

    # load test data
    test_data = load_data(data_path=data_path, filename=task + '_test')
    noisy_test_data = noisyfy_data(test_data)
    test_batch_iterator = make_batch_iterator(noisy_test_data, seq_len=50)

    # reset tensorflow graph
    tf.reset_default_graph()

    # instantiate method
    hyperparams = get_default_hyperparams()
    method = DPF(**hyperparams['global'])

    with tf.Session() as session:
        # load method and apply to new data
        method.load(session, model_path)
        for i in range(10):
            test_batch = next(test_batch_iterator)
            test_batch_input = remove_state(test_batch,
                                            provide_initial_state=False)
            result = method.predict(session, test_batch_input,
                                    **hyperparams['test'])
예제 #14
0
    def __init__(self, path_to_data, actions, input_n=10, output_n=10, split=0, sample_rate=2, data_mean=0,
                 data_std=0):
        """
        read h36m data to get the dct coefficients.
        :param path_to_data:
        :param actions: actions to read
        :param input_n: past frame length
        :param output_n: future frame length
        :param dct_n: number of dct coeff. used
        :param split: 0 train, 1 test, 2 validation
        :param sample_rate: 2
        :param data_mean: mean of expmap
        :param data_std: standard deviation of expmap
        """

        self.path_to_data = path_to_data
        self.split = split
        subs = np.array([[1, 6, 7, 8, 9], [5], [11]])

        acts = data_utils.define_actions(actions)

        # subs = np.array([[1], [5], [11]])
        # acts = ['walking']

        subjs = subs[split]
        all_seqs, dim_ignore, dim_use, data_mean, data_std = data_utils.load_data(path_to_data, subjs, acts,
                                                                                  sample_rate,
                                                                                  input_n + output_n,
                                                                                  data_mean=data_mean,
                                                                                  data_std=data_std,
                                                                                  input_n=input_n)

        self.data_mean = data_mean
        self.data_std = data_std

        # first 6 elements are global translation and global rotation
        dim_used = dim_use[6:]
        self.all_seqs = all_seqs
        self.dim_used = dim_used

        all_seqs = all_seqs[:, :, dim_used]
        all_seqs = all_seqs.transpose(0, 2, 1)
        all_seqs = all_seqs.reshape(-1, input_n + output_n)
        all_seqs = all_seqs.transpose()


        # padding the observed sequence so that it has the same length as observed + future sequence
        pad_idx = np.repeat([input_n - 1], output_n)
        i_idx = np.append(np.arange(0, input_n), pad_idx)

        input_chebyshev_coef = data_utils.get_chebyshev_coef(input_n + output_n, all_seqs[i_idx, :])
        input_chebyshev_coef = input_chebyshev_coef.transpose().reshape([-1, len(dim_used), input_n + output_n])

        target_chebyshev_coef = data_utils.get_chebyshev_coef(input_n + output_n, all_seqs)
        target_chebyshev_coef = target_chebyshev_coef.transpose().reshape([-1, len(dim_used), input_n + output_n])

        self.input_chebyshev_coef = input_chebyshev_coef
        #self.output_chebyshev_coef = target_chebyshev_coef
        self.output_chebyshev_coef = target_chebyshev_coef-input_chebyshev_coef
예제 #15
0
    def __init__(self,
                 path_to_data,
                 actions,
                 input_n=10,
                 output_n=10,
                 dct_n=20,
                 split=0,
                 sample_rate=2,
                 data_mean=0,
                 data_std=0):
        """
        read h36m data to get the dct coefficients.
        :param path_to_data:
        :param actions: actions to read
        :param input_n: past frame length
        :param output_n: future frame length
        :param dct_n: number of dct coeff. used
        :param split: 0 train, 1 test, 2 validation
        :param sample_rate: 2
        :param data_mean: mean of expmap
        :param data_std: standard deviation of expmap
        """

        self.path_to_data = path_to_data
        self.split = split
        subs = np.array([[1, 6, 7, 8, 9], [5], [11]])

        acts = data_utils.define_actions(actions)

        # subs = np.array([[1], [5], [11]])
        # acts = ['walking']

        subjs = subs[split]
        all_seqs, dim_ignore, dim_use, data_mean, data_std = data_utils.load_data(
            path_to_data,
            subjs,
            acts,
            sample_rate,
            1,
            data_mean=data_mean,
            data_std=data_std,
            input_n=1)

        self.data_mean = data_mean
        self.data_std = data_std

        self.max = np.max(all_seqs)
        self.min = np.min(all_seqs)

        # first 6 elements are global translation and global rotation
        dim_used = dim_use[6:]
        self.all_seqs = all_seqs
        self.dim_used = dim_used

        m = self.all_seqs.shape[0]
        n = self.all_seqs.shape[-1]
        self.all_seqs = self.all_seqs.reshape((m, n))
        #print("all seq_shape ", self.all_seqs.shape)
        self.all_seqs = self.all_seqs[:, dim_used]
예제 #16
0
def gen_local_data(idf_threshold, global_vec, basic_net):
    """
    generate local data (including paper features and paper network) for each associated name
    :param idf_threshold: threshold for determining whether there exists an edge between two papers (for this demo we set 29)
    """
    # 这里一应该读入ASSIGNMENT_JSON 但由于竞赛方给的数据缺失,所以只好采用聚类结构BASIC_CLUSTER
    # name_to_pubs_test = data_utils.load_json(settings.ASSIGNMENT_JSON)
    name_to_pubs_test = data_utils.load_json(settings.ASSIGNMENT_JSON)
    pid_dict = data_utils.load_data(settings.PID_INDEX)
    lc_inter = LMDBClient(global_vec)
    pos_pairs = data_utils.load_data(basic_net)
    graph_dir = join(settings.DATA_DIR, 'local',
                     'graph-{}'.format(idf_threshold))
    os.makedirs(graph_dir, exist_ok=True)
    for i, name in enumerate(name_to_pubs_test):
        cur_person_dict = name_to_pubs_test[name]
        pids = []
        pids2label = {}

        # generate content
        wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)),
                          'w')
        for aid, items in enumerate(cur_person_dict):
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)
        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                cur_pub_emb = list(map(str, cur_pub_emb))
                wf_content.write('{}\t'.format(pid))
                wf_content.write('\t'.join(cur_pub_emb))
                wf_content.write('\t{}\n'.format(pids2label[pid]))
        wf_content.close()

        pairs = pos_pairs[name]
        with open(join(graph_dir, '{}_pubs_network.txt'.format(name)),
                  'w') as f:
            pid_index = pid_dict[name]
            for i, j in pairs:
                pid1 = pid_index[i]
                pid2 = pid_index[j]
                if pid1 in pids and pid2 in pids:
                    f.write(pid1 + '\t' + pid2 + '\n')
        print('prepare local data', name, 'done')
def plot_models():
    task = 'nav01'
    data_path = '../data/100s'
    test_data = load_data(data_path=data_path, filename=task + '_test')
    noisy_test_data = noisyfy_data(reduce_data(test_data, 10))
    num_examples = 10
    # same seqlen and batchsize needed here!
    # test_batch_iterator = make_batch_iterator(noisy_test_data, seq_len=50, batch_size=50)
    test_batch_iterator = make_batch_iterator(noisy_test_data,
                                              seq_len=50,
                                              batch_size=num_examples)
    batch = next(test_batch_iterator)

    # for i in range(num_examples):
    #     plot_observation(batch, i=0, t=i)

    predictions = dict()

    for variant, file_name in {
            'ind_e2e': '2017-12-23_03:32:47_compute-0-9_nav01_pf_ind_e2e_1000',
            # 'ind_e2e': '2017-12-22_18:30:30_compute-0-1_nav02_pf_ind_e2e_1000',
            # 'lstm': '2017-12-24_13:25:53_compute-0-1_nav01_lstm_1000',
            # 'lstm': '2017-12-22_18:29:21_compute-1-2_nav02_lstm_1000',
            # 'ind': '2017-12-23_00:48:08_compute-0-74_nav01_pf_ind_500',
            # 'e2e': '2017-12-22_18:29:49_compute-0-15_nav01_pf_e2e_500',
    }.items():

        with open('../log/lc/' + file_name, 'rb') as f:
            log = pickle.load(f)
        hyper_params = log['hyper_params'][0]
        model_path = '../models/' + log['exp_params'][0]['model_path'].split(
            '/models/')[-1]  # ['exp_params']['model_path]

        # reset tensorflow graph
        tf.reset_default_graph()

        # instantiate method
        if 'lstm' in variant:
            method = RNN(**hyper_params['global'])
        else:
            method = DPF(**hyper_params['global'])

        with tf.Session() as session:
            # load method and apply to new data
            statistics = method.load(session, model_path)
            # print('predicting now')
            # predictions[variant] = method.predict(session, batch, num_particles=1000, return_particles=False)
            # print('prediction done')
            # plot_measurement_model(session, method, statistics, batch, task, num_examples, variant)
            # plot_proposer(session, method, statistics, batch, task, num_examples, variant)
            # plot_motion_model(session, method, statistics, batch, task, 10, 50, variant)
            plot_particle_filter(session, method, statistics, batch, task,
                                 num_examples, 1000, variant)

    print(predictions.keys())
    # plot_prediction(predictions['ind_e2e'], predictions['lstm'], statistics, batch, task, num_examples, variant)

    plt.pause(10000.0)
예제 #18
0
def gen_local_data(pids,
                   labels,
                   idf_threshold=10
                   ):  # 对每一个作者名, 生成局部数据, 包括文档特征 与 文档网络; 输入参数是阀值, 也就是相似度高于多少才连边
    """
    generate local data (including paper features and paper network) for each associated name
    :param idf_threshold: threshold for determining whether there exists an edge between two papers (for this demo we set 29)
    """
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR,
                               'feature_idf.pkl')  # 加载 特征的 idf值 {word: idf}
    INTER_LMDB_NAME = 'author_triplets.emb'  # 加载 作者在triplet训练后的 内部嵌入 (pid-j, y)
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    LMDB_AUTHOR_FEATURE = "pub_authors.feature"  # 加载 作者 原始 特征 (pid-j, author_feature)
    lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE)
    graph_dir = join(settings.DATA_DIR, 'local',
                     'graph-{}'.format(idf_threshold))  # 建立目录, 做好保存局部模型 的工作
    os.makedirs(graph_dir, exist_ok=True)

    name = "Name"

    wf_content = open(join(graph_dir, '{}_pubs_content.txt'.format(name)), 'w')
    shuffle(pids)  # 打乱

    for i, pid in enumerate(pids):
        cur_pub_emb = lc_inter.get(pid)  # 获得文档嵌入 y
        if cur_pub_emb is not None:
            cur_pub_emb = list(map(str, cur_pub_emb))  #把cur_pub_emb 转换成字符串 表达
            wf_content.write('{}\t'.format(pid))  # 文档id
            wf_content.write('\t'.join(cur_pub_emb))  # 嵌入 y
            wf_content.write('\t{}\n'.format(pid))  # pid
        else:
            print("ERROR: not found embedding y for pid:%s\n" % (pid))

    wf_content.close()  # pid-j, y, aid

    # generate network
    n_pubs = len(pids)
    wf_network = open(join(graph_dir, '{}_pubs_network.txt'.format(name)),
                      'w')  # 作者名 - 网络保存路径 (pid-j, pid-j)
    edges_num = 0
    for i in range(n_pubs - 1):  # 枚举 文档 i
        author_feature1 = set(lc_feature.get(
            pids[i]))  # 取出 文档i 原始 特征 (pid-j, author_feature)
        for j in range(i + 1, n_pubs):  # 枚举 后面 点 文档 j
            author_feature2 = set(lc_feature.get(pids[j]))  # 取出 文档j 原始 特征
            common_features = author_feature1.intersection(
                author_feature2)  # 提取 公共特征
            idf_sum = 0
            for f in common_features:  # 枚举 公共特征 中的 特征f
                idf_sum += idf.get(f, idf_threshold)  # 计算 idf 和
                # print(f, idf.get(f, idf_threshold))
            if idf_sum >= idf_threshold:  # 和 大于阀值
                wf_network.write('{}\t{}\n'.format(
                    pids[i], pids[j]))  # 连边, 写入 图网络 文件中 (pid-j, pid-j)
                edges_num = edges_num + 1
    print('n_egdes', edges_num)
    wf_network.close()
    ''' name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR, 'name_to_pubs_test_100.json') # 加载 测试集 name->aid->pid-j
def test(idf_threshold):
    name_to_pubs_test = data_utils.load_json(settings.GLOBAL_DATA_DIR,
                                             'name_to_pubs_test_100.json')
    idf = data_utils.load_data(settings.GLOBAL_DATA_DIR, 'feature_idf.pkl')
    INTER_LMDB_NAME = 'triplete_loss_lc_attention_network_embedding'
    lc_inter = LMDBClient(INTER_LMDB_NAME)
    LMDB_AUTHOR_FEATURE = "pub_authors.feature"
    lc_feature = LMDBClient(LMDB_AUTHOR_FEATURE)
    graph_dir = join(settings.DATA_DIR, 'local',
                     'graph-{}'.format(idf_threshold))
    os.makedirs(graph_dir, exist_ok=True)
    for i, name in enumerate(name_to_pubs_test):
        print(i, name)
        cur_person_dict = name_to_pubs_test[name]
        pids_set = set()
        pids = []
        pids2label = {}

        # 286 hongbin_li_pubs_content.txt
        # generate content
        for i, aid in enumerate(cur_person_dict):
            items = cur_person_dict[aid]
            if len(items) < 5:
                continue
            for pid in items:
                pids2label[pid] = aid
                pids.append(pid)
        shuffle(pids)

        for pid in pids:
            cur_pub_emb = lc_inter.get(pid)
            if cur_pub_emb is not None:
                pids_set.add(pid)

        # generate network1
        # generate network1
        all_idf_sum = 0
        pathCnt = 0
        pids_filter = list(pids_set)
        n_pubs = len(pids_filter)
        for i in range(n_pubs - 1):
            author_feature1 = set(lc_feature.get(pids_filter[i]))
            for j in range(i + 1, n_pubs):
                author_feature2 = set(lc_feature.get(pids_filter[j]))
                # print('author_feature2: ', author_feature2)
                common_features = author_feature1.intersection(author_feature2)
                idf_sum = 0
                for f in common_features:
                    idf_sum += idf.get(f, idf_threshold)
                all_idf_sum += idf_sum
                if idf_sum >= idf_threshold:
                    pathCnt = pathCnt + 1

        if name == "kexin_xu":
            print("all_idf_sum: ", all_idf_sum)
            print("pathCnt: ", pathCnt)
예제 #20
0
 def evaluate_precision_at_topK(self):
     fold = 5
     topK = [1, 5, 10]
     acc = np.zeros((fold, len(topK)), dtype=np.double)
     for i in range(fold):
         sorted_indices = data_utils.load_data(self.ranking_dir, 'ranking-indices-{}-{}.pkl'.format('test', i))
         for t, k in enumerate(topK):
             acc[i, t] = eval_utils.prec_at_top(sorted_indices, k)
         print(acc[i])
     print(acc.mean(axis=0))
예제 #21
0
    def _load_pose(self, dataset_path):
        """
        Uses the data_utils to load all of the 3D poses in Human3.6m, in 'world coordinates'. We also keep around the
        meta data for each of the poses, which are tuples (subject, action, sequence_id).

        We flatten all of the poses to be a 2D array with shape (total frames, 96).
        Meta data contains the subject number, action, sequence id and frame index (of the video) to recover all
        necessary information.

        :param dataset_path: The directory for which the dataset is stored.
        :return: train_poses, train_meta, val_poses, val_meta
        """
        # Load 3d data
        rtrain_set = data_utils.load_data(dataset_path, data_utils.TRAIN_SUBJECTS, self.actions, dim=3)
        rval_set = data_utils.load_data(dataset_path, data_utils.TEST_SUBJECTS, self.actions, dim=3)

        # Convert into friendly indexed training and test set
        train_set, train_set_meta = self._flatten_poses_set(rtrain_set)
        val_set, val_set_meta = self._flatten_poses_set(rval_set)

        return train_set, train_set_meta, val_set, val_set_meta
예제 #22
0
 def load_dataset(self, dataset, sparse_features, datapath):
     """Loads citation dataset."""
     dataset = load_data(dataset, datapath)
     adj_true = dataset[0] + sp.eye(dataset[0].shape[0])
     # adj_true to compute link prediction metrics
     self.data['adj_true'] = adj_true.todense()
     if sparse_features:
         self.data['features'] = sparse_to_tuple(dataset[1])
     else:
         self.data['features'] = dataset[1]
     self.data['node_labels'] = dataset[2]
     self.data['train']['node_mask'] = dataset[3]
     self.data['val']['node_mask'] = dataset[4]
     self.data['test']['node_mask'] = dataset[5]
예제 #23
0
def train(args):
    np.random.seed(args.seed)

    logging.getLogger().setLevel(logging.INFO)
    if args.save:
        if not args.save_dir:
            dt = datetime.datetime.now()
            date = f"{dt.year}_{dt.month}_{dt.day}"
            if args.node_cluster == 1:
                task = 'nc'
            else:
                task = 'lp'
            models_dir = os.path.join(os.environ['LOG_DIR'], task, date)
            save_dir = get_dir_name(models_dir)
        else:
            save_dir = args.save_dir
        logging.basicConfig(level=logging.INFO,
                            handlers=[
                                logging.FileHandler(
                                    os.path.join(save_dir, 'log.txt')),
                                logging.StreamHandler()
                            ])
        logging.info(f"Logging model in {save_dir}")
        args.save_dir = save_dir

    if args.node_cluster == 1:
        ### NOTE : node clustering use full edge
        args.val_prop = 0.0
        args.test_prop = 0.0

    import pprint
    args_info_pprint = pprint.pformat(vars(args))

    logging.info(args_info_pprint)

    # Load data
    logging.info("Loading Data : {}".format(args.dataset))
    t_load = time.time()
    data = load_data(args, os.path.join(os.environ['DATAPATH'], args.dataset))
    st0 = np.random.get_state()
    args.np_seed = st0

    t_load = time.time() - t_load
    logging.info(data['info'])
    logging.info('Loading data took time: {:.4f}s'.format(t_load))

    sol = Solver(args, data)
    sol.fit()
    sol.eval()
예제 #24
0
def cal_feature_idf():
    """
    calculate word IDF (Inverse document frequency) using publication data
    """
    features = data_utils.load_data('Essential_Embeddings/', "pub.features")
    feature_dir = join('Essential_Embeddings/', 'global')
    index = 0
    author_counter = dd(int)
    author_cnt = 0
    word_counter = dd(int)
    word_cnt = 0
    none_count = 0
    for pub_index in range(len(features)):
        pub_features = features[pub_index]
        # print(pub_features)
        if (pub_features == None):
            none_count += 1
            continue
        for author_index in range(len(pub_features)):
            aid, author_features, word_features = pub_features[author_index]

            if index % 100000 == 0:
                print(index, aid)
            index += 1
            
            for af in author_features:
                author_cnt += 1
                author_counter[af] += 1

            for wf in word_features:
                word_cnt +=1
                word_counter[wf] +=1        

    author_idf = {}
    for k in author_counter:
        author_idf[k] = math.log(author_cnt / author_counter[k])

    word_idf = {}
    for k in word_counter:
        word_idf[k] = math.log(word_cnt / word_counter[k])

    data_utils.dump_data(dict(author_idf), feature_dir, "author_feature_idf.pkl")
    data_utils.dump_data(dict(word_idf), feature_dir, "word_feature_idf.pkl")
    print("None count: ", none_count)
def plot_statistics():
    task = 'nav02'
    data_path = '../data/100s'
    test_data = load_data(data_path=data_path, filename=task + '_test')
    noisy_test_data = noisyfy_data(test_data)
    # noisy_test_data = noisyfy_data(test_data)
    batch_size = 32
    test_batch_iterator = make_batch_iterator(noisy_test_data,
                                              seq_len=2,
                                              batch_size=batch_size)

    filenames = {
        'ind_e2e': '2017-12-22_18:30:30_compute-0-1_nav02_pf_ind_e2e_1000',
        'ind': '2017-12-23_06:56:07_compute-0-26_nav02_pf_ind_1000',
        'e2e': '2017-12-24_00:51:18_compute-1-0_nav02_pf_e2e_1000',
    }

    for variant in ['ind', 'e2e', 'ind_e2e']:
        file_name = filenames[variant]

        with open('../log/lc/' + file_name, 'rb') as f:
            log = pickle.load(f)
        hyper_params = log['hyper_params'][0]
        model_path = '../models/' + log['exp_params'][0]['model_path'].split(
            '/models/')[-1]  # ['exp_params']['model_path]

        # reset tensorflow graph
        tf.reset_default_graph()

        # instantiate method
        method = DPF(**hyper_params['global'])

        with tf.Session() as session:
            # load method and apply to new data
            statistics = method.load(session, model_path)
            plot_measurement_statistics(session, method, statistics,
                                        test_batch_iterator, batch_size,
                                        variant)
            plot_motion_statistics(session, method, statistics,
                                   test_batch_iterator, task, variant)

    plt.pause(10000.0)
예제 #26
0
파일: main.py 프로젝트: suhridbuddha/ntsa
def main(config):
    if config.mode == "test" or config.mode == "predict":
        mode = config.mode
        if config.restore_path is not None:
            config.__dict__.update(**Logger.load(config.restore_path))
            config.mode = mode

    Model = select_model(config.model)
    logger = Logger(config=config.__dict__.copy())

    df = load_data(config.dataset_path)
    dataset, testset, test_df = build_train_test_datasets(df, config)

    model = Model(dataset.shape, config=config)

    trainer = Trainer(model, path=logger.main_path)
    trainer.init_sess()

    if config.restore_path is not None:
        trainer.restore(config.restore_path)

    if "gan" in config.model:
        runner = AdversarialRunner(steps=config.steps)
    else:
        runner = Runner(steps=config.steps)

    if config.mode == "predict":
        runner.predict(trainer, dataset, logger)
        log.info(f"Prediction saved at {logger.paths['report']}")
    elif config.mode == "train" or config.mode == "test":
        if config.mode == "train":
            try:
                runner.train(trainer, dataset, testset, logger)
            except KeyboardInterrupt:
                trainer.save()
        else:
            df, score = runner.test(trainer, testset, logger)
            log.log(logging.INFO, f"Test metrics: {dict_to_str(score)}")
    else:
        logging.error("Selected Mode does not exist.")
        raise NotImplementedError("Selected mode does not exist.")
예제 #27
0
def gen_test(name_to_pubs_test, k=300, flatten=False):
    pid_dict = data_utils.load_data(settings.PID_INDEX)
    xs, ys = [], []
    names = []
    for name in name_to_pubs_test:
        pid_index = pid_dict[name]
        names.append(name)
        num_clusters = len(name_to_pubs_test[name])
        x = []
        items = list(chain.from_iterable(name_to_pubs_test[name]))
        while len(x) < k:
            p = np.random.choice(items)
            emb = lc.get(p)
            if emb is not None:
                x.append(emb)
        if flatten:
            xs.append(np.sum(x, axis=0))
        else:
            xs.append(np.stack(x))
        ys.append(num_clusters)
    xs = np.stack(xs)
    ys = np.stack(ys)
    return names, xs, ys
예제 #28
0
def train_dpf(task='nav01',
              data_path='../data/100s',
              model_path='../models/tmp',
              plot=False):

    # load training data and add noise
    train_data = load_data(data_path=data_path, filename=task + '_train')
    noisy_train_data = noisyfy_data(train_data)

    # reset tensorflow graph
    tf.reset_default_graph()

    # instantiate method
    hyperparams = get_default_hyperparams()
    method = DPF(**hyperparams['global'])

    with tf.Session() as session:
        # train method and save result in model_path
        method.fit(session,
                   noisy_train_data,
                   model_path,
                   **hyperparams['train'],
                   plot_task=task,
                   plot=plot)
예제 #29
0
def train(args):
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if int(args.double_precision):
        torch.set_default_dtype(torch.float64)
    if int(args.cuda) >= 0:
        torch.cuda.manual_seed(args.seed)
    args.device = 'cuda:' + str(args.cuda) if int(args.cuda) >= 0 else 'cpu'
    args.patience = args.epochs if not args.patience else int(args.patience)
    logging.getLogger().setLevel(logging.INFO)
    if args.save:
        if not args.save_dir:
            dt = datetime.datetime.now()
            date = f"{dt.year}_{dt.month}_{dt.day}"
            models_dir = os.path.join(os.environ['LOG_DIR'], args.task, date)
            save_dir = get_dir_name(models_dir)
        else:
            save_dir = args.save_dir
        logging.basicConfig(level=logging.INFO,
                            handlers=[
                                logging.FileHandler(
                                    os.path.join(save_dir, 'log.txt')),
                                logging.StreamHandler()
                            ])

    logging.info(f'Using: {args.device}')
    logging.info("Using seed {}.".format(args.seed))

    # Load data
    data = load_data(args, os.path.join(os.environ['DATAPATH'], args.dataset))
    args.n_nodes, args.feat_dim = data['features'].shape

    if args.task == 'nc':
        Model = NCModel
        args.n_classes = int(data['labels'].max() + 1)
        logging.info(f'Num classes: {args.n_classes}')
    else:
        args.nb_false_edges = len(data['train_edges_false'])
        args.nb_edges = len(data['train_edges'])
        if args.task == 'lp':
            Model = LPModel

    if not args.lr_reduce_freq:
        args.lr_reduce_freq = args.epochs

    # Model and optimizer
    model = Model(args)
    logging.info(str(model))
    optimizer = getattr(optimizers,
                        args.optimizer)(params=model.parameters(),
                                        lr=args.lr,
                                        weight_decay=args.weight_decay)
    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer,
                                                   step_size=int(
                                                       args.lr_reduce_freq),
                                                   gamma=float(args.gamma))
    tot_params = sum([np.prod(p.size()) for p in model.parameters()])
    logging.info(f"Total number of parameters: {tot_params}")
    if args.cuda is not None and int(args.cuda) >= 0:
        os.environ['CUDA_VISIBLE_DEVICES'] = str(args.cuda)
        model = model.to(args.device)
        for x, val in data.items():
            if torch.is_tensor(data[x]):
                data[x] = data[x].to(args.device)

    # Train model
    t_total = time.time()
    counter = 0
    best_val_metrics = model.init_metric_dict()
    best_test_metrics = None
    best_emb = None
    for epoch in range(args.epochs):
        t = time.time()
        model.train()
        optimizer.zero_grad()
        embeddings = model.encode(data['features'], data['adj_train_norm'])
        train_metrics = model.compute_metrics(embeddings, data, 'train', args)
        train_metrics['loss'].backward()

        if args.grad_clip is not None:
            max_norm = float(args.grad_clip)
            all_params = list(model.parameters())
            for param in all_params:
                torch.nn.utils.clip_grad_norm_(param, max_norm)
        optimizer.step()
        lr_scheduler.step()
        if (epoch + 1) % args.log_freq == 0:
            logging.info(" ".join([
                'Epoch: {:04d}'.format(epoch + 1),
                'lr: {}'.format(lr_scheduler.get_lr()[0]),
                format_metrics(train_metrics, 'train'),
                'time: {:.4f}s'.format(time.time() - t)
            ]))
        if (epoch + 1) % args.eval_freq == 0:
            model.eval()
            embeddings = model.encode(data['features'], data['adj_train_norm'])
            val_metrics = model.compute_metrics(embeddings, data, 'val', args)
            if (epoch + 1) % args.log_freq == 0:
                logging.info(" ".join([
                    'Epoch: {:04d}'.format(epoch + 1),
                    format_metrics(val_metrics, 'val')
                ]))
            if model.has_improved(best_val_metrics, val_metrics):
                best_test_metrics = model.compute_metrics(
                    embeddings, data, 'test', args)
                if isinstance(embeddings, tuple):
                    best_emb = torch.cat(
                        (pmath.logmap0(embeddings[0], c=1.0), embeddings[1]),
                        dim=1).cpu()
                else:
                    best_emb = embeddings.cpu()
                if args.save:
                    np.save(os.path.join(save_dir, 'embeddings.npy'),
                            best_emb.detach().numpy())

                best_val_metrics = val_metrics
                counter = 0
            else:
                counter += 1
                if counter == args.patience and epoch > args.min_epochs:
                    logging.info("Early stopping")
                    break

    logging.info("Optimization Finished!")
    logging.info("Total time elapsed: {:.4f}s".format(time.time() - t_total))
    if not best_test_metrics:
        model.eval()
        best_emb = model.encode(data['features'], data['adj_train_norm'])
        best_test_metrics = model.compute_metrics(best_emb, data, 'test', args)
    logging.info(" ".join(
        ["Val set results:",
         format_metrics(best_val_metrics, 'val')]))
    logging.info(" ".join(
        ["Test set results:",
         format_metrics(best_test_metrics, 'test')]))

    if args.save:
        if isinstance(best_emb, tuple):
            best_emb = torch.cat(
                (pmath.logmap0(best_emb[0], c=1.0), best_emb[1]), dim=1).cpu()
        else:
            best_emb = best_emb.cpu()
        np.save(os.path.join(save_dir, 'embeddings.npy'),
                best_emb.detach().numpy())
        if hasattr(model.encoder, 'att_adj'):
            filename = os.path.join(save_dir, args.dataset + '_att_adj.p')
            pickle.dump(model.encoder.att_adj.cpu().to_dense(),
                        open(filename, 'wb'))
            print('Dumped attention adj: ' + filename)

        json.dump(vars(args), open(os.path.join(save_dir, 'config.json'), 'w'))
        torch.save(model.state_dict(), os.path.join(save_dir, 'model.pth'))
        logging.info(f"Saved model in {save_dir}")
예제 #30
0
import numpy as np
import matplotlib.pyplot as plt
import pylab

from utils.data_utils import gen_noise, load_data


try:
    data_load = np.load("../data.npy", allow_pickle=True)
except FileNotFoundError:
    load_data()
    data_load = np.load("../data.npy", allow_pickle=True)

intensity = data_load[835][0]
labels = data_load[835][1]
input_size = len(intensity)
num_labels = len(labels)


def to_matrix(vector):
    pixels = np.zeros((35, 35))
    for i in range(35):
        pixels[i][0:34] = vector[i * 34: (i + 1) * 34]

    pixels = np.delete(pixels, 0, 0)
    pixels = np.delete(pixels, -1, axis=1)
    return pixels


signal_shot, signal_temp = gen_noise(
    signal=data_load[0][0],