예제 #1
0
class MyUser:
    lut = dict()
    user_pkl_base = './twitter/pkls'
    user_json_base = './twitter/users'
    user_label_base = './twitter/labels'
    user_pkl_files = iu.list_children(user_pkl_base,
                                      pattern='.pkl',
                                      full_path=True)
    user_json_files = iu.list_children(user_json_base,
                                       pattern='.txt',
                                       full_path=True)
    label_files = iu.list_children(user_label_base,
                                   pattern='.txt',
                                   full_path=True)

    def __init__(self, profile, twarr):
        self.twarr: List[dict] = twarr
        self.profile: dict = profile
        self.uid: str = profile[tk.id_str]

    def sort_twarr_by_time(self):
        self.twarr = sorted(
            self.twarr,
            key=lambda tw: tmu.timestamp_of_created_at(tw[tk.created_at]))

    def get_created_at_list(self):
        return [tw[tk.created_at] for tw in self.twarr]

    def reindex_twarr(self, index_arr: List):
        assert len(index_arr) == len(self.twarr)
        assert set(index_arr) == set(range(len(index_arr)))
        self.twarr = [self.twarr[i] for i in index_arr]
예제 #2
0
 def get_log_path(self):
     cand_paths = iu.list_children('./', iu.DIR, r'^log\d', full_path=True)
     if len(cand_paths) == 0:
         cand_paths = iu.list_children('./logs',
                                       iu.DIR,
                                       r'^log\d',
                                       full_path=True)
     log_path = iu.choose_from(
         cand_paths) if self.args.c else iu.most_recent(cand_paths)
     return log_path
예제 #3
0
def plot_max_c_probs():
    paths = iu.list_children('./', iu.DIR, '^log', True)
    log_path = (iu.choose_from if False else iu.most_recent)(paths)
    store_paths = iu.list_children(log_path,
                                   iu.DIR,
                                   pattern='gid=67',
                                   full_path=True)
    for store_path in store_paths:
        try:
            per_store_path(store_path)
            tf.reset_default_graph()
        except Exception as e:
            print(e)
예제 #4
0
 def filter_into_temp(self):
     from bs4 import BeautifulSoup
     files = iu.list_children(self.orgn_file, full_path=True)
     array = list()
     for fidx, file in enumerate(files):
         print(fidx)
         tree = BeautifulSoup(''.join(iu.read_lines(file)), "html.parser")
         for article in tree.find_all("reuters"):
             topics = list(article.topics.children)
             if not len(topics) == 1:
                 continue
             topic = str(topics[0].text.encode('ascii', 'ignore'))
             text = article.find('text')
             if text is None or text.body is None:
                 continue
             title = str(
                 text.title.text.encode('utf-8', 'ignore')) if text.title is not None else ''
             title = ' '.join(pu.tokenize(title, pu.tokenize_pattern))
             body = str(text.body.text.encode('utf-8', 'ignore'))
             body = ' '.join(pu.tokenize(body, pu.tokenize_pattern))
             array.append((topic, '{}, {}'.format(title, body)))
     docarr = du.make_docarr([(idx, topic, body) for idx, (topic, body) in enumerate(array)])
     print(len(docarr))
     print(Counter([d.topic for d in docarr]))
     print(len(sorted(set([d.topic for d in docarr]))))
     du.dump_docarr(self.temp_file, docarr)
예제 #5
0
 def filter_into_temp(self):
     file_list = iu.list_children(self.orgn_file, full_path=True)
     twarr_list = [iu.load_array(file) for file in file_list]
     doclist = list()
     for topic_id, twarr in enumerate(twarr_list):
         for tw in twarr:
             doclist.append((str(tw['id']), topic_id, tw['text'].replace('#', '')))
     docarr = du.make_docarr(doclist)
     du.dump_docarr(self.temp_file, docarr)
예제 #6
0
def get_log_path(str_list, make_new: bool):
    if make_new:
        log_path = './log_{}_{}'.format(tmu.format_date()[2:],
                                        '+'.join(str_list))
        iu.mkdir(log_path, rm_prev=True)
    else:
        log_path = iu.choose_from(
            iu.list_children('./', iu.DIR, 'log', full_path=True))
    print('log path:', log_path)
    return log_path
예제 #7
0
def extract_tweets():
    in_path = '/home/cdong/works/uclu/data/userClustering_origin/data'
    out_path_json = '/home/cdong/works/uclu/data/twitter/users/'
    out_path_pkl = '/home/cdong/works/uclu/data/twitter/pkls/'
    files = iu.list_children(in_path,
                             ctype=iu.FILE,
                             pattern='^E',
                             full_path=True)
    print('total files', len(files))
    files_parts = au.split_multi_process(files, 20)
    args_list = [(part, out_path_json, out_path_pkl) for part in files_parts]
    res_list = mu.multi_process(filter_tw_from_files, args_list)
예제 #8
0
def rename_ground_truth():
    in_path = '/home/cdong/works/uclu/data/userClustering_origin/groud-truth-clusters'
    out_path = '/home/cdong/works/uclu/data/twitter/labels'
    files = iu.list_children(in_path,
                             ctype=iu.FILE,
                             full_path=True,
                             pattern=r'^\d')
    for file in files:
        fname = iu.get_name(file)
        fname_new = re.sub(r'\b(\d)\b', '0\\1', fname)
        s = fname_new.split('-', maxsplit=3)
        s.insert(0, s.pop(2))
        fname_new = '-'.join(s)
        print(fname, '=>', fname_new, '\n')

        formated = reformat_ground_truth(file)
        iu.dump_array(iu.join(out_path, fname_new), formated)
예제 #9
0
    def main(self):
        log_path = self.get_log_path()
        print('log path:', log_path)
        log_files = iu.list_children(log_path,
                                     pattern=r'^gid.+\.txt$',
                                     full_path=True)
        best_list = list()
        for file in log_files:
            entries = au.name2entries(name=iu.get_name(file),
                                      postfix='.txt',
                                      exclude=self.exclude)
            scores = [
                iu.loads(l) for l in iu.read_lines(file)
                if (l.startswith('{') and 'v_NDCG' in l)
            ]
            scores_with_test = [s for s in scores if 't_NDCG' in s]
            if len(scores) == 0 or len(scores_with_test) == 0:
                print(au.entries2name(entries), 'lacks test info')
                continue
            best_scores = scores_with_test[-3:]
            name2score = pd.DataFrame()
            for idx, rvs2scores in enumerate(best_scores):
                rvs2scores.pop('brk_cnt')
                for title, value in rvs2scores.items():
                    name2score.loc[idx, title] = value
                # for rvs, score in rvs2scores.items():
                #     for name, value in score.items():
                #         title = '{}_{}'.format(rvs[0], name)
            name2score = name2score.mean(axis=0).round(4)
            name2score['ep'] = len(scores)
            best_list.append((dict(entries), name2score.to_dict()))

        table = pd.DataFrame()
        for i, (name2param, name2score) in enumerate(best_list):
            for k, v in list(name2param.items()) + list(name2score.items()):
                table.loc[i, k] = v
        table.fillna('-', inplace=True)
        temp = 'mmm'
        pre = 't'
        table[temp] = table['%s_NDCG' % pre] + table['%s_MAP' %
                                                     pre] + table['%s_MRR' %
                                                                  pre]
        table = table.sort_values(by=temp)
        table.drop([temp, K.lr, K.reg], axis=1, inplace=True)
        # table = table.query('dpt=="1"')
        if self.args.s:
            table.to_csv(iu.join(log_path, 'summary.csv'))

        # print(table.columns)
        # print(table)
        # group_col = [K.dn, K.mix, K.act, K.dpt]

        for value, df in table.groupby(K.vs):
            df.pop(K.ep)
            print(value)
            print(df)
            mean = df.groupby(K.dn).mean()
            print(mean)
            mean.to_csv('%s.csv' % value)
        return

        group_col = [K.dn]
        grouped = table.groupby(group_col)
        kv_df_list = list()
        summ = pd.DataFrame()
        import numpy as np
        for idx, (values, table) in enumerate(grouped):
            # print(list(zip(group_col, values)))
            kv = dict(zip(group_col, values))
            kv['final'] = np.mean(table['v_NDCG'] + table['v_MAP'] +
                                  table['v_MRR']) / 3
            kv['final'] = kv['final'].round(3)
            kv_df_list.append([kv, table])
            columns = [
                '%s_%s' % (a, b) for a in ['v', 't']
                for b in ['NDCG', 'MAP', 'MRR']
            ]
            s = table[columns].mean(0)
            print(dict(s))
            # print(s.index)
            # print(s[s.index])
            # print(list(s.data))
            # summ.loc[idx, 'data'] = values
            # summ.loc[idx, columns] = list(s.data)
            summ.append(dict(s), ignore_index=True)
            # print(table, '\n')
        print(summ)
예제 #10
0
    #     a = list(_sampler.trick_generate(64, 16))
    # exit()

    # _d = Data20ng()
    # docarr = _d.load_docarr()
    # print(_d.name, np.mean([len(d.tokenids[:500]) for d in docarr]))
    # exit()

    # summary_datasets()
    # exit()

    # to_btm()
    # exit()
    from clu.data.remote_transfer import transfer_files, OUT, Nodes

    _files = iu.list_children('./', pattern='btm', full_path=True)
    print(_files)
    input('continue?')
    transfer_files(_files, _files, OUT, Nodes.alias_gpu)
    exit()
    # for _o in object_list:
    #     tf, topics = _o.get_matrix_topics(using='tf')
    #     iu.dump_pickle('{}_tf.pkl'.format(_d.name), [tf, topics])
    #     tfidf, topics = _o.get_matrix_topics_for_vade()
    #     iu.dump_pickle('{}_tfidf.pkl'.format(_d.name), [tfidf, topics])
    # exit()

    for _d in [Data20ng]:
        # for _d in object_list:
        _d = _d()
        print(_d.name)