示例#1
0
def analyze_glda_result():
    glda_base = '/home/cdong/works/research/clu/baselinee/GaussianLDA'
    out_dir_ = iu.join(glda_base, 'output_{}/')
    d_classes = (DataReuters(), Data20ng())

    # for d in d_classes:
    #     # topic_list = d.get_topic_list()
    #     out_dir = out_dir_.format(d.name)
    # # for out_dir in fi.listchildren(glda_base, children_type=fi.TYPE_DIR, pattern='output'):
    # #     print(out_dir)
    #     for param in fi.listchildren(out_dir, children_type=fi.TYPE_DIR, concat=True):
    #         print(param[param.rfind('/') + 1:])
    #         for assignment in fi.listchildren(param, concat=True, pattern='table_assignments_'):
    #             lines = fu.read_lines(assignment)
    #             print(len(lines))
    from collections import Counter
    import utils.array_utils as au
    import re

    def fn(f):
        return f[f.rfind('/') + 1:]

    for out_dir in iu.list_children(glda_base,
                                    ctype=iu.DIR,
                                    pattern='^output',
                                    full_path=True):
        # topic_list = Data20ng().get_topic_list()
        dname = re.findall('output_(.+)$', out_dir)[0]
        print(dname)
        topic_list = name2object[dname].get_topics()

        print(out_dir)
        for param in iu.list_children(out_dir, ctype=iu.DIR, full_path=True):
            print(param[param.rfind('/') + 1:])
            for assign in iu.list_children(param,
                                           pattern='table_assignments',
                                           full_path=True):
                lines = iu.read_lines(assign)
                if len(lines) != len(topic_list):
                    continue
                print(fn(assign), len(lines))
                print(lines[100])
                cluster_list = [
                    Counter(list(map(int,
                                     line.split_length()))).most_common()[0][0]
                    for line in lines
                ]
                print(au.score(topic_list, cluster_list, 'nmi'))
                print(au.score(topic_list, cluster_list, 'ari'))
def test(test_file, model_file):
    textarr, labelarr = list(), list()
    with open(test_file) as testfp:
        lines = testfp.readlines()[:20]
    for line in lines:
        label, text = line.strip().split(' ', 1)
        textarr.append(text)
        labelarr.append(label)
    # for idx, line in enumerate(testlines):
    #     if pu.is_empty_string(line):
    #         continue
    #     label, text = line.split(' ', 1)
    #     print(label, model.predict(text, threshold=0.5), text)
    pred_value_arr = predict(textarr, ftu.load_model(model_file))
    label = [label2value[label] for label in labelarr]
    print(au.score(label, pred_value_arr, 'auc'))
示例#3
0
 def LECM_twarr_with_label(twarr, tw_cluster_label):
     # Currently best hyperparam 1, 0.1, 0.1, 1
     # tw_topic_arr, tw_cluster_pred = LECMClusterer.LECM_twarr(twarr, 1, 0.1, 0.1, 1, 20, 1)
     # print('one epoch:alpha {:<5}, eta {:<5}, beta {:<5}, lambd {:<5}, NMI {:<8}\n'.
     #       format(0.1, 0.1, 0.1, 0.1, au.score(tw_cluster_pred, tw_cluster_label, 'nmi')))
     tw_topic_arr = tw_cluster_pred = nmi = 0
     for alpha in [1]:
         for eta in [0.1]:
             for beta in [0.1]:
                 for lambd in [1]:
                     tw_topic_arr_, tw_cluster_pred_ = LECMClusterer.LECM_twarr(twarr, alpha, eta, beta, lambd, 20, 70)
                     nmi_ = au.score(tw_cluster_pred_, tw_cluster_label, 'nmi')
                     print('alpha {:<5}, eta {:<5}, beta {:<5}, lambd {:<5}, NMI{:<8}'.
                           format(alpha, eta, beta, lambd, nmi_))
                     if nmi < nmi_:
                         tw_topic_arr, tw_cluster_pred = tw_topic_arr_, tw_cluster_pred_
                         nmi = nmi_
     return tw_topic_arr, tw_cluster_pred
示例#4
0
def test(test_file, model_file):
    textarr, labelarr = list(), list()
    with open(test_file) as testfp:
        lines = testfp.readlines()
    for line in lines:
        label, text = line.strip().split(' ', 1)
        textarr.append(text)
        labelarr.append(label)
    preds, scores = predict(textarr, threshold=0.2)
    assert len(preds) == len(textarr)
    
    for thres in [i/10 for i in range(2, 11)]:
        print(thres, Counter([1 if s > thres else 0 for s in scores]))
    
    label = [label2value[label] for label in labelarr]
    print(au.score(label, preds, 'auc'))
    for idx in range(1000, 1100):
        pred, lb, text = preds[idx], label[idx], textarr[idx]
        if not pred == lb:
            print(pred, lb, text)
示例#5
0
def analyze_refine_mean_and_stderr(result_file, mean_std_file):
    using_scores = ['nmi', 'h**o', 'cmplt', 'ari']
    arg_tpc_clu_list = iu.load_array(result_file)
    rows = list()
    for kwargs, topics, clusters in arg_tpc_clu_list:
        scores = [au.score(topics, clusters, s) for s in using_scores]
        res_dict = Od(zip(using_scores, scores))
        row = Od(list(kwargs.items()) + list(res_dict.items()))
        rows.append(row)
    rows = sorted(rows, key=lambda item: item['nmi'], reverse=True)
    df = pd.DataFrame(data=rows)
    print(df)
    score_array = df[using_scores].values
    mean = np.mean(score_array, axis=0)
    std = np.std(score_array, axis=0, ddof=1)
    table = list(zip(*[using_scores, mean, std]))
    lines = [
        '{}: {} ± {}'.format(name, round(mean, 4), round(std, 4))
        for name, mean, std in table
    ]
    iu.write_lines(mean_std_file, lines)
示例#6
0
 def input_twarr_with_label(twarr, label):
     # alpha_range = beta_range = [i/100 for i in range(1, 10, 3)] + [i/10 for i in range(1, 10, 3)] + \
     #                            [i for i in range(1, 10, 3)]
     # K_range = [30, 40, 50]
     alpha_range = beta_range = [i / 100 for i in range(1, 10, 4)
                                 ] + [i / 10 for i in range(1, 10, 4)]
     K_range = [30, 40, 50]
     """cluster using different hyperparams in multiprocess way"""
     iter_num = 100
     process_num = 20
     hyperparams = [(a, b, K) for a in alpha_range for b in beta_range
                    for K in K_range]
     res_list = list()
     for i in range(int(math.ceil(len(hyperparams) / process_num))):
         param_list = [
             (twarr, *param, iter_num)
             for param in hyperparams[i * process_num:(i + 1) * process_num]
         ]
         res_list += utils.multiprocess_utils.multi_process(
             GSDMM.GSDMM_twarr, param_list)
         print('{:<4} /'.format((i + 1) * process_num), len(hyperparams),
               'params processed')
     """group the data by K"""
     frame = pd.DataFrame(index=np.arange(0, len(hyperparams)),
                          columns=['alpha', 'beta', 'K'])
     for i in range(len(hyperparams)):
         frame.loc[i] = hyperparams[i]
     print('\n', frame, '\n')
     """start plotting figures"""
     for (alpha, K), indices in frame.groupby(['alpha',
                                               'K']).groups.items():
         fig = plt.figure()
         fig.set_figheight(8)
         fig.set_figwidth(8)
         all_nmi = list()
         for i in indices:
             beta = frame.loc[i]['beta']
             tw_cluster_pred_iter = res_list[i]
             iter_x = range(len(tw_cluster_pred_iter))
             nmi_y = [
                 au.score(label, pred, 'nmi')
                 for pred in tw_cluster_pred_iter
             ]
             all_nmi.append(nmi_y)
             plt.plot(iter_x,
                      nmi_y,
                      '-',
                      lw=1.5,
                      label='beta={}'.format(round(beta, 2)))
         plt.xlabel('iteration')
         plt.ylabel('NMI')
         plt.ylim(0.0, 0.75)
         plt.title('K=' + str(K))
         plt.legend(loc='lower right')
         plt.grid(True, '-', color='#333333', lw=0.8)
         plt.text(iter_num - 40,
                  0.70,
                  'final nmi: ' +
                  str(round(max([nmi[-1] for nmi in all_nmi]), 6)),
                  fontsize=14,
                  verticalalignment='bottom',
                  horizontalalignment='left')
         plt.savefig(getcfg().dc_test + 'GSDMM/' +
                     'alpha={},K={}.png'.format(round(alpha, 2), K))
示例#7
0
    for i in range(3):
        hist = model.fit(X,
                         B,
                         validation_split=0.1,
                         epochs=100,
                         batch_size=batch_size,
                         verbose=0,
                         shuffle=True)

        # create model that gives penultimate layer
        inputs = model.layers[0].input
        outputs = model.layers[-2].output
        model_penultimate = Model(inputs, outputs)
        # inference of penultimate layer
        H = model_penultimate.predict(X)
        V = normalize(H, norm='l1')
        print("Sample shape: {}".format(H.shape))

        # n_clusters = len(np.unique(y))
        # print("Number of classes: %d" % n_clusters)
        km = KMeans(n_clusters=clu_num, n_jobs=4, max_iter=200)
        km.fit(V)
        pred = km.labels_
        # nmi = cluster_quality(y, pred)

        d = dict([(s, au.score(y, pred, s)) for s in ['nmi', 'ari']])
        print(d)
        # logger.info(entries2name(d, inter=' ', intra=':', postfix=''))
        # np.save("pred.npy", pred)
        # model.save_weights("model.plk")