示例#1
0
def get_semantic_tokens_multi(file_path):
    pos_type_info = {
        ark.prop_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_prop_dict_file},
        ark.comm_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_comm_dict_file},
        ark.verb_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_verb_dict_file},
        ark.hstg_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_hstg_dict_file},
    }
    total_doc_num = 0
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 40)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20)
    res_list = mu.multi_process(get_semantic_tokens, [(file_list,) for file_list in file_list_block])
    for res_type_info, doc_num in res_list:
        total_doc_num += doc_num
        for label in res_type_info.keys():
            pos_type_info[label][K_IFD].merge_freq_from(res_type_info[label][K_IFD])
    print('total_doc_num', total_doc_num)
    for label in pos_type_info.keys():
        ifd, file_name = pos_type_info[label][K_IFD], pos_type_info[label][K_FILE]
        ifd.drop_words_by_condition(3)
        if label != ark.hstg_label:
            ifd.drop_words_by_condition(lambda word, _: word.startswith('#'))
        ifd.dump_dict(file_name)
        print('{}; vocab size:{}'.format(file_name, ifd.vocabulary_size()))
示例#2
0
def tsne_data_multi(using):
    assert using in {'avg', 'tfidf'}
    kw_arg = dict(early_exaggeration=12,
                  n_iter=800,
                  n_iter_without_progress=100)
    args_list = [(data_class, using, kw_arg) for data_class in object_list]
    tsne_list = mu.multi_process(tsne_data, args_list)
    print(len(tsne_list), len(tsne_list[0]))
    np.save('tsne_{}.npy'.format(using), np.array(tsne_list, dtype=object))
示例#3
0
def _load_word2vec_from_glovec_file(glovec_file, process_num):
    lines_parts = mu.split_multi(iu.read_lines(glovec_file), process_num)
    print('lines read & partition over')
    word2vec_parts = mu.multi_process(_word2vec_from_lines,
                                      [(lines, ) for lines in lines_parts])
    word2vec = dict()
    for word2vec_part in word2vec_parts:
        word2vec.update(word2vec_part)
    return word2vec
def query_from_files_multi(file_list, query, n_process=10):
    """ as there may be many files, we handle them through processes """
    file_blocks = mu.split_multi_format(file_list, n_process)
    res_list = mu.multi_process(query_from_files,
                                args_list=[(block, query)
                                           for block in file_blocks])
    twarr = au.merge_array(res_list)
    print('len(res_list):{}, len(twarr):{}'.format(len(res_list), len(twarr)),
          end=', ')
    return twarr
def clustering_multi(func, params, process_num=20):
    param_num = len(params)
    res_list = list()
    for i in range(int(math.ceil(param_num / process_num))):
        res_list += mu.multi_process(
            func, params[i * process_num:(i + 1) * process_num])
        print('{:<4} / {} params processed'.format(
            min((i + 1) * process_num, param_num), param_num))
    if not len(res_list) == len(params):
        raise ValueError('Error occur in clustering')
    return res_list
示例#6
0
def pairwise_score_multi(array, process_num, pair_func):
    total = len(array)
    pairs = [(i, j) for i in range(total - 1) for j in range(i + 1, total)]
    if process_num <= 1 or total < 40:
        idx_pair_score_list = pairwise_score(array, pairs, pair_func)
    else:
        pair_blocks = mu.split_multi_format(pairs, process_num)
        arg_list = [(array, idx_pairs, pair_func) for idx_pairs in pair_blocks]
        score_pairs_blocks = mu.multi_process(pairwise_score, arg_list)
        idx_pair_score_list = merge_array(score_pairs_blocks)
    return idx_pair_score_list
示例#7
0
def get_tokens_multi(file_path):
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20)
    res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block])
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for ifd, doc_num in res_list:
        total_doc_num += doc_num
        id_freq_dict.merge_freq_from(ifd)
    print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size())
    id_freq_dict.drop_words_by_condition(3)
    id_freq_dict.dump_dict(getcfg().post_dict_file)
示例#8
0
def make_neg_event_bad_text_2016():
    files = fi.listchildren("/home/nfs/cdong/tw/origin/",
                            fi.TYPE_FILE,
                            concat=True)
    files_blocks = mu.split_multi_format(files, 4)
    output_file = neg_event_pattern.format("neg_2016_bad_text_{}.json")
    args_list = [(block, output_file.format(idx))
                 for idx, block in enumerate(files_blocks)]
    res_list = mu.multi_process(extract_bad_tweets_into, args_list)
    n_num_list, tw_num_list = zip(*res_list)
    total_n, total_tw = sum(n_num_list), sum(tw_num_list)
    print(n_num_list, tw_num_list, total_n, total_tw,
          round(total_n / total_tw, 6))
示例#9
0
def twarr_dist_pairs_multi(twarr):
    for tw in twarr:
        tw['nouse'] = tw['text'].lower()
    total = len(twarr) - 1
    process_num = 16
    point_lists = [[
        i + 16 * j for j in range(int(total / process_num) + 1)
        if (i + process_num * j) < total
    ] for i in range(process_num)]
    pairs_blocks = multi_process(dist_pairs,
                                 [(twarr, point) for point in point_lists])
    for tw in twarr:
        del tw['nouse']
    return merge_array(pairs_blocks)
示例#10
0
def tsne_multi():
    class_list = [DataTREC, DataGoogle, DataEvent, DataReuters, Data20ng]
    doc_avg_list = mu.multi_process(train_embedding, [[c] for c in class_list])
    for m in doc_avg_list:
        print(m.shape)
    kw_arg = dict(early_exaggeration=12,
                  n_iter=800,
                  n_iter_without_progress=100)
    tsne_point_list = ru.fit_multi(ru.fit_tsne, doc_avg_list,
                                   [kw_arg] * len(doc_avg_list))
    for m in tsne_point_list:
        print(m.shape)
    name_point_list = list(zip([c.name for c in class_list], tsne_point_list))
    return name_point_list
示例#11
0
def summary_files_in_path(from_path, into_path=None):
    """ Read all .json under file_path, extract tweets from them into a file under summary_path. """
    # [-13:]--hour [-13:-3]--day [-13:-5]--month,ymdh refers to the short of "year-month-date-hour"
    from_path = fi.add_sep_if_needed(from_path)
    file_ymdh_arr = pu.split_digit_arr(fi.get_parent_path(from_path)[-13:])
    if not is_target_ymdh(file_ymdh_arr):
        return
    
    into_file = '{}{}'.format(fi.add_sep_if_needed(into_path), '_'.join(file_ymdh_arr) + '.sum')
    fi.remove_file(into_file)
    subfiles = fi.listchildren(from_path, children_type=fi.TYPE_FILE)
    file_block = mu.split_multi_format([(from_path + subfile) for subfile in subfiles], process_num=20)
    twarr_blocks = mu.multi_process(sum_files, [(file_list, tflt.FILTER_LEVEL_LOW) for file_list in file_block])
    twarr = au.merge_array(twarr_blocks)
    if twarr:
        fu.dump_array(into_file, twarr, overwrite=True)
示例#12
0
def fit_multi(func, xs, kwargs_list=None):
    import utils.multiprocess_utils as mu
    res_list = mu.multi_process(func, [(x,) for x in xs], kwargs_list)
    return res_list
示例#13
0
def matrix2str_list_multi(matrix, delimeter, ndigits, process_num):
    matrix_parts = mu.split_multi(matrix, process_num)
    arg_list = [(matrix_part, delimeter, ndigits)
                for matrix_part in matrix_parts]
    return au.merge(mu.multi_process(matrix2str_list, arg_list))