Exemplo n.º 1
0
def make_train_test():
    p_file = ft_data_pattern.format("pos_2016.txt")
    n_bad_files = fi.listchildren(ft_data_pattern.format(''),
                                  fi.TYPE_FILE,
                                  concat=True,
                                  pattern='2016_bad')
    n_2017_files = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2017')
    # n_2012_fulls = fi.listchildren(ft_data_pattern.format(''), fi.TYPE_FILE, concat=True, pattern='2012_full')[:12]
    n_2012_fulls = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2012_full')
    n_2016_files = fi.listchildren(ft_data_pattern.format(''),
                                   fi.TYPE_FILE,
                                   concat=True,
                                   pattern='2016_queried')
    print(len(n_bad_files), len(n_2017_files), len(n_2012_fulls),
          len(n_2016_files))

    n_files = n_bad_files + n_2017_files + n_2012_fulls + n_2016_files

    p_txtarr = fu.read_lines(p_file)
    p_prefix_txtarr = prefix_textarr(label_t, p_txtarr)
    n_txtarr_blocks = [fu.read_lines(file) for file in n_files]
    n_prefix_txtarr_blocks = [
        prefix_textarr(label_f, txtarr) for txtarr in n_txtarr_blocks
    ]

    train_test = list()
    bad = len(n_bad_files)
    bad_blocks, n_blocks = n_prefix_txtarr_blocks[:
                                                  bad], n_prefix_txtarr_blocks[
                                                      bad:]
    train_test.append(split_train_test(p_prefix_txtarr))
    train_test.extend([split_train_test(block) for block in n_blocks])
    print("len(train_test)", len(train_test))
    train_list, test_list = zip(*train_test)
    train_list = list(train_list) + bad_blocks

    train_txtarr = au.merge_array(train_list)
    test_txtarr = au.merge_array(test_list)
    fu.write_lines(fasttext_train, train_txtarr)
    fu.write_lines(fasttext_test, test_txtarr)
    print("len(train_list)", len(train_list), "len(train_txtarr)",
          len(train_txtarr), "len(test_txtarr)", len(test_txtarr))
Exemplo n.º 2
0
def merge_cic_list2cluid_twarr_list(cic_list):
    """
    分析输入的 cic_list 中每个对象所指示的频次最高的地点信息,按照这一信息进行聚类合并
    :param cic_list: list,每个元素为 ClusterInfoCarrier
    :return: list,每个元素为tuple,
        见 clustering.gsdpmm.gsdpmm_stream_ifd_dynamic.GSDPMMStreamIFDDynamic#get_cluid_twarr_list
    """
    geo2group_id, cluid2group = dict(), dict()
    for cic in cic_list:
        cluid, clu_geo_table = cic.cluid, cic.geo_table
        s_geo_table = cic.s_geo_table
        if len(s_geo_table) == 0:
            cluid2group[cluid] = [cic]
        else:
            clu_top_geo = s_geo_table[0]['address']
            if clu_top_geo not in geo2group_id:
                group_id = cluid
                geo2group_id[clu_top_geo] = group_id
                cluid2group[group_id] = [cic]
            else:
                group_id = geo2group_id[clu_top_geo]
                cluid2group[group_id].append(cic)
    new_cluid_twarr_list = list()
    for group_id, group_cic_list in cluid2group.items():
        new_cluid = group_cic_list[0].cluid
        new_twarr = au.merge_array([cic.twarr for cic in group_cic_list])
        new_cluid_twarr_list.append((new_cluid, new_twarr))
    return new_cluid_twarr_list
Exemplo n.º 3
0
def make_text_files():
    for idx, file in enumerate(neg_2012_full_files):
        twarr = fu.load_array(file)
        txtarr = list()
        for tw in twarr:
            text = pu.text_normalization(tw[tk.key_text])
            if pu.is_empty_string(text) or len(text) < 20:
                continue
            txtarr.append(text)
        print('len delta', len(twarr) - len(txtarr))
        path = Path(file)
        out_file_name = '_'.join([path.parent.name,
                                  path.name]).replace('json', 'txt')
        out_file = ft_data_pattern.format(out_file_name)
        print(out_file)
        fu.write_lines(out_file, txtarr)
    return
    p_twarr_blocks = map(fu.load_array, pos_files)
    p_txtarr_blocks = map(twarr2textarr, p_twarr_blocks)
    p_txtarr = au.merge_array(list(p_txtarr_blocks))
    p_out_file = ft_data_pattern.format('pos_2016.txt')
    fu.write_lines(p_out_file, p_txtarr)

    for f in neg_files:
        in_file = neg_event_pattern.format(f)
        out_file = ft_data_pattern.format(f.replace("json", "txt"))
        twarr = fu.load_array(in_file)
        txtarr = twarr2textarr(twarr)
        print(len(twarr), '->', len(txtarr), len(twarr) - len(txtarr))
        fu.write_lines(out_file, txtarr)
Exemplo n.º 4
0
def exec_pre_test(test_data_path):
    subfiles = fi.listchildren(test_data_path, children_type='file')
    # file_list = fu.split_multi_format(
    #     [(test_data_path + file) for file in subfiles if file.endswith('.json')], process_num=6)
    # twarr_blocks = fu.multi_process(fi.summary_unzipped_tweets_multi,
    #                                 [(file_list_slice,) for file_list_slice in file_list])
    twarr_blocks = filter_twarr(
        [fu.load_array(file) for file in subfiles if file.endswith('.json')])
    twarr = au.merge_array(twarr_blocks)

    tu.start_ner_service(pool_size=16)
    tu.twarr_ner(twarr)
    tu.end_ner_service()

    all_ids = set(fu.load_array(test_data_path + 'test_ids_all.csv'))
    pos_ids = set(fu.load_array(test_data_path + 'test_ids_pos.csv'))
    non_pos_ids = all_ids.difference(pos_ids)

    pos_twarr = list()
    non_pos_twarr = list()
    for tw in twarr:
        twid = tw[tk.key_id]
        if twid in pos_ids:
            pos_twarr.append(tw)
        elif twid in non_pos_ids:
            non_pos_twarr.append(tw)

    fu.dump_array(getcfg().pos_data_file, pos_twarr)
    fu.dump_array(getcfg().non_pos_data_file, non_pos_twarr)
Exemplo n.º 5
0
def get_semantic_tokens(file_list):
    pos_type_info = {
        ark.prop_label: {
            K_IFD: IdFreqDict()
        },
        ark.comm_label: {
            K_IFD: IdFreqDict()
        },
        ark.verb_label: {
            K_IFD: IdFreqDict()
        },
        ark.hstg_label: {
            K_IFD: IdFreqDict()
        },
    }
    total_doc_num = 0
    for file in file_list:
        twarr = ark.twarr_ark(fu.load_array(file))
        total_doc_num += len(twarr)
        pos_tokens = au.merge_array([tw[tk.key_ark] for tw in twarr])
        for pos_token in pos_tokens:
            word = pos_token[0].strip().lower()
            if len(word) <= 2 or not pu.is_valid_keyword(word):
                continue
            real_label = ark.pos_token2semantic_label(pos_token)
            if real_label:
                pos_type_info[real_label][K_IFD].count_word(word)
    return pos_type_info, total_doc_num
Exemplo n.º 6
0
def filter2cluster(remain_workload=None):
    """
    从过滤&分类模块读取其返回的推特列表,合并后输入聚类模块
    :param remain_workload: int/None,控制从过滤&分类模块读取结果的行为
            若为None,则调用 bflt.try_get_unread_batch_output 获取返回结果;
            否则以其为参数调用 bflt.wait_get_unread_batch_output 获取返回结果
    :return:
    """
    if remain_workload is None:
        batches_of_batches = bflt.try_get_unread_batch_output()
    else:
        batches_of_batches = bflt.wait_get_unread_batch_output(remain_workload)
    if not batches_of_batches:
        return
    filtered_batches = au.merge_array(batches_of_batches)
    filtered_twarr = au.merge_array(filtered_batches)
    bclu.input_twarr(filtered_twarr)
Exemplo n.º 7
0
def identify_korea():
    file = '/home/nfs/cdong/tw/seeding/NorthKorea/korea.json'
    twarr_blocks = fu.load_array(file)
    twarr = au.merge_array(twarr_blocks)
    for tw in twarr:
        text = tw[tk.key_text]
        if not re.search('korea', text, flags=re.I):
            print(text)
 def set_batches(self, tw_batches):
     self.tw_batches.clear()
     for batch_idx in range(len(tw_batches)):
         self.tw_batches.append(
             GSDPMMStreamIFDDynamic.pre_process_twarr(
                 tw_batches[batch_idx]))
     cluid_set = set(
         [tw[tk.key_event_cluid] for tw in au.merge_array(tw_batches)])
     self.cludict = dict([(cluid, ClusterHolder(cluid))
                          for cluid in cluid_set])
Exemplo n.º 9
0
def filter2cluster():
    """
    Read output from filter and transfer it to the cluster
    :return
    """
    filtered_batches = bflt.get_batch_output()
    filtered_twarr = au.merge_array(filtered_batches)
    print(len(filtered_twarr))
    bclu.input_twarr_batch(filtered_twarr)
    print('input to cluster over')
Exemplo n.º 10
0
def query_from_files_multi(file_list, query, n_process=10):
    """ as there may be many files, we handle them through processes """
    file_blocks = mu.split_multi_format(file_list, n_process)
    res_list = mu.multi_process(query_from_files,
                                args_list=[(block, query)
                                           for block in file_blocks])
    twarr = au.merge_array(res_list)
    print('len(res_list):{}, len(twarr):{}'.format(len(res_list), len(twarr)),
          end=', ')
    return twarr
Exemplo n.º 11
0
    def order_twarr_through_time(self):
        print("data source : normal")
        event_blocks = fu.load_array("./data/events2016.txt")
        false_event_twarr = fu.load_array("./data/false_pos_events.txt")
        event_blocks.append(false_event_twarr)
        for block_idx, block in enumerate(event_blocks):
            for tw in block:
                tw[tk.key_event_label] = block_idx
        twarr = au.merge_array(event_blocks)
        tflt.filter_twarr_dup_id(twarr)

        def random_idx_for_item(item_arr, dest_item):
            from numpy import random

            def sample(prob):
                return random.rand() < prob

            non_dest_item_idx = [
                idx for idx in range(len(item_arr))
                if item_arr[idx] not in dest_item
            ]
            dest_item_idx = [
                idx for idx in range(len(item_arr))
                if item_arr[idx] in dest_item
            ]
            non_dest_cnt = dest_cnt = 0
            res = list()
            while len(non_dest_item_idx) > non_dest_cnt and len(
                    dest_item_idx) > dest_cnt:
                if sample((len(dest_item_idx) - dest_cnt) /
                          (len(dest_item_idx) - dest_cnt +
                           len(non_dest_item_idx) - non_dest_cnt)):
                    res.append(dest_item_idx[dest_cnt])
                    dest_cnt += 1
                else:
                    res.append(non_dest_item_idx[non_dest_cnt])
                    non_dest_cnt += 1
            while len(non_dest_item_idx) > non_dest_cnt:
                res.append(non_dest_item_idx[non_dest_cnt])
                non_dest_cnt += 1
            while len(dest_item_idx) > dest_cnt:
                res.append(dest_item_idx[dest_cnt])
                dest_cnt += 1
            return res

        idx_time_order = tu.rearrange_idx_by_time(twarr)
        twarr = [twarr[idx] for idx in idx_time_order]
        lbarr = self.lbarr_of_twarr(twarr)
        idx_random_item = random_idx_for_item(lbarr, {max(lbarr)})
        twarr = [twarr[idx] for idx in idx_random_item]
        return twarr
Exemplo n.º 12
0
 def load_tw_batches(self, load_cluid_arr):
     tw_batches = fu.load_array(self.labelled_batch_file)
     tu.twarr_nlp(au.merge_array(tw_batches))
     print("twarr nlp over")
     if load_cluid_arr:
         cluid_batches = fu.load_array(self.cluid_batch_file)
         assert len(tw_batches) == len(cluid_batches)
         for b_idx in range(len(tw_batches)):
             tw_batch, cluid_batch = tw_batches[b_idx], cluid_batches[b_idx]
             assert len(tw_batch) == len(cluid_batch)
             for idx in range(len(tw_batch)):
                 tw, cluid = tw_batch[idx], cluid_batch[idx]
                 tw[tk.key_event_cluid] = cluid
     return tw_batches
Exemplo n.º 13
0
 def group_similar_tweets(twarr, process_num=0):
     """
     按照文本相似程度,调整推特列表中各推特的顺序,使得相近的文本被安排到相近的位置上;
     对一组文本进行两两比较操作复杂度O(n^2),文本超过1000条就已十分耗时,分进程并行操作
     :param twarr: list,推特列表
     :param process_num: 使用的子进程的数量
     :return: list,排序后的推特列表
     """
     txtarr = [tw[tk.key_text] for tw in twarr]
     idx_g, txt_g = au.group_similar_items(txtarr,
                                           score_thres=0.3,
                                           process_num=process_num)
     tw_groups = [[twarr[idx] for idx in g] for g in idx_g]
     return au.merge_array(tw_groups)
Exemplo n.º 14
0
def twarr_dist_pairs_multi(twarr):
    for tw in twarr:
        tw['nouse'] = tw['text'].lower()
    total = len(twarr) - 1
    process_num = 16
    point_lists = [[
        i + 16 * j for j in range(int(total / process_num) + 1)
        if (i + process_num * j) < total
    ] for i in range(process_num)]
    pairs_blocks = multi_process(dist_pairs,
                                 [(twarr, point) for point in point_lists])
    for tw in twarr:
        del tw['nouse']
    return merge_array(pairs_blocks)
Exemplo n.º 15
0
def summary_files_in_path(from_path, into_path=None):
    """ Read all .json under file_path, extract tweets from them into a file under summary_path. """
    # [-13:]--hour [-13:-3]--day [-13:-5]--month,ymdh refers to the short of "year-month-date-hour"
    from_path = fi.add_sep_if_needed(from_path)
    file_ymdh_arr = pu.split_digit_arr(fi.get_parent_path(from_path)[-13:])
    if not is_target_ymdh(file_ymdh_arr):
        return
    
    into_file = '{}{}'.format(fi.add_sep_if_needed(into_path), '_'.join(file_ymdh_arr) + '.sum')
    fi.remove_file(into_file)
    subfiles = fi.listchildren(from_path, children_type=fi.TYPE_FILE)
    file_block = mu.split_multi_format([(from_path + subfile) for subfile in subfiles], process_num=20)
    twarr_blocks = mu.multi_process(sum_files, [(file_list, tflt.FILTER_LEVEL_LOW) for file_list in file_block])
    twarr = au.merge_array(twarr_blocks)
    if twarr:
        fu.dump_array(into_file, twarr, overwrite=True)
Exemplo n.º 16
0
def dist_pairs(twarr, points):
    return merge_array([[(i, j,
                          text_dist_less_than(twarr[i]['temp'],
                                              twarr[j]['temp']))
                         for j in range(i + 1, len(twarr))] for i in points])
Exemplo n.º 17
0
    _cic_list = bext.get_batch_output()
    print('get cic outputs, type:{}'.format(type(_cic_list)))
    for cic in _cic_list:
        twnum = len(cic.twarr)
        _geo_list = [
            geo['address'] for geo in cic.od['geo_infer']
            if geo['quality'] == 'locality'
        ]
        print('cluid:{}, twarr len:{}'.format(cic.cluid, twnum))
        print(cic.od['summary']['keywords'])
        print(_geo_list)
        print('\n')

        if len(_geo_list) == 0:
            _top_geo = 'NOGPE'
        else:
            _top_geo = '`'.join(_geo_list)
        _out_file = '/home/nfs/cdong/tw/src/calling/tmp/id{}_tw{}_{}.txt'.format(
            cic.cluid, twnum, _top_geo)
        _txtarr = [tw[tk.key_text] for tw in cic.twarr]
        _idx_g, _txt_g = au.group_similar_items(_txtarr,
                                                score_thres=0.3,
                                                process_num=20)
        _txt_g = [
            sorted(g, key=lambda t: len(t), reverse=True) for g in _txt_g
        ]
        _txtarr = au.merge_array(_txt_g)
        fu.write_lines(_out_file, _txtarr)

    tmu.check_time()
Exemplo n.º 18
0
 def get_current_twharr(self):
     return au.merge_array(self.twh_batches)
Exemplo n.º 19
0
     #         print(twarr[idx][tk.key_text])
     # post_twarr = [tw for idx, tw in enumerate(twarr) if probarr[idx] >= 0.4]
     # post_total_len += len(post_twarr)
     # print(len(post_twarr) / len(twarr), '\n\n\n')
 tmu.check_time()
 lblarr = [1 for _ in range(len(pos_probarr))] + [0 for _ in range(len(neg_probarr))]
 prbarr = pos_probarr + neg_probarr
 fu.dump_array("prb_lbl_arr.txt", (lblarr, prbarr))
 lblarr, prbarr = fu.load_array("prb_lbl_arr.txt")
 au.precision_recall_threshold(lblarr, prbarr)
 # print('total portion = {} / {} = {}'.format(post_total_len, pre_total_len, post_total_len / pre_total_len))
 tmu.check_time()
 exit()
 
 sub_files = fi.listchildren('/home/nfs/cdong/tw/origin/', fi.TYPE_FILE, concat=True)[18:19]
 twarr = au.merge_array([fu.load_array(file) for file in sub_files])
 print(len(twarr))
 tmu.check_time(print_func=None)
 for idx, tw in enumerate(twarr[14000:15000]):
     if (idx + 1) % 1000 == 0:
         print(idx)
     try:
         my_filter.get_features(tw)
     except:
         # print(tw[tk.key_text])
         # print(tw[tk.key_orgntext])
         print('-', pu.text_normalization(tw[tk.key_orgntext]))
 tmu.check_time(print_func=lambda dt: print('pos filter time elapsed {}s'.format(dt)))
 
 exit()
 
Exemplo n.º 20
0
 def make_tw_batches(self, batch_size):
     ordered_twarr = self.order_twarr_through_time()
     tw_batches = split_array_into_batches(ordered_twarr, batch_size)
     self.twarr_info(au.merge_array(tw_batches))
     fu.dump_array(self.labelled_batch_file, tw_batches)