Пример #1
0
 def filter_into_temp(self):
     twarr = iu.load_array(self.orgn_file)
     print(len(twarr), type(twarr[0]))
     docarr = du.make_docarr(
         [[tw[k] for k in ('tweetId', 'clusterNo', 'textCleaned')]
          for tw in twarr])
     du.dump_docarr(self.temp_file, docarr)
Пример #2
0
 def filter_into_temp(self):
     from bs4 import BeautifulSoup
     files = iu.list_children(self.orgn_file, full_path=True)
     array = list()
     for fidx, file in enumerate(files):
         print(fidx)
         tree = BeautifulSoup(''.join(iu.read_lines(file)), "html.parser")
         for article in tree.find_all("reuters"):
             topics = list(article.topics.children)
             if not len(topics) == 1:
                 continue
             topic = str(topics[0].text.encode('ascii', 'ignore'))
             text = article.find('text')
             if text is None or text.body is None:
                 continue
             title = str(
                 text.title.text.encode('utf-8', 'ignore')) if text.title is not None else ''
             title = ' '.join(pu.tokenize(title, pu.tokenize_pattern))
             body = str(text.body.text.encode('utf-8', 'ignore'))
             body = ' '.join(pu.tokenize(body, pu.tokenize_pattern))
             array.append((topic, '{}, {}'.format(title, body)))
     docarr = du.make_docarr([(idx, topic, body) for idx, (topic, body) in enumerate(array)])
     print(len(docarr))
     print(Counter([d.topic for d in docarr]))
     print(len(sorted(set([d.topic for d in docarr]))))
     du.dump_docarr(self.temp_file, docarr)
Пример #3
0
    def filter_from_temp(self):
        c = self.__class__
        topic_flt_func, wf_flt_func, doc_flt_func = c.topic_flt_func, c.wf_flt_func, c.doc_flt_func
        docarr = du.load_docarr(self.temp_file)
        docarr = du.filter_duplicate_docid(docarr)
        docarr = du.tokenize_docarr(docarr, stemming=self.stem)
        print('data prepare (filter duplicate id, tokenize) over')
        acc_topics, rej_topics_1, docarr = du.filter_docarr_by_topic(
            docarr, topic_flt_func)
        docarr, ifd = du.docarr_bootstrap_ifd(docarr, wf_flt_func)
        # docarr = du.filter_docarr(docarr, doc_flt_func)
        # docarr, ifd = du.docarr_bootstrap_ifd(docarr, wf_flt_func)
        # acc_topics, rej_topics_2, docarr = du.filter_docarr_by_topic(docarr, topic_flt_func)
        # docarr, ifd = du.docarr_bootstrap_ifd(docarr, wf_flt_func)
        # docarr = du.filter_docarr(docarr, doc_flt_func)

        # rej_topics = rej_topics_1 + rej_topics_2
        rej_topics = rej_topics_1
        print('get {} docs\n'.format(len(docarr)))
        print('{} suff topic:{}\n'.format(len(acc_topics),
                                          acc_topics.most_common()))
        print('{} insuff topic:{}'.format(len(rej_topics),
                                          rej_topics.most_common()[:20]))
        docarr = sorted(docarr, key=lambda d: d.topic)
        ifd.dump_dict(self.dict_file)
        du.dump_docarr(self.docarr_file, docarr)
Пример #4
0
 def filter_into_temp(self):
     file_list = iu.list_children(self.orgn_file, full_path=True)
     twarr_list = [iu.load_array(file) for file in file_list]
     doclist = list()
     for topic_id, twarr in enumerate(twarr_list):
         for tw in twarr:
             doclist.append((str(tw['id']), topic_id, tw['text'].replace('#', '')))
     docarr = du.make_docarr(doclist)
     du.dump_docarr(self.temp_file, docarr)
Пример #5
0
 def filter_into_temp(self):
     json_list = iu.load_array(self.orgn_file)
     item_list = list()
     for i, o in enumerate(json_list):
         text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:1200])
         # text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:3000])
         # text = o['text']
         item_list.append((i, o['cluster'], text))
     docarr = du.make_docarr(item_list)
     du.dump_docarr(self.temp_file, docarr)
Пример #6
0
    def filter_into_temp(self):
        from os.path import join
        did_to_cat = dict()
        cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
        data_dir = _data_base.format(self.name, self.special[0])
        with open(join(data_dir, 'rcv1-v2.topics.qrels')) as fin:
            for line in fin.readlines():
                line = line.strip().split(' ')
                cat = line[0]
                did = int(line[1])
                if cat in cat_list:
                    did_to_cat[did] = did_to_cat.get(did, []) + [cat]
            # did_to_cat = {k: did_to_cat[k] for k in list(did_to_cat.keys()) if len(did_to_cat[k]) > 1}
            for did in list(did_to_cat.keys()):
                if len(did_to_cat[did]) > 1:
                    del did_to_cat[did]

        dat_list = [
            'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat',
            'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat',
            'lyrl2004_tokens_train.dat'
        ]
        data = list()
        target = list()
        cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
        docarr = list()

        del did
        for dat in dat_list:
            with open(join(data_dir, dat)) as fin:
                for line in fin.readlines():
                    if line.startswith('.I'):
                        if 'did' in locals():
                            assert doc != ''
                            if did in did_to_cat:
                                data.append(doc)
                                target.append(cat_to_cid[did_to_cat[did][0]])
                                docarr.append(
                                    (did, cat_to_cid[did_to_cat[did][0]], doc))
                        did = int(line.strip().split(' ')[1])
                        doc = ''
                    elif line.startswith('.W'):
                        assert doc == ''
                    else:
                        doc += line

        print((len(data), 'and', len(did_to_cat)))
        print(data[0])
        assert len(data) == len(did_to_cat)
        print(len(docarr))

        du.dump_docarr(self.temp_file, du.make_docarr(docarr[:20000]))
Пример #7
0
 def filter_into_temp(self):
     twarr = iu.load_array(self.orgn_file)
     outrows = list()
     for idx, tw in enumerate(twarr):
         if tw['relevance'] > 1:
             continue
         docid, topic, text = tw['tweetId'], tw['clusterNo'], tw['text']
         if not 10 < len(' '.join(pu.tokenize(text, pu.tokenize_pattern))):
             continue
         outrows.append([docid, topic, text])
     topics = Counter([r[1] for r in outrows])
     print('get {} rows'.format(len(outrows)))
     print('{} topics, {}'.format(len(topics), topics))
     du.dump_docarr(self.temp_file, du.make_docarr(outrows))