示例#1
0
class DataGoogle(Data):
    name = 'Google'
    orgn = ['News.txt']
    seq_len = 10
    topic_num = 152
    w_verify_func = du.word_verify(None, None, 0.0, None)
    wf_flt_func = lambda word, freq: freq >= 0
    doc_flt_func = lambda d: len(d.tokens) >= 3 and d.topic is not None
    topic_flt_func = lambda rank, freq: True

    def filter_into_temp(self):
        twarr = iu.load_array(self.orgn_file)
        print(len(twarr), type(twarr[0]))
        docarr = du.make_docarr(
            [[tw[k] for k in ('tweetId', 'clusterNo', 'textCleaned')] for tw in twarr])
        du.dump_docarr(self.temp_file, docarr)
示例#2
0
class DataEvent(Data):
    name = 'Event'
    orgn = ['Terrorist']
    seq_len = 14
    topic_num = 69
    w_verify_func = du.word_verify(2, 16, 0.8, pu.nltk_stop_words)
    wf_flt_func = lambda word, freq: freq >= 3
    doc_flt_func = lambda d: len(d.tokens) >= 3 and d.topic is not None
    topic_flt_func = lambda rank, freq: True

    def filter_into_temp(self):
        file_list = iu.list_children(self.orgn_file, full_path=True)
        twarr_list = [iu.load_array(file) for file in file_list]
        doclist = list()
        for topic_id, twarr in enumerate(twarr_list):
            for tw in twarr:
                doclist.append((str(tw['id']), topic_id, tw['text'].replace('#', '')))
        docarr = du.make_docarr(doclist)
        du.dump_docarr(self.temp_file, docarr)
示例#3
0
class DataTREC(Data):
    name = 'TREC'
    orgn = ['Tweets.txt']
    topic_num = 128
    w_verify_func = du.word_verify(3, 14, 0.8, pu.my_stop_words)
    wf_flt_func = lambda word, freq: freq >= 3
    doc_flt_func = lambda d: len(d.tokens) >= 5 and d.topic is not None
    topic_flt_func = lambda rank, freq: 10 <= freq

    def filter_into_temp(self):
        twarr = iu.load_array(self.orgn_file)
        outrows = list()
        for idx, tw in enumerate(twarr):
            if tw['relevance'] > 1:
                continue
            docid, topic, text = tw['tweetId'], tw['clusterNo'], tw['text']
            if not 10 < len(' '.join(pu.tokenize(text, pu.tokenize_pattern))):
                continue
            outrows.append([docid, topic, text])
        topics = Counter([r[1] for r in outrows])
        print('get {} rows'.format(len(outrows)))
        print('{} topics, {}'.format(len(topics), topics))
        du.dump_docarr(self.temp_file, du.make_docarr(outrows))
示例#4
0
class DataReuters(Data):
    name = 'Reuters'
    orgn = ['segments']
    seq_len = 100
    topic_num = 31
    w_verify_func = du.word_verify(3, 16, 0.8, pu.nltk_stop_words)
    wf_flt_func = lambda word, freq: freq >= 3
    doc_flt_func = lambda d: len(d.tokens) >= 3 and d.topic is not None
    topic_flt_func = lambda rank, freq: freq >= 20

    def filter_into_temp(self):
        from bs4 import BeautifulSoup
        files = iu.list_children(self.orgn_file, full_path=True)
        array = list()
        for fidx, file in enumerate(files):
            print(fidx)
            tree = BeautifulSoup(''.join(iu.read_lines(file)), "html.parser")
            for article in tree.find_all("reuters"):
                topics = list(article.topics.children)
                if not len(topics) == 1:
                    continue
                topic = str(topics[0].text.encode('ascii', 'ignore'))
                text = article.find('text')
                if text is None or text.body is None:
                    continue
                title = str(
                    text.title.text.encode('utf-8', 'ignore')) if text.title is not None else ''
                title = ' '.join(pu.tokenize(title, pu.tokenize_pattern))
                body = str(text.body.text.encode('utf-8', 'ignore'))
                body = ' '.join(pu.tokenize(body, pu.tokenize_pattern))
                array.append((topic, '{}, {}'.format(title, body)))
        docarr = du.make_docarr([(idx, topic, body) for idx, (topic, body) in enumerate(array)])
        print(len(docarr))
        print(Counter([d.topic for d in docarr]))
        print(len(sorted(set([d.topic for d in docarr]))))
        du.dump_docarr(self.temp_file, docarr)
示例#5
0
class DataR10K(Data):
    name = 'R10K'
    orgn = ['data']
    topic_num = 4
    w_verify_func = du.word_verify(3, 16, 0.6, None)
    wf_flt_func = lambda word, freq: freq >= 5
    doc_flt_func = lambda d: len(d.tokens) >= 10
    topic_flt_func = lambda rank, freq: True

    def filter_into_temp(self):
        from os.path import join
        did_to_cat = dict()
        cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT']
        data_dir = _data_base.format(self.name, self.special[0])
        with open(join(data_dir, 'rcv1-v2.topics.qrels')) as fin:
            for line in fin.readlines():
                line = line.strip().split(' ')
                cat = line[0]
                did = int(line[1])
                if cat in cat_list:
                    did_to_cat[did] = did_to_cat.get(did, []) + [cat]
            # did_to_cat = {k: did_to_cat[k] for k in list(did_to_cat.keys()) if len(did_to_cat[k]) > 1}
            for did in list(did_to_cat.keys()):
                if len(did_to_cat[did]) > 1:
                    del did_to_cat[did]

        dat_list = [
            'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat',
            'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat',
            'lyrl2004_tokens_train.dat'
        ]
        data = list()
        target = list()
        cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3}
        docarr = list()

        del did
        for dat in dat_list:
            with open(join(data_dir, dat)) as fin:
                for line in fin.readlines():
                    if line.startswith('.I'):
                        if 'did' in locals():
                            assert doc != ''
                            if did in did_to_cat:
                                data.append(doc)
                                target.append(cat_to_cid[did_to_cat[did][0]])
                                docarr.append(
                                    (did, cat_to_cid[did_to_cat[did][0]], doc))
                        did = int(line.strip().split(' ')[1])
                        doc = ''
                    elif line.startswith('.W'):
                        assert doc == ''
                    else:
                        doc += line

        print((len(data), 'and', len(did_to_cat)))
        print(data[0])
        assert len(data) == len(did_to_cat)
        print(len(docarr))

        du.dump_docarr(self.temp_file, du.make_docarr(docarr[:20000]))