def filter_into_temp(self): twarr = iu.load_array(self.orgn_file) print(len(twarr), type(twarr[0])) docarr = du.make_docarr( [[tw[k] for k in ('tweetId', 'clusterNo', 'textCleaned')] for tw in twarr]) du.dump_docarr(self.temp_file, docarr)
def filter_into_temp(self): from bs4 import BeautifulSoup files = iu.list_children(self.orgn_file, full_path=True) array = list() for fidx, file in enumerate(files): print(fidx) tree = BeautifulSoup(''.join(iu.read_lines(file)), "html.parser") for article in tree.find_all("reuters"): topics = list(article.topics.children) if not len(topics) == 1: continue topic = str(topics[0].text.encode('ascii', 'ignore')) text = article.find('text') if text is None or text.body is None: continue title = str( text.title.text.encode('utf-8', 'ignore')) if text.title is not None else '' title = ' '.join(pu.tokenize(title, pu.tokenize_pattern)) body = str(text.body.text.encode('utf-8', 'ignore')) body = ' '.join(pu.tokenize(body, pu.tokenize_pattern)) array.append((topic, '{}, {}'.format(title, body))) docarr = du.make_docarr([(idx, topic, body) for idx, (topic, body) in enumerate(array)]) print(len(docarr)) print(Counter([d.topic for d in docarr])) print(len(sorted(set([d.topic for d in docarr])))) du.dump_docarr(self.temp_file, docarr)
def filter_from_temp(self): c = self.__class__ topic_flt_func, wf_flt_func, doc_flt_func = c.topic_flt_func, c.wf_flt_func, c.doc_flt_func docarr = du.load_docarr(self.temp_file) docarr = du.filter_duplicate_docid(docarr) docarr = du.tokenize_docarr(docarr, stemming=self.stem) print('data prepare (filter duplicate id, tokenize) over') acc_topics, rej_topics_1, docarr = du.filter_docarr_by_topic( docarr, topic_flt_func) docarr, ifd = du.docarr_bootstrap_ifd(docarr, wf_flt_func) # docarr = du.filter_docarr(docarr, doc_flt_func) # docarr, ifd = du.docarr_bootstrap_ifd(docarr, wf_flt_func) # acc_topics, rej_topics_2, docarr = du.filter_docarr_by_topic(docarr, topic_flt_func) # docarr, ifd = du.docarr_bootstrap_ifd(docarr, wf_flt_func) # docarr = du.filter_docarr(docarr, doc_flt_func) # rej_topics = rej_topics_1 + rej_topics_2 rej_topics = rej_topics_1 print('get {} docs\n'.format(len(docarr))) print('{} suff topic:{}\n'.format(len(acc_topics), acc_topics.most_common())) print('{} insuff topic:{}'.format(len(rej_topics), rej_topics.most_common()[:20])) docarr = sorted(docarr, key=lambda d: d.topic) ifd.dump_dict(self.dict_file) du.dump_docarr(self.docarr_file, docarr)
def filter_into_temp(self): file_list = iu.list_children(self.orgn_file, full_path=True) twarr_list = [iu.load_array(file) for file in file_list] doclist = list() for topic_id, twarr in enumerate(twarr_list): for tw in twarr: doclist.append((str(tw['id']), topic_id, tw['text'].replace('#', ''))) docarr = du.make_docarr(doclist) du.dump_docarr(self.temp_file, docarr)
def filter_into_temp(self): json_list = iu.load_array(self.orgn_file) item_list = list() for i, o in enumerate(json_list): text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:1200]) # text = ' '.join(pu.tokenize(o['text'], pu.tokenize_pattern)[:3000]) # text = o['text'] item_list.append((i, o['cluster'], text)) docarr = du.make_docarr(item_list) du.dump_docarr(self.temp_file, docarr)
def filter_into_temp(self): from os.path import join did_to_cat = dict() cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT'] data_dir = _data_base.format(self.name, self.special[0]) with open(join(data_dir, 'rcv1-v2.topics.qrels')) as fin: for line in fin.readlines(): line = line.strip().split(' ') cat = line[0] did = int(line[1]) if cat in cat_list: did_to_cat[did] = did_to_cat.get(did, []) + [cat] # did_to_cat = {k: did_to_cat[k] for k in list(did_to_cat.keys()) if len(did_to_cat[k]) > 1} for did in list(did_to_cat.keys()): if len(did_to_cat[did]) > 1: del did_to_cat[did] dat_list = [ 'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat', 'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat', 'lyrl2004_tokens_train.dat' ] data = list() target = list() cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3} docarr = list() del did for dat in dat_list: with open(join(data_dir, dat)) as fin: for line in fin.readlines(): if line.startswith('.I'): if 'did' in locals(): assert doc != '' if did in did_to_cat: data.append(doc) target.append(cat_to_cid[did_to_cat[did][0]]) docarr.append( (did, cat_to_cid[did_to_cat[did][0]], doc)) did = int(line.strip().split(' ')[1]) doc = '' elif line.startswith('.W'): assert doc == '' else: doc += line print((len(data), 'and', len(did_to_cat))) print(data[0]) assert len(data) == len(did_to_cat) print(len(docarr)) du.dump_docarr(self.temp_file, du.make_docarr(docarr[:20000]))
def filter_into_temp(self): twarr = iu.load_array(self.orgn_file) outrows = list() for idx, tw in enumerate(twarr): if tw['relevance'] > 1: continue docid, topic, text = tw['tweetId'], tw['clusterNo'], tw['text'] if not 10 < len(' '.join(pu.tokenize(text, pu.tokenize_pattern))): continue outrows.append([docid, topic, text]) topics = Counter([r[1] for r in outrows]) print('get {} rows'.format(len(outrows))) print('{} topics, {}'.format(len(topics), topics)) du.dump_docarr(self.temp_file, du.make_docarr(outrows))