def run(self): en_ch_sents = [] sents_index = defaultdict(set) print('building sents index...') with self.input()['en'].open('r') as enf, \ self.input()['ch'].open('r') as chf: for sent_no, (en_sent, ch_sent) in enumerate(zip(enf, chf)): en_ch_sents.append((en_sent.strip(), ch_sent.strip())) bigrams = tools.ngrams(en_sent.strip().split(), 2) for bigram in bigrams: sents_index[bigram].add(sent_no) print('finding spg sents...') with self.input()['spg'].open('r') as spgf, \ self.output().open('w') as outputf: for spgs_of_en_pattern_json in spgf: spgs_of_en_pattern = json.loads(spgs_of_en_pattern_json) for spg in spgs_of_en_pattern['ch_patterns']: bigrams = tools.ngrams(spg['en_phrase'].split(), 2) sents_nos_sets = (sents_index[bigram] for bigram in bigrams) sents_nos = reduce(lambda x, y: x & y, sents_nos_sets) sents = (en_ch_sents[sent_no] for sent_no in sents_nos) spg['sents'] = [(en_sent, ch_sent) for en_sent, ch_sent in sents if spg['en_phrase'] in en_sent and spg['ch_phrase'] in ch_sent] print(json.dumps(spgs_of_en_pattern, ensure_ascii=False, check_circular=False), file=outputf)
def selectFeatures(data, tags, stoplist, p): # p[i] for i-gram passing high pass filter dlist = [] for r in data: r['feature'] = [] for num in range(1, len(p) + 1): # n-gram lengthes stat = Counter() for tag in tags: try: stat += ngrams(r[tag], num, stoplist) except KeyError: continue nc = csfilter(stat, p[num - 1], 0) r['feature'] += [k for k,v in nc] if not r['feature']: dlist.append(r) for r in dlist: data.remove(r)
def convert2pytables(phrasetable_path, lexe2f_path, lexf2e_path, h5_path, reverse=False): class PTable(tb.IsDescription): bigram = tb.StringCol(30) en = tb.StringCol(200) ch = tb.StringCol(200) aligns = tb.StringCol(100) scores = tb.Float64Col(shape=4) lexe2f = [] with open(lexe2f_path) as lexe2f_f: for line in lexe2f_f: # print(line) en, ch_prob = line.strip().split(' ', 1) ch, prob = ch_prob.rsplit(' ', 1) lexe2f.append((en, ch, prob)) lexf2e = [] with open(lexf2e_path) as lexf2e_f: for line in lexf2e_f: ch, en, prob = line.strip().split(' ', 1) lexf2e.append((ch, en, prob)) if reverse: lexe2f, lexf2e = lexf2e, lexe2f with tb.open_file(h5_path, mode='w', title='PhraseTable') as h5file, \ gzip.open(phrasetable_path, 'rt') as ptfile: filters = tb.Filters(complevel=9, complib='blosc') h5file.create_array('/', 'lexe2f', lexe2f, 'lex en to ch prob') h5file.create_array('/', 'lexf2e', lexf2e, 'lex ch to en prob') table = h5file.create_table( '/', 'phrasetable', description=PTable, title='Phrase Table', filters=filters, # expectedrows=21626879, # chunkshape=(21626,) ) print(h5file) table_row = table.row for line in ptfile: en, ch, scores, aligns, cnt = line.strip().split(' ||| ') if reverse: en, ch = ch, en en, ch = en.strip(), ch.strip() inv_phrase_prob, inv_lex_w, dir_phrase_prob, dir_lex_w, _ = map( float, scores.strip().split()) if reverse: inv_phrase_prob, inv_lex_w, dir_phrase_prob, dir_lex_w = dir_phrase_prob, dir_lex_w, inv_phrase_prob, inv_lex_w aligns = (map(int, align.split('-')) for align in aligns.strip().split()) if reverse: aligns = ((en_pos, ch_pos) for ch_pos, en_pos in aligns) aligns_ddict = defaultdict(list) for en_pos, ch_pos in aligns: aligns_ddict[en_pos].append(ch_pos) aligns = dict(aligns_ddict) bigrams = tools.ngrams(en.split(), 2) en = en.strip() for bigram in bigrams: table_row['bigram'] = ' '.join(bigram).encode('utf8') table_row['en'] = en.encode('utf8') table_row['ch'] = ch.encode('utf8') table_row['aligns'] = json.dumps(aligns).encode('utf8') table_row['scores'] = (inv_phrase_prob, inv_lex_w, dir_phrase_prob, dir_lex_w) table_row.append() table.flush() table.cols.bigram.create_csindex(filters=filters)