예제 #1
0
        def run(self):
            en_ch_sents = []
            sents_index = defaultdict(set)
            print('building sents index...')
            with self.input()['en'].open('r') as enf, \
                 self.input()['ch'].open('r') as chf:
                for sent_no, (en_sent, ch_sent) in enumerate(zip(enf, chf)):
                    en_ch_sents.append((en_sent.strip(), ch_sent.strip()))
                    bigrams = tools.ngrams(en_sent.strip().split(), 2)
                    for bigram in bigrams:
                        sents_index[bigram].add(sent_no)
            print('finding spg sents...')
            with self.input()['spg'].open('r') as spgf, \
                 self.output().open('w') as outputf:

                for spgs_of_en_pattern_json in spgf:
                    spgs_of_en_pattern = json.loads(spgs_of_en_pattern_json)
                    for spg in spgs_of_en_pattern['ch_patterns']:
                        bigrams = tools.ngrams(spg['en_phrase'].split(), 2)

                        sents_nos_sets = (sents_index[bigram]
                                          for bigram in bigrams)
                        sents_nos = reduce(lambda x, y: x & y, sents_nos_sets)
                        sents = (en_ch_sents[sent_no] for sent_no in sents_nos)
                        spg['sents'] = [(en_sent, ch_sent)
                                        for en_sent, ch_sent in sents
                                        if spg['en_phrase'] in en_sent and
                                        spg['ch_phrase'] in ch_sent]

                    print(json.dumps(spgs_of_en_pattern,
                                     ensure_ascii=False,
                                     check_circular=False),
                          file=outputf)
예제 #2
0
파일: preprocess.py 프로젝트: s1van/cse5243
def selectFeatures(data, tags, stoplist, p):	# p[i] for i-gram passing high pass filter

	dlist = []
	for r in data:
		r['feature']  = []
		for num in range(1, len(p) + 1):	# n-gram lengthes
			stat = Counter()
			for tag in tags:
				try:
					stat += ngrams(r[tag], num, stoplist)
				except KeyError:
					continue
			nc = csfilter(stat, p[num - 1], 0)
			r['feature'] += [k for k,v in nc]
		if not r['feature']:
			dlist.append(r)
		
	for r in dlist:
		data.remove(r)
예제 #3
0
    def convert2pytables(phrasetable_path, lexe2f_path, lexf2e_path, h5_path,
                         reverse=False):
        class PTable(tb.IsDescription):
            bigram = tb.StringCol(30)
            en = tb.StringCol(200)
            ch = tb.StringCol(200)
            aligns = tb.StringCol(100)
            scores = tb.Float64Col(shape=4)

        lexe2f = []
        with open(lexe2f_path) as lexe2f_f:
            for line in lexe2f_f:
                # print(line)
                en, ch_prob = line.strip().split(' ', 1)
                ch, prob = ch_prob.rsplit(' ', 1)
                lexe2f.append((en, ch, prob))

        lexf2e = []
        with open(lexf2e_path) as lexf2e_f:
            for line in lexf2e_f:
                ch, en, prob = line.strip().split(' ', 1)
                lexf2e.append((ch, en, prob))

        if reverse:
            lexe2f, lexf2e = lexf2e, lexe2f

        with tb.open_file(h5_path, mode='w', title='PhraseTable') as h5file, \
             gzip.open(phrasetable_path, 'rt') as ptfile:
            filters = tb.Filters(complevel=9, complib='blosc')
            h5file.create_array('/', 'lexe2f', lexe2f, 'lex en to ch prob')
            h5file.create_array('/', 'lexf2e', lexf2e, 'lex ch to en prob')

            table = h5file.create_table(
                '/', 'phrasetable',
                description=PTable,
                title='Phrase Table',
                filters=filters,
                # expectedrows=21626879,  # chunkshape=(21626,)
            )
            print(h5file)
            table_row = table.row
            for line in ptfile:
                en, ch, scores, aligns, cnt = line.strip().split(' ||| ')
                if reverse: en, ch = ch, en

                en, ch = en.strip(), ch.strip()

                inv_phrase_prob, inv_lex_w, dir_phrase_prob, dir_lex_w, _ = map(
                    float, scores.strip().split())
                if reverse:
                    inv_phrase_prob, inv_lex_w, dir_phrase_prob, dir_lex_w = dir_phrase_prob, dir_lex_w, inv_phrase_prob, inv_lex_w

                aligns = (map(int, align.split('-'))
                          for align in aligns.strip().split())
                if reverse:
                    aligns = ((en_pos, ch_pos) for ch_pos, en_pos in aligns)

                aligns_ddict = defaultdict(list)
                for en_pos, ch_pos in aligns:
                    aligns_ddict[en_pos].append(ch_pos)
                aligns = dict(aligns_ddict)

                bigrams = tools.ngrams(en.split(), 2)
                en = en.strip()
                for bigram in bigrams:
                    table_row['bigram'] = ' '.join(bigram).encode('utf8')
                    table_row['en'] = en.encode('utf8')
                    table_row['ch'] = ch.encode('utf8')
                    table_row['aligns'] = json.dumps(aligns).encode('utf8')
                    table_row['scores'] = (inv_phrase_prob, inv_lex_w,
                                           dir_phrase_prob, dir_lex_w)
                    table_row.append()
            table.flush()
            table.cols.bigram.create_csindex(filters=filters)