示例#1
0
def slice_lines_grouped_by_n(inf, outf, *, n, s):
    for lines in tools.group_n_lines(inf, n=n):
        if type(s) == slice:
            outf.write(''.join(lines[s]))
        elif type(s) == int:
            outf.write(lines[s])
        else:
            raise AssertionError
示例#2
0
 def run(self):
     with self.input().open('r') as inf:
         with self.output()['src'].open(
             'w') as outf1, self.output()['tgt'].open(
                 'w') as outf2:
             for src, tgt, _ in tools.group_n_lines(inf, n=3):
                 outf1.write(src)
                 outf2.write(tgt)
示例#3
0
 def ngram_pairs_from_lines(lines):
     for i, (zh, tag, _) in enumerate(tools.group_n_lines(lines,
                                                          n=3), 1):
         zh, tag = zh.split(), tag.split()
         if i % 100000 == 0:
             print('{:,}'.format(i))  # show progress
             # return
         yield from task.ngram_pairs(zh, tag)
示例#4
0
 def run(self):
     with self.input().open('r') as inf, self.output().open('w') as outf:
         for zh, en, _ in tools.group_n_lines(inf, n=3):
             zh, en = zh.strip(), en.strip()
             zh = tools.zhsent_preprocess(zh)
             with contextlib.redirect_stdout(outf):
                 print(zh)
                 print(en)
                 print()
示例#5
0
    def run(self):
        from geniatagger import GeniaTaggerClient
        gtagger = GeniaTaggerClient()
        with self.input().open('r') as input_file:
            with self.output()['np'].open('w') as np_out, self.output()['vp'].open('w') as vp_out, self.output()['pure_np'].open('w') as pure_np_out:
                for en, ch in tools.group_n_lines(input_file, n=2):

                    en, ch = en.strip(), ch.strip()
                    en_tag_info = gtagger.parse(en)
                    if 'B-VP' == en_tag_info[0][3]:
                        outfile = vp_out
                    elif 'B-VP' not in (wdata[3] for wdata in en_tag_info):
                        outfile = pure_np_out
                    else:
                        outfile = np_out

                    print(en, file=outfile)
                    print(ch, file=outfile)
                    print(*('/'.join(wdata) for wdata in en_tag_info), file=outfile)
                    print(file=outfile)
示例#6
0
    def run(self):
        from nltk.tokenize import sent_tokenize
        from nltk.tokenize import RegexpTokenizer
        ch_sent_tokenize = RegexpTokenizer('(?:[^。「」!?]*(「[^」]*」)?[^。「」!?]*)+[。!?;]?').tokenize
        import sys

        with self.input().open('r') as input_file, self.output().open('w') as output_file:
            for en, ch in tools.group_n_lines(input_file, n=2):
                en, ch = en.strip(), ch.strip()
                ens = sent_tokenize(en)
                chs = [sub_ch for sub_ch in ch_sent_tokenize(ch) if sub_ch != '']

                score = 0
                if len(ens) != len(chs):
                    print('Unmatched sentences length:', ens, chs, file=sys.stderr)
                    continue

                score = sum(translate_score(en, ch)
                            for en, ch in zip(ens, chs)) / len(en.split())

                for en, ch in zip(ens, chs):
                    print(score, en, ch, sep='\t', file=output_file)
示例#7
0
def transformat_line2slash(inf, outf):
    for zh, tag, _ in tools.group_n_lines(inf, n=3):
        zh, tag = zh.strip().split(), tag.strip().split()
        print(*('{}/{}'.format(z, t) for z, t in zip(zh, tag)), file=outf)