Python group_n_lines示例

编程语言: Python

命名空间/包名称: tools

方法/功能: group_n_lines

hotexamples.com的示例: 7

Python group_n_lines - 已找到7个示例。这些是从开源项目中提取的最受好评的tools.group_n_lines现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： gentask.py 项目： d2207197/smttoktag

def slice_lines_grouped_by_n(inf, outf, *, n, s):
    for lines in tools.group_n_lines(inf, n=n):
        if type(s) == slice:
            outf.write(''.join(lines[s]))
        elif type(s) == int:
            outf.write(lines[s])
        else:
            raise AssertionError

示例#2

显示文件

文件： gentask_giza.py 项目： d2207197/smttoktag

 def run(self):
     with self.input().open('r') as inf:
         with self.output()['src'].open(
             'w') as outf1, self.output()['tgt'].open(
                 'w') as outf2:
             for src, tgt, _ in tools.group_n_lines(inf, n=3):
                 outf1.write(src)
                 outf2.write(tgt)

示例#3

显示文件

文件： gentask.py 项目： d2207197/smttoktag

 def ngram_pairs_from_lines(lines):
     for i, (zh, tag, _) in enumerate(tools.group_n_lines(lines,
                                                          n=3), 1):
         zh, tag = zh.split(), tag.split()
         if i % 100000 == 0:
             print('{:,}'.format(i))  # show progress
             # return
         yield from task.ngram_pairs(zh, tag)

示例#4

显示文件

文件： sbc4_tm_lm_tasks.py 项目： d2207197/smttoktag

 def run(self):
     with self.input().open('r') as inf, self.output().open('w') as outf:
         for zh, en, _ in tools.group_n_lines(inf, n=3):
             zh, en = zh.strip(), en.strip()
             zh = tools.zhsent_preprocess(zh)
             with contextlib.redirect_stdout(outf):
                 print(zh)
                 print(en)
                 print()

示例#5

显示文件

文件： lm_tm_luigi.py 项目： d2207197/smttoktag

    def run(self):
        from geniatagger import GeniaTaggerClient
        gtagger = GeniaTaggerClient()
        with self.input().open('r') as input_file:
            with self.output()['np'].open('w') as np_out, self.output()['vp'].open('w') as vp_out, self.output()['pure_np'].open('w') as pure_np_out:
                for en, ch in tools.group_n_lines(input_file, n=2):

                    en, ch = en.strip(), ch.strip()
                    en_tag_info = gtagger.parse(en)
                    if 'B-VP' == en_tag_info[0][3]:
                        outfile = vp_out
                    elif 'B-VP' not in (wdata[3] for wdata in en_tag_info):
                        outfile = pure_np_out
                    else:
                        outfile = np_out

                    print(en, file=outfile)
                    print(ch, file=outfile)
                    print(*('/'.join(wdata) for wdata in en_tag_info), file=outfile)
                    print(file=outfile)

示例#6

显示文件

文件： lm_tm_luigi.py 项目： d2207197/smttoktag

    def run(self):
        from nltk.tokenize import sent_tokenize
        from nltk.tokenize import RegexpTokenizer
        ch_sent_tokenize = RegexpTokenizer('(?:[^。「」！？]*(「[^」]*」)?[^。「」！？]*)+[。！？；]?').tokenize
        import sys

        with self.input().open('r') as input_file, self.output().open('w') as output_file:
            for en, ch in tools.group_n_lines(input_file, n=2):
                en, ch = en.strip(), ch.strip()
                ens = sent_tokenize(en)
                chs = [sub_ch for sub_ch in ch_sent_tokenize(ch) if sub_ch != '']

                score = 0
                if len(ens) != len(chs):
                    print('Unmatched sentences length:', ens, chs, file=sys.stderr)
                    continue

                score = sum(translate_score(en, ch)
                            for en, ch in zip(ens, chs)) / len(en.split())

                for en, ch in zip(ens, chs):
                    print(score, en, ch, sep='\t', file=output_file)

示例#7

显示文件

文件： gentask.py 项目： d2207197/smttoktag

def transformat_line2slash(inf, outf):
    for zh, tag, _ in tools.group_n_lines(inf, n=3):
        zh, tag = zh.strip().split(), tag.strip().split()
        print(*('{}/{}'.format(z, t) for z, t in zip(zh, tag)), file=outf)