示例#1
0
    def _merge_alignment_parts(self, model_prefix: Path, output_file_path: Path) -> None:
        alignments: List[Tuple[int, str]] = []
        for input_file_path in model_prefix.parent.glob(model_prefix.name + f".A{self.file_suffix}.part*"):
            with open(input_file_path, "r", encoding="utf-8") as in_file:
                line_index = 0
                segment_index = 0
                cur_alignment: str = ""
                for line in in_file:
                    cur_alignment += line
                    alignment_line_index = line_index % 3
                    if alignment_line_index == 0:
                        start = line.index("(")
                        end = line.index(")")
                        segment_index = int(line[start + 1 : end])
                    elif alignment_line_index == 2:
                        alignments.append((segment_index, cur_alignment.strip()))
                        cur_alignment = ""
                    line_index += 1

        write_corpus(
            output_file_path,
            map(lambda a: str(a[1]), sorted(alignments, key=lambda a: a[0])),
        )
示例#2
0
# -*- coding: utf-8 -*-

'''
Script to convert CoNLL formatted files to plain text.
Also works with token_tag formatted files.
Tokens are separated by whitespace. All tags are lost.
'''

import argparse
from nlpnet.pos.pos_reader import POSReader

import utils

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('input', help='Input CoNLL formatted (or token_tag) file')
    parser.add_argument('output', help='Output file')
    args = parser.parse_args()

    r = POSReader(filename=args.input, load_dictionaries=False)
    utils.write_corpus(r.sentences, args.output, False)




示例#3
0
'''
Read a POS tagged corpus and extract sentences until a given number
of tokens is achieved. The extracted sentences are saved in a new file.
'''

import argparse
from nlpnet.pos.pos_reader import POSReader

import utils

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('input', help='Corpus file')
    parser.add_argument('output', help='Output file')
    parser.add_argument('num_tokens', help='Minimum number of tokens', type=int)
    args = parser.parse_args()
    
    r = POSReader(filename=args.input, load_dictionaries=False)
    new_sentences = []
    total_tokens = 0
    
    for sent in r.sentences:
        if total_tokens >= args.num_tokens:
            break
        
        new_sentences.append(sent)
        total_tokens += len(sent)
    
    utils.write_corpus(new_sentences, args.output)
示例#4
0
    orig.close()

    while True:
        context_freq = context_stats(inc, f=args.f)
        scores_rule = scores(context_freq, best_rules, f=args.f)
        #ss = scores_rule[0]
        best_rule = scores_rule[0]

        for r in best_rule.keys():
            best_rules.append(r)
        best_score = scores_rule[1]
        applied = scores_rule[2]

        if best_score <= 0:
            output = open('%s.final' % name, 'w')
            write_corpus(inc, output)
            output.close()
            out.close()
            break
        best_rule = reversed(sorted(best_rule.items(), key=lambda t: t[1]))

        for r, a in best_rule:
            inc = list(apply_rule(r, inc, f=args.f))
            try:
                out.write(r.display())
            except:
                out.write(r.display().encode('utf-8'))
            out.write('score=%s applied=%s\n' % (str(best_score), a))

        if args.p:
            for r in best_rules[:-1]:
示例#5
0
    orig.close()

    while True:
        context_freq = context_stats(inc, f=args.f)
        scores_rule = scores(context_freq, best_rules, f=args.f)
        #ss = scores_rule[0]
        best_rule = scores_rule[0]

        for r in best_rule.keys():
            best_rules.append(r)
        best_score = scores_rule[1]
        applied = scores_rule[2]

        if best_score <= 0:
            output = open('%s.final' % name, 'w')
            write_corpus(inc, output)
            output.close()
            out.close()
            break
        best_rule = reversed(sorted(best_rule.items(), key=lambda t: t[1]))

        for r, a in best_rule:
            inc = list(apply_rule(r, inc, f=args.f))
            try:
                out.write(r.display())
            except:
                out.write(r.display().encode('utf-8'))
            out.write('score=%s applied=%s\n' % (str(best_score), a))

        if args.p:
            for r in best_rules[:-1]: