def _merge_alignment_parts(self, model_prefix: Path, output_file_path: Path) -> None: alignments: List[Tuple[int, str]] = [] for input_file_path in model_prefix.parent.glob(model_prefix.name + f".A{self.file_suffix}.part*"): with open(input_file_path, "r", encoding="utf-8") as in_file: line_index = 0 segment_index = 0 cur_alignment: str = "" for line in in_file: cur_alignment += line alignment_line_index = line_index % 3 if alignment_line_index == 0: start = line.index("(") end = line.index(")") segment_index = int(line[start + 1 : end]) elif alignment_line_index == 2: alignments.append((segment_index, cur_alignment.strip())) cur_alignment = "" line_index += 1 write_corpus( output_file_path, map(lambda a: str(a[1]), sorted(alignments, key=lambda a: a[0])), )
# -*- coding: utf-8 -*- ''' Script to convert CoNLL formatted files to plain text. Also works with token_tag formatted files. Tokens are separated by whitespace. All tags are lost. ''' import argparse from nlpnet.pos.pos_reader import POSReader import utils if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('input', help='Input CoNLL formatted (or token_tag) file') parser.add_argument('output', help='Output file') args = parser.parse_args() r = POSReader(filename=args.input, load_dictionaries=False) utils.write_corpus(r.sentences, args.output, False)
''' Read a POS tagged corpus and extract sentences until a given number of tokens is achieved. The extracted sentences are saved in a new file. ''' import argparse from nlpnet.pos.pos_reader import POSReader import utils if __name__ == '__main__': parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('input', help='Corpus file') parser.add_argument('output', help='Output file') parser.add_argument('num_tokens', help='Minimum number of tokens', type=int) args = parser.parse_args() r = POSReader(filename=args.input, load_dictionaries=False) new_sentences = [] total_tokens = 0 for sent in r.sentences: if total_tokens >= args.num_tokens: break new_sentences.append(sent) total_tokens += len(sent) utils.write_corpus(new_sentences, args.output)
orig.close() while True: context_freq = context_stats(inc, f=args.f) scores_rule = scores(context_freq, best_rules, f=args.f) #ss = scores_rule[0] best_rule = scores_rule[0] for r in best_rule.keys(): best_rules.append(r) best_score = scores_rule[1] applied = scores_rule[2] if best_score <= 0: output = open('%s.final' % name, 'w') write_corpus(inc, output) output.close() out.close() break best_rule = reversed(sorted(best_rule.items(), key=lambda t: t[1])) for r, a in best_rule: inc = list(apply_rule(r, inc, f=args.f)) try: out.write(r.display()) except: out.write(r.display().encode('utf-8')) out.write('score=%s applied=%s\n' % (str(best_score), a)) if args.p: for r in best_rules[:-1]: