Пример #1
0
def main(args):
    lines = read_file_to_lines(args.lex_table, args.unicode_escape)
    entries = filter(lambda x: x, [
        parse_line_to_entry(line, delimiter=args.delimiter) for line in lines
    ])
    raw_lexicon = groupby(sorted(entries), key=(lambda e: (e.src, e.tgt)))

    def merge_duplicate_entry(entries):
        entries = list(entries)
        e = entries[0]
        return LexEntry(e.src, e.tgt, sum(e.prob for e in entries))

    def map_entries(entries):
        entries = sorted(entries, key=lambda e: -e.prob)[:3]
        max_prob = entries[0].prob
        entries = [
            LexEntry(e.src, e.tgt, e.prob * (1 / max_prob)) for e in entries
        ]
        return filter(lambda e: e.prob > 1e-5, entries)

    merged_lexicon = map(merge_duplicate_entry,
                         map(operator.itemgetter(1), raw_lexicon))
    lexicon = groupby(sorted(merged_lexicon), key=(lambda e: e.src))
    pruned_lexicon = flatten(
        map(map_entries, map(operator.itemgetter(1), lexicon)))
    out_lines = map(lambda e: f"{e.src} {e.prob} {e.tgt}", pruned_lexicon)
    write_lines_to_file(args.lexicon_path, out_lines)
Пример #2
0
 def from_moses(cls, moses_path, unicode_escape):
     lines = read_file_to_lines(moses_path, unicode_escape)
     entries = filter(lambda x: x, [
         MosesHelper.parse_line_to_entry(line, delimiter='\|\|\|')
         for line in lines
     ])
     return cls(list(entries))
Пример #3
0
 def from_kaldi(cls, lexicon_path: str, with_prob: bool = False, sum_dup_pron_probs: bool = True):
     lines = read_file_to_lines(lexicon_path)
     def parse_line(line):
         cols = line.split()
         if with_prob:
             return LexiconEntry(cols[0], np.log(float(cols[1])), " ".join(cols[2:]))
         else:
             return LexiconEntry(cols[0], 0.0, " ".join(cols[1:]))
     return cls(map(parse_line, lines), sum_dup_pron_probs)
Пример #4
0
#        chinese = converter.convert(chinese)
#        new_sent = sent[:start] + chinese + sent[:end]
#        return convert_arabic_number_to_chinese(new_sent)
#    else:
#        return sent

parser = argparse.ArgumentParser()
parser.add_argument('--lexicon-path')
parser.add_argument('--input-path')
parser.add_argument('--output-path')
parser.add_argument('--filtered-output-path')
args = parser.parse_args()

lexicon = parse_lexicon(args.lexicon_path)

lines = read_file_to_lines(args.input_path)
texts = [" ".join(line.split()[2:]) for line in lines]

cnt = 0
cutted_sents = []
valid_line_nums = []
for idx, text in enumerate(texts):
    sents = re.split("\s+", text)
    words = []
    segmentable = True
    for sent in sents:
        sent = re.sub("[^\u4e00-\u9fa5A-Za-z0-9]", "", sent)
        sent = cn2an.transform(sent, "an2cn")
        sent = converter.convert(sent)
        maybe_words = dict_seg(sent, lexicon)
        if not maybe_words:
Пример #5
0
    moses_config = MosesConfig(True, True, args.n_best)
    moses_client = MosesClient(port=args.mosesserver_port, config=moses_config)
    word_seg = None
    if "dict" in model_types:
        from tsm.ckip_wrapper import CKIPWordSegWrapper
        cutter = CKIPWordSegWrapper(args.ckip_path, dict_lexicon,
                                    not args.recommend_dictionary)

    seq2seq_translator = None
    if 'seq2seq' in model_types or 'seq2seq' in args.unk_consult_order:
        seq2seq_translator = AllennlpClient()
    unk_translator = UnkTranslator(prob_lexicon, dict_lexicon, taibun_lexicon,
                                   args.unk_consult_order, seq2seq_translator)
    maybe_process_unk = maybe_process_unk_factory(unk_translator)

    lines = read_file_to_lines(args.src_path)
    outf = open(args.dest_path, 'w')

    oovs = []
    for line in tqdm.tqdm(lines):
        utt_id = None
        if args.has_utt_id:
            fields = line.split()
            utt_id = fields[0]
            line = " ".join(fields[1:])
        src_sent = Sentence.parse_mixed_text(line, remove_punct=True)
        all_entries = []
        if "dict" in model_types:
            maybe_sents = cutter.cut("".join(src_sent))
            n_best = math.ceil(
                min(args.n_best, math.exp(math.log(1000) / len(maybe_sents))))
Пример #6
0
parser = argparse.ArgumentParser()
parser.add_argument('input_file')
parser.add_argument('map_file')
parser.add_argument('output_file')
parser.add_argument('--col', type=int, help="starting from which column")
parser.add_argument('--delimiter', default="\s+")

args = parser.parse_args()


def line2word_syls(line):
    cols = re.split(args.delimiter, line)
    return cols[:args.col], list(filter(lambda col: col, cols[args.col:]))


syl_lines = read_file_to_lines(args.input_file)
syl_lexicon = list(map(line2word_syls, syl_lines))


def line2syl_phn(line):
    idx = line.index(" ")
    return line[:idx], line[idx + 1:]


map_lines = read_file_to_lines(args.map_file)
mapping = dict(map(line2syl_phn, map_lines))


def map_syltone(syl):
    tone = int(syl[-1])
    phns = mapping[syl[:-1]]