Exemplo n.º 1
0
def filter_lines(args):

    fpath = os.path.join(args.output_dir, args.model)

    accepted = ParallelWriter(fpath, fname='train', unique=True)
    rejected = ParallelWriter(fpath, fname='rejected')

    # Open aligned files with directory assumptions.
    dirname = '{}-{}'.format(*sorted([args.src_lang, args.tgt_lang]))
    src_aligned = open(
        os.path.join(fpath, dirname, 'aligned.{}'.format(args.src_lang)), 'r')
    tgt_aligned = open(
        os.path.join(fpath, dirname, 'aligned.{}'.format(args.tgt_lang)), 'r')

    # engine loaded for tokenizer. Consistency between export model.
    engine = from_pretrained(tag=args.model, use_cuda=False)
    filters = [
        EvalLang(args.src_lang, args.tgt_lang),
        LengthRatioFilter(engine.tokenizer,
                          args.src_lang,
                          args.tgt_lang,
                          min_length=2,
                          lower_bound=0.5,
                          upper_bound=2.0)
    ]

    for src_line, tgt_line in zip(src_aligned, tgt_aligned):
        src_line = src_line.rstrip('\n')
        tgt_line = tgt_line.rstrip('\n')

        # Check if any of the filters fail.
        allowed_in = True
        for _filter in filters:
            if not _filter(src_line, tgt_line):
                allowed_in = False

        # Otherwise.
        if allowed_in:
            accepted.write(args.src_lang, args.tgt_lang, src_line, tgt_line)

        else:
            rejected.write(args.src_lang, args.tgt_lang, src_line, tgt_line)
Exemplo n.º 2
0
def store_retrieved(model, pivot_lang, langs, force_redo=False, resume_from=0):
    op_model = from_pretrained(tag=model, use_cuda=True)
    queries = (db.session.query(Translation, Entry).join(Entry).filter(
        and_(Translation.model == model, Translation.lang == pivot_lang,
             Entry.lang.in_(langs))).all())

    counter = 0
    for query, _ in tqdm(queries):
        if counter < resume_from:
            counter += 1
            continue

        counter += 1
        if query.translated:
            retrieval_entry = (Retrieval.query.filter(
                and_(Retrieval.query_id == query.parent_id,
                     Retrieval.model == model)).first())
            if not retrieval_entry or force_redo:
                retrieved = retrieve_neighbours(query.parent_id,
                                                pivot_lang,
                                                op_model.tokenizer,
                                                model=model)
                if retrieved:
                    first = retrieved[0]
                    retrieved_id, score = first
                    if retrieval_entry:
                        retrieval_entry.retrieved_id = retrieved_id
                        retrieval_entry.score = score

                    else:
                        retrieval_entry = Retrieval(query_id=query.parent_id,
                                                    retrieved_id=retrieved_id,
                                                    score=score,
                                                    model=model)

                    db.session.add(retrieval_entry)
                    db.session.commit()
Exemplo n.º 3
0
 def op_model():
     from ilmulti.translator import from_pretrained
     return from_pretrained(tag='mm-to-en-iter3', use_cuda=True)
Exemplo n.º 4
0
    parser.add_argument('--model',
                        help='model used to translate',
                        required=True)
    parser.add_argument('--tgt-lang',
                        help='target lang to translate to',
                        required=True)
    parser.add_argument('--force-rebuild',
                        help='restore the tranlsation items',
                        action='store_true')
    parser.add_argument('--start-over',
                        help='delete existing translations',
                        action='store_true')
    parser.add_argument('--resume-from',
                        help='delete existing translations',
                        action='store_true')
    parser.add_argument('--use-cuda',
                        help='use available GPUs',
                        action='store_true')

    args = parser.parse_args()

    if args.start_over:
        # delete_existing_translations(args.model, args.tgt_lang)
        pass

    engine = from_pretrained(tag=args.model, use_cuda=args.use_cuda)
    langs = ['hi', 'ta', 'te', 'ml', 'bn', 'gu', 'mr', 'pa', 'or', 'ur']

    translate(engine, args.max_tokens, args.model, langs, args.tgt_lang,
              args.force_rebuild)
Exemplo n.º 5
0
            if check_pair_title(entry, link, lang):
                title_match += 1 
                # if check_pair_length(entry, link):
                #    content_length += 1
                if check_retrieval(entry, link, model):
                    retrieved += 1

    print('Total articles', articles)
    print('date_match', date_match)
    print('title_match', title_match)
    print('content_length', content_length)
    print('retrieved', retrieved, model, '\n')

if __name__ == '__main__':
    '''
        Sanity checks on a gold dataset determined
        by dates, title and content based matches.
    ''' 
    parser=ArgumentParser()
    parser.add_argument('lang', help='language for sanity checks')
    args = parser.parse_args()
    lang = args.lang

    engine = from_pretrained(tag='mm-to-en-iter1', use_cuda=False)
    segmenter = engine.segmenter
    #models = ['mm-all-iter1', 'mm-to-en-iter1', 'mm_all_iter0', 'mm-all-iter0', 'mm_toEN_iter1']:
    models = ['mm_all_iter0', 'mm-to-en-iter1']

    for model in models:
        sanity_check(lang, model)