def extract_translations(dump_file, target_lang, ref_file_list,
                         output_to_console):
    """
    """
    stderr('Loading references...')
    doc_to_ref = load_references(ref_file_list)
    doc_to_src = load_sources(ref_file_list)
    doc_to_timing = defaultdict(dict)
    session_id = 0

    stderr('Loading database dump...')
    doc_to_user_txt = defaultdict(dict)
    doc_to_user_time = defaultdict(dict)
    doc_to_user_valid = defaultdict(dict)
    dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang)
    username_set = set()
    for row in dump_row_list:
        username_set.add(row.username)
        text_dict = json.loads(row.text)
        segment_to_tgt_txt = imt_utils.final_translations_from_dict(text_dict)
        doc_name = url2doc(row.src_doc)
        log = json.loads(row.log)
        segment_to_time = segment_times_from_log(log)
        segment_to_mt = initial_translations_from_imt_log(
            log
        ) if row.interface == 'imt' else initial_translations_from_pe_log(log)
        for line_id in sorted(segment_to_tgt_txt.keys()):
            doc_id = '%s:%d' % (doc_name, line_id)
            user_id = row.username + ':' + row.interface
            mt_id = 'MT:mt'
            doc_to_user_txt[doc_id][user_id] = segment_to_tgt_txt[line_id]
            doc_to_user_time[doc_id][user_id] = segment_to_time[line_id]
            doc_to_user_valid[doc_id][user_id] = str2bool(row.valid)
            if line_id in segment_to_mt:
                doc_to_user_txt[doc_id][mt_id] = segment_to_mt[line_id]
                doc_to_user_time[doc_id][mt_id] = 0.0
                doc_to_user_valid[doc_id][mt_id] = True
            else:
                stderr('WARNING: No MT for %s %s %d' %
                       (row.username, doc_id, line_id))

    # Output the results
    output_system_files(doc_to_ref, doc_to_src, doc_to_user_txt,
                        doc_to_user_valid, username_set)
    if output_to_console:
        console_dump(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_time)
def extract_translations(dump_file,
                         target_lang,
                         ref_file_list,
                         output_to_console):
    """
    """
    stderr('Loading references...')
    doc_to_ref = load_references(ref_file_list)
    doc_to_src = load_sources(ref_file_list)
    doc_to_timing = defaultdict(dict)
    session_id = 0

    stderr('Loading database dump...')
    doc_to_user_txt = defaultdict(dict)
    doc_to_user_time = defaultdict(dict)
    doc_to_user_valid = defaultdict(dict)
    dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang)
    username_set = set()
    for row in dump_row_list:
        username_set.add(row.username)
        text_dict = json.loads(row.text)
        segment_to_tgt_txt = imt_utils.final_translations_from_dict(text_dict)
        doc_name = url2doc(row.src_doc)
        log = json.loads(row.log)
        segment_to_time = segment_times_from_log(log)
        segment_to_mt = initial_translations_from_imt_log(log) if row.interface == 'imt' else initial_translations_from_pe_log(log)
        for line_id in sorted(segment_to_tgt_txt.keys()):
            doc_id = '%s:%d' % (doc_name, line_id)
            user_id = row.username + ':' + row.interface
            mt_id = 'MT:mt'
            doc_to_user_txt[doc_id][user_id] = segment_to_tgt_txt[line_id]
            doc_to_user_time[doc_id][user_id] = segment_to_time[line_id]
            doc_to_user_valid[doc_id][user_id] = str2bool(row.valid)
            if line_id in segment_to_mt:
                doc_to_user_txt[doc_id][mt_id] = segment_to_mt[line_id]
                doc_to_user_time[doc_id][mt_id] = 0.0
                doc_to_user_valid[doc_id][mt_id] = True
            else:
                stderr('WARNING: No MT for %s %s %d' % (row.username,
                                                        doc_id,
                                                        line_id))

    # Output the results
    output_system_files(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_valid, username_set)
    if output_to_console:
        console_dump(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_time)
예제 #3
0
# Load and process the database dump
session_order = 0
condition_order = 0
last_user = None
last_condition = None
for i, row in enumerate(dump_row_list):
    if i > 0 and i % 10 == 0:
        sys.stdout.write('.')
        if i % 800 == 0:
            print
    tgt_text_dict = json.loads(row.text)
    segment_to_tgt_txt = imt_utils.final_translations_from_dict(tgt_text_dict)
    doc_name = imt_utils.url2doc(row.src_doc)
    log = json.loads(row.log)
    segment_to_time = imt_utils.segment_times_from_log(log)
    segment_to_mt = imt_utils.initial_translations_from_imt_log(
        log
    ) if row.interface == 'imt' else imt_utils.initial_translations_from_pe_log(
        log)
    segment_to_src_txt = imt_utils.source_segments_from_log(log)
    doc_name = imt_utils.url2doc(row.src_doc)
    doc_genre = imt_utils.genre_from_url(row.src_doc)

    for line_id in sorted(segment_to_tgt_txt.keys()):
        # TODO: hack for a user with bad logs
        edist = 0
        if line_id in segment_to_mt:
            mt_tgt_txt = segment_to_mt[line_id]
            user_tgt_txt = segment_to_tgt_txt[line_id]
            edist = edit_distance.dameraulevenshtein(mt_tgt_txt, user_tgt_txt,
# Load and process the database dump
session_order = 0
condition_order = 0
last_user = None
last_condition = None
for i,row in enumerate(dump_row_list):
    if i > 0 and i % 10 == 0:
        sys.stdout.write('.')
        if i % 800 == 0:
            print
    tgt_text_dict = json.loads(row.text)
    segment_to_tgt_txt = imt_utils.final_translations_from_dict(tgt_text_dict)
    doc_name = imt_utils.url2doc(row.src_doc)
    log = json.loads(row.log)
    segment_to_time = imt_utils.segment_times_from_log(log)
    segment_to_mt = imt_utils.initial_translations_from_imt_log(log) if row.interface == 'imt' else imt_utils.initial_translations_from_pe_log(log)
    segment_to_src_txt = imt_utils.source_segments_from_log(log)
    doc_name = imt_utils.url2doc(row.src_doc)
    doc_genre = imt_utils.genre_from_url(row.src_doc)

    for line_id in sorted(segment_to_tgt_txt.keys()):
        # TODO: hack for a user with bad logs
        edist = 0
        if line_id in segment_to_mt:
            mt_tgt_txt = segment_to_mt[line_id]
            user_tgt_txt = segment_to_tgt_txt[line_id]
            edist = edit_distance.dameraulevenshtein(mt_tgt_txt,
                                                     user_tgt_txt,
                                                     True)
        segment_id = '%s:%d' % (doc_name, line_id)