def extract_translations(dump_file, target_lang, ref_file_list, output_to_console): """ """ stderr('Loading references...') doc_to_ref = load_references(ref_file_list) doc_to_src = load_sources(ref_file_list) doc_to_timing = defaultdict(dict) session_id = 0 stderr('Loading database dump...') doc_to_user_txt = defaultdict(dict) doc_to_user_time = defaultdict(dict) doc_to_user_valid = defaultdict(dict) dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang) username_set = set() for row in dump_row_list: username_set.add(row.username) text_dict = json.loads(row.text) segment_to_tgt_txt = imt_utils.final_translations_from_dict(text_dict) doc_name = url2doc(row.src_doc) log = json.loads(row.log) segment_to_time = segment_times_from_log(log) segment_to_mt = initial_translations_from_imt_log( log ) if row.interface == 'imt' else initial_translations_from_pe_log(log) for line_id in sorted(segment_to_tgt_txt.keys()): doc_id = '%s:%d' % (doc_name, line_id) user_id = row.username + ':' + row.interface mt_id = 'MT:mt' doc_to_user_txt[doc_id][user_id] = segment_to_tgt_txt[line_id] doc_to_user_time[doc_id][user_id] = segment_to_time[line_id] doc_to_user_valid[doc_id][user_id] = str2bool(row.valid) if line_id in segment_to_mt: doc_to_user_txt[doc_id][mt_id] = segment_to_mt[line_id] doc_to_user_time[doc_id][mt_id] = 0.0 doc_to_user_valid[doc_id][mt_id] = True else: stderr('WARNING: No MT for %s %s %d' % (row.username, doc_id, line_id)) # Output the results output_system_files(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_valid, username_set) if output_to_console: console_dump(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_time)
def extract_translations(dump_file, target_lang, ref_file_list, output_to_console): """ """ stderr('Loading references...') doc_to_ref = load_references(ref_file_list) doc_to_src = load_sources(ref_file_list) doc_to_timing = defaultdict(dict) session_id = 0 stderr('Loading database dump...') doc_to_user_txt = defaultdict(dict) doc_to_user_time = defaultdict(dict) doc_to_user_valid = defaultdict(dict) dump_row_list = imt_utils.load_middleware_dump(dump_file, target_lang) username_set = set() for row in dump_row_list: username_set.add(row.username) text_dict = json.loads(row.text) segment_to_tgt_txt = imt_utils.final_translations_from_dict(text_dict) doc_name = url2doc(row.src_doc) log = json.loads(row.log) segment_to_time = segment_times_from_log(log) segment_to_mt = initial_translations_from_imt_log(log) if row.interface == 'imt' else initial_translations_from_pe_log(log) for line_id in sorted(segment_to_tgt_txt.keys()): doc_id = '%s:%d' % (doc_name, line_id) user_id = row.username + ':' + row.interface mt_id = 'MT:mt' doc_to_user_txt[doc_id][user_id] = segment_to_tgt_txt[line_id] doc_to_user_time[doc_id][user_id] = segment_to_time[line_id] doc_to_user_valid[doc_id][user_id] = str2bool(row.valid) if line_id in segment_to_mt: doc_to_user_txt[doc_id][mt_id] = segment_to_mt[line_id] doc_to_user_time[doc_id][mt_id] = 0.0 doc_to_user_valid[doc_id][mt_id] = True else: stderr('WARNING: No MT for %s %s %d' % (row.username, doc_id, line_id)) # Output the results output_system_files(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_valid, username_set) if output_to_console: console_dump(doc_to_ref, doc_to_src, doc_to_user_txt, doc_to_user_time)
# Load and process the database dump session_order = 0 condition_order = 0 last_user = None last_condition = None for i, row in enumerate(dump_row_list): if i > 0 and i % 10 == 0: sys.stdout.write('.') if i % 800 == 0: print tgt_text_dict = json.loads(row.text) segment_to_tgt_txt = imt_utils.final_translations_from_dict(tgt_text_dict) doc_name = imt_utils.url2doc(row.src_doc) log = json.loads(row.log) segment_to_time = imt_utils.segment_times_from_log(log) segment_to_mt = imt_utils.initial_translations_from_imt_log( log ) if row.interface == 'imt' else imt_utils.initial_translations_from_pe_log( log) segment_to_src_txt = imt_utils.source_segments_from_log(log) doc_name = imt_utils.url2doc(row.src_doc) doc_genre = imt_utils.genre_from_url(row.src_doc) for line_id in sorted(segment_to_tgt_txt.keys()): # TODO: hack for a user with bad logs edist = 0 if line_id in segment_to_mt: mt_tgt_txt = segment_to_mt[line_id] user_tgt_txt = segment_to_tgt_txt[line_id] edist = edit_distance.dameraulevenshtein(mt_tgt_txt, user_tgt_txt,
# Load and process the database dump session_order = 0 condition_order = 0 last_user = None last_condition = None for i,row in enumerate(dump_row_list): if i > 0 and i % 10 == 0: sys.stdout.write('.') if i % 800 == 0: print tgt_text_dict = json.loads(row.text) segment_to_tgt_txt = imt_utils.final_translations_from_dict(tgt_text_dict) doc_name = imt_utils.url2doc(row.src_doc) log = json.loads(row.log) segment_to_time = imt_utils.segment_times_from_log(log) segment_to_mt = imt_utils.initial_translations_from_imt_log(log) if row.interface == 'imt' else imt_utils.initial_translations_from_pe_log(log) segment_to_src_txt = imt_utils.source_segments_from_log(log) doc_name = imt_utils.url2doc(row.src_doc) doc_genre = imt_utils.genre_from_url(row.src_doc) for line_id in sorted(segment_to_tgt_txt.keys()): # TODO: hack for a user with bad logs edist = 0 if line_id in segment_to_mt: mt_tgt_txt = segment_to_mt[line_id] user_tgt_txt = segment_to_tgt_txt[line_id] edist = edit_distance.dameraulevenshtein(mt_tgt_txt, user_tgt_txt, True) segment_id = '%s:%d' % (doc_name, line_id)