def get_edit_distances(tgt_segments, ref_segments): """ Args: Returns: Raises: """ edit_distances = [] for tgt_line, ref_line in zip(tgt_segments, ref_segments): distance = dameraulevenshtein(ref_line, tgt_line, True) edit_distances.append(distance) return edit_distances
def get_edit_distances(tgt_segments, ref_segments): """ Args: Returns: Raises: """ edit_distances = [] for tgt_line,ref_line in zip(tgt_segments,ref_segments): distance = dameraulevenshtein(ref_line,tgt_line,True) edit_distances.append(distance) return edit_distances
def run_test(ref_file, tgt_list, tgt_meta_list): """ Args: Returns: Raises: """ ref_segments = load_segments(ref_file) # Sufficient statistics counts_lev_a = Counter() nums_lev_a = Counter() counts_lev_b = Counter() nums_lev_b = Counter() counts_dlev_a = Counter() nums_dlev_a = Counter() counts_dlev_b = Counter() nums_dlev_b = Counter() for (tgt_file, meta_file) in zip(tgt_list, tgt_meta_list): tgt_segments = load_segments(tgt_file) ui_ids = ids_from_meta_file(meta_file) for i, tgt_txt in enumerate(tgt_segments): ref_txt = ref_segments[i] lev_dist = levenshtein(ref_txt, tgt_txt, True) dlev_dist = dameraulevenshtein(ref_txt, tgt_txt, True) ui_id = ui_ids[i] if ui_id == 1: counts_lev_a[i] += lev_dist counts_dlev_a[i] += dlev_dist nums_lev_a[i] += 1 nums_dlev_a[i] += 1 elif ui_id == 2: counts_lev_b[i] += lev_dist counts_dlev_b[i] += dlev_dist nums_lev_b[i] += 1 nums_dlev_b[i] += 1 else: raise RuntimeError print 'Levenshtein distance' diff_test(counts_lev_a, nums_lev_a, counts_lev_b, nums_lev_b) print print 'Damerau-Levenshtein distance' diff_test(counts_dlev_a, nums_dlev_a, counts_dlev_b, nums_dlev_b)
def run_test(ref_file, tgt_list, tgt_meta_list): """ Args: Returns: Raises: """ ref_segments = load_segments(ref_file) # Sufficient statistics counts_lev_a = Counter() nums_lev_a = Counter() counts_lev_b = Counter() nums_lev_b = Counter() counts_dlev_a = Counter() nums_dlev_a = Counter() counts_dlev_b = Counter() nums_dlev_b = Counter() for (tgt_file,meta_file) in zip(tgt_list,tgt_meta_list): tgt_segments = load_segments(tgt_file) ui_ids = ids_from_meta_file(meta_file) for i,tgt_txt in enumerate(tgt_segments): ref_txt = ref_segments[i] lev_dist = levenshtein(ref_txt,tgt_txt,True) dlev_dist = dameraulevenshtein(ref_txt,tgt_txt,True) ui_id = ui_ids[i] if ui_id == 1: counts_lev_a[i] += lev_dist counts_dlev_a[i] += dlev_dist nums_lev_a[i] += 1 nums_dlev_a[i] += 1 elif ui_id == 2: counts_lev_b[i] += lev_dist counts_dlev_b[i] += dlev_dist nums_lev_b[i] += 1 nums_dlev_b[i] += 1 else: raise RuntimeError print 'Levenshtein distance' diff_test(counts_lev_a, nums_lev_a, counts_lev_b, nums_lev_b) print print 'Damerau-Levenshtein distance' diff_test(counts_dlev_a, nums_dlev_a, counts_dlev_b, nums_dlev_b)
segment_to_time = imt_utils.segment_times_from_log(log) segment_to_mt = imt_utils.initial_translations_from_imt_log( log ) if row.interface == 'imt' else imt_utils.initial_translations_from_pe_log( log) segment_to_src_txt = imt_utils.source_segments_from_log(log) doc_name = imt_utils.url2doc(row.src_doc) doc_genre = imt_utils.genre_from_url(row.src_doc) for line_id in sorted(segment_to_tgt_txt.keys()): # TODO: hack for a user with bad logs edist = 0 if line_id in segment_to_mt: mt_tgt_txt = segment_to_mt[line_id] user_tgt_txt = segment_to_tgt_txt[line_id] edist = edit_distance.dameraulevenshtein(mt_tgt_txt, user_tgt_txt, True) segment_id = '%s:%d' % (doc_name, line_id) time = segment_to_time[line_id] total_translation_time[row.username][row.interface] += time total_translation_time[row.username][row.interface + '_nseg'] += 1 order = int(row.order) if not (last_user or last_condition): last_user = row.username last_condition = row.interface if row.username != last_user: session_order = 0 condition_order = 0 elif last_condition != row.interface: condition_order = 0 time_key = '%s:%d' % (row.username, order) user_order_to_time[time_key].append(time)
doc_name = imt_utils.url2doc(row.src_doc) log = json.loads(row.log) segment_to_time = imt_utils.segment_times_from_log(log) segment_to_mt = imt_utils.initial_translations_from_imt_log(log) if row.interface == 'imt' else imt_utils.initial_translations_from_pe_log(log) segment_to_src_txt = imt_utils.source_segments_from_log(log) doc_name = imt_utils.url2doc(row.src_doc) doc_genre = imt_utils.genre_from_url(row.src_doc) for line_id in sorted(segment_to_tgt_txt.keys()): # TODO: hack for a user with bad logs edist = 0 if line_id in segment_to_mt: mt_tgt_txt = segment_to_mt[line_id] user_tgt_txt = segment_to_tgt_txt[line_id] edist = edit_distance.dameraulevenshtein(mt_tgt_txt, user_tgt_txt, True) segment_id = '%s:%d' % (doc_name, line_id) time = segment_to_time[line_id] total_translation_time[row.username][row.interface] += time total_translation_time[row.username][row.interface+'_nseg'] += 1 order = int(row.order) if not (last_user or last_condition): last_user = row.username last_condition = row.interface if row.username != last_user: session_order = 0 condition_order = 0 elif last_condition != row.interface: condition_order = 0 time_key = '%s:%d' % (row.username,order)