def run_network_on_validation(epoch_num): val_pos_prec, val_class_prec, val_rough_pos_prec = 0.0, 0.0, 0.0 val_pos_items, val_class_items = 0, 0 # iterate num_dafs_to_save = 6 dafs_to_save = [] for idaf, daf in enumerate(val_data): class_prec, pos_prec, tagged_daf, rough_pos_prec = CalculateLossForDaf(daf, fValidation=True) # increment and continue if not pos_prec is None: val_pos_prec += pos_prec val_rough_pos_prec += rough_pos_prec val_pos_items += 1 if not class_prec is None: val_class_prec += class_prec val_class_items += 1 if epoch_num >= 0 and idaf % round(1.0 * len(val_data) / num_dafs_to_save) == 0: objStr = json.dumps(tagged_daf, indent=4, ensure_ascii=False) util.make_folder_if_need_be('{}/epoch_{}'.format(model_root,epoch_num)) with open("{}/epoch_{}/{}_tagged.json".format(model_root,epoch_num, tagged_daf["file"]), "w") as f: f.write(objStr.encode('utf-8')) # divide val_pos_prec = val_pos_prec / val_pos_items * 100 if val_pos_items > 0 else 0.0 val_rough_pos_prec = val_rough_pos_prec / val_pos_items * 100 if val_pos_items > 0 else 0.0 val_class_prec = val_class_prec / val_class_items * 100 if val_class_items > 0 else 0.0 # print the results log_message('Validation: pos_prec: ' + str(val_pos_prec) + ', class_prec: ' + str(val_class_prec) + ', rough pos prec: ' + str(val_rough_pos_prec)) return val_pos_prec, val_class_prec, val_rough_pos_prec
def run_network_on_validation(epoch_num): val_lang_prec = 0.0 val_lang_items = 0 # iterate num_words_to_save = 1000 words_to_save = [] for idaf, word in enumerate(val_data): lang_prec, tagged_word = CalculateLossForWord(word, fValidation=True) # increment and continue val_lang_prec += lang_prec val_lang_items += 1 if epoch_num >= 0 and idaf % round( 1.0 * len(val_data) / num_words_to_save) == 0: words_to_save.append(tagged_word) # divide val_lang_prec = val_lang_prec / val_lang_items * 100 if val_lang_items > 0 else 0.0 # print the results log_message('Validation: pos_prec: ' + str(val_lang_prec)) objStr = json.dumps(words_to_save, indent=4, ensure_ascii=False) util.make_folder_if_need_be('{}/epoch_{}'.format(model_root, epoch_num)) with open("{}/epoch_{}/tagged.json".format(model_root, epoch_num), "w") as f: f.write(objStr.encode('utf-8')) return val_lang_prec
def dilate_lang(): lang_tagged_path = 'data/3_lang_tagged' lang_tagged_dilated_path = 'data/4_lang_tagged_dilated' mesechtot_names = ['Berakhot','Shabbat','Eruvin','Pesachim','Bava Kamma','Bava Metzia','Bava Batra'] for mesechta in mesechtot_names: util.make_folder_if_need_be('{}/json/{}'.format(lang_tagged_path, mesechta)) mesechta_path = '{}/json/{}'.format(lang_tagged_path, mesechta) def sortdaf(fname): daf = fname.split('/')[-1].split('.json')[0] daf_int = int(daf[:-1]) amud_int = 1 if daf[-1] == 'b' else 0 return daf_int*2 + amud_int files = [f for f in listdir(mesechta_path) if isfile(join(mesechta_path, f))] files.sort(key=sortdaf) html_out = OrderedDict() for i_f,f_name in enumerate(files): lang_out = [] lang_in = json.load(codecs.open('{}/{}'.format(mesechta_path,f_name), "rb", encoding="utf-8")) for i_w,w in enumerate(lang_in): if 1 < i_w < len(lang_in)-1: neigh = [lang_in[i_w-1]['confidence'],lang_in[i_w+1]['confidence']] elif i_w < len(lang_in) - 1: neigh = [lang_in[i_w+1]['confidence']] else: neigh = [lang_in[i_w-1]['confidence']] neigh_conf = [sum([c[0] for c in neigh])/2,sum([c[1] for c in neigh])/2] weight = 1.1 new_conf = [sum([neigh_conf[0],weight*w['confidence'][0]]),sum([neigh_conf[1],weight*w['confidence'][1]])] new_lang = 'aramaic' if new_conf[0] > new_conf[1] else 'mishnaic' lang_out.append({'word':w['word'],'lang':new_lang,'confidence':new_conf}) util.make_folder_if_need_be("{}/json/{}".format(lang_tagged_dilated_path,mesechta)) fp = codecs.open("{}/json/{}/{}".format(lang_tagged_dilated_path,mesechta,f_name), "wb", encoding='utf-8') json.dump(lang_out, fp, indent=4, encoding='utf-8', ensure_ascii=False) fp.close() daf = f_name.split('/')[-1].split('.json')[0] html_out[daf] = lang_out if i_f % 10 == 0: print '{}/{}'.format(mesechta,f_name) html = print_tagged_corpus_to_html_table(html_out) util.make_folder_if_need_be("{}/html/{}".format(lang_tagged_dilated_path, mesechta)) fp = codecs.open("{}/html/{}/{}.html".format(lang_tagged_dilated_path, mesechta, daf), "wb", encoding='utf-8') fp.write(html) fp.close() html_out = OrderedDict()
def print_tagged_corpus_to_html_table(text_name, ref_list, num_daf_per_doc): cal_dh_root = "data/2_matched_sefaria" iref = 0 while iref < len(ref_list): str = u"<html><head><style>h1{text-align:center;background:grey}td{text-align:center}table{margin-top:20px;margin-bottom:20px;margin-right:auto;margin-left:auto;width:1200px}.missed{color:white;background:red}.b{color:green}.m{color:blue}.sef{color:black}.cal{color:grey}.good-cal{color:red}.good-jba{background:#eee;color:red}.POS{color:orange}</style><meta charset='utf-8'></head><body>" start_daf = "" end_daf = "" for idaf in xrange(num_daf_per_doc): if iref >= len(ref_list): break ref = ref_list[iref] daf = ref.__str__().replace("{} ".format(text_name), "").encode('utf8') str += u"<h1>DAF {}</h1>".format(daf) str += u"<table>" if idaf == 0: start_daf = daf if idaf == num_daf_per_doc - 1: end_daf = daf try: util.make_folder_if_need_be('{}/json/{}'.format( cal_dh_root, text_name)) test_set = json.load( codecs.open("{}/json/{}/{}.json".format( cal_dh_root, text_name, daf), "r", encoding="utf-8")) except IOError: continue # this daf apparently didn't exist in cal dataset but does in sefaria word_list = test_set["words"] missed_word_list = test_set["missed_words"] missed_dic = {wo["index"]: wo["word"] for wo in missed_word_list} sef_count = 0 cal_count = 0 while sef_count < len(word_list): row_obj = word_list[sef_count:sef_count + 10] row_sef = u"<tr class='sef'><td>{}</td>".format( u"</td><td>".join([wo["word"] for wo in reversed(row_obj)])) row_sef += u"<td>({}-{})</td></tr>".format( sef_count, sef_count + len(row_obj) - 1) row_cal = u"<tr class='cal'>" start_cal_count = cal_count for wo in reversed(row_obj): while cal_count in missed_dic: cal_count += 1 if "cal_word" in wo: cal_count += 1 row_cal += u"<td class='good-cal'>{} <span class='POS'>({})</span></td>".format( wo["cal_word"], wo["POS"]) elif "jba_word" in wo: row_cal += u"<td class='good-jba'>{} <span class='POS'>({})</span><br>{}</td>".format( wo["jba_word"], wo["POS"], wo["head_word"]) else: row_cal += u"<td class='{}'>{}</td>".format( wo["class"][0], wo["class"][0:3].upper()) row_cal += u"<td>({}-{})</td>".format(start_cal_count, cal_count - 1) row_cal += u"</tr>" str += row_sef str += row_cal sef_count += 10 str += u"</table>" str += u"<table>" count = 0 while count < len(missed_word_list): row_obj = missed_word_list[count:count + 10] word_str = [ u"{}:{}".format(wo["word"], wo["index"]) for wo in reversed(row_obj) ] row_missed = u"<tr class='missed'><td>{}</td></tr>".format( u"</td><td>".join(word_str)) str += row_missed count += 10 str += u"</table>" iref += 1 str += u"</body></html>" util.make_folder_if_need_be('{}/html/{}'.format( cal_dh_root, text_name)) fp = codecs.open("{}/html/{}/{}-{}.html".format( cal_dh_root, text_name, start_daf, end_daf), 'w', encoding='utf-8') fp.write(str) fp.close()
if items_seen % breakpoint == 0 or idaf == len(train_data) - 1: last_loss = total_loss / breakpoint last_pos_prec = total_pos_prec / total_pos_items * 100 last_rough_pos_prec = total_rough_pos_prec / total_pos_items * 100 last_class_prec = total_class_prec / total_class_items * 100 log_message("Segments processed: " + str(items_seen) + ", loss: " + str(last_loss) + ', pos_prec: ' + str( last_pos_prec) + ', class_prec: ' + str(last_class_prec) + ', rough pos prec: ' + str(last_rough_pos_prec)) total_loss, total_pos_prec, total_class_prec, total_rough_pos_prec = 0.0, 0.0, 0.0, 0.0 total_pos_items = 0 total_class_items = 0 log_message('Finished epoch ' + str(epoch)) val_class_prec, val_pos_prec, val_rough_pos_prec = run_network_on_validation(epoch) util.make_folder_if_need_be('{}/epoch_{}'.format(model_root,epoch)) filename_to_save = '{}/epoch_{}/postagger_model_embdim{}_hiddim{}_lyr{}_e{}_trainloss{}_trainprec{}_valprec.model'.format(model_root,epoch,EMBED_DIM,HIDDEN_DIM,sLAYERS,epoch,last_loss,last_pos_prec,val_pos_prec) model.save(filename_to_save) f = open("{}/epoch_{}/conf_matrix_e{}.html".format(model_root,epoch, epoch), 'w') f.write(pos_conf_matrix.to_html()) f.close() pos_conf_matrix.clear() else: #tag all of shas! mesechtot_names = ['Berakhot','Shabbat','Eruvin','Pesachim','Bava Kamma','Bava Metzia','Bava Batra'] for mesechta in mesechtot_names: mesechta_path = 'data/5_pos_tagged/json/{}'.format(mesechta) util.make_folder_if_need_be(mesechta_path)
breakpoint = 5000 if items_seen % breakpoint == 0: last_loss = total_loss / breakpoint last_lang_prec = total_lang_prec / total_lang_items * 100 log_message("Words processed: " + str(items_seen) + ", loss: " + str(last_loss) + ', lang_prec: ' + str(last_lang_prec)) total_loss, total_lang_prec = 0.0, 0.0 total_lang_items = 0 log_message('Finished epoch ' + str(epoch)) val_lang_prec = run_network_on_validation(epoch) util.make_folder_if_need_be('{}/epoch_{}'.format(model_root, epoch)) filename_to_save = '{}/epoch_{}/postagger_model_embdim{}_hiddim{}_lyr{}_e{}_trainloss{}_trainprec.model'.format( model_root, epoch, EMBED_DIM, HIDDEN_DIM, sLAYERS, epoch, last_loss) model.save(filename_to_save) f = open( "{}/epoch_{}/conf_matrix_e{}.html".format(model_root, epoch, epoch), 'w') f.write(lang_conf_matrix.to_html()) f.close() lang_conf_matrix.clear() else: #tag all of shas! lang_tagged_path = 'data/3_lang_tagged' mesechtot_names = [
def match_cal_segments(mesechta): def merge_cal_word_objs(s, e, word_obj_list): obj_list = word_obj_list[s:e] m_word = u" ".join([o["word"] for o in obj_list]) m_head_word = u" ".join([o["head_word"] for o in obj_list]) m_pos_list = [o["POS"] for o in obj_list] m_pos = max(set(m_pos_list), key=m_pos_list.count) new_obj = obj_list[0].copy() new_obj["word"] = m_word new_obj["head_word"] = m_head_word new_obj["POS"] = m_pos return [ new_obj ] #returns a single element array which will replace a range s:e in the original array cal_lines = json.load(open( "data/1_cal_input/cal_lines_{}.json".format(mesechta), "r"), encoding="utf8") #cal_pos_hashtable = json.load(open("cal_pos_hashtable.json","r"),encoding='utf8') dafs = cal_lines["dafs"] lines_by_daf = cal_lines["lines"] super_base_ref = Ref(mesechta) subrefs = super_base_ref.all_subrefs() ical = 0 num_sef_words = 0 num_cal_words = 0 num_words_matched = 0 for curr_sef_ref in subrefs: if curr_sef_ref.is_empty(): continue if ical >= len(dafs): break daf = dafs[ical] print "-----{} DAF {} ({}/{})-----".format(mesechta, daf, ical, len(dafs)) base_tc = TextChunk(curr_sef_ref, "he", "William Davidson Edition - Aramaic") bas_word_list = [] # re.split(r"\s+"," ".join(base_text.text)) for segment in base_tc.text: bas_word_list += util.tokenize_words(segment) temp_out = [{"word": w, "class": "unknown"} for w in bas_word_list] lines = [[word_obj["word"] for word_obj in temp_line] for temp_line in lines_by_daf[ical]] word_obj_list = [ word_obj for temp_line in lines_by_daf[ical] for word_obj in temp_line ] lines_by_str = [u' '.join(line_array) for line_array in lines] curr_cal_ref = Ref("{} {}".format(mesechta, daf)) out = [] word_for_word_se = [] cal_words = [] missed_words = [] global_offset = 0 if curr_sef_ref == curr_cal_ref: matched = dibur_hamatchil_matcher.match_text( bas_word_list, lines_by_str, verbose=True, word_threshold=0.27, char_threshold=0.6, with_abbrev_matches=True, with_num_abbrevs=False) start_end_map = matched["matches"] abbrev_matches = matched["abbrevs"] abbrev_ranges = [[am.rashiRange for am in am_list] for am_list in abbrev_matches] print u' --- '.join( [unicode(am) for am_list in abbrev_matches for am in am_list]) abbrev_count = 0 for ar in abbrev_ranges: abbrev_count += len(ar) #if abbrev_count > 0: # print "GRESATLJL THNA DZEOR", abbrev_ranges for iline, se in enumerate(start_end_map): curr_cal_line = lines[iline] # if there is an expanded abbrev, concat those words into one element if len(abbrev_ranges[iline]) > 0: offset = 0 # account for the fact that you're losing elements in the array as you merge them abbrev_ranges[iline].sort(key=lambda x: x[0]) for ar in abbrev_ranges[iline]: if ar[1] - ar[0] <= 0: continue #TODO there's an issue with the abbrev func, but i'm too lazy to fix now. sometimes they're zero length #redefine ar by how many actual words are in the range, not just how many elements start_ar = ar[0] i_abbrev = start_ar num_words = 0 while i_abbrev < len(curr_cal_line): temp_w = curr_cal_line[i_abbrev] num_words += len(re.split(ur'\s+', temp_w)) if num_words >= (ar[1] - ar[0] + 1): break i_abbrev += 1 end_ar = i_abbrev ar = (start_ar, end_ar) if len(curr_cal_line[ar[0] - offset:ar[1] + 1 - offset] ) != len( word_obj_list[ar[0] - offset + len(cal_words):ar[1] + 1 - offset + len(cal_words)]): #something's wrong. not sure what, but best to ignore this continue print u"ABBREV RANGE {} --- OFFSET {}".format( ar, offset) print u"CURR CAL LINE BEFORE {}".format(u','.join( curr_cal_line[ar[0] - offset:ar[1] + 1 - offset])) curr_cal_line[ar[0] - offset:ar[1] + 1 - offset] = [ u' '.join(curr_cal_line[ar[0] - offset:ar[1] + 1 - offset]) ] print u"CURR CAL LINE AFTER {}".format( curr_cal_line[ar[0] - offset]) print u"WORD OBJ LIST BEFORE {}".format(u','.join([ u'({})'.format(obj['word']) for obj in merge_cal_word_objs( ar[0] - offset + len(cal_words), ar[1] + 1 - offset + len(cal_words), word_obj_list) ])) word_obj_list[ar[0] - offset + len(cal_words):ar[1] + 1 - offset + len(cal_words)] = merge_cal_word_objs( ar[0] - offset + len(cal_words), ar[1] + 1 - offset + len(cal_words), word_obj_list) print u"WORD OBJ LIST AFTER {}".format( word_obj_list[ar[0] - offset + len(cal_words)]['word']) offset += ar[1] - ar[0] global_offset += offset cal_words += curr_cal_line if se[0] == -1: word_for_word_se += [(-1, -1) for i in range(len(curr_cal_line))] continue # matched_cal_objs_indexes = language_tools.match_segments_without_order(lines[iline],bas_word_list[se[0]:se[1]+1],2.0) curr_bas_line = bas_word_list[se[0]:se[1] + 1] #print u'base line',u' '.join(curr_bas_line) matched_obj_words_base = dibur_hamatchil_matcher.match_text( curr_bas_line, curr_cal_line, char_threshold=0.35, verbose=False, with_num_abbrevs=False) matched_words_base = matched_obj_words_base["matches"] word_for_word_se += [(tse[0] + se[0], tse[1] + se[0]) if tse[0] != -1 else tse for tse in matched_words_base] matched_word_for_word_obj = dibur_hamatchil_matcher.match_text( bas_word_list, cal_words, char_threshold=0.35, prev_matched_results=word_for_word_se, boundaryFlexibility=2, with_num_abbrevs=False) matched_word_for_word = matched_word_for_word_obj["matches"] cal_len = len(matched_word_for_word) bad_word_offset = 0 for ical_word, temp_se in enumerate(matched_word_for_word): if temp_se[0] == -1: missed_words.append({ "word": word_obj_list[ical_word]["word"], "index": ical_word }) continue #dictionary juggling... for i in xrange(temp_se[0], temp_se[1] + 1): #in case a cal_words and word_obj_list aren't the same length bc a word got split up """ if cal_words[ical_word] != word_obj_list[ical_word-bad_word_offset]["word"]: if ical_word+1 < len(cal_words) and cal_words[ical_word+1] != word_obj_list[ical_word-bad_word_offset+1]["word"]: bad_word_offset += 1 continue """ cal_word_obj = word_obj_list[ical_word].copy() cal_word_obj["cal_word"] = cal_word_obj["word"] temp_sef_word = temp_out[i]["word"] temp_out[i] = cal_word_obj temp_out[i]["class"] = "talmud" temp_out[i]["word"] = temp_sef_word print u"\n-----\nFOUND {}/{} ({}%)".format( cal_len - len(missed_words), cal_len, (1 - round(1.0 * len(missed_words) / cal_len, 4)) * 100) #print u"MISSED: {}".format(u" ,".join([u"{}:{}".format(wo["word"], wo["index"]) for wo in missed_words])) ical += 1 num_cal_words += cal_len num_words_matched += (cal_len - len(missed_words)) """ #tag 1 pos words if still untagged for iwo,word_obj in enumerate(temp_out): word = word_obj["word"] if word in cal_pos_hashtable: if len(cal_pos_hashtable[word]) == 1: temp_out[iwo] = {"word":word,"cal_word":word,"class":"talmud","POS":cal_pos_hashtable[word][0]} """ num_sef_words += len(temp_out) out += temp_out sef_daf = curr_sef_ref.__str__().replace("{} ".format(mesechta), "").encode('utf8') doc = {"words": out, "missed_words": missed_words} util.make_folder_if_need_be( "data/2_matched_sefaria/json/{}".format(mesechta)) fp = codecs.open("data/2_matched_sefaria/json/{}/{}.json".format( mesechta, sef_daf), "w", encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False) fp.close() return num_sef_words, num_cal_words, num_words_matched