예제 #1
0
    def run_network_on_validation(epoch_num):
        val_pos_prec, val_class_prec, val_rough_pos_prec = 0.0, 0.0, 0.0
        val_pos_items, val_class_items = 0, 0
        # iterate
        num_dafs_to_save = 6
        dafs_to_save = []

        for idaf, daf in enumerate(val_data):
            class_prec, pos_prec, tagged_daf, rough_pos_prec = CalculateLossForDaf(daf, fValidation=True)
            # increment and continue
            if not pos_prec is None:
                val_pos_prec += pos_prec
                val_rough_pos_prec += rough_pos_prec
                val_pos_items += 1
            if not class_prec is None:
                val_class_prec += class_prec
                val_class_items += 1

            if epoch_num >= 0 and idaf % round(1.0 * len(val_data) / num_dafs_to_save) == 0:
                objStr = json.dumps(tagged_daf, indent=4, ensure_ascii=False)
                util.make_folder_if_need_be('{}/epoch_{}'.format(model_root,epoch_num))
                with open("{}/epoch_{}/{}_tagged.json".format(model_root,epoch_num, tagged_daf["file"]), "w") as f:
                    f.write(objStr.encode('utf-8'))

        # divide
        val_pos_prec = val_pos_prec / val_pos_items * 100 if val_pos_items > 0 else 0.0
        val_rough_pos_prec = val_rough_pos_prec / val_pos_items * 100 if val_pos_items > 0 else 0.0
        val_class_prec = val_class_prec / val_class_items * 100 if val_class_items > 0 else 0.0
        # print the results
        log_message('Validation: pos_prec: ' + str(val_pos_prec) + ', class_prec: ' + str(val_class_prec) + ', rough pos prec: ' + str(val_rough_pos_prec))

        return val_pos_prec, val_class_prec, val_rough_pos_prec
예제 #2
0
def run_network_on_validation(epoch_num):
    val_lang_prec = 0.0
    val_lang_items = 0
    # iterate
    num_words_to_save = 1000
    words_to_save = []

    for idaf, word in enumerate(val_data):
        lang_prec, tagged_word = CalculateLossForWord(word, fValidation=True)
        # increment and continue
        val_lang_prec += lang_prec
        val_lang_items += 1
        if epoch_num >= 0 and idaf % round(
                1.0 * len(val_data) / num_words_to_save) == 0:
            words_to_save.append(tagged_word)

    # divide
    val_lang_prec = val_lang_prec / val_lang_items * 100 if val_lang_items > 0 else 0.0
    # print the results
    log_message('Validation: pos_prec: ' + str(val_lang_prec))

    objStr = json.dumps(words_to_save, indent=4, ensure_ascii=False)
    util.make_folder_if_need_be('{}/epoch_{}'.format(model_root, epoch_num))
    with open("{}/epoch_{}/tagged.json".format(model_root, epoch_num),
              "w") as f:
        f.write(objStr.encode('utf-8'))
    return val_lang_prec
예제 #3
0
def dilate_lang():
    lang_tagged_path = 'data/3_lang_tagged'
    lang_tagged_dilated_path = 'data/4_lang_tagged_dilated'
    mesechtot_names = ['Berakhot','Shabbat','Eruvin','Pesachim','Bava Kamma','Bava Metzia','Bava Batra']
    for mesechta in mesechtot_names:
        util.make_folder_if_need_be('{}/json/{}'.format(lang_tagged_path, mesechta))
        mesechta_path = '{}/json/{}'.format(lang_tagged_path, mesechta)

        def sortdaf(fname):
            daf = fname.split('/')[-1].split('.json')[0]
            daf_int = int(daf[:-1])
            amud_int = 1 if daf[-1] == 'b' else 0
            return daf_int*2 + amud_int

        files = [f for f in listdir(mesechta_path) if isfile(join(mesechta_path, f))]
        files.sort(key=sortdaf)
        html_out = OrderedDict()
        for i_f,f_name in enumerate(files):
            lang_out = []
            lang_in = json.load(codecs.open('{}/{}'.format(mesechta_path,f_name), "rb", encoding="utf-8"))
            for i_w,w in enumerate(lang_in):
                if 1 < i_w < len(lang_in)-1:
                    neigh = [lang_in[i_w-1]['confidence'],lang_in[i_w+1]['confidence']]
                elif i_w < len(lang_in) - 1:
                    neigh = [lang_in[i_w+1]['confidence']]
                else:
                    neigh = [lang_in[i_w-1]['confidence']]
                neigh_conf = [sum([c[0] for c in neigh])/2,sum([c[1] for c in neigh])/2]

                weight = 1.1
                new_conf = [sum([neigh_conf[0],weight*w['confidence'][0]]),sum([neigh_conf[1],weight*w['confidence'][1]])]
                new_lang = 'aramaic' if new_conf[0] > new_conf[1] else 'mishnaic'
                lang_out.append({'word':w['word'],'lang':new_lang,'confidence':new_conf})

            util.make_folder_if_need_be("{}/json/{}".format(lang_tagged_dilated_path,mesechta))
            fp = codecs.open("{}/json/{}/{}".format(lang_tagged_dilated_path,mesechta,f_name), "wb", encoding='utf-8')
            json.dump(lang_out, fp, indent=4, encoding='utf-8', ensure_ascii=False)
            fp.close()

            daf = f_name.split('/')[-1].split('.json')[0]
            html_out[daf] = lang_out
            if i_f % 10 == 0:
                print '{}/{}'.format(mesechta,f_name)
                html = print_tagged_corpus_to_html_table(html_out)
                util.make_folder_if_need_be("{}/html/{}".format(lang_tagged_dilated_path, mesechta))
                fp = codecs.open("{}/html/{}/{}.html".format(lang_tagged_dilated_path, mesechta, daf), "wb",
                                 encoding='utf-8')
                fp.write(html)
                fp.close()
                html_out = OrderedDict()
예제 #4
0
def print_tagged_corpus_to_html_table(text_name, ref_list, num_daf_per_doc):
    cal_dh_root = "data/2_matched_sefaria"

    iref = 0
    while iref < len(ref_list):
        str = u"<html><head><style>h1{text-align:center;background:grey}td{text-align:center}table{margin-top:20px;margin-bottom:20px;margin-right:auto;margin-left:auto;width:1200px}.missed{color:white;background:red}.b{color:green}.m{color:blue}.sef{color:black}.cal{color:grey}.good-cal{color:red}.good-jba{background:#eee;color:red}.POS{color:orange}</style><meta charset='utf-8'></head><body>"

        start_daf = ""
        end_daf = ""
        for idaf in xrange(num_daf_per_doc):
            if iref >= len(ref_list): break
            ref = ref_list[iref]
            daf = ref.__str__().replace("{} ".format(text_name),
                                        "").encode('utf8')
            str += u"<h1>DAF {}</h1>".format(daf)
            str += u"<table>"
            if idaf == 0: start_daf = daf
            if idaf == num_daf_per_doc - 1: end_daf = daf

            try:
                util.make_folder_if_need_be('{}/json/{}'.format(
                    cal_dh_root, text_name))
                test_set = json.load(
                    codecs.open("{}/json/{}/{}.json".format(
                        cal_dh_root, text_name, daf),
                                "r",
                                encoding="utf-8"))
            except IOError:
                continue  # this daf apparently didn't exist in cal dataset but does in sefaria
            word_list = test_set["words"]
            missed_word_list = test_set["missed_words"]
            missed_dic = {wo["index"]: wo["word"] for wo in missed_word_list}

            sef_count = 0
            cal_count = 0
            while sef_count < len(word_list):
                row_obj = word_list[sef_count:sef_count + 10]
                row_sef = u"<tr class='sef'><td>{}</td>".format(
                    u"</td><td>".join([wo["word"]
                                       for wo in reversed(row_obj)]))
                row_sef += u"<td>({}-{})</td></tr>".format(
                    sef_count, sef_count + len(row_obj) - 1)

                row_cal = u"<tr class='cal'>"
                start_cal_count = cal_count
                for wo in reversed(row_obj):
                    while cal_count in missed_dic:
                        cal_count += 1
                    if "cal_word" in wo:
                        cal_count += 1
                        row_cal += u"<td class='good-cal'>{} <span class='POS'>({})</span></td>".format(
                            wo["cal_word"], wo["POS"])
                    elif "jba_word" in wo:
                        row_cal += u"<td class='good-jba'>{} <span class='POS'>({})</span><br>{}</td>".format(
                            wo["jba_word"], wo["POS"], wo["head_word"])
                    else:
                        row_cal += u"<td class='{}'>{}</td>".format(
                            wo["class"][0], wo["class"][0:3].upper())
                row_cal += u"<td>({}-{})</td>".format(start_cal_count,
                                                      cal_count - 1)
                row_cal += u"</tr>"

                str += row_sef
                str += row_cal
                sef_count += 10
            str += u"</table>"

            str += u"<table>"
            count = 0
            while count < len(missed_word_list):
                row_obj = missed_word_list[count:count + 10]
                word_str = [
                    u"{}:{}".format(wo["word"], wo["index"])
                    for wo in reversed(row_obj)
                ]
                row_missed = u"<tr class='missed'><td>{}</td></tr>".format(
                    u"</td><td>".join(word_str))
                str += row_missed
                count += 10
            str += u"</table>"
            iref += 1
        str += u"</body></html>"
        util.make_folder_if_need_be('{}/html/{}'.format(
            cal_dh_root, text_name))
        fp = codecs.open("{}/html/{}/{}-{}.html".format(
            cal_dh_root, text_name, start_daf, end_daf),
                         'w',
                         encoding='utf-8')
        fp.write(str)
        fp.close()
예제 #5
0
                if items_seen % breakpoint == 0 or idaf == len(train_data) - 1:
                    last_loss = total_loss / breakpoint
                    last_pos_prec = total_pos_prec / total_pos_items * 100
                    last_rough_pos_prec = total_rough_pos_prec / total_pos_items * 100
                    last_class_prec = total_class_prec / total_class_items * 100

                    log_message("Segments processed: " + str(items_seen) + ", loss: " + str(last_loss) + ', pos_prec: ' + str(
                        last_pos_prec) + ', class_prec: ' + str(last_class_prec) + ', rough pos prec: ' + str(last_rough_pos_prec))

                    total_loss, total_pos_prec, total_class_prec, total_rough_pos_prec = 0.0, 0.0, 0.0, 0.0
                    total_pos_items = 0
                    total_class_items = 0

            log_message('Finished epoch ' + str(epoch))
            val_class_prec, val_pos_prec, val_rough_pos_prec = run_network_on_validation(epoch)
            util.make_folder_if_need_be('{}/epoch_{}'.format(model_root,epoch))

            filename_to_save = '{}/epoch_{}/postagger_model_embdim{}_hiddim{}_lyr{}_e{}_trainloss{}_trainprec{}_valprec.model'.format(model_root,epoch,EMBED_DIM,HIDDEN_DIM,sLAYERS,epoch,last_loss,last_pos_prec,val_pos_prec)
            model.save(filename_to_save)

            f = open("{}/epoch_{}/conf_matrix_e{}.html".format(model_root,epoch, epoch), 'w')
            f.write(pos_conf_matrix.to_html())
            f.close()
            pos_conf_matrix.clear()
    else:
        #tag all of shas!
        mesechtot_names = ['Berakhot','Shabbat','Eruvin','Pesachim','Bava Kamma','Bava Metzia','Bava Batra']
        for mesechta in mesechtot_names:
            mesechta_path = 'data/5_pos_tagged/json/{}'.format(mesechta)
            util.make_folder_if_need_be(mesechta_path)
예제 #6
0
            breakpoint = 5000
            if items_seen % breakpoint == 0:
                last_loss = total_loss / breakpoint
                last_lang_prec = total_lang_prec / total_lang_items * 100

                log_message("Words processed: " + str(items_seen) +
                            ", loss: " + str(last_loss) + ', lang_prec: ' +
                            str(last_lang_prec))

                total_loss, total_lang_prec = 0.0, 0.0
                total_lang_items = 0

        log_message('Finished epoch ' + str(epoch))
        val_lang_prec = run_network_on_validation(epoch)

        util.make_folder_if_need_be('{}/epoch_{}'.format(model_root, epoch))
        filename_to_save = '{}/epoch_{}/postagger_model_embdim{}_hiddim{}_lyr{}_e{}_trainloss{}_trainprec.model'.format(
            model_root, epoch, EMBED_DIM, HIDDEN_DIM, sLAYERS, epoch,
            last_loss)
        model.save(filename_to_save)

        f = open(
            "{}/epoch_{}/conf_matrix_e{}.html".format(model_root, epoch,
                                                      epoch), 'w')
        f.write(lang_conf_matrix.to_html())
        f.close()
        lang_conf_matrix.clear()
else:
    #tag all of shas!
    lang_tagged_path = 'data/3_lang_tagged'
    mesechtot_names = [
예제 #7
0
def match_cal_segments(mesechta):
    def merge_cal_word_objs(s, e, word_obj_list):
        obj_list = word_obj_list[s:e]
        m_word = u" ".join([o["word"] for o in obj_list])
        m_head_word = u" ".join([o["head_word"] for o in obj_list])
        m_pos_list = [o["POS"] for o in obj_list]
        m_pos = max(set(m_pos_list), key=m_pos_list.count)
        new_obj = obj_list[0].copy()
        new_obj["word"] = m_word
        new_obj["head_word"] = m_head_word
        new_obj["POS"] = m_pos
        return [
            new_obj
        ]  #returns a single element array which will replace a range s:e in the original array

    cal_lines = json.load(open(
        "data/1_cal_input/cal_lines_{}.json".format(mesechta), "r"),
                          encoding="utf8")
    #cal_pos_hashtable = json.load(open("cal_pos_hashtable.json","r"),encoding='utf8')
    dafs = cal_lines["dafs"]
    lines_by_daf = cal_lines["lines"]

    super_base_ref = Ref(mesechta)
    subrefs = super_base_ref.all_subrefs()
    ical = 0

    num_sef_words = 0
    num_cal_words = 0
    num_words_matched = 0

    for curr_sef_ref in subrefs:
        if curr_sef_ref.is_empty(): continue
        if ical >= len(dafs): break

        daf = dafs[ical]
        print "-----{} DAF {}  ({}/{})-----".format(mesechta, daf, ical,
                                                    len(dafs))

        base_tc = TextChunk(curr_sef_ref, "he",
                            "William Davidson Edition - Aramaic")
        bas_word_list = []  # re.split(r"\s+"," ".join(base_text.text))
        for segment in base_tc.text:
            bas_word_list += util.tokenize_words(segment)

        temp_out = [{"word": w, "class": "unknown"} for w in bas_word_list]

        lines = [[word_obj["word"] for word_obj in temp_line]
                 for temp_line in lines_by_daf[ical]]
        word_obj_list = [
            word_obj for temp_line in lines_by_daf[ical]
            for word_obj in temp_line
        ]
        lines_by_str = [u' '.join(line_array) for line_array in lines]

        curr_cal_ref = Ref("{} {}".format(mesechta, daf))

        out = []
        word_for_word_se = []
        cal_words = []
        missed_words = []

        global_offset = 0
        if curr_sef_ref == curr_cal_ref:
            matched = dibur_hamatchil_matcher.match_text(
                bas_word_list,
                lines_by_str,
                verbose=True,
                word_threshold=0.27,
                char_threshold=0.6,
                with_abbrev_matches=True,
                with_num_abbrevs=False)
            start_end_map = matched["matches"]
            abbrev_matches = matched["abbrevs"]
            abbrev_ranges = [[am.rashiRange for am in am_list]
                             for am_list in abbrev_matches]
            print u' --- '.join(
                [unicode(am) for am_list in abbrev_matches for am in am_list])
            abbrev_count = 0
            for ar in abbrev_ranges:
                abbrev_count += len(ar)
            #if abbrev_count > 0:
            #    print "GRESATLJL THNA DZEOR", abbrev_ranges
            for iline, se in enumerate(start_end_map):

                curr_cal_line = lines[iline]
                # if there is an expanded abbrev, concat those words into one element
                if len(abbrev_ranges[iline]) > 0:
                    offset = 0  # account for the fact that you're losing elements in the array as you merge them
                    abbrev_ranges[iline].sort(key=lambda x: x[0])
                    for ar in abbrev_ranges[iline]:
                        if ar[1] - ar[0] <= 0:
                            continue  #TODO there's an issue with the abbrev func, but i'm too lazy to fix now. sometimes they're zero length

                        #redefine ar by how many actual words are in the range, not just how many elements
                        start_ar = ar[0]
                        i_abbrev = start_ar
                        num_words = 0
                        while i_abbrev < len(curr_cal_line):
                            temp_w = curr_cal_line[i_abbrev]
                            num_words += len(re.split(ur'\s+', temp_w))
                            if num_words >= (ar[1] - ar[0] + 1):
                                break
                            i_abbrev += 1
                        end_ar = i_abbrev

                        ar = (start_ar, end_ar)
                        if len(curr_cal_line[ar[0] - offset:ar[1] + 1 - offset]
                               ) != len(
                                   word_obj_list[ar[0] - offset +
                                                 len(cal_words):ar[1] + 1 -
                                                 offset + len(cal_words)]):
                            #something's wrong. not sure what, but best to ignore this
                            continue
                        print u"ABBREV RANGE {} --- OFFSET {}".format(
                            ar, offset)
                        print u"CURR CAL LINE BEFORE {}".format(u','.join(
                            curr_cal_line[ar[0] - offset:ar[1] + 1 - offset]))
                        curr_cal_line[ar[0] - offset:ar[1] + 1 - offset] = [
                            u' '.join(curr_cal_line[ar[0] - offset:ar[1] + 1 -
                                                    offset])
                        ]
                        print u"CURR CAL LINE AFTER {}".format(
                            curr_cal_line[ar[0] - offset])
                        print u"WORD OBJ LIST BEFORE {}".format(u','.join([
                            u'({})'.format(obj['word'])
                            for obj in merge_cal_word_objs(
                                ar[0] - offset + len(cal_words), ar[1] + 1 -
                                offset + len(cal_words), word_obj_list)
                        ]))
                        word_obj_list[ar[0] - offset + len(cal_words):ar[1] +
                                      1 - offset +
                                      len(cal_words)] = merge_cal_word_objs(
                                          ar[0] - offset + len(cal_words),
                                          ar[1] + 1 - offset + len(cal_words),
                                          word_obj_list)
                        print u"WORD OBJ LIST AFTER {}".format(
                            word_obj_list[ar[0] - offset +
                                          len(cal_words)]['word'])
                        offset += ar[1] - ar[0]
                        global_offset += offset

                cal_words += curr_cal_line
                if se[0] == -1:
                    word_for_word_se += [(-1, -1)
                                         for i in range(len(curr_cal_line))]
                    continue
                # matched_cal_objs_indexes = language_tools.match_segments_without_order(lines[iline],bas_word_list[se[0]:se[1]+1],2.0)
                curr_bas_line = bas_word_list[se[0]:se[1] + 1]
                #print u'base line',u' '.join(curr_bas_line)
                matched_obj_words_base = dibur_hamatchil_matcher.match_text(
                    curr_bas_line,
                    curr_cal_line,
                    char_threshold=0.35,
                    verbose=False,
                    with_num_abbrevs=False)
                matched_words_base = matched_obj_words_base["matches"]
                word_for_word_se += [(tse[0] + se[0],
                                      tse[1] + se[0]) if tse[0] != -1 else tse
                                     for tse in matched_words_base]

            matched_word_for_word_obj = dibur_hamatchil_matcher.match_text(
                bas_word_list,
                cal_words,
                char_threshold=0.35,
                prev_matched_results=word_for_word_se,
                boundaryFlexibility=2,
                with_num_abbrevs=False)
            matched_word_for_word = matched_word_for_word_obj["matches"]
            cal_len = len(matched_word_for_word)
            bad_word_offset = 0
            for ical_word, temp_se in enumerate(matched_word_for_word):
                if temp_se[0] == -1:
                    missed_words.append({
                        "word":
                        word_obj_list[ical_word]["word"],
                        "index":
                        ical_word
                    })
                    continue

                #dictionary juggling...
                for i in xrange(temp_se[0], temp_se[1] + 1):
                    #in case a cal_words and word_obj_list aren't the same length bc a word got split up
                    """
                    if cal_words[ical_word] != word_obj_list[ical_word-bad_word_offset]["word"]:
                        if ical_word+1 < len(cal_words) and cal_words[ical_word+1] != word_obj_list[ical_word-bad_word_offset+1]["word"]:
                            bad_word_offset += 1
                        continue
                    """
                    cal_word_obj = word_obj_list[ical_word].copy()
                    cal_word_obj["cal_word"] = cal_word_obj["word"]
                    temp_sef_word = temp_out[i]["word"]
                    temp_out[i] = cal_word_obj
                    temp_out[i]["class"] = "talmud"
                    temp_out[i]["word"] = temp_sef_word

            print u"\n-----\nFOUND {}/{} ({}%)".format(
                cal_len - len(missed_words), cal_len,
                (1 - round(1.0 * len(missed_words) / cal_len, 4)) * 100)
            #print u"MISSED: {}".format(u" ,".join([u"{}:{}".format(wo["word"], wo["index"]) for wo in missed_words]))
            ical += 1
            num_cal_words += cal_len
            num_words_matched += (cal_len - len(missed_words))
        """
        #tag 1 pos words if still untagged
        for iwo,word_obj in enumerate(temp_out):
            word = word_obj["word"]
            if word in cal_pos_hashtable:
                if len(cal_pos_hashtable[word]) == 1:
                    temp_out[iwo] = {"word":word,"cal_word":word,"class":"talmud","POS":cal_pos_hashtable[word][0]}
        """

        num_sef_words += len(temp_out)

        out += temp_out

        sef_daf = curr_sef_ref.__str__().replace("{} ".format(mesechta),
                                                 "").encode('utf8')
        doc = {"words": out, "missed_words": missed_words}
        util.make_folder_if_need_be(
            "data/2_matched_sefaria/json/{}".format(mesechta))
        fp = codecs.open("data/2_matched_sefaria/json/{}/{}.json".format(
            mesechta, sef_daf),
                         "w",
                         encoding='utf-8')
        json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
        fp.close()

    return num_sef_words, num_cal_words, num_words_matched