def get_rabbi_char_loc(context, seg_text): matches = match_text(seg_text.split(), [context.replace('~', '')], with_num_abbrevs=False, place_all=True, place_consecutively=True, verbose=False) if matches["matches"][0][0] == -1: return None, None matched = matches["match_text"][0][0] count = seg_text.count(matched) if count == 0: return None, None if count > 1: # print(f"Context\n{context}\nappears {count} times!") return None, None rabbi = context.split('~')[1] rabbi_len = len(rabbi) context_start = seg_text.find(matched) if matched != context.replace('~', ''): # cant assume rabbi_start_rel is same as it was in `context` rcount = matched.count(rabbi) if rcount == 0: print("NO_RABBI") return None, None if rcount > 1: print("TON_O_RABANAN") return None, None rabbi_start_rel = matched.find(rabbi) else: rabbi_start_rel = context.find('~') start = context_start + rabbi_start_rel end = start + rabbi_len return start, end
def get_rabbi_char_loc_list(context_list, seg_text, norm_regex=None, repl=None, **match_text_kwargs): orig_seg_text = seg_text if norm_regex is not None: seg_text = re.sub(norm_regex, repl, seg_text) matches = match_text(seg_text.split(), [context.replace('~', '') for context in context_list], with_num_abbrevs=False, place_all=True, place_consecutively=True, verbose=False, max_overlap_percent=1.1, **match_text_kwargs) rabbi_span_list = [] for match_span, matched_text, context in zip(matches["matches"], matches["match_text"], context_list): rabbi_span_list += [get_rabbi_char_loc(match_span, matched_text, context, seg_text, orig_seg_text, norm_regex, repl)] return rabbi_span_list
def create_section(oref, dicta_text, dicta_vtitle): with_nikkud, without_nukkud = dicta_text.split(), strip_nikkud( dicta_text).split() sefaria_text = prepare_sefaria_text(oref) dh_match = match_text(without_nukkud, sefaria_text) matches = dh_match['matches'] segments = oref.all_segment_refs() assert len(segments) == len(matches) for segment, match in zip(segments, matches): tc = segment.text('he', dicta_vtitle) new_segment_text = u' '.join(with_nikkud[match[0]:match[1] + 1]) if not new_segment_text: new_segment_text = segment.text('he', davidson_vtitle).text tc.text = new_segment_text tc.save()
def get_rabbi_char_loc(match_span, matched_text, context, seg_text, orig_seg_text, norm_regex, repl): from sefaria.helper.normalization import FunctionNormalizer from research.knowledge_graph.named_entity_recognition.ner_tagger import NormalizerTools if match_span[0] == -1: return None, None matched = matched_text[0] pre_rabbi, rabbi, _ = context.split('~') context_start = list(re.finditer(r'\S+', seg_text))[match_span[0]].start() if matched != context.replace('~', ''): # cant assume rabbi_start_rel is same as it was in `context` word_b4_rabbi = pre_rabbi.split()[-1] + ' ' if len(pre_rabbi.strip()) > 0 else '' rabbi_matches = match_text(matched.split(), [word_b4_rabbi + rabbi], with_num_abbrevs=False, place_all=True, place_consecutively=True, verbose=False) rabbi_matched = rabbi_matches["match_text"][0][0] if rabbi_matched == '': return None, None rcount = matched.count(rabbi_matched) if rcount > 1: print("TON_O_RABANAN") return None, None rabbi_start_rel = matched.find(rabbi_matched) if len(word_b4_rabbi) > 0 and rabbi_matched.startswith(word_b4_rabbi): # first word is not part of rabbi abbreviation (unlike א"ר where first word should be considered part of the rabbi) # wait until now to remove word_b4_rabbi to reduce the rabbi's ambiguity in matched rabbi_matched = rabbi_matched[len(word_b4_rabbi):] rabbi_start_rel += len(word_b4_rabbi) rabbi_len = len(rabbi_matched) else: rabbi_start_rel = context.find('~') rabbi_len = len(rabbi) start = context_start + rabbi_start_rel end = start + rabbi_len if norm_regex is not None: def find_text_to_remove(s, **kwargs): return [(m, repl) for m in re.finditer(norm_regex, s)] normalizer = FunctionNormalizer(find_text_to_remove) norm_map = normalizer.get_mapping_after_normalization(orig_seg_text) mention_indices = normalizer.convert_normalized_indices_to_unnormalized_indices([(start, end)], norm_map) start, end = NormalizerTools.include_trailing_nikkud(mention_indices[0], orig_seg_text) return start, end
def test_empty_comment(self): daftext = u'אע״ג שאמרו ככה בלה בלה בלה'.split() rashi = [u'', u'אף על גב שאמרו'] matched = dhm.match_text(daftext, rashi, verbose=True)
def match_cal_segments(mesechta): def tokenize_words(str): str = str.replace(u"־", " ") str = re.sub(r"</?.+>", "", str) # get rid of html tags str = re.sub(r"\([^\(\)]+\)", "", str) # get rid of refs #str = str.replace("'", '"') word_list = filter(bool, re.split(r"[\s\:\-\,\.\;\(\)\[\]\{\}]", str)) return word_list def merge_cal_word_objs(s, e, word_obj_list): obj_list = word_obj_list[s:e] m_word = u" ".join([o["word"] for o in obj_list]) m_head_word = u" ".join([o["head_word"] for o in obj_list]) m_pos_list = [o["POS"] for o in obj_list] m_pos = max(set(m_pos_list), key=m_pos_list.count) new_obj = obj_list[0].copy() new_obj["word"] = m_word new_obj["head_word"] = m_head_word new_obj["POS"] = m_pos return [ new_obj ] #returns a single element array which will replace a range s:e in the original array cal_lines = json.load(open("cal_lines_{}.json".format(mesechta), "r"), encoding="utf8") cal_pos_hashtable = json.load(open("cal_pos_hashtable.json", "r"), encoding='utf8') dafs = cal_lines["dafs"] lines_by_daf = cal_lines["lines"] super_base_ref = Ref(mesechta) subrefs = super_base_ref.all_subrefs() ical = 0 num_sef_words = 0 num_cal_words = 0 num_words_matched = 0 for curr_sef_ref in subrefs: if curr_sef_ref.is_empty(): continue if ical >= len(dafs): break daf = dafs[ical] print "-----{} DAF {} ({}/{})-----".format(mesechta, daf, ical, len(dafs)) base_tc = TextChunk(curr_sef_ref, "he") bas_word_list = [] # re.split(r"\s+"," ".join(base_text.text)) for segment in base_tc.text: bas_word_list += tokenize_words(segment) temp_out = [{"word": w, "class": "unknown"} for w in bas_word_list] lines = [[word_obj["word"] for word_obj in temp_line] for temp_line in lines_by_daf[ical]] word_obj_list = [ word_obj for temp_line in lines_by_daf[ical] for word_obj in temp_line ] lines_by_str = [u' '.join(line_array) for line_array in lines] curr_cal_ref = Ref("{} {}".format(mesechta, daf)) out = [] word_for_word_se = [] cal_words = [] missed_words = [] global_offset = 0 if curr_sef_ref == curr_cal_ref: matched = dibur_hamatchil_matcher.match_text( bas_word_list, lines_by_str, verbose=True, word_threshold=0.27, char_threshold=0.6, with_abbrev_matches=True, with_num_abbrevs=False) start_end_map = matched["matches"] abbrev_matches = matched["abbrevs"] abbrev_ranges = [[am.rashiRange for am in am_list] for am_list in abbrev_matches] print u' --- '.join( [unicode(am) for am_list in abbrev_matches for am in am_list]) abbrev_count = 0 for ar in abbrev_ranges: abbrev_count += len(ar) #if abbrev_count > 0: # print "GRESATLJL THNA DZEOR", abbrev_ranges for iline, se in enumerate(start_end_map): curr_cal_line = lines[iline] # if there is an expanded abbrev, concat those words into one element if len(abbrev_ranges[iline]) > 0: offset = 0 # account for the fact that you're losing elements in the array as you merge them abbrev_ranges[iline].sort(key=lambda x: x[0]) for ar in abbrev_ranges[iline]: if ar[1] - ar[0] <= 0: continue #TODO there's an issue with the abbrev func, but i'm too lazy to fix now. sometimes they're zero length #redefine ar by how many actual words are in the range, not just how many elements start_ar = ar[0] i_abbrev = start_ar num_words = 0 while i_abbrev < len(curr_cal_line): temp_w = curr_cal_line[i_abbrev] num_words += len(re.split(ur'\s+', temp_w)) if num_words >= (ar[1] - ar[0] + 1): break i_abbrev += 1 end_ar = i_abbrev ar = (start_ar, end_ar) if len(curr_cal_line[ar[0] - offset:ar[1] + 1 - offset] ) != len( word_obj_list[ar[0] - offset + len(cal_words):ar[1] + 1 - offset + len(cal_words)]): #something's wrong. not sure what, but best to ignore this continue print u"ABBREV RANGE {} --- OFFSET {}".format( ar, offset) print u"CURR CAL LINE BEFORE {}".format(u','.join( curr_cal_line[ar[0] - offset:ar[1] + 1 - offset])) curr_cal_line[ar[0] - offset:ar[1] + 1 - offset] = [ u' '.join(curr_cal_line[ar[0] - offset:ar[1] + 1 - offset]) ] print u"CURR CAL LINE AFTER {}".format( curr_cal_line[ar[0] - offset]) print u"WORD OBJ LIST BEFORE {}".format(u','.join([ u'({})'.format(obj['word']) for obj in merge_cal_word_objs( ar[0] - offset + len(cal_words), ar[1] + 1 - offset + len(cal_words), word_obj_list) ])) word_obj_list[ar[0] - offset + len(cal_words):ar[1] + 1 - offset + len(cal_words)] = merge_cal_word_objs( ar[0] - offset + len(cal_words), ar[1] + 1 - offset + len(cal_words), word_obj_list) print u"WORD OBJ LIST AFTER {}".format( word_obj_list[ar[0] - offset + len(cal_words)]['word']) offset += ar[1] - ar[0] global_offset += offset cal_words += curr_cal_line if se[0] == -1: word_for_word_se += [(-1, -1) for i in range(len(curr_cal_line))] continue # matched_cal_objs_indexes = language_tools.match_segments_without_order(lines[iline],bas_word_list[se[0]:se[1]+1],2.0) curr_bas_line = bas_word_list[se[0]:se[1] + 1] #print u'base line',u' '.join(curr_bas_line) matched_obj_words_base = dibur_hamatchil_matcher.match_text( curr_bas_line, curr_cal_line, char_threshold=0.35, verbose=False, with_num_abbrevs=False) matched_words_base = matched_obj_words_base["matches"] word_for_word_se += [(tse[0] + se[0], tse[1] + se[0]) if tse[0] != -1 else tse for tse in matched_words_base] matched_word_for_word_obj = dibur_hamatchil_matcher.match_text( bas_word_list, cal_words, char_threshold=0.35, prev_matched_results=word_for_word_se, boundaryFlexibility=2, with_num_abbrevs=False) matched_word_for_word = matched_word_for_word_obj["matches"] cal_len = len(matched_word_for_word) bad_word_offset = 0 for ical_word, temp_se in enumerate(matched_word_for_word): if temp_se[0] == -1: missed_words.append({ "word": word_obj_list[ical_word]["word"], "index": ical_word }) continue #dictionary juggling... for i in xrange(temp_se[0], temp_se[1] + 1): #in case a cal_words and word_obj_list aren't the same length bc a word got split up """ if cal_words[ical_word] != word_obj_list[ical_word-bad_word_offset]["word"]: if ical_word+1 < len(cal_words) and cal_words[ical_word+1] != word_obj_list[ical_word-bad_word_offset+1]["word"]: bad_word_offset += 1 continue """ cal_word_obj = word_obj_list[ical_word].copy() cal_word_obj["cal_word"] = cal_word_obj["word"] temp_sef_word = temp_out[i]["word"] temp_out[i] = cal_word_obj temp_out[i]["class"] = "talmud" temp_out[i]["word"] = temp_sef_word print u"\n-----\nFOUND {}/{} ({}%)".format( cal_len - len(missed_words), cal_len, (1 - round(1.0 * len(missed_words) / cal_len, 4)) * 100) #print u"MISSED: {}".format(u" ,".join([u"{}:{}".format(wo["word"], wo["index"]) for wo in missed_words])) ical += 1 num_cal_words += cal_len num_words_matched += (cal_len - len(missed_words)) """ #tag 1 pos words if still untagged for iwo,word_obj in enumerate(temp_out): word = word_obj["word"] if word in cal_pos_hashtable: if len(cal_pos_hashtable[word]) == 1: temp_out[iwo] = {"word":word,"cal_word":word,"class":"talmud","POS":cal_pos_hashtable[word][0]} """ num_sef_words += len(temp_out) out += temp_out sef_daf = curr_sef_ref.__str__().replace("{} ".format(mesechta), "").encode('utf8') doc = {"words": out, "missed_words": missed_words} fp = codecs.open( "cal_matcher_output/{}/lang_naive_talmud/lang_naive_talmud_{}.json" .format(mesechta, sef_daf), "w", encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False) fp.close() return num_sef_words, num_cal_words, num_words_matched
def match_cal_segments(mesechta): def tokenize_words(str): str = str.replace(u"־", " ") str = re.sub(r"</?.+>", "", str) # get rid of html tags str = re.sub(r"\([^\(\)]+\)", "", str) # get rid of refs str = str.replace("'", '"') word_list = filter(bool, re.split(r"[\s\:\-\,\.\;\(\)\[\]\{\}]", str)) return word_list def merge_cal_word_objs(s,e,word_obj_list): obj_list = word_obj_list[s:e] m_word = u" ".join([o["word"] for o in obj_list]) m_head_word = u" ".join([o["head_word"] for o in obj_list]) m_pos_list = [o["POS"] for o in obj_list] m_pos = max(set(m_pos_list), key=m_pos_list.count) new_obj = obj_list[0].copy() new_obj["word"] = m_word new_obj["head_word"] = m_head_word new_obj["POS"] = m_pos return [new_obj] #returns a single element array which will replace a range s:e in the original array cal_lines = json.load(open("cal_lines_{}.json".format(mesechta), "r"), encoding="utf8") cal_pos_hashtable = json.load(open("cal_pos_hashtable.json","r"),encoding='utf8') dafs = cal_lines["dafs"] lines_by_daf = cal_lines["lines"] super_base_ref = Ref(mesechta) subrefs = super_base_ref.all_subrefs() ical = 0 for curr_sef_ref in subrefs: if curr_sef_ref.is_empty(): continue if ical >= len(dafs): break daf = dafs[ical] print "-----{} DAF {} ({}/{})-----".format(mesechta,daf,ical,len(dafs)) base_tc = TextChunk(curr_sef_ref, "he") bas_word_list = [] # re.split(r"\s+"," ".join(base_text.text)) for segment in base_tc.text: bas_word_list += tokenize_words(segment) temp_out = [{"word": w, "class": "unknown"} for w in bas_word_list] lines = [[word_obj["word"] for word_obj in temp_line] for temp_line in lines_by_daf[ical]] word_obj_list = [word_obj for temp_line in lines_by_daf[ical] for word_obj in temp_line] lines_by_str = [u' '.join(line_array) for line_array in lines] curr_cal_ref = Ref("{} {}".format(mesechta, daf)) out = [] word_for_word_se = [] cal_words = [] missed_words = [] global_offset = 0 if curr_sef_ref == curr_cal_ref: start_end_map,abbrev_matches = dibur_hamatchil_matcher.match_text(bas_word_list, lines_by_str, verbose=True, word_threshold=0.27, char_threshold=1.5,with_abbrev_matches=True) abbrev_ranges = [[am.rashiRange for am in am_list] for am_list in abbrev_matches ] print u' --- '.join([unicode(am) for am_list in abbrev_matches for am in am_list]) abbrev_count = 0 for ar in abbrev_ranges: abbrev_count += len(ar) #if abbrev_count > 0: # print "GRESATLJL THNA DZEOR", abbrev_ranges for iline,se in enumerate(start_end_map): curr_cal_line = lines[iline] # if there is an expanded abbrev, concat those words into one element if len(abbrev_ranges[iline]) > 0: offset = 0 # account for the fact that you're losing elements in the array as you merge them abbrev_ranges[iline].sort(key=lambda x: x[0]) for ar in abbrev_ranges[iline]: if ar[1] - ar[0] <= 0: continue #TODO there's an issue with the abbrev func, but i'm too lazy to fix now. sometimes they're zero length #redefine ar by how many actual words are in the range, not just how many elements start_ar = ar[0] i_abbrev = start_ar num_words = 0 while i_abbrev < len(curr_cal_line): temp_w = curr_cal_line[i_abbrev] num_words += len(re.split(ur'\s+',temp_w)) if num_words >= (ar[1]-ar[0]+1): break i_abbrev += 1 end_ar = i_abbrev ar = (start_ar,end_ar) if len(curr_cal_line[ar[0]-offset:ar[1]+1-offset]) != len( word_obj_list[ar[0]-offset+len(cal_words):ar[1]+1-offset+len(cal_words)]): #something's wrong. not sure what, but best to ignore this continue print u"ABBREV RANGE {} --- OFFSET {}".format(ar,offset) print u"CURR CAL LINE BEFORE {}".format(u','.join(curr_cal_line[ar[0]-offset:ar[1]+1-offset])) curr_cal_line[ar[0]-offset:ar[1]+1-offset] = [u' '.join(curr_cal_line[ar[0]-offset:ar[1]+1-offset])] print u"CURR CAL LINE AFTER {}".format(curr_cal_line[ar[0]-offset]) print u"WORD OBJ LIST BEFORE {}".format(u','.join([u'({})'.format(obj['word']) for obj in merge_cal_word_objs(ar[0]-offset+len(cal_words),ar[1]+1-offset+len(cal_words),word_obj_list)])) word_obj_list[ar[0]-offset+len(cal_words):ar[1]+1-offset+len(cal_words)] = merge_cal_word_objs(ar[0]-offset+len(cal_words),ar[1]+1-offset+len(cal_words),word_obj_list) print u"WORD OBJ LIST AFTER {}".format(word_obj_list[ar[0]-offset+len(cal_words)]['word']) offset += ar[1]-ar[0] global_offset += offset cal_words += curr_cal_line if se[0] == -1: word_for_word_se += [(-1,-1) for i in range(len(curr_cal_line))] continue # matched_cal_objs_indexes = language_tools.match_segments_without_order(lines[iline],bas_word_list[se[0]:se[1]+1],2.0) curr_bas_line = bas_word_list[se[0]:se[1]+1] #print u'base line',u' '.join(curr_bas_line) matched_words_base = dibur_hamatchil_matcher.match_text(curr_bas_line, curr_cal_line, char_threshold=0.3,verbose=False) word_for_word_se += [(tse[0]+se[0],tse[1]+se[0]) if tse[0] != -1 else tse for tse in matched_words_base] matched_word_for_word = dibur_hamatchil_matcher.match_text(bas_word_list, cal_words, char_threshold=0.3, prev_matched_results=word_for_word_se,boundaryFlexibility=2) bad_word_offset = 0 for ical_word,temp_se in enumerate(matched_word_for_word): if temp_se[0] == -1: missed_words.append({"word":word_obj_list[ical_word]["word"],"index":ical_word}) continue #dictionary juggling... for i in xrange(temp_se[0],temp_se[1]+1): #in case a cal_words and word_obj_list aren't the same length bc a word got split up """ if cal_words[ical_word] != word_obj_list[ical_word-bad_word_offset]["word"]: if ical_word+1 < len(cal_words) and cal_words[ical_word+1] != word_obj_list[ical_word-bad_word_offset+1]["word"]: bad_word_offset += 1 continue """ cal_word_obj = word_obj_list[ical_word].copy() cal_word_obj["cal_word"] = cal_word_obj["word"] temp_sef_word = temp_out[i]["word"] temp_out[i] = cal_word_obj temp_out[i]["class"] = "talmud" temp_out[i]["word"] = temp_sef_word cal_len = len(matched_word_for_word) print u"\n-----\nFOUND {}/{} ({}%)".format(cal_len - len(missed_words), cal_len, (1 - round(1.0 * len(missed_words) / cal_len, 4)) * 100) #print u"MISSED: {}".format(u" ,".join([u"{}:{}".format(wo["word"], wo["index"]) for wo in missed_words])) ical += 1 """ #tag 1 pos words if still untagged for iwo,word_obj in enumerate(temp_out): word = word_obj["word"] if word in cal_pos_hashtable: if len(cal_pos_hashtable[word]) == 1: temp_out[iwo] = {"word":word,"cal_word":word,"class":"talmud","POS":cal_pos_hashtable[word][0]} """ out += temp_out sef_daf = curr_sef_ref.__str__().replace("{} ".format(mesechta),"").encode('utf8') doc = {"words": out,"missed_words":missed_words} fp = codecs.open("cal_matcher_output/{}/lang_naive_talmud/lang_naive_talmud_{}.json".format(mesechta,sef_daf), "w", encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False) fp.close()
def find_changes_between_wiki_and_will(): import math, bisect from data_utilities import dibur_hamatchil_matcher from tqdm import tqdm out = {} total = 0 missed = 0 for title in tqdm(library.get_indexes_in_category("Bavli")): wiki = Version().load({ "title": title, "versionTitle": "Wikisource Talmud Bavli", "language": "he" }) will = Version().load({ "title": title, "versionTitle": "William Davidson Edition - Aramaic", "language": "he" }) for isec, (wiki_section, will_section) in enumerate(zip(wiki.chapter, will.chapter)): for iseg, (wiki_segment, will_segment) in enumerate( zip(wiki_section, will_section)): daf = math.ceil((isec + 1) / 2) amud = 'b' if (isec + 1) % 2 == 0 else 'a' tref = f"{title} {daf}{amud}:{iseg+1}" wiki_removed_indices, wiki_original_words, wiki_removed_chars = get_words_removed( wiki_segment, will_segment) wiki_tokenized_words = base_tokenizer(wiki_segment, will_segment) will_removed_indices, will_original_words, will_removed_chars = get_words_removed( will_segment, will_segment) will_tokenized_words = base_tokenizer(will_segment, will_segment) matched = dibur_hamatchil_matcher.match_text( wiki_tokenized_words, [" ".join(will_tokenized_words)], verbose=False, strict_boundaries=True, place_all=True, with_abbrev_matches=True) total += 1 if matched['matches'][0][0] == -1: # no match missed += 1 continue for abbrev_list in matched['abbrevs']: for abbrev in abbrev_list: # print('orig', abbrev.gemaraRange, abbrev.rashiRange) wikiWordRange = [ x + bisect.bisect_right(wiki_removed_indices, x) for x in abbrev.gemaraRange ] willWordRange = [ x + bisect.bisect_right(will_removed_indices, x) for x in abbrev.rashiRange ] wiki_start_char = len(" ".join( wiki_original_words[:wikiWordRange[0]])) if wiki_start_char > 0: # account for space after initial words wiki_start_char += 1 wiki_end_char = len(" ".join( wiki_original_words[:wikiWordRange[1] + 1])) wikiCharRange = [wiki_start_char, wiki_end_char] will_start_char = len(" ".join( will_original_words[:willWordRange[0]])) if will_start_char > 0: will_start_char += 1 will_end_char = len(" ".join( will_original_words[:willWordRange[1] + 1])) willCharRange = [will_start_char, will_end_char] wiki_removed_chars += [ (tuple(wikiCharRange), will_segment[willCharRange[0]:willCharRange[1]]) ] # print(f"~{wiki_segment[wikiCharRange[0]:wikiCharRange[1]]}~") # print(f"~{will_segment[willCharRange[0]:willCharRange[1]]}~") # print('after', wikiWordRange, willWordRange) # print(wiki_original_words[wikiWordRange[0]:wikiWordRange[1]+1]) # print(will_original_words[willWordRange[0]:willWordRange[1]+1]) out[tref] = { 'wiki': wiki_removed_chars, 'will': will_removed_chars } print('Total', total) print('Missed', missed) print('Perc', missed / total) with open(f"{DATASET_LOC}/wiki_will_changes.json", 'w') as fout: json.dump(out, fout)
def match_cal_segments(mesechta): def tokenize_words(str): str = str.replace(u"־", " ") str = re.sub(r"</?[a-z]+>", "", str) # get rid of html tags str = re.sub(r"\([^\(\)]+\)", "", str) # get rid of refs str = str.replace("'", '"') word_list = filter(bool, re.split(r"[\s\:\-\,\.\;\(\)\[\]\{\}]", str)) return word_list def merge_cal_word_objs(s, e, word_obj_list): obj_list = word_obj_list[s:e] m_word = u" ".join([o["word"] for o in obj_list]) m_head_word = u" ".join([o["head_word"] for o in obj_list]) m_pos_list = [o["POS"] for o in obj_list] m_pos = max(set(m_pos_list), key=m_pos_list.count) new_obj = obj_list[0].copy() new_obj["word"] = m_word new_obj["head_word"] = m_head_word new_obj["POS"] = m_pos return [ new_obj ] #returns a single element array which will replace a range s:e in the original array cal_lines = json.load(open("cal_lines_{}.json".format(mesechta), "r"), encoding="utf8") dafs = cal_lines["dafs"] lines_by_daf = cal_lines["lines"] super_base_ref = Ref(mesechta) subrefs = super_base_ref.all_subrefs() ical = 0 for curr_sef_ref in subrefs: if curr_sef_ref.is_empty(): continue if ical >= len(dafs): break daf = dafs[ical] print "----- DAF {} ({}/{})-----".format(daf, ical, len(dafs)) base_tc = TextChunk(curr_sef_ref, "he") bas_word_list = [] # re.split(r"\s+"," ".join(base_text.text)) for segment in base_tc.text: bas_word_list += tokenize_words(segment) temp_out = [{"word": w, "class": "unknown"} for w in bas_word_list] lines = [[word_obj["word"] for word_obj in temp_line] for temp_line in lines_by_daf[ical]] word_obj_list = [ word_obj for temp_line in lines_by_daf[ical] for word_obj in temp_line ] lines_by_str = [u' '.join(line_array) for line_array in lines] curr_cal_ref = Ref("{} {}".format(mesechta, daf)) out = [] word_for_word_se = [] cal_words = [] missed_words = [] global_offset = 0 if curr_sef_ref == curr_cal_ref: start_end_map, abbrev_ranges = dibur_hamatchil_matcher.match_text( bas_word_list, lines_by_str, verbose=True, word_threshold=0.5, with_abbrev_ranges=True) for iline, se in enumerate(start_end_map): curr_cal_line = lines[iline] # if there is an expanded abbrev, concat those words into one element if len(abbrev_ranges[iline]) > 0: offset = 0 # account for the fact that you're losing elements in the array as you merge them for ar in abbrev_ranges[iline]: if ar[1] - ar[0] <= 0: continue #TODO there's an issue with the abbrev func, but i'm too lazy to fix now. sometimes they're zero length curr_cal_line[ar[0] - offset:ar[1] + 1 - offset] = [ u' '.join(curr_cal_line[ar[0] - offset:ar[1] + 1 - offset]) ] word_obj_list[ar[0] - offset + len(cal_words):ar[1] + 1 - offset + len(cal_words)] = merge_cal_word_objs( ar[0] - offset + len(cal_words), ar[1] + 1 - offset + len(cal_words), word_obj_list) offset += ar[1] - ar[0] global_offset += offset print offset cal_words += curr_cal_line if se[0] == -1: word_for_word_se += [(-1, -1) for i in range(len(curr_cal_line))] continue # matched_cal_objs_indexes = language_tools.match_segments_without_order(lines[iline],bas_word_list[se[0]:se[1]+1],2.0) curr_bas_line = bas_word_list[se[0]:se[1] + 1] matched_words_base = dibur_hamatchil_matcher.match_text( curr_bas_line, curr_cal_line, char_threshold=0.4) word_for_word_se += [(tse[0] + se[0], tse[1] + se[0]) if tse[0] != -1 else tse for tse in matched_words_base] matched_word_for_word = dibur_hamatchil_matcher.match_text( bas_word_list, cal_words, char_threshold=0.4, prev_matched_results=word_for_word_se) for ical_word, temp_se in enumerate(matched_word_for_word): if temp_se[0] == -1: missed_words.append({ "word": word_obj_list[ical_word]["word"], "index": ical_word }) continue #dictionary juggling... for i in xrange(temp_se[0], temp_se[1] + 1): cal_word_obj = word_obj_list[ical_word].copy() cal_word_obj["cal_word"] = cal_word_obj["word"] temp_sef_word = temp_out[i]["word"] temp_out[i] = cal_word_obj temp_out[i]["class"] = "talmud" temp_out[i]["word"] = temp_sef_word cal_len = len(matched_word_for_word) print u"\n-----\nFOUND {}/{} ({}%)".format( cal_len - len(missed_words), cal_len, (1 - round(1.0 * len(missed_words) / cal_len, 4)) * 100) print u"MISSED: {}".format(u" ,".join([ u"{}:{}".format(wo["word"], wo["index"]) for wo in missed_words ])) ical += 1 out += temp_out sef_daf = curr_sef_ref.__str__().replace("{} ".format(mesechta), "").encode('utf8') doc = {"words": out, "missed_words": missed_words} fp = codecs.open( "cal_matcher_output/{}/lang_naive_talmud/lang_naive_talmud_{}.json" .format(mesechta, sef_daf), "w", encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False) fp.close()
def test_duplicate_comment(self): daftext = 'אע״ג שאמרו ככה בלה בלה בלה'.split() rashi = ['בלה', 'בלה'] matched = dhm.match_text(daftext, rashi, verbose=True)
base_chapter = base_text[chapter_index - 1] print 'fixing chapter {}'.format(chapter_index) book_text = [ bleach.clean(segment, tags=[], strip=True) for segment in base_chapter ] seg_indices = first_word_indices(book_text) word_list = u' '.join(book_text).split() dh_list = [ re.sub(ur' (\.|:)', ur'\1', p.dh.get_valueOf_()) for p in comment_chapter.get_phrase() ] matches = match_text(word_list, dh_list, dh_extract_method=cleaner, place_all=False, strict_boundaries=True, char_threshold=0.4) locations.append([ bisect.bisect_right(seg_indices, match[0]) if match[0] >= 0 else match[0] for match in matches['matches'] ]) commentary.set_verses(locations) commentary.correct_phrase_verses() if overwrite: outfile = filename else: outfile = '{}_test'.format(filename) # with codecs.open('XML/{}.xml'.format(outfile), 'w', 'utf-8') as out:
return indices root = DCXMLsubs.parse("XML/tractate-avot_drabi_natan-xml.xml", silence=True) base_text = root.getBaseTextArray()[0] base_text = [bleach.clean(segment, tags=[], strip=True) for segment in base_text] seg_indices = first_word_indices(base_text) word_list = u' '.join(base_text).split() c = root.body.commentaries.commentary[6].chapter[0] dh_list = [p.dh.get_valueOf_() for p in c.get_phrase()] def cleaner(input_string): assert isinstance(input_string, basestring) pattern = u'\u05d5?(\u05db|\u05d2)\u05d5\u05f3?' match = re.search(pattern, input_string) if match is None: return input_string if match.start() > 6 and (match.start() > len(input_string) / 2): return re.sub(u'\u05d5?(\u05db|\u05d2)\u05d5\u05f3?.*', u'', input_string) elif match.start() > 6 and (match.start() < len(input_string) / 2): return re.sub(u'.*?{}'.format(pattern), u'', input_string) else: return re.sub(pattern, u'', input_string) matches = match_text(word_list, dh_list, dh_extract_method=cleaner, place_all=False) locations = [bisect.bisect_right(seg_indices, match[0]) if match[0] >= 0 else match[0] for match in matches['matches']] c.set_verses(locations) with codecs.open('text.xml', 'w', 'utf-8') as outfile: c.export(outfile, level=1)
def convert_mentions_for_alt_version(nikkud_vtitle, mentions_output, manual_changes_file=None, limit=None): import json from research.knowledge_graph.named_entity_recognition.ner_tagger import Mention from data_utilities.dibur_hamatchil_matcher import match_text from sefaria.helper.normalization import FunctionNormalizer if manual_changes_file is not None: changes = srsly.read_json(manual_changes_file) with open("sperling_mentions.json", "r") as fin: j = json.load(fin) # add mentions in db b/c sperling_mentions only includes bonayich-only mentions for tl in RefTopicLinkSet({"class": "refTopic", "linkType": "mention", "charLevelData.versionTitle": "William Davidson Edition - Aramaic"}): j += [{ "start": tl.charLevelData['startChar'], "end": tl.charLevelData['endChar'], "ref": tl.ref, "mention": tl.charLevelData['text'], "id_matches": [tl.toTopic] }] mentions_by_seg = defaultdict(list) print("TOTAL MENTIONS", len(j)) for mention in j: mentions_by_seg[mention['ref']] += [Mention().add_metadata(**mention)] indexes = library.get_indexes_in_category("Bavli") if limit is None else limit def get_norm_pos(start, end, s): num_to_remove = s.count(':', 0, start) return start - num_to_remove, end - num_to_remove replace_reg_parens = r"(?:[\u0591-\u05bd\u05bf-\u05c5\u05c7,.:!?״()]+|\s—|\s…)" replace_reg = r"(?:[\u0591-\u05bd\u05bf-\u05c5\u05c7,.:!?״]+|\s—|\s…)" def get_find_text_to_remove(remove_parens=True): return lambda s: [(m, '') for m in re.finditer(replace_reg_parens if remove_parens else replace_reg, s)] new_mentions = [] num_failed = 0 for mas in tqdm(indexes): if Version().load({"title": mas, "versionTitle": nikkud_vtitle, "language": "he"}) is None: continue index = library.get_index(mas) for seg in index.all_segment_refs(): temp_mentions = mentions_by_seg[seg.normal()] if len(temp_mentions) == 0: continue text = TextChunk(seg, lang='he', vtitle='William Davidson Edition - Aramaic').text norm_text = re.sub(':', '', text) text_nikkud = TextChunk(seg, lang='he', vtitle=nikkud_vtitle).text remove_parens = True if re.sub(replace_reg_parens, '', text_nikkud) != text: remove_parens = False normalizer = FunctionNormalizer(get_find_text_to_remove(remove_parens)) norm_text_nikkud = re.sub(replace_reg_parens if remove_parens else replace_reg, '', text_nikkud) if len(text_nikkud) == 0: continue mention_indices = [get_norm_pos(mention.start, mention.end, text) for mention in temp_mentions] if manual_changes_file is None: norm_map = normalizer.get_mapping_after_normalization(text_nikkud) else: temp_wiki_changes = changes.get(seg.normal(), {}).get('wiki', []) temp_will_changes = changes.get(seg.normal(), {}).get('will', []) temp_wiki_changes = list(filter(lambda x: x not in temp_will_changes, temp_wiki_changes)) temp_wiki_changes.sort(key=lambda x: x[0][0]) for tc in temp_wiki_changes: tc[0][0] += 1 tc[0][1] += 1 norm_map = normalizer.get_mapping_after_normalization(text_nikkud, removal_list=temp_wiki_changes) mention_indices = normalizer.convert_normalized_indices_to_unnormalized_indices(mention_indices, norm_map) temp_new_mentions = [] for mention, (unnorm_start, unnorm_end) in zip(temp_mentions, mention_indices): if manual_changes_file is None: new_mention = re.sub(replace_reg_parens if remove_parens else replace_reg, '', text_nikkud[unnorm_start:unnorm_end]) else: new_mention = text_nikkud[unnorm_start:unnorm_end] try: if len(new_mention) == 0: print("ZERO LENGTH MENTION", mention.mention, seg.normal()) assert len(new_mention) > 0 if manual_changes_file is None: assert new_mention == mention.mention, f"'{new_mention} != {mention.mention}' {unnorm_start} {unnorm_end}" else: for offset in [0, -1, 1, -2, 2]: new_mention = text_nikkud[unnorm_start+offset:unnorm_end+offset] # likely to be abbreviations in new_mention. use dh matcher to see if they're 'equivalent' old_mention_comparison = mention.mention if new_mention.startswith('א"ר'): old_mention_comparison = "אמר " + old_mention_comparison if new_mention.startswith('"'): # middle of abbrev new_mention_comparison = new_mention[1:2] + "'" + new_mention[2:] else: new_mention_comparison = new_mention new_words = new_mention_comparison.split() matched = match_text(new_words, [old_mention_comparison], with_abbrev_matches=True, daf_skips=0, rashi_skips=0, overall=0) if matched['matches'][0][0] != -1: # need look at actual match and figure out if any words are missing # recalculate unnorm_start and unnorm_end to leave out these words. Test case: Arakhin 5a:18 istart_word, iend_word = matched['matches'][0] start_text = " ".join(new_words[:istart_word]) start_offset = len(start_text) + (1 if len(start_text) > 0 else 0) # add 1 to account for space right after start_text end_text = " ".join(new_words[iend_word+1:]) end_offset = len(end_text) + (1 if len(end_text) > 0 else 0) unnorm_start += offset + start_offset unnorm_end += offset - end_offset break # move unnorm_start and end to nearest word break if unnorm_end == len(text_nikkud) + 1: # one too big unnorm_end -= 1 if unnorm_end > len(text_nikkud): # too big give up # print("UPDATE END TOO BIG. GIVE UP...", mention.mention, seg.normal()) assert False if text_nikkud[unnorm_start] in {' ', ':'}: # move forward by one unnorm_start += 1 if text_nikkud[unnorm_end-1] in {' ', ':'}: unnorm_end -= 1 start_nearest_break = max(text_nikkud.rfind(' ', 0, unnorm_start), text_nikkud.rfind(':', 0, unnorm_start)) end_nearest_break_match = re.search(r'[\s:]', text_nikkud[unnorm_end:]) end_nearest_break = (end_nearest_break_match.start() + unnorm_end) if end_nearest_break_match is not None else -1 if start_nearest_break != -1: unnorm_start = start_nearest_break + 1 elif unnorm_start != 0: # if couldn't find space before, must be at beginning # print("UPDATE START", mention.mention, seg.normal()) unnorm_start = 0 if end_nearest_break != -1: unnorm_end = end_nearest_break elif unnorm_end != len(text_nikkud): # print("UPDATE END", mention.mention, seg.normal()) unnorm_end = len(text_nikkud) assert matched['matches'][0][0] != -1 mention.add_metadata(start=unnorm_start, end=unnorm_end, mention=text_nikkud[unnorm_start:unnorm_end]) temp_new_mentions += [mention] except AssertionError: norm_start, norm_end = get_norm_pos(mention.start, mention.end, text) snip_size = 10 start_snip_naive = norm_start - snip_size if norm_start >= snip_size else 0 start_snip = norm_text.rfind(" ", 0, start_snip_naive) if start_snip == -1: start_snip = start_snip_naive end_snip_naive = norm_end + snip_size if norm_end + snip_size <= len(norm_text) else len(norm_text) end_snip = norm_text.find(" ", end_snip_naive) if end_snip == -1: end_snip = end_snip_naive snippet = f"{norm_text[start_snip:norm_start]}~{norm_text[norm_start:norm_end]}~{norm_text[norm_end:end_snip]}" new_norm_start, new_norm_end = get_rabbi_char_loc_list([snippet], norm_text_nikkud)[0] if new_norm_start is None: # print("new_norm_start is None") num_failed += 1 continue new_start, new_end = normalizer.convert_normalized_indices_to_unnormalized_indices([(new_norm_start, new_norm_end)], norm_map)[0] new_mention = re.sub(replace_reg_parens if remove_parens else replace_reg, '', text_nikkud[new_start:new_end]) try: assert new_mention == mention.mention, f"'{new_mention} != {mention.mention}' {unnorm_start} {unnorm_end}" mention.add_metadata(start=new_start, end=new_end, mention=text_nikkud[new_start:new_end]) temp_new_mentions += [mention] except AssertionError: num_failed += 1 # get_rabbi_char_pos using context and text_nikkud # get_unnormalized pos new_mentions += temp_new_mentions out = [m.serialize(delete_keys=['versionTitle', 'language']) for m in new_mentions] with open(f"{mentions_output}", "w") as fout: json.dump(out, fout, ensure_ascii=False, indent=2) print("NUM FAILED", num_failed)