def replace_with_base64(s, ref): words_to_replace = [] cleaned_pasuk = tokenizer(s, True) for iw, word in enumerate(cleaned_pasuk): prefix = '' shoresh = lookup_shoresh(word, ref) if shoresh: shoresh = shoresh[0] else: print(word) if any(word_to_emoji == shoresh for word_to_emoji in words_to_emojis): nikudless_word = strip_cantillation(word, True)[:-1] nikudless_shoresh = strip_cantillation(shoresh, True)[:-1] if len(nikudless_shoresh) > len(nikudless_word): nikudless_shoresh = nikudless_shoresh[:len(nikudless_word)] if nikudless_word != nikudless_shoresh: prefix_index = nikudless_word.find(nikudless_shoresh) if prefix_index != -1 and any(p == nikudless_word[:prefix_index] for p in prefixes): nikud_prefix_index = word.find(shoresh[0], prefix_index) prefix = word[:nikud_prefix_index] words_to_replace += [{"name": word, "shoresh": shoresh, "prefix": prefix, "word_num": iw}] tokenized_pasuk = tokenizer(s, False) for to_replace in words_to_replace: p = to_replace["prefix"] tokenized_pasuk[to_replace[ "word_num"]] = u'<span class="purim-emoji">' \ u'{}<img src="data:image/png;base64,{}" /> </span>'.format( u"{}-".format(p) if len(p) > 0 else u"", emoji_map[to_replace["shoresh"]]) new_pasuk = rebuild_tokenized_text(tokenized_pasuk) if new_pasuk[-1] != u'׃': new_pasuk += u'׃' return new_pasuk
def _single_lookup(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and forms.count() == 0: del query_obj["refs"] forms = WordFormSet(query_obj) if forms.count() > 0: result = [] headword_query = [] for form in forms: for lookup in form.lookups: headword_query.append({'headword': lookup['headword']}) # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms return headword_query else: return []
def make_parsed_source(chapter_name, chapter_num, topic_name, topic_num, source_num, prev_rows): global TOTAL_REFS, PARSED_REFS source, commentary = "", "" for r in prev_rows: s, c = get_midrashic_text(r["source"]) source += s commentary += c m = re.search(r"\(([^)]+)\)\s*\.?\s*\$?\s*$", source) if m is None: print("OH NO -- {} {} {}: {}".format(chapter_num, topic_num, source_num, topic_name)) print(strip_cantillation(source, strip_vowels=True)[-20:]) return None else: source = re.sub(r"\(([^)]+)\)\s*\.?\s*\$?\s*$", "", source) ref_list = [ parse_ref(r, source) for r in re.split(r"[:;]", m.group(1)) ] TOTAL_REFS += len(ref_list) PARSED_REFS += len([_f for _f in ref_list if _f]) return { "chapter_name": chapter_name, "chapter_num": chapter_num, "topic_name": topic_name, "topic_num": topic_num, "source_num": source_num, "source": source, "commentary": commentary, "ref_list": [_f for _f in ref_list if _f] }
def _single_lookup(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and len(forms) == 0: del query_obj["refs"] forms = WordFormSet(query_obj) if len(forms) > 0: headword_query = [] for form in forms: for lookup in form.lookups: headword_query.append({'headword': lookup['headword']}) # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms return headword_query else: return []
def tokenize_words(self, base_str): base_str = base_str.strip() base_str = strip_cantillation(base_str, strip_vowels=True) base_str = bleach.clean(base_str, tags=[], strip=True) for match in re.finditer(ur'\(.*?\)', base_str): if library.get_titles_in_string(match.group()) and len(match.group().split()) <= 5: base_str = base_str.replace(match.group(), u"")
def tokenize_words(self, base_str): base_str = base_str.strip() base_str = strip_cantillation(base_str, strip_vowels=True) base_str = bleach.clean(base_str, tags=[], strip=True) for match in re.finditer(ur'\(.*?\)', base_str): if library.get_titles_in_string( match.group()) and len(match.group().split()) <= 5: base_str = base_str.replace(match.group(), u"")
def tokenizer(base_str, clean=False): base_str = base_str.strip() if clean: base_str = base_str.replace(u"׀", u"$$$") base_str = bleach.clean(base_str, tags=[], strip=True) base_str = strip_cantillation(base_str, strip_vowels=False) base_str = re.sub(ur'־', u' *־* ', base_str) word_list = re.split(ur"\s+", base_str) return word_list
def make_text_index_document(cls, tref, heTref, version, lang, version_priority, content, categories): """ Create a document for indexing from the text specified by ref/version/lang """ # Don't bother indexing if there's no content if not content: return False content_wo_cant = strip_cantillation(content, strip_vowels=False).strip() content_wo_cant = re.sub(r'<[^>]+>', '', content_wo_cant) content_wo_cant = re.sub(r'\([^)]+\)', '', content_wo_cant) # remove all parens if len(content_wo_cant) == 0: return False oref = Ref(tref) toc_tree = library.get_toc_tree() cats = oref.index.categories indexed_categories = categories # the default # get the full path of every cat along the way. # starting w/ the longest, # check if they're root swapped. paths = [cats[:i] for i in range(len(cats), 0, -1)] for path in paths: cnode = toc_tree.lookup(path) if getattr(cnode, "searchRoot", None) is not None: # Use the specified searchRoot, with the rest of the category path appended. indexed_categories = [cnode.searchRoot] + cats[len(path) - 1:] break tp = cls.best_time_period if tp is not None: comp_start_date = int(tp.start) else: comp_start_date = 3000 # far in the future ref_data = RefData().load({"ref": tref}) pagesheetrank = ref_data.pagesheetrank if ref_data is not None else RefData.DEFAULT_PAGESHEETRANK return { "ref": tref, "heRef": heTref, "version": version, "lang": lang, "version_priority": version_priority if version_priority is not None else 1000, "titleVariants": oref.index_node.all_tree_titles("en"), "categories": indexed_categories, "order": oref.order_id(), "path": "/".join(indexed_categories + [cls.curr_index.title]), "pagesheetrank": pagesheetrank, "comp_date": comp_start_date, #"hebmorph_semi_exact": content_wo_cant, "exact": content_wo_cant, "naive_lemmatizer": content_wo_cant, }
def clean(self, s): s = unicodedata.normalize("NFD", s) s = strip_cantillation(s, strip_vowels=True) s = re.sub(u"(^|\s)(?:\u05d4['\u05f3])($|\s)", u"\1יהוה\2", s) s = re.sub(ur"[,'\":?.!;־״׳]", u" ", s) s = re.sub(ur"\([^)]+\)", u" ", s) # s = re.sub(ur"\((?:\d{1,3}|[\u05d0-\u05ea]{1,3})\)", u" ", s) # sefaria automatically adds pasuk markers. remove them s = bleach.clean(s, strip=True, tags=()).strip() s = u" ".join(s.split()) return s
def base_tokenizer(str): punc_pat = re.compile(ur"(\.|,|:|;)$") str = re.sub(ur"\([^\(\)]+\)", u"", str) str = re.sub(ur"''",ur'"',str) # looks like double apostrophe in shulchan arukh is meant to be a quote str = re.sub(r"</?[a-z]+>", "", str) # get rid of html tags str = hebrew.strip_cantillation(str, strip_vowels=True) word_list = re.split(ur"\s+", str) word_list = [re.sub(punc_pat,u"",w).strip() for w in word_list if len(re.sub(punc_pat,u"",w).strip()) > 0] # remove empty strings and punctuation at the end of a word return word_list
def extract_steinsaltz_possibilities(base_text, stein_text, word_punct_pairs): """ strip off intro any punct found in bold is extracted separately any punct at end of bold or in commentary is combined and any subset is possible (accounting for order that punct appeared) """ # algorithm works better when bold tags are consolidated stein_text = re.sub(r'</b>(\s*)<b>', r'\g<1>', stein_text) maps = build_maps(base_text, stein_text) talmud_word_index = 0 for ts_map in maps.suite: if not ts_map.actually_has_talmud() and ( isinstance(ts_map.talmud_steinsaltz, SteinsaltzIntro) or isinstance(ts_map.talmud_steinsaltz, ConnectedTalmud)): continue talmud_words = split_by_type(ts_map.reg_talmud, 'words') talmud_poss_dict = get_talmud_punct_possibilities( ts_map.talmud_steinsaltz.talmud) # all punct in talmud portion of stein can theoretically be on any word in talmud (since we don't have a word-to-word mapping) for i, (tw1, pair_dict) in enumerate( zip( talmud_words, word_punct_pairs[talmud_word_index:talmud_word_index + len(talmud_words)])): tw2 = pair_dict['Word'] assert tw1 == strip_cantillation( tw2, strip_vowels=True), f"{tw1}--{tw2}" word_punct_pairs[talmud_word_index + i]['Punct Possibilities'] += talmud_poss_dict[ 'Punct Possibilities'] word_punct_pairs[talmud_word_index + i]['Pre-quote?'] |= talmud_poss_dict['Pre-quote?'] word_punct_pairs[ talmud_word_index + i]['Post-quote?'] |= talmud_poss_dict['Post-quote?'] word_punct_pairs[talmud_word_index + i]['Dash?'] |= talmud_poss_dict['Dash?'] # last word can have punctuation on it + any combo of punctuation in stein stein_poss_dict = get_talmud_punct_possibilities( ts_map.talmud_steinsaltz.steinsaltz) word_punct_pairs[talmud_word_index + len(talmud_words) - 1][ 'Punct Possibilities'] += talmud_poss_dict['Punct Possibilities'] word_punct_pairs[talmud_word_index + len(talmud_words) - 1]['Pre-quote?'] |= talmud_poss_dict['Pre-quote?'] word_punct_pairs[talmud_word_index + len(talmud_words) - 1]['Post-quote?'] |= talmud_poss_dict['Post-quote?'] word_punct_pairs[talmud_word_index + len(talmud_words) - 1]['Dash?'] |= talmud_poss_dict['Dash?'] talmud_word_index += len(talmud_words) return word_punct_pairs
def base_tokenizer(str): punc_pat = re.compile(ur"(\.|,|:)$") str = re.sub(ur"\([^\(\)]+\)", u"", str) str = re.sub(r"</?[a-z]+>", "", str) # get rid of html tags str = hebrew.strip_cantillation(str, strip_vowels=True) word_list = re.split(ur"\s+", str) word_list = [ re.sub(punc_pat, u"", w).strip() for w in word_list if len(re.sub(punc_pat, u"", w).strip()) > 0 ] # remove empty strings and punctuation at the end of a word return word_list
def lookup_shoresh(w, ref): # in both - cant # only second - cant # only first - nikud #remove all non-Hebrew non-nikud characters (including cantillation and sof-pasuk) w = strip_cantillation(w, strip_vowels=False) w = re.sub(ur"[A-Za-z׃׀־]", u"", w) lexicon = "BDB Augmented Strong" wf = WordForm().load({"form": w, "refs": re.compile("^" + ref + "$")}) if wf: return map(lambda x: x["headword"], filter(lambda x: x["lexicon"] == lexicon, wf.lookups))
def body(self): self.load_ref("Job 1") expected_heb = 'אִ֛ישׁ הָיָ֥ה בְאֶֽרֶץ־ע֖וּץ אִיּ֣וֹב שְׁמ֑וֹ וְהָיָ֣ה ׀ הָאִ֣ישׁ הַה֗וּא תָּ֧ם וְיָשָׁ֛ר וִירֵ֥א אֱלֹהִ֖ים וְסָ֥ר מֵרָֽע׃' expected_eng_closed = 'There was a man in the land of Uz named Job. That man was blameless and upright; he feared God and shunned evil.' expected_eng_open = 'THERE was a man in the land of Uz, whose name was Job; and that man was whole-hearted and upright, and one that feared God, and shunned evil.' sgmnt_eng = self.get_nth_section_english(1) sgmnt_heb = self.get_nth_section_hebrew(1) str_eng = sgmnt_eng.text.strip() str_heb = sgmnt_heb.text.strip() # not sure why, but he strings aren't equal unless vowels are stripped expected_heb_stripped = strip_cantillation(expected_heb, strip_vowels=True) str_heb_stripped = strip_cantillation(str_heb, strip_vowels=True) assert expected_heb_stripped == str_heb_stripped, "'{}' does not equal '{}'".format( expected_heb_stripped, str_heb_stripped) assert str_eng in [expected_eng_open, expected_eng_closed ], "'{}' does not equal '{}' or '{}'".format( str_eng, expected_eng_closed, expected_eng_open) self.toggle_on_text_settings() self.toggle_language_hebrew() assert 'hebrew' in self.get_content_language() assert 'english' not in self.get_content_language() assert 'bilingual' not in self.get_content_language() assert self.has_hebrew_text() == True assert self.has_english_text() == False self.toggle_on_text_settings() self.toggle_language_english() assert 'hebrew' not in self.get_content_language() assert 'english' in self.get_content_language() assert 'bilingual' not in self.get_content_language() assert self.has_hebrew_text() == False assert self.has_english_text() == True self.toggle_on_text_settings() self.toggle_language_bilingual() assert 'hebrew' not in self.get_content_language() assert 'english' not in self.get_content_language() assert 'bilingual' in self.get_content_language() assert self.has_hebrew_text() == True assert self.has_english_text() == True self.get_content_language()
def make_text_index_document(cls, tref, heTref, version, lang, version_priority, content, categories): """ Create a document for indexing from the text specified by ref/version/lang """ oref = Ref(tref) text = TextFamily(oref, context=0, commentary=False, version=version, lang=lang).contents() if not content: # Don't bother indexing if there's no content return False content_wo_cant = strip_cantillation(content, strip_vowels=False).strip() content_wo_cant = re.sub(r'<[^>]+>', '', content_wo_cant) content_wo_cant = re.sub(r'\([^)]+\)', '', content_wo_cant) # remove all parens if len(content_wo_cant) == 0: return False if getattr(cls.curr_index, "dependence", None) == 'Commentary' and "Commentary" in text["categories"]: # uch, special casing temp_categories = text["categories"][:] temp_categories.remove('Commentary') temp_categories[0] += " Commentaries" # this will create an additional bucket for each top level category's commentary else: temp_categories = categories tp = cls.best_time_period if not tp is None: comp_start_date = int(tp.start) else: comp_start_date = 3000 # far in the future # section_ref = tref[:tref.rfind(u":")] if u":" in tref else (tref[:re.search(ur" \d+$", tref).start()] if re.search(ur" \d+$", tref) is not None else tref) ref_data = RefData().load({"ref": tref}) pagesheetrank = ref_data.pagesheetrank if ref_data is not None else RefData.DEFAULT_PAGERANK * RefData.DEFAULT_SHEETRANK return { "ref": tref, "heRef": heTref, "version": version, "lang": lang, "version_priority": version_priority if version_priority is not None else 1000, "titleVariants": text["titleVariants"], "categories": temp_categories, "order": oref.order_id(), "path": "/".join(temp_categories + [cls.curr_index.title]), "pagesheetrank": pagesheetrank, "comp_date": comp_start_date, #"hebmorph_semi_exact": content_wo_cant, "content": content_wo_cant if cls.merged else "", # backwards compat for android "exact": content_wo_cant, "naive_lemmatizer": content_wo_cant, }
def make_text_index_document(cls, tref, heTref, version, lang, version_priority, content, categories): """ Create a document for indexing from the text specified by ref/version/lang """ oref = Ref(tref) text = TextFamily(oref, context=0, commentary=False, version=version, lang=lang).contents() if not content: # Don't bother indexing if there's no content return False content_wo_cant = strip_cantillation(content, strip_vowels=False).strip() content_wo_cant = re.sub(ur'<[^>]+>', u'', content_wo_cant) content_wo_cant = re.sub(ur'\([^)]+\)', u'', content_wo_cant) # remove all parens if len(content_wo_cant) == 0: return False if getattr(cls.curr_index, "dependence", None) == 'Commentary' and "Commentary" in text["categories"]: # uch, special casing temp_categories = text["categories"][:] temp_categories.remove('Commentary') temp_categories[0] += " Commentaries" # this will create an additional bucket for each top level category's commentary else: temp_categories = categories tp = cls.best_time_period if not tp is None: comp_start_date = int(tp.start) else: comp_start_date = 3000 # far in the future # section_ref = tref[:tref.rfind(u":")] if u":" in tref else (tref[:re.search(ur" \d+$", tref).start()] if re.search(ur" \d+$", tref) is not None else tref) ref_data = RefData().load({"ref": tref}) pagesheetrank = ref_data.pagesheetrank if ref_data is not None else RefData.DEFAULT_PAGERANK * RefData.DEFAULT_SHEETRANK return { "ref": tref, "heRef": heTref, "version": version, "lang": lang, "version_priority": version_priority if version_priority is not None else 1000, "titleVariants": text["titleVariants"], "categories": temp_categories, "order": oref.order_id(), "path": "/".join(temp_categories + [cls.curr_index.title]), "pagesheetrank": pagesheetrank, "comp_date": comp_start_date, #"hebmorph_semi_exact": content_wo_cant, "content": content_wo_cant if cls.merged else u"", # backwards compat for android "exact": content_wo_cant, "naive_lemmatizer": content_wo_cant, }
def lookup_shoresh(w, ref): # in both - cant # only second - cant # only first - nikud #remove all non-Hebrew non-nikud characters (including cantillation and sof-pasuk) w = strip_cantillation(w, strip_vowels=False) w = re.sub(ur"[A-Za-z׃׀־]", u"", w) lexicon = "BDB Augmented Strong" try: wf = WordForm().load({"form": w, "refs": re.compile("^" + ref + "$")}) except Exception: return None if wf: return map(lambda x: x["headword"], filter(lambda x: x["lexicon"] == lexicon, wf.lookups))
def get_snippet_by_seg_ref(source, found): """ based off of library.get_wrapped_refs_string :param source: :param found: :return: """ found_title = found.index.get_title("he") found_node = library.get_schema_node(found_title, "he") title_nodes = {t: found_node for t in found.index.all_titles("he")} all_reg = library.get_multi_title_regex_string( set(found.index.all_titles("he")), "he") reg = regex.compile(all_reg, regex.VERBOSE) source_text = strip_cantillation(source.text("he").text, strip_vowels=True) linkified = library._wrap_all_refs_in_string(title_nodes, reg, source_text, "he") snippets = [] for match in re.finditer(u"(<a [^>]+>)([^<]+)(</a>)", linkified): ref = Ref(match.group(2)) if ref.normal() == found.section_ref().normal() or ref.normal( ) == found.normal(): start_snip_naive = match.start(2) - 100 if match.start( 0) >= 100 else 0 start_snip = linkified.rfind(u" ", 0, start_snip_naive) if start_snip == -1: start_snip = start_snip_naive end_snip_naive = match.end(2) + 100 if match.end(0) + 100 <= len( linkified) else len(linkified) end_snip = linkified.find(u" ", end_snip_naive) if end_snip == -1: end_snip = end_snip_naive snippets += [ bleach.clean(linkified[start_snip:end_snip], tags=[], strip=True) ] if len(snippets) == 0: print "zero" print found linkified = library._wrap_all_refs_in_string(title_nodes, reg, source_text, "he") if len(snippets) == 0: return [source_text] return snippets
def make_text_index_document(tref, version, lang): """ Create a document for indexing from the text specified by ref/version/lang """ oref = Ref(tref) text = TextFamily(oref, context=0, commentary=False, version=version, lang=lang).contents() content = text["he"] if lang == 'he' else text["text"] if not content: # Don't bother indexing if there's no content return False if isinstance(content, list): content = " ".join(content) content = bleach.clean(content, strip=True, tags=()) content = strip_cantillation(content,strip_vowels=True) index = oref.index tp = index.best_time_period() if not tp is None: comp_start_date = int(tp.start) else: comp_start_date = 3000 return { "ref": oref.normal(), "ref_order": oref.order_id(), "comp_date_int": comp_date_curve(comp_start_date), "pagerank": math.log(pagerank_dict[oref.normal()]) + 20 if oref.normal() in pagerank_dict else 1.0, "pagerank-original": pagerank_dict[oref.normal()] if oref.normal() in pagerank_dict else 1E-8, "version": version, "lang": lang, "hebmorph-standard": content, "hebmorph-exact": content, "hebmorph-standard-no-norm": content, "hebmorph-exact-no-norm": content, "ngram": content, "infreq": content, "aggresive-ngram": content, "naive-lemmatizer": content, "comp-date": comp_start_date, "original": content }
def clean(s): if len(s) == 0: return s s = unicodedata.normalize("NFD", s) s = strip_cantillation(s, strip_vowels=True) # please forgive me... # replace common hashem replacements with the tetragrammaton s = re.sub(ur"(^|\s)([\u05de\u05e9\u05d5\u05db\u05dc\u05d1]?)(?:\u05d4['\u05f3]|\u05d9\u05d9)($|\s)", ur"\1\2\u05d9\u05d4\u05d5\u05d4\3", s) s = re.sub(ur"[,'\":?!;־״׳]", u" ", s) # purposefully leave out period so we can replace ... later on s = re.sub(ur"\([^)]+\)", u" ", s) s = re.sub(ur"<[^>]+>", u"", s) s = u" ".join(s.split()) return s
def lookup_shoresh(w, ref): # in both - cant # only second - cant # only first - nikud #remove all non-Hebrew non-nikud characters (including cantillation and sof-pasuk) w = strip_cantillation(w, strip_vowels=False) w = re.sub(r"[A-Za-z׃׀־]", "", w) lexicon = "BDB Augmented Strong" try: wf = WordForm().load({"form": w, "refs": re.compile("^" + ref + "$")}) except Exception: return None if wf: return [ x["headword"] for x in [x for x in wf.lookups if x["lexicon"] == lexicon] ]
def make_mishnaic_training_context(): training = [] mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")] mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")] for ind in mishnah_indexes: mishna_segs = ind.all_section_refs() for seg in mishna_segs: first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True) training += [{'language':'mishnaic', 'phrase': util.tokenize_words(p)} for p in first_sec_str.split(u'. ')] total_words = 0 total_phrases = len(training) for p in training: total_words += len(p['phrase']) print 'NUM PHRASES: {} AVG WORDS PER PHRASE: {}'.format(total_phrases,total_words/total_phrases) return training
def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and len(forms) == 0: del query_obj["refs"] forms = WordFormSet(query_obj) return forms
def make_mishnaic_training_context(): training = [] mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")] mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")] for ind in mishnah_indexes: mishna_segs = ind.all_section_refs() for seg in mishna_segs: first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True) training += [{'language':'mishnaic', 'phrase':tokenize_words(p)} for p in first_sec_str.split(u'. ')] total_words = 0 total_phrases = len(training) for p in training: total_words += len(p['phrase']) print 'NUM PHRASES: {} AVG WORDS PER PHRASE: {}'.format(total_phrases,total_words/total_phrases) return training
def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs): from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): # This step technically used to happen in the lookup main method `lexicon_lookup` if there were no initial results, but in case where a # consonantal form was supplied in the first place, this optimizes queries. input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and len(forms) == 0: del query_obj["refs"] forms = WordFormSet(query_obj) return forms
def lexicon_lookup(cls, input_str, **kwargs): input_str = unicodedata.normalize("NFC", input_str) results = cls._single_lookup(input_str, **kwargs) if not results or kwargs.get('always_consonants', False): results += cls._single_lookup(strip_cantillation(input_str, True), lookup_key='c_form', **kwargs) if not kwargs.get('never_split', None) and (len(results) == 0 or kwargs.get("always_split", None)): ngram_results = cls._ngram_lookup(input_str, **kwargs) results += ngram_results if len(results): primary_tuples = set() query = set() #TODO: optimize number of word form lookups? there can be a lot of duplicates... is it needed? for r in results: # extract the lookups with "primary" field so it can be used for sorting lookup in the LexicinEntrySet, # but also delete it, because its not part of the query obj if "primary" in r: if r["primary"] is True: primary_tuples.add((r["headword"], r["parent_lexicon"])) del r["primary"] return LexiconEntrySet({"$or": results}, primary_tuples=primary_tuples) else: return None
def make_text_index_document(tref, version, lang): from sefaria.utils.hebrew import strip_cantillation """ Create a document for indexing from the text specified by ref/version/lang """ oref = Ref(tref) text = TextFamily(oref, context=0, commentary=False, version=version, lang=lang).contents() content = text["he"] if lang == 'he' else text["text"] if not content: # Don't bother indexing if there's no content return False if isinstance(content, list): content = flatten_list(content) # deal with mutli-dimensional lists as well content = " ".join(content) content = bleach.clean(content, strip=True, tags=()) content_wo_cant = strip_cantillation(content, strip_vowels=False) if re.match(ur'^\s*[\(\[].+[\)\]]\s*$',content): return False #don't bother indexing. this segment is surrounded by parens
def make_mishnaic_training(): training = [] num_mishnah_per_mesechta = 30000 # effectively all mishnah mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")] mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")] mish_set = set() num_removed = 0 for ind in mishnah_indexes: mishna_segs = ind.all_section_refs() if len(mishna_segs) >= num_mishnah_per_mesechta: mishna_segs = mishna_segs[:num_mishnah_per_mesechta] for seg in mishna_segs: first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True) word_list = tokenize_words(first_sec_str) for word in word_list: if random.random() > 0.5 and word in mish_set: num_removed += 1 continue training.append({'word':word,'tag':'mishnaic'}) mish_set.add(word) print "Num Mishna removed: {}".format(num_removed) return training
def tokenize_words_for_tfidf(text, stopwords): from sefaria.utils.hebrew import strip_cantillation try: text = TextChunk._strip_itags(text) except AttributeError: pass text = strip_cantillation(text, strip_vowels=True) text = re.sub(r'<[^>]+>', ' ', text) for match in re.finditer(r'\(.*?\)', text): if len(match.group().split()) <= 5: text = text.replace(match.group(), " ") text = re.sub(r'־', ' ', text) text = re.sub( r'\[[^\[\]]{1,7}\]', '', text ) # remove kri but dont remove too much to avoid messing with brackets in talmud text = re.sub(r'[A-Za-z.,"?!״:׃]', '', text) # replace common hashem replacements with the tetragrammaton text = re.sub( "(^|\s)([\u05de\u05e9\u05d5\u05db\u05dc\u05d1]?)(?:\u05d4['\u05f3]|\u05d9\u05d9)($|\s)", "\\1\\2\u05d9\u05d4\u05d5\u05d4\\3", text) # replace common elokim replacement with elokim text = re.sub( "(^|\s)([\u05de\u05e9\u05d5\u05db\u05dc\u05d1]?)(?:\u05d0\u05dc\u05e7\u05d9\u05dd)($|\s)", "\\1\\2\u05d0\u05dc\u05d4\u05d9\u05dd\\3", text) words = [] if len(text) != 0: # text = requests.post('https://prefix.dicta.org.il/api', data=json.dumps({'data': text})).text # text = re.sub(r'(?<=\s|"|\(|\[|-)[\u05d0-\u05ea]+\|', '', ' ' + text) # remove prefixes text = re.sub('[^\u05d0-\u05ea"]', ' ', text) words = list( filter(lambda w: w not in stopwords, [ re.sub('^\u05d5', '', w.replace('"', '')) for w in text.split() ])) return words
def make_mishnaic_training(): training = [] num_mishnah_per_mesechta = 30000 # effectively all mishnah mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")] mishnah_indexes += [library.get_index(ind) for ind in library.get_indexes_in_category("Torah")] mish_set = set() num_removed = 0 for ind in mishnah_indexes: mishna_segs = ind.all_section_refs() if len(mishna_segs) >= num_mishnah_per_mesechta: mishna_segs = mishna_segs[:num_mishnah_per_mesechta] for seg in mishna_segs: first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(), strip_vowels=True) word_list = util.tokenize_words(first_sec_str) for word in word_list: if random.random() > 0.45 and word in mish_set: num_removed += 1 continue training.append({'word':word,'tag':'mishnaic'}) mish_set.add(word) print "Num Mishna removed: {}".format(num_removed) return training
def tag_testing_naive(text_name,bib_links,seg_len_list,word_list_in,ref_list,test_set_name="test"): cal_dh_root = "../../dibur_hamatchil/dh_source_scripts/cal_matcher_output" jba_count = 0 curr_state = "" #state should be retained, even b/w dafs #caldb_words = json.load(codecs.open("caldb_words_{}.json".format(text_name), "r", encoding="utf-8")) for iref,ref in enumerate(ref_list): curr_seg_len_list = seg_len_list[iref] curr_bib_links = bib_links[iref] curr_word_list_in = word_list_in[iref] daf = ref.__str__().replace("{} ".format(text_name),"").encode('utf8') try: cal_pre_tagged_words = \ json.load(codecs.open("{}/{}/lang_naive_talmud/lang_naive_talmud_{}.json".format(cal_dh_root,text_name,daf), "r", encoding="utf8")) except IOError: cal_pre_tagged_words = None jbaforms = json.load(codecs.open("JBAHashtable.json","rb",encoding='utf8')) word_list_out = [] count = 0 main_i = 0 while main_i < len(curr_seg_len_list): seg_len = curr_seg_len_list[main_i] bib_linkset = curr_bib_links[main_i] seg = curr_word_list_in[count:count+seg_len] count += seg_len b_start = -1; b_end = -1 if len(bib_linkset) > 0: for bib_link in bib_linkset: #there is an assumption here that the links to Tanakh are always 1 try: bib_seg = tokenize_words(hebrew.strip_cantillation(Ref(bib_link.refs[1]).text('he').as_string(),strip_vowels=True),strip_html=True) b_start,b_end = match_segments(seg, bib_seg) except InputError: continue for i,word in enumerate(seg): state_switch_pat = re.compile(r"\<big\>\<strong\>[^\<\>]+\</strong\>\</big\>") if re.match(state_switch_pat,word): if curr_state == "mishnaic": curr_state = "talmudic" elif curr_state == "talmudic" or curr_state == "": curr_state = "mishnaic" cal_obj = None if b_start != -1 and b_end != -1 and i in xrange(b_start,b_end): lang = "biblical" elif curr_state == "talmudic": #lang = cal_pre_tagged_words[count-seg_len+i]["class"] if not cal_pre_tagged_words is None: try: cal_obj = cal_pre_tagged_words["words"][count-seg_len+i] if cal_obj["class"] == "unknown": if word in jbaforms and len(jbaforms[word]) == 1 and False: temp_cal_obj = jbaforms[word][0].copy() if temp_cal_obj["word"][-1] != "'" and temp_cal_obj["head_word"][-1] != "_": cal_obj = temp_cal_obj cal_obj["jba_word"] = cal_obj["word"] cal_obj["word"] = word cal_obj["class"] = "talmud" jba_count += 1 except IndexError: break elif curr_state == "mishnaic": lang = "mishnaic" else: lang = "unknown" if cal_obj: word_list_out.append(cal_obj) else: word_list_out.append({"word":word,"class":lang}) main_i += 1 missed_words = [] if cal_pre_tagged_words is None else cal_pre_tagged_words["missed_words"] doc = {"words":word_list_out,"missed_words":missed_words} fp = codecs.open("{}/{}/test_set/{}_naive_{}.json".format(cal_dh_root,text_name,test_set_name,daf), "w", encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False) print "NUM JBA WORDS: {}".format(jba_count)
def make_training_sets(type): if type is "biblical": tanach_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Tanakh") if not ind in ("Daniel","Ezra","Nehemia")] tanach_dict = {} for ind in tanach_indexes: all_secs = ind.all_section_refs() for sec in all_secs: sec_str = hebrew.strip_cantillation(sec.text('he').as_string(),strip_vowels=True) word_list = tokenize_words(sec_str) for word in word_list: if word: two_letter = get_two_letter_word(word) if two_letter: temp_list = set(tanach_dict[two_letter]) if two_letter in tanach_dict else set() temp_list.add(word) tanach_dict[two_letter] = list(temp_list) fp = codecs.open("biblical_2_letters_training.json","w",encoding='utf-8') json.dump(tanach_dict, fp,indent=4, encoding='utf-8', ensure_ascii=False) elif type is "mishnaic": num_mishnah_per_mesechta = 30000 #effectively all mishnah mishnah_indexes = [library.get_index(ind) for ind in library.get_indexes_in_category("Mishnah")] mishnah_dict = {} for ind in mishnah_indexes: mishna_segs = ind.all_section_refs() if len(mishna_segs) >= num_mishnah_per_mesechta: mishna_segs = mishna_segs[:num_mishnah_per_mesechta] for seg in mishna_segs: if len(seg.linkset().filter("Tanakh")) > 0: #avoid mishnahs that quote tanakh to not mix languages continue first_sec_str = hebrew.strip_cantillation(seg.text('he').as_string(),strip_vowels=True) word_list = tokenize_words(first_sec_str) for word in word_list: if word: two_letter = get_two_letter_word(word) if two_letter: temp_list = set(mishnah_dict[two_letter]) if two_letter in mishnah_dict else set() temp_list.add(word) mishnah_dict[two_letter] = list(temp_list) fp = codecs.open("mishnaic_2_letters_training.json","w",encoding='utf-8') json.dump(mishnah_dict, fp,indent=4, encoding='utf-8', ensure_ascii=False) elif type is "talmudic": talmud_dict = {} talmud_dbs = { ("caldb.txt",cal_tools.parseCalLine), ("jbaforms.txt",cal_tools.parseJBALine) } for db in talmud_dbs: with open(db[0],"r") as caldb: for line in caldb: line_obj = db[1](line,True,False) try: word = line_obj["word"] except KeyError: print "continuing" continue if word: two_letter = get_two_letter_word(word) if two_letter: temp_list = set(talmud_dict[two_letter]) if two_letter in talmud_dict else set() temp_list.add(word) talmud_dict[two_letter] = list(temp_list) head_word = line_obj["head_word"] if head_word: two_letter = get_two_letter_word(head_word) if two_letter: temp_list = set(talmud_dict[two_letter]) if two_letter in talmud_dict else set() temp_list.add(head_word) talmud_dict[two_letter] = list(temp_list) fp = codecs.open("talmudic_2_letters_training.json", "w", encoding='utf-8') json.dump(talmud_dict, fp, indent=4, encoding='utf-8', ensure_ascii=False)
def tag_testing_naive(text_name,bib_links,seg_len_list,word_list_in,test_set_name="test"): curr_state = "" caldb_words = json.load(codecs.open("caldb_words_{}.json".format(text_name), "r", encoding="utf-8")) cal_words = caldb_words["words"] cal_head_words = caldb_words["head_words"] word_list_out = [] count = 0 main_i = 0 while main_i < len(seg_len_list): seg_len = seg_len_list[main_i] bib_linkset = bib_links[main_i] seg = word_list_in[count:count+seg_len] count += seg_len b_start = -1; b_end = -1 if len(bib_linkset) > 0: for bib_link in bib_linkset: #there is an assumption here that the links to Tanakh are always 1 try: bib_seg = tokenize_words(hebrew.strip_cantillation(Ref(bib_link.refs[1]).text('he').as_string(),strip_vowels=True),strip_html=True) b_start,b_end = match_segments(seg, bib_seg) except InputError: continue for i,word in enumerate(seg): state_switch_pat = re.compile(r"\<big\>\<strong\>[^\<\>]+\</strong\>\</big\>") if re.match(state_switch_pat,word): if curr_state == "mishnaic": curr_state = "talmudic" elif curr_state == "talmudic" or curr_state == "": curr_state = "mishnaic" if b_start != -1 and b_end != -1 and i in xrange(b_start,b_end): lang = "biblical" elif curr_state == "talmudic": lang = "unknown" elif curr_state == "mishnaic": lang = "mishnaic" else: lang = "unknown" word_list_out.append({"word":word,"class":lang}) main_i += 1 curr_state = "" cal_count = 0 main_i = 0 num_rounds_without_matches = 0 last_match_i = 0 in_backtrack = False while main_i < len(word_list_out) and cal_count + 4 < len(cal_words): if num_rounds_without_matches > 15: if in_backtrack: cal_count -= 4 in_backtrack = False else: main_i = last_match_i cal_count += 4 in_backtrack = True num_rounds_without_matches = 0 print "back track!" yo = 34 if cal_count == 12: yo += 3432443 temp_tal_words = word_list_out[main_i:main_i+10] cal_ngram_list = [] for inner_cal_count in range(4): temp_ngram = Cal_ngram(cal_words[cal_count+inner_cal_count:cal_count+inner_cal_count+4],cal_head_words[cal_count+inner_cal_count:cal_count+inner_cal_count+4],temp_tal_words,main_i,skip_penalty=min(inner_cal_count,1)) temp_ngram.find() cal_ngram_list.append(temp_ngram) best_ngram_score = -1 best_ngram = None best_ngram_index = -1 for i,cng in enumerate(cal_ngram_list): if cng.score < best_ngram_score or best_ngram_score == -1: best_ngram_score = cng.score best_ngram = cng best_ngram_index = i if len(best_ngram.matched_indexes) == 0: temp_tags = best_ngram.curr_tagged_words start_match_pos = best_ngram.start_pos num_rounds_without_matches += 1 else: in_backtrack = False num_rounds_without_matches = 0 last_match_i = main_i+len(temp_tags) cal_count += (best_ngram_index+1) start_match_pos = best_ngram.start_pos temp_tags = best_ngram.curr_tagged_words[:best_ngram.matched_indexes[0]-start_match_pos+1] word_list_out[start_match_pos:start_match_pos+len(temp_tags)] = temp_tags main_i += len(temp_tags) doc = {} doc["words"] = word_list_out fp = codecs.open("{}_naive.json".format(test_set_name), "w", encoding='utf-8') json.dump(doc, fp, indent=4, encoding='utf-8', ensure_ascii=False)
def get_snippet_by_seg_ref(source_tc, found, must_find_snippet=False, snip_size=100, use_indicator_words=False, return_matches=False): """ based off of library.get_wrapped_refs_string :param source: :param found: :param must_find_snippet: bool, True if you only want to return a str if you found a snippet :param snip_size int number of chars in snippet on each side :param use_indicator_words bool, True if you want to use hard-coded indicator words to determine which side of the ref the quote is on :return: """ after_indicators = [u"דכתיב", u"ודכתיב", u"וכתיב", u"וכתוב", u"שכתוב", u"כשכתוב", u"כדכתיב", u"זל", u"ז״ל", u"ז''ל", u"ז\"ל", u"אומרם", u"כאמור", u"ואומר", u"אמר", u"שנאמר", u"בגמ'", u"בגמ׳", u"בפסוק", u"לעיל", u"ולעיל", u"לקמן", u"ולקמן", u"בירושלמי", u"בבבלי", u"שדרשו", u"ששנינו", u"שנינו", u"ושנינו", u"דשנינו", u"כמש״כ", u"כמש\"כ", u"כמ״ש", u"כמ\"ש", u"וכמש״כ", u"וכמ\"ש", u"וכמ״ש", u"וכמש\"כ", u"ע״ה", u"ע\"ה", u"מבואר", u"כמבואר", u"במתני׳", u"במתנ\'", u"דתנן", u"זכרונם לברכה", u"זכר לברכה"] after_reg = ur"(?:^|\s)(?:{})\s*[(\[]?$".format(u"|".join(after_indicators)) after_indicators_far = [u"דבפרק", u"בפרק", u"שבפרק", u"פרק"] after_far_reg = ur"(?:^|\s)(?{}:)(?=\s|$)".format(u"|".join(after_indicators_far)) after_indicators_after = [u"בד״ה", u"בד\"ה", u"ד״ה", u"ד\"ה"] after_after_reg = ur"^\s*(?:{})\s".format(u"|".join(after_indicators_after)) punctuation = [u",", u".", u":", u"?", u"!", u"׃"] punctuation_after_reg = ur"^\s*(?:{})\s".format(u"|".join(punctuation)) punctuation_before_reg = ur"(?:{})\s*$".format(u"|".join(punctuation)) after_indicators_after_far = [u"וגו׳", u"וגו'", u"וגו", u"וכו׳", u"וכו'", u"וכו"] after_after_far_reg = ur"(?:^|\s)(?{}:)(?=\s|$)".format(u"|".join(after_indicators_after_far)) found_title = found.index.get_title("he") found_node = library.get_schema_node(found_title, "he") title_nodes = {t: found_node for t in found.index.all_titles("he")} all_reg = library.get_multi_title_regex_string(set(found.index.all_titles("he")), "he") reg = regex.compile(all_reg, regex.VERBOSE) source_text = re.sub(ur"<[^>]+>", u"", strip_cantillation(source_tc.text, strip_vowels=True)) linkified = library._wrap_all_refs_in_string(title_nodes, reg, source_text, "he") snippets = [] found_normal = found.normal() found_section_normal = re.match(ur"^[^:]+", found_normal).group() for match in re.finditer(u"(<a [^>]+>)([^<]+)(</a>)", linkified): ref = get_tc(match.group(2), True) if ref.normal() == found_section_normal or ref.normal() == found_normal: if return_matches: snippets += [match] else: start_snip_naive = match.start(1) - snip_size if match.start(1) >= snip_size else 0 start_snip_space = linkified.rfind(u" ", 0, start_snip_naive) start_snip_link = linkified.rfind(u"</a>", 0, match.start(1)) start_snip = max(start_snip_space, start_snip_link) if start_snip == -1: start_snip = start_snip_naive end_snip_naive = match.end(3) + snip_size if match.end(3) + snip_size <= len(linkified) else len(linkified) end_snip_space = linkified.find(u" ", end_snip_naive) end_snip_link = linkified.find(u"<a ", match.end(3)) end_snip = min(end_snip_space, end_snip_link) if end_snip == -1: end_snip = end_snip_naive if use_indicator_words: before_snippet = linkified[start_snip:match.start(1)] if u"ירושלמי" in before_snippet[-20:] and (len(ref.index.categories) < 2 or ref.index.categories[1] != u'Yerushalmi'): # this guys not a yerushalmi but very likely should be continue after_snippet = linkified[match.end(3):end_snip] if re.search(after_reg, before_snippet) is not None: temp_snip = after_snippet # print before_snippet else: temp_snip = linkified[start_snip:end_snip] else: temp_snip = linkified[start_snip:end_snip] snippets += [re.sub(ur"<[^>]+>", u"", temp_snip)] if len(snippets) == 0: if must_find_snippet: return None return [source_text] return snippets