def find_refs(self, text): begin = -1 end = -1 for i, letter in enumerate(text): if begin == -1 and is_hebrew(letter): begin = i if begin != -1 and not is_hebrew(letter): end = i print text[begin:end] begin = -1 end = -1 return True
def is_hebrew(self): """Returns True if this sheet appears to be in Hebrew according to its title""" from sefaria.utils.hebrew import is_hebrew import regex title = strip_tags(self.title) # Consider a sheet Hebrew if its title contains Hebrew character but no English characters return is_hebrew(title) and not regex.search(u"[a-z|A-Z]", title)
def add_more_mishnah_titles(): from sefaria.utils.hebrew import is_hebrew, strip_cantillation with open( "/home/nss/sefaria/datasets/ner/sefaria/temp/Rabbis in Mishnah Corrections - cross_validated_by_language.csv", "r") as fin: c = csv.DictReader(fin) for row in c: # TODO deal with mistakes if row['Error Type (rabbi, title, mistake, correct, skip)'] != 'title': continue new_title = strip_cantillation(row['Missing Title'], strip_vowels=True) if new_title == 'TYPO': continue slug = row['Missing Title Slug'] if len(slug) == 0: print('NO MISSING TITLE SLUG', row) continue if slug.startswith('BONAYICH'): continue t = Topic.init(slug) if t is None: print("NO TOPIC FOR SLUG", slug, row) continue if len(new_title) == 0: print("ZERO LEN NEW TITLE", row) continue t.titles += [{ "text": new_title, "lang": "he" if is_hebrew(new_title) else "en" }] t.save()
def bulktext_api(request, refs): """ Used by the linker. :param request: :param refs: :return: """ if request.method == "GET": cb = request.GET.get("callback", None) useTextFamily = request.GET.get("useTextFamily", None) refs = set(refs.split("|")) res = {} for tref in refs: try: oref = model.Ref(tref) lang = "he" if is_hebrew(tref) else "en" if useTextFamily: text_fam = model.TextFamily(oref, commentary=0, context=0, pad=False) he = text_fam.he en = text_fam.text res[tref] = { 'he': he, 'en': en, 'lang': lang, 'ref': oref.normal(), 'primary_category': text_fam.contents()['primary_category'], 'heRef': oref.he_normal(), 'url': oref.url() } else: he = model.TextChunk(oref, "he").text en = model.TextChunk(oref, "en").text res[tref] = { 'he': he if isinstance(he, basestring) else JaggedTextArray(he).flatten_to_string( ), # these could be flattened on the client, if need be. 'en': en if isinstance(en, basestring) else JaggedTextArray(en).flatten_to_string(), 'lang': lang, 'ref': oref.normal(), 'heRef': oref.he_normal(), 'url': oref.url() } except (InputError, ValueError, AttributeError, KeyError) as e: # referer = request.META.get("HTTP_REFERER", "unknown page") # This chatter fills up the logs. todo: put in it's own file # logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e)) res[tref] = {"error": 1} resp = jsonResponse(res, cb) return resp
def _do_search(self): lang = "he" if is_hebrew(self._needle) else "en" reg_str = m.library.get_regex_string( self._needle, lang, for_js=True, anchored=False, capture_title=False, parentheses=self._with_parenthesis) reg = re.compile(reg_str, re.VERBOSE) match = reg.search(self._haystack) return match
def _single_lookup(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and forms.count() == 0: del query_obj["refs"] forms = WordFormSet(query_obj) if forms.count() > 0: result = [] headword_query = [] for form in forms: for lookup in form.lookups: headword_query.append({'headword': lookup['headword']}) # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms return headword_query else: return []
def title_regex_api(request, titles): if request.method == "GET": cb = request.GET.get("callback", None) parentheses = bool(int(request.GET.get("parentheses", False))) titles = set(titles.split("|")) res = {} errors = [] for title in titles: lang = "he" if is_hebrew(title) else "en" try: re_string = model.library.get_regex_string( title, lang, anchored=False, for_js=True, parentheses=parentheses) res[title] = re_string except (AttributeError, AssertionError) as e: # There are normal errors here, when a title matches a schema node, the chatter fills up the logs. # logger.warning(u"Library._build_ref_from_string() failed to create regex for: {}. {}".format(title, e)) errors.append("{} : {}".format(title, e)) if len(errors): res["error"] = errors resp = jsonResponse(res, cb) return resp else: return jsonResponse({"error": "Unsupported HTTP method."})
def test_regex_string_he_in_parentheses_only(self): st1 = '(ובויקרא כ"ה)' st2 = 'ובויקרא כ"ה' title = 'ויקרא' lang = "he" if is_hebrew(title) else "en" res = m.library.get_regex_string(title, lang, for_js=True, anchored=False, capture_title=False, parentheses=True) res_no_comments = re.sub('\s+', '', re.sub('\s*?#.*?\n', '', res)) match = re.search(res_no_comments, st1) match_string = '' if not match else match.group().replace( match.group(1), '') resp = requests.get( "https://www.sefaria.org.il/{}".format(match_string)) assert resp.status_code == 200 match = re.search(res_no_comments, st2) match_string = 'no match' if not match else match.group().replace( match.group(1), '') resp = requests.get( "https://www.sefaria.org.il/{}".format(match_string)) assert resp.status_code == 404
def check_rabi_rav_results(): from research.knowledge_graph.named_entity_recognition.ner_tagger import TextNormalizer from sefaria.utils.hebrew import is_hebrew with open(f"{DATASET_LOC}/Fix Rabi and Rav Errors - rav_rabbi_errors.csv", "r") as fin: c = csv.DictReader(fin) rows = list(c) # check titles appear in text for row in rows: typ = row['Error Type (rabbi, title, mistake, correct)'] is_heb = is_hebrew(row['Snippet']) text = TextNormalizer.normalize_text('he' if is_heb else 'en', row['Snippet'].replace('~', '')) if typ == 'title': title = row['Missing Title'] elif typ == 'rabbi': title = row[f"Missing Rabbi {'Hebrew' if is_heb else 'English'}"] else: continue title_reg = TextNormalizer.get_rabbi_regex( TextNormalizer.myunidecode(title.strip())) m = re.search(title_reg, text) if not m: if typ == 'rabbi' and len(row['Missing Rabbi Title in Text']) > 0: title = row['Missing Rabbi Title in Text'] title_reg = TextNormalizer.get_rabbi_regex( TextNormalizer.myunidecode(title.strip())) m = re.search(title_reg, text) if not m: print(f"MISSED '{title}':", text, row['Ref']) else: print(f"MISSED '{title}':", text, row['Ref'])
def bulktext_api(request, refs): """ Used by the linker. :param request: :param refs: :return: """ if request.method == "GET": cb = request.GET.get("callback", None) refs = set(refs.split("|")) res = {} for tref in refs: try: oref = model.Ref(tref) lang = "he" if is_hebrew(tref) else "en" he = model.TextChunk(oref, "he").text en = model.TextChunk(oref, "en").text res[tref] = { 'he': he if isinstance(he, basestring) else JaggedTextArray(he).flatten_to_string(), # these could be flattened on the client, if need be. 'en': en if isinstance(en, basestring) else JaggedTextArray(en).flatten_to_string(), 'lang': lang, 'ref': oref.normal(), 'heRef': oref.he_normal(), 'url': oref.url() } except (InputError, ValueError, AttributeError) as e: referer = request.META.get("HTTP_REFERER", "unknown page") logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e)) res[tref] = {"error": 1} resp = jsonResponse(res, cb) resp['Access-Control-Allow-Origin'] = '*' return resp
def extract_form_tuples(csv_row): forms = [(csv_row[0].strip(), 'eng'), (csv_row[1].strip(), 'heb')] forms += [(x.strip(), 'eng') for x in csv_row[2].split(",") if len(x)] forms += [(x.strip(), 'heb') for x in csv_row[3].split(",") if len(x)] forms += [(x.strip(), 'heb' if is_hebrew(x) else 'eng') for x in csv_row[4].split(",") if len(x)] return forms
def add_langs_to_topics(topic_list: list, use_as_typed=True, backwards_compat_lang_fields: dict = None) -> list: """ adds primary en and he to each topic in topic_list and returns new topic_list :param list topic_list: list of topics where each item is dict of form {'slug': required, 'asTyped': optional} :param dict backwards_compat_lang_fields: of shape {'en': str, 'he': str}. Defines lang fields for backwards compatibility. If None, ignore. :param bool use_as_typed: """ new_topic_list = [] if len(topic_list) > 0: topic_set = {topic.slug: topic for topic in TopicSet({'$or': [{'slug': topic['slug']} for topic in topic_list]})} for topic in topic_list: topic_obj = topic_set.get(topic['slug'], None) if topic_obj is None: continue new_topic = topic.copy() tag_lang = 'en' if use_as_typed: tag_lang = 'he' if is_hebrew(new_topic['asTyped']) else 'en' new_topic[tag_lang] = new_topic['asTyped'] if not use_as_typed or tag_lang == 'en': new_topic['he'] = topic_obj.get_primary_title('he') if not use_as_typed or tag_lang == 'he': new_topic['en'] = topic_obj.get_primary_title('en') if backwards_compat_lang_fields is not None: for lang in ('en', 'he'): new_topic[backwards_compat_lang_fields[lang]] = new_topic[lang] new_topic_list += [new_topic] return new_topic_list
def add_langs_to_topics(topic_list: list, use_as_typed=True, backwards_compat_lang_fields: dict = None) -> list: """ adds primary en and he to each topic in topic_list and returns new topic_list :param list topic_list: list of topics where each item is dict of form {'slug': required, 'asTyped': optional} :param dict backwards_compat_lang_fields: of shape {'en': str, 'he': str}. Defines lang fields for backwards compatibility. If None, ignore. :param bool use_as_typed: """ new_topic_list = [] from sefaria.model import library topic_map = library.get_topic_mapping() if len(topic_list) > 0: for topic in topic_list: # Fall back on `asTyped` if no data is in mapping yet. If neither `asTyped` nor mapping data is availble fail safe by reconstructing a title from a slug (HACK currently affecting trending topics if a new topic isn't in cache yet) default_title = topic['asTyped'] if use_as_typed else topic['slug'].replace("-", " ").title() topic_titles = topic_map.get(topic['slug'], {"en": default_title, "he": default_title}) new_topic = topic.copy() tag_lang = 'en' if use_as_typed: tag_lang = 'he' if is_hebrew(new_topic['asTyped']) else 'en' new_topic[tag_lang] = new_topic['asTyped'] if not use_as_typed or tag_lang == 'en': new_topic['he'] = topic_titles["he"] if not use_as_typed or tag_lang == 'he': new_topic['en'] = topic_titles["en"] if backwards_compat_lang_fields is not None: for lang in ('en', 'he'): new_topic[backwards_compat_lang_fields[lang]] = new_topic[lang] new_topic_list += [new_topic] return new_topic_list
def _single_lookup(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) """if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form'""" query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} form = WordForm().load(query_obj) if not form and lookup_ref: del query_obj["refs"] form = WordForm().load(query_obj) if form: result = [] headword_query = [] for lookup in form.lookups: headword_query.append({'headword': lookup['headword']}) # TODO: if we want the 'lookups' in wf to be a dict we can pass as is to the lexiconentry, we need to change the key 'lexicon' to 'parent_lexicon' in word forms return headword_query else: return []
def get_sheet_language(sheet): """ Returns the language we believe `sheet` to be written in, based on the language of its title. """ title = strip_tags(sheet.get("title", "")).replace("(Copy)", "").replace("\n", " ") return "hebrew" if is_hebrew(title, heb_only=True) else "english"
def create_topic_from_title(title): topic = Topic({ "slug": Topic.normalize_slug(title), "titles": [{ "text": title, "lang": "he" if is_hebrew(title) else "en", "primary": True, }] }) topic.save() return topic
def import_rabi_rav_rabbis_into_topics(): from sefaria.utils.hebrew import is_hebrew with open(f"{DATASETS_NAMED_ENTITY_LOC}/new_rabbis.json", "r") as fin: j = json.load(fin) TopicSet({'alt_ids.rav_rabi': {"$exists": True}}).delete() for _, d in j.items(): d['alt_ids'] = {"rav_rabi": True} typ = d['type'] del d['type'] t = Topic(d) t.save() toTopic = "mishnaic-people" if typ == "tanna" else "talmudic-people" link_json = { "class": "intraTopic", "fromTopic": t.slug, "toTopic": toTopic, "linkType": "is-a", "dataSource": "sperling-bonayich" } itl = IntraTopicLink(link_json) try: itl.save() except sefaria.system.exceptions.DuplicateRecordError: print("Duplicate", t.slug, toTopic) with open( f"{DATASETS_NAMED_ENTITY_LOC}/Fix Rabi and Rav Errors - rav_rabbi_errors.csv", "r") as fin: c = csv.DictReader(fin) rows = list(c) for row in rows: typ = row['Error Type (rabbi, title, mistake, correct)'] is_heb = is_hebrew(row['Snippet']) if typ == 'title': slug_list = [row['Missing Title Slug']] other_slugs = row['Additional Missing Title Slugs'] if len(other_slugs) > 0: slug_list += other_slugs.split(', ') topic_list = [Topic.init(slug.lower()) for slug in slug_list] for t, s in zip(topic_list, slug_list): if not t: print("NO TOPIC", s) continue has_title = False for tit in t.titles: if tit['text'] == row['Missing Title']: has_title = True break if has_title: continue t.add_title(row['Missing Title'], 'he' if is_heb else 'en') t.save()
def finds_multiple(self, result): lang = "he" if is_hebrew(self._needle) else "en" for title_match in m.library.all_titles_regex(lang, citing_only=False).finditer(self._haystack): match = self._do_search(self._needle, self._haystack[title_match.start():]) if not match: return False if m.Ref(match.group(1)).normal() in result: return True else: print("Mismatched. Found: {}, which normalizes to: {}, which is not in {}".format(match.group(1), m.Ref(match.group(1)).normal(), result)) return False
def test_regex_string_he_js_with_prefix(self): st = 'ובויקרא כ"ה' title = 'ויקרא' lang = "he" if is_hebrew(title) else "en" reg_str = m.library.get_regex_string(title, lang, for_js=True, anchored=False, capture_title=False) reg = re.compile(reg_str, re.VERBOSE) match = reg.search(st) assert m.Ref(match.group(1)).normal() == "Leviticus 25"
def test_regex_string_en_js(self): st = 'Ruth 1 1' title = 'Ruth' lang = "he" if is_hebrew(title) else "en" reg_str = m.library.get_regex_string(title, lang, for_js=True, anchored=False, capture_title=False) reg = re.compile(reg_str, re.VERBOSE) match = reg.search(st) assert m.Ref(match.group(1)).normal() == "Ruth 1:1"
def bundle_many_texts(refs, useTextFamily=False, as_sized_string=False, min_char=None, max_char=None): res = {} for tref in refs: try: oref = model.Ref(tref) lang = "he" if is_hebrew(tref) else "en" if useTextFamily: text_fam = model.TextFamily(oref, commentary=0, context=0, pad=False) he = text_fam.he en = text_fam.text res[tref] = { 'he': he, 'en': en, 'lang': lang, 'ref': oref.normal(), 'primary_category': text_fam.contents()['primary_category'], 'heRef': oref.he_normal(), 'url': oref.url() } else: he_tc = model.TextChunk(oref, "he") en_tc = model.TextChunk(oref, "en") if as_sized_string: kwargs = {} if min_char: kwargs['min_char'] = min_char if max_char: kwargs['max_char'] = max_char he_text = he_tc.as_sized_string(**kwargs) en_text = en_tc.as_sized_string(**kwargs) else: he = he_tc.text en = en_tc.text # these could be flattened on the client, if need be. he_text = he if isinstance(he, str) else JaggedTextArray(he).flatten_to_string() en_text = en if isinstance(en, str) else JaggedTextArray(en).flatten_to_string() res[tref] = { 'he': he_text, 'en': en_text, 'lang': lang, 'ref': oref.normal(), 'heRef': oref.he_normal(), 'url': oref.url() } except (InputError, ValueError, AttributeError, KeyError) as e: # referer = request.META.get("HTTP_REFERER", "unknown page") # This chatter fills up the logs. todo: put in it's own file # logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e)) res[tref] = {"error": 1} return res
def add_titles(): with open(f'{ROOT}/{new_titles}', 'r') as fin: cin = csv.DictReader(fin) for row in cin: slug = row['Slug'] if 'BONAYICH' in slug: continue t = Topic.init(slug) if t is None: print('Slug is None', slug) continue new_title = normalize_title(row['New title 1']) lang = 'he' if is_hebrew(new_title) else 'en' t.add_title(new_title, lang) t.save()
def test_regex_string_en_js(self): st = 'Ruth 1 1' title = 'Ruth' lang = "he" if is_hebrew(title) else "en" res = m.library.get_regex_string(title, lang, for_js=True, anchored=False, capture_title=False) match = re.search(res, st) match_string = match.group( ) # 'no match' if not match else match.group() resp = requests.get( "https://www.sefaria.org.il/{}".format(match_string)) assert resp.status_code == 200
def bulktext_api(request, refs): """ Used by the linker. :param request: :param refs: :return: """ if request.method == "GET": cb = request.GET.get("callback", None) useTextFamily = request.GET.get("useTextFamily", None) refs = set(refs.split("|")) res = {} for tref in refs: try: oref = model.Ref(tref) lang = "he" if is_hebrew(tref) else "en" if useTextFamily: text_fam = model.TextFamily(oref, commentary=0, context=0, pad=False) he = text_fam.he en = text_fam.text res[tref] = { 'he': he, 'en': en, 'lang': lang, 'ref': oref.normal(), 'primary_category': text_fam.contents()['primary_category'], 'heRef': oref.he_normal(), 'url': oref.url() } else: he = model.TextChunk(oref, "he").text en = model.TextChunk(oref, "en").text res[tref] = { 'he': he if isinstance(he, basestring) else JaggedTextArray(he).flatten_to_string(), # these could be flattened on the client, if need be. 'en': en if isinstance(en, basestring) else JaggedTextArray(en).flatten_to_string(), 'lang': lang, 'ref': oref.normal(), 'heRef': oref.he_normal(), 'url': oref.url() } except (InputError, ValueError, AttributeError, KeyError) as e: # referer = request.META.get("HTTP_REFERER", "unknown page") # This chatter fills up the logs. todo: put in it's own file # logger.warning(u"Linker failed to parse {} from {} : {}".format(tref, referer, e)) res[tref] = {"error": 1} resp = jsonResponse(res, cb) return resp
def test_regex_string_he_in_parentheses_only(self): st1 = '(ובויקרא כ"ה)' st2 = 'ובויקרא כ"ה' title = 'ויקרא' lang = "he" if is_hebrew(title) else "en" reg_str = m.library.get_regex_string(title, lang, for_js=True, anchored=False, capture_title=False, parentheses=True) reg = re.compile(reg_str, re.VERBOSE) match = reg.search(st1) assert m.Ref(match.group(1)).normal() == "Leviticus 25" match = reg.search(st1) assert m.Ref(match.group(1)).normal() == "Leviticus 25"
def parse_titles(self, element): title = element.get("text") # print title #title = re.sub(ur"</b>|<b>|#.*#|'", u"", title) title = self.comment_strip_re.sub(u"", title) spl_title = title.split(self.title_lang_delim) titles = {} if len(spl_title) == 2: he_pos = 1 if is_hebrew(spl_title[1]) else 0 he = spl_title[he_pos].split(self.alt_title_delim) titles["hePrim"] = he[0].strip() titles["heAltList"] = [t.strip() for t in he[1:]] del spl_title[he_pos] en = spl_title[0].split(self.alt_title_delim) titles["enPrim"] = en[0].strip() titles["enAltList"] = [t.strip() for t in en[1:]] # print node.attrib return titles
def test_regex_string_he_in_parentheses(self): st3 = '(בדברים לב ובספרות ג ב)' titles = ['דברים', 'רות'] for title in titles: lang = "he" if is_hebrew(title) else "en" reg_str = m.library.get_regex_string(title, lang, for_js=True, anchored=False, capture_title=False, parentheses=True) reg = re.compile(reg_str, re.VERBOSE) match = reg.search(st3) if title == 'דברים': assert m.Ref(match.group(1)).normal() == "Deuteronomy 32" else: assert match is None
def title_regex_api(request, titles): if request.method == "GET": cb = request.GET.get("callback", None) titles = set(titles.split("|")) res = {} errors = [] for title in titles: lang = "he" if is_hebrew(title) else "en" try: re_string = model.library.get_regex_string(title, lang, for_js=True) res[title] = re_string except (AttributeError, AssertionError) as e: logger.warning(u"Library._build_ref_from_string() failed to create regex for: {}. {}".format(title, e)) errors.append(u"{} : {}".format(title, e)) if len(errors): res["error"] = errors resp = jsonResponse(res, cb) resp['Access-Control-Allow-Origin'] = '*' return resp
def test_regex_string_he_in_parentheses_3(self): st3 = '<p>[שיר השירים א ירושלמי כתובות (דף כח:) בשורות א]' titles = ['ירושלמי כתובות', 'שיר השירים'] for title in titles: lang = "he" if is_hebrew(title) else "en" reg_str = m.library.get_regex_string(title, lang, for_js=True, anchored=False, capture_title=False, parentheses=True) reg = re.compile(reg_str, re.VERBOSE) match = reg.search(st3) if title == "ירושלמי כתובות": assert m.Ref( match.group(1)).normal() == "Jerusalem Talmud Ketubot 28b" else: assert m.Ref(match.group(1)).normal() == "Song of Songs 1"
def title_regex_api(request, titles): if request.method == "GET": cb = request.GET.get("callback", None) titles = set(titles.split("|")) res = {} errors = [] for title in titles: lang = "he" if is_hebrew(title) else "en" try: re_string = model.library.get_regex_string(title, lang, anchored=False, for_js=True) res[title] = re_string except (AttributeError, AssertionError) as e: # There are normal errors here, when a title matches a schema node, the chatter fills up the logs. # logger.warning(u"Library._build_ref_from_string() failed to create regex for: {}. {}".format(title, e)) errors.append(u"{} : {}".format(title, e)) if len(errors): res["error"] = errors resp = jsonResponse(res, cb) return resp
def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs): from sefaria.utils.hebrew import is_hebrew, strip_cantillation, has_cantillation from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and len(forms) == 0: del query_obj["refs"] forms = WordFormSet(query_obj) return forms
def parse_titles(self, element): title = element.get("text") if '**default**' in title: return None # print title #title = re.sub(ur"</b>|<b>|#.*#|'", u"", title) title = self.comment_strip_re.sub("", title) spl_title = title.split(self.title_lang_delim) titles = {} if len(spl_title) == 2: he_pos = 1 if is_hebrew(spl_title[1]) else 0 he = spl_title[he_pos].split(self.alt_title_delim) titles["hePrim"] = he[0].strip() titles["heAltList"] = [t.strip() for t in he[1:]] del spl_title[he_pos] en = spl_title[0].split(self.alt_title_delim) titles["enPrim"] = en[0].strip() titles["enAltList"] = [t.strip() for t in en[1:]] # print node.attrib return titles
def test_regex_string_he_in_parentheses_3(self): st3 = '<p>[שיר השירים א ירושלמי כתובות (דף כח:) בשורות א]' titles = ['ירושלמי כתובות', 'שיר השירים'] for title in titles: lang = "he" if is_hebrew(title) else "en" res = m.library.get_regex_string(title, lang, for_js=True, anchored=False, capture_title=False, parentheses=True) res_no_comments = re.compile(res, re.VERBOSE) match = res_no_comments.search(st3) match_string = 'no match' if not match else match.group() resp = requests.get( "https://www.sefaria.org.il/{}".format(match_string)) assert resp.status_code == 200 print(resp.url) assert resp.url == 'https://www.sefaria.org.il/Song_of_Songs.1' if title == 'שיר השירים' else 'https://www.sefaria.org.il/Jerusalem_Talmud_Ketubot.28b' if title == 'ירושלמי כתובות' else ''
def get_word_form_objects(cls, input_word, lookup_key='form', **kwargs): from sefaria.model import Ref lookup_ref = kwargs.get("lookup_ref", None) wform_pkey = lookup_key if is_hebrew(input_word): # This step technically used to happen in the lookup main method `lexicon_lookup` if there were no initial results, but in case where a # consonantal form was supplied in the first place, this optimizes queries. input_word = strip_cantillation(input_word) if not has_cantillation(input_word, detect_vowels=True): wform_pkey = 'c_form' query_obj = {wform_pkey: input_word} if lookup_ref: nref = Ref(lookup_ref).normal() query_obj["refs"] = {'$regex': '^{}'.format(nref)} forms = WordFormSet(query_obj) if lookup_ref and len(forms) == 0: del query_obj["refs"] forms = WordFormSet(query_obj) return forms
def test_regex_string_he_in_parentheses_1(self): st3 = '(בדברים לב ובספרות ג ב)' titles = ['דברים', 'רות'] for title in titles: lang = "he" if is_hebrew(title) else "en" res = m.library.get_regex_string(title, lang, for_js=True, anchored=False, capture_title=False, parentheses=True) res_no_comments = re.sub('\s+', '', re.sub('\s*?#.*?\n', '', res)) match = re.search(res_no_comments, st3) match_string = 'no match' if not match else match.group() resp = requests.get( "https://www.sefaria.org.il/{}".format(match_string)) assert resp.status_code == 200 if title == 'דברים' else 404 print(resp.url) assert resp.url == 'https://www.sefaria.org.il/Deuteronomy.32' if title == 'דברים' else 'https://www.sefaria.org.il/no%20match'
def primary_name(self, lang): return [self.name] if (hebrew.is_hebrew(self.name) == (lang == "he")) else []
""" Ensure that Hebrew and English Title variants are in the correct field. """ from sefaria.model import * from sefaria.utils.hebrew import is_hebrew indices = IndexSet({}) for index in indices: en = [] he = [] variants = index.titleVariants + getattr(index, "heTitleVariants", []) for variant in variants: if is_hebrew(variant): he.append(variant) else: en.append(variant) if set(index.titleVariants) != set(en): print index.title print index.titleVariants print en index.titleVariants = list(set(en)) index.heTitleVariants = list(set(he)) index.save()
def get_refs_in_string(st): """ Returns a list of valid refs found within text. """ lang = 'he' if is_hebrew(st) else 'en' titles = model.get_titles_in_string(st, lang) if not titles: return [] if lang == "en": reg = "\\b(?P<ref>" reg += "(" + "|".join([re.escape(title) for title in titles]) + ")" reg += " \d+([ab])?([ .:]\d+)?([ .:]\d+)?(-\d+([ab])?([ .:]\d+)?)?" + ")\\b" reg = re.compile(reg) elif lang == "he": title_string = "|".join([re.escape(t) for t in titles]) #Hebrew Unicode page: http://www.unicode.org/charts/PDF/U0590.pdf #todo: handle Ayin before Resh cases. #todo: This doesn't do ranges. Do we see those in the wild? #todo: verify that open and closing parens are of the same type, so as not to fooled by (} or {) reg = ur"""(?<= # look behind for opening brace [({{] # literal '(', brace, [^}})]* # anything but a closing ) or brace ) (?P<ref> # Capture the whole match as 'ref' ({0}) # Any one book title, (Inserted with format(), below) \s+ # a space (\u05d3[\u05e3\u05e4\u05f3']\s+)? # Daf, spelled with peh, peh sofit, geresh, or single quote (?:\u05e4(?:"|\u05f4|'')?)? # Peh (for 'perek') maybe followed by a quote of some sort (?P<num1> # the first number (1 of 3 styles, below) (?=\p{{Hebrew}}+(?:"|\u05f4|'')\p{{Hebrew}}) # (2: ") Lookahead: At least one letter, followed by double-quote, two single quotes, or gershayim, followed by one letter \u05ea*(?:"|\u05f4|'')? # Many Tavs (400), maybe dbl quote [\u05e7-\u05ea]?(?:"|\u05f4|'')? # One or zero kuf-tav (100-400), maybe dbl quote [\u05d8-\u05e6]?(?:"|\u05f4|'')? # One or zero tet-tzaddi (9-90), maybe dbl quote [\u05d0-\u05d8]? # One or zero alef-tet (1-9) # |(?=\p{{Hebrew}}) # (3: no punc) Lookahead: at least one Hebrew letter \u05ea* # Many Tavs (400) [\u05e7-\u05ea]? # One or zero kuf-tav (100-400) [\u05d8-\u05e6]? # One or zero tet-tzaddi (9-90) [\u05d0-\u05d8]? # One or zero alef-tet (1-9) |\p{{Hebrew}}['\u05f3] # (1: ') single letter, followed by a single quote or geresh )\s* # end of the num1 group, maybe space [.:]? # maybe a . for gemara refs or a : for tanach or gemara refs [,\s]* # maybe a comma, maybe a space, maybe both (?: (?:\u05de\u05e9\u05e0\u05d4\s) # Mishna spelled out, with a space after |(?:\u05de(?:"|\u05f4|'')?) # or Mem (for 'mishna') maybe followed by a quote of some sort )? (?P<num2> # second number - optional (?=\p{{Hebrew}}+(?:"|\u05f4|'')\p{{Hebrew}}) # (2: ") Lookahead: At least one letter, followed by double-quote, two single quotes, or gershayim, followed by one letter \u05ea*(?:"|\u05f4|'')? # Many Tavs (400), maybe dbl quote [\u05e7-\u05ea]?(?:"|\u05f4|'')? # One or zero kuf-tav (100-400), maybe dbl quote [\u05d8-\u05e6]?(?:"|\u05f4|'')? # One or zero tet-tzaddi (9-90), maybe dbl quote [\u05d0-\u05d8]? # One or zero alef-tet (1-9) # |(?=\p{{Hebrew}}) # (3: no punc) Lookahead: at least one Hebrew letter \u05ea* # Many Tavs (400) [\u05e7-\u05ea]? # One or zero kuf-tav (100-400) [\u05d8-\u05e6]? # One or zero tet-tzaddi (9-90) [\u05d0-\u05d8]? # One or zero alef-tet (1-9) |\p{{Hebrew}}['\u05f3] # (1: ') single letter, followed by a single quote or geresh )?[.:]? # end of the num2 group, maybe a . or : for gemara refs ) # end of ref capture (?= # look ahead for closing brace [^({{]* # match of anything but an opening '(' or brace [)}}] # zero-width: literal ')' or brace ) """.format(title_string) reg = regex.compile(reg, regex.VERBOSE) matches = reg.findall(st) refs = [match[0] for match in matches] if len(refs) > 0: for ref in refs: logger.debug("get_refs_in_text: " + ref) return refs
def test_is_hebrew(self): assert h.is_hebrew(u"ג")