def create_bomberg(): manuscript_data = MANUSCRIPT_DATA['Bomberg Pressing'] manuscript_title = manuscript_data['title'] create_manuscript(manuscript_data) missing_files, tractates_to_check = [], [] with open('Bomberg_map.csv') as fp: map_rows = list(csv.DictReader(fp)) url_prefix = 'https://storage.googleapis.com/sefaria-manuscripts/bomberg' slug = manuscript.ManuscriptPage.get_slug_for_title(manuscript_title) ms = manuscript.ManuscriptPageSet({'manuscript_slug': slug}) ms.delete() for row in map_rows: try: current, first, last = 0, int(row['FirstDaf']), int(row['lastDaf']) except ValueError: continue for section in Ref(row['Tractate']).all_subrefs( )[2:]: # subrefs will add daf 1a and 2b page_id = section.normal() # print(page_id) filename = f'masekhet_{row["Number"].zfill(2)}_{str(first+current).zfill(4)}.jpg' if not os.path.exists( os.path.join('./Bomberg/bomberg_original', filename)): missing_files.append((page_id, filename)) current += 1 continue data = {'manuscript_slug': slug, 'page_id': page_id} page_obj = manuscript.ManuscriptPage().load(data) if page_obj is None: page_obj = manuscript.ManuscriptPage().load_from_dict(data) page_obj.image_url = f'{url_prefix}/{filename}' page_obj.thumbnail_url = f'{url_prefix}/{filename.replace(".jpg", "_thumbnail.jpg")}' if hasattr(page_obj, 'contained_refs'): page_obj.contained_refs = [] page_obj.set_expanded_refs() page_obj.add_ref(page_id) page_obj.save() current += 1 if current + first - 1 != last: tractates_to_check.append(row['Tractate']) print(f'number of weird tractates is {len(tractates_to_check)}') for t in tractates_to_check: print(t) print(f'number of missing files is {len(missing_files)}') for m in missing_files[:10]: print(m)
def create_dummy_munich(): mp = manuscript.ManuscriptPage().load({ 'manuscript_slug': "munich-manuscript-95", "page_id": "dummy_manuscript" }) if mp is not None: return mp mp = manuscript.ManuscriptPage() mp.manuscript_slug = "munich-manuscript-95" mp.page_id = "dummy_manuscript" mp.image_url = "foo" mp.thumbnail_url = 'foo' mp.save() return mp
def check_missing(title_regex): manu_meta = manuscript.Manuscript().load( {'title': { '$regex': title_regex }}) talmud = library.get_indexes_in_category('Bavli') talmud.sort(key=lambda x: Ref(x).index.order) missing = [] total = 0 for tractate in talmud: print(tractate) segs = Ref(tractate).all_segment_refs() total += len(segs) for seg in segs: tref = seg.normal() mp = manuscript.ManuscriptPage().load({ 'manuscript_slug': manu_meta.slug, 'expanded_refs': tref }) if not mp: missing.append(tref) as_ranges = find_ranges(missing) print(f'{len(as_ranges)} ranges of missing refs') print(f'{len(missing)} total missing segments') print(f'{total} segments analyzed') print(*as_ranges, sep='\n')
def create_vilna(): manuscript_data = MANUSCRIPT_DATA['Vilna Pressing'] manuscript_title = manuscript_data['title'] create_manuscript(manuscript_data) url_prefix = 'https://storage.googleapis.com/sefaria-manuscripts/vilna-romm' slug = manuscript.ManuscriptPage.get_slug_for_title(manuscript_title) ms = manuscript.ManuscriptPageSet({'manuscript_slug': slug}) ms.delete() file_directory = '/home/jonathan/sefaria/Sefaria-Data/sources/NLI/Romm/full_size' filenames = [f for f in os.listdir(file_directory) if f.endswith('.jpg')] for i, f in enumerate(filenames, 1): if i % 100 == 0: print(f'{i}/{len(filenames)}') tref = f.replace('.jpg', '').replace('_', ' ') if not Ref.is_ref(tref): print(f'bad filename for {f}') continue data = { 'manuscript_slug': slug, 'page_id': Ref(tref).normal(), 'image_url': f'{url_prefix}/{f}', 'thumbnail_url': f'{url_prefix}/{f.replace(".jpg", "_thumbnail.jpg")}', } page_obj = manuscript.ManuscriptPage().load_from_dict(data) page_obj.add_ref(tref) page_obj.save()
def create_leningrad(): title = 'Leningrad Codex' create_manuscript(MANUSCRIPT_DATA[title]) with open('Leningrad_map.json') as fp: leningrad = json.load(fp) url_prefix = 'https://storage.googleapis.com/sefaria-manuscripts/leningrad-color' for i, (page, ref_range) in enumerate(leningrad.items()): if i % 25 == 0: print(f'{i} / {len(leningrad)}') data = { 'manuscript_slug': manuscript.ManuscriptPage.get_slug_for_title(title), 'page_id': page.replace(".jpg", ""), # 'image_url': f'{url_prefix}/{page}', # 'thumbnail_url': f'{url_prefix}/{page.replace(".jpg", "_thumbnail.jpg")}', } page_obj = manuscript.ManuscriptPage().load(data) if not page_obj: page_obj = manuscript.ManuscriptPage().load_from_dict(data) page_obj.contained_refs = [] page_obj.set_expanded_refs() file_conversion_data = re.search(r'_([0-9]+)([vr])', page) if not file_conversion_data: print(f'weird filename: {page}') continue else: number, side = file_conversion_data.group( 1), file_conversion_data.group(2) new_filename = f'BIB_LENCDX_F{number.zfill(3)}{"B" if side == "v" else "A"}.jpg' page_obj.image_url = f'{url_prefix}/{new_filename}' page_obj.thumbnail_url = f'{url_prefix}/{new_filename}'.replace( '.jpg', '_thumbnail.jpg') for tref in ref_range.split('; '): page_obj.add_ref(tref) page_obj.save()
def create_kaufmann_page(page_json): data = { 'manuscript_slug': manuscript.ManuscriptPage.get_slug_for_title('Kaufmann Manuscript'), 'page_id': re.match(r'^(.*)-large\.jpg', page_json['image_content']).group(1), 'image_url': page_json['image_url'], 'thumbnail_url': re.sub(r'large\.jpg$', 'large_thumbnail.jpg', page_json['image_url']), 'contained_refs': [r.normal() for r in find_ranges(page_json['expanded_refs'])], 'expanded_refs': page_json['expanded_refs'], } page = manuscript.ManuscriptPage().load_from_dict(data) page.save()
def create_munich(): """ keep logic for url derivation convert munich_filemap from a list to dict with <image_file> as key html_map = get_rows_from_db() get tref, url_list from html_map for each url, derive manuscript, image_number & filename from filename, load the page data from the json :return: """ def get_page_for_manuscript(number, page_url): if number == 6: parsed = parse.urlparse(page_url) query = parse.parse_qs(parsed.query) return query['seite'][0] else: return re.search('image_([0-9]+)$', page_url).group(1) with open('munich_filemap.json') as fp: munich_data_list = json.load(fp) url_prefix = 'https://storage.googleapis.com/sefaria-manuscripts/munich-manuscript' munich_filemap = {page['image_file']: page for page in munich_data_list} print(next(iter(munich_filemap.keys()))) manuscript_mapper = { 6569: 6, 3409: 95, 6568: 140, 6547: 141, } manuscript_json = MANUSCRIPT_DATA['Munich Manuscript'] for n in manuscript_mapper.values(): manuscript_json['title'] = f'Munich Manuscript {n}' manuscript_json['he_title'] = f'כתב יד מינכן {n}' create_manuscript(f'Munich Manuscript {n}', manuscript_json) bizzarre = [] for item, (tref, url_list) in enumerate(get_rows_from_db().items()): if item % 200 == 0: print(item) for i, url in enumerate(url_list): filename = f'munich_images/{tref}.jpg' if i == 0 else f'munich_images/{tref}({"I"*i}).jpg' manuscript_hint = re.match( r'^http://daten.digitale-sammlungen.de/([^/]+)/', url).group(1) manuscript_number = int( re.search(r'0*([0-9]+)$', manuscript_hint).group(1)) manuscript_number = manuscript_mapper[manuscript_number] slug = manuscript.ManuscriptPage.get_slug_for_title( f'Munich Manuscript {manuscript_number}') storage_url = filename.replace("munich_images", url_prefix) try: page_json = munich_filemap[filename] except KeyError as e: bizzarre.append(filename) continue # tractate = re.search(r'munich_images/([^\s]+\s[0-9])', filename) # if not tractate: # print('debug statement insufficient') # close_keys = [k for k in munich_filemap.keys() if tractate.group(1) in k] # print(f'near matches to key {filename}:', *close_keys, sep='\n') # manuscript.ManuscriptPageSet({'manuscript_slug': {'$regex': 'munich.*'}}).delete() # raise e data = { 'manuscript_slug': slug, 'page_id': f'Cod. hebr. {manuscript_number} pg. {get_page_for_manuscript(manuscript_number, url)}', 'image_url': storage_url, 'thumbnail_url': storage_url.replace('.jpg', '_thumbnail.jpg'), } page_obj = manuscript.ManuscriptPage().load_from_dict(data) page_obj.add_ref(page_json['full_ref']) try: page_obj.save() except DuplicateKeyError as e: print(e, 'cleaning up', sep='\n') manuscript.ManuscriptPageSet({ 'manuscript_slug': { '$regex': 'munich.*' } }).delete() raise e print(f'number of weird cases is: {len(bizzarre)}', *bizzarre, sep='\n')
thumbnail_directory) normal_sec = section.normal() data = { 'manuscript_slug': bomberg_man.slug, 'page_id': normal_sec, 'image_url': get_url_for_file(filename, 'bomberg'), 'thumbnail_url': get_url_for_file(filename.replace('.jpg', '_thumbnail.jpg'), 'bomberg'), 'contained_refs': [normal_sec] } mpage = manuscript.ManuscriptPage().load_from_dict(data) mpage.set_expanded_refs() mpage.save() """ The following image is actually the last image in Berkahot: https://storage.googleapis.com/sefaria-manuscripts/munich-manuscript/Eruvin 72b-75a.jpg manuscript attrs: - title: - he_title - source - description - he description page attrs: - manuscript_slug