예제 #1
0
def create_bomberg():
    manuscript_data = MANUSCRIPT_DATA['Bomberg Pressing']
    manuscript_title = manuscript_data['title']
    create_manuscript(manuscript_data)
    missing_files, tractates_to_check = [], []

    with open('Bomberg_map.csv') as fp:
        map_rows = list(csv.DictReader(fp))

    url_prefix = 'https://storage.googleapis.com/sefaria-manuscripts/bomberg'
    slug = manuscript.ManuscriptPage.get_slug_for_title(manuscript_title)
    ms = manuscript.ManuscriptPageSet({'manuscript_slug': slug})
    ms.delete()

    for row in map_rows:
        try:
            current, first, last = 0, int(row['FirstDaf']), int(row['lastDaf'])
        except ValueError:
            continue
        for section in Ref(row['Tractate']).all_subrefs(
        )[2:]:  # subrefs will add daf 1a and 2b
            page_id = section.normal()
            # print(page_id)
            filename = f'masekhet_{row["Number"].zfill(2)}_{str(first+current).zfill(4)}.jpg'
            if not os.path.exists(
                    os.path.join('./Bomberg/bomberg_original', filename)):
                missing_files.append((page_id, filename))
                current += 1
                continue
            data = {'manuscript_slug': slug, 'page_id': page_id}
            page_obj = manuscript.ManuscriptPage().load(data)
            if page_obj is None:
                page_obj = manuscript.ManuscriptPage().load_from_dict(data)
            page_obj.image_url = f'{url_prefix}/{filename}'
            page_obj.thumbnail_url = f'{url_prefix}/{filename.replace(".jpg", "_thumbnail.jpg")}'
            if hasattr(page_obj, 'contained_refs'):
                page_obj.contained_refs = []
                page_obj.set_expanded_refs()
            page_obj.add_ref(page_id)
            page_obj.save()
            current += 1
        if current + first - 1 != last:
            tractates_to_check.append(row['Tractate'])

    print(f'number of weird tractates is {len(tractates_to_check)}')
    for t in tractates_to_check:
        print(t)
    print(f'number of missing files is {len(missing_files)}')
    for m in missing_files[:10]:
        print(m)
예제 #2
0
def create_dummy_munich():
    mp = manuscript.ManuscriptPage().load({
        'manuscript_slug': "munich-manuscript-95",
        "page_id": "dummy_manuscript"
    })
    if mp is not None:
        return mp
    mp = manuscript.ManuscriptPage()
    mp.manuscript_slug = "munich-manuscript-95"
    mp.page_id = "dummy_manuscript"
    mp.image_url = "foo"
    mp.thumbnail_url = 'foo'
    mp.save()
    return mp
예제 #3
0
def check_missing(title_regex):
    manu_meta = manuscript.Manuscript().load(
        {'title': {
            '$regex': title_regex
        }})
    talmud = library.get_indexes_in_category('Bavli')
    talmud.sort(key=lambda x: Ref(x).index.order)
    missing = []
    total = 0
    for tractate in talmud:
        print(tractate)
        segs = Ref(tractate).all_segment_refs()
        total += len(segs)
        for seg in segs:
            tref = seg.normal()
            mp = manuscript.ManuscriptPage().load({
                'manuscript_slug':
                manu_meta.slug,
                'expanded_refs':
                tref
            })
            if not mp:
                missing.append(tref)

    as_ranges = find_ranges(missing)
    print(f'{len(as_ranges)} ranges of missing refs')
    print(f'{len(missing)} total missing segments')
    print(f'{total} segments analyzed')
    print(*as_ranges, sep='\n')
예제 #4
0
def create_vilna():
    manuscript_data = MANUSCRIPT_DATA['Vilna Pressing']
    manuscript_title = manuscript_data['title']
    create_manuscript(manuscript_data)

    url_prefix = 'https://storage.googleapis.com/sefaria-manuscripts/vilna-romm'
    slug = manuscript.ManuscriptPage.get_slug_for_title(manuscript_title)
    ms = manuscript.ManuscriptPageSet({'manuscript_slug': slug})
    ms.delete()

    file_directory = '/home/jonathan/sefaria/Sefaria-Data/sources/NLI/Romm/full_size'
    filenames = [f for f in os.listdir(file_directory) if f.endswith('.jpg')]
    for i, f in enumerate(filenames, 1):
        if i % 100 == 0:
            print(f'{i}/{len(filenames)}')
        tref = f.replace('.jpg', '').replace('_', ' ')
        if not Ref.is_ref(tref):
            print(f'bad filename for {f}')
            continue
        data = {
            'manuscript_slug': slug,
            'page_id': Ref(tref).normal(),
            'image_url': f'{url_prefix}/{f}',
            'thumbnail_url':
            f'{url_prefix}/{f.replace(".jpg", "_thumbnail.jpg")}',
        }
        page_obj = manuscript.ManuscriptPage().load_from_dict(data)
        page_obj.add_ref(tref)
        page_obj.save()
예제 #5
0
def create_leningrad():
    title = 'Leningrad Codex'
    create_manuscript(MANUSCRIPT_DATA[title])

    with open('Leningrad_map.json') as fp:
        leningrad = json.load(fp)

    url_prefix = 'https://storage.googleapis.com/sefaria-manuscripts/leningrad-color'
    for i, (page, ref_range) in enumerate(leningrad.items()):
        if i % 25 == 0:
            print(f'{i} / {len(leningrad)}')
        data = {
            'manuscript_slug':
            manuscript.ManuscriptPage.get_slug_for_title(title),
            'page_id': page.replace(".jpg", ""),
            # 'image_url': f'{url_prefix}/{page}',
            # 'thumbnail_url': f'{url_prefix}/{page.replace(".jpg", "_thumbnail.jpg")}',
        }
        page_obj = manuscript.ManuscriptPage().load(data)
        if not page_obj:
            page_obj = manuscript.ManuscriptPage().load_from_dict(data)
        page_obj.contained_refs = []
        page_obj.set_expanded_refs()
        file_conversion_data = re.search(r'_([0-9]+)([vr])', page)
        if not file_conversion_data:
            print(f'weird filename: {page}')
            continue
        else:
            number, side = file_conversion_data.group(
                1), file_conversion_data.group(2)
            new_filename = f'BIB_LENCDX_F{number.zfill(3)}{"B" if side == "v" else "A"}.jpg'
            page_obj.image_url = f'{url_prefix}/{new_filename}'
            page_obj.thumbnail_url = f'{url_prefix}/{new_filename}'.replace(
                '.jpg', '_thumbnail.jpg')

        for tref in ref_range.split('; '):
            page_obj.add_ref(tref)
        page_obj.save()
예제 #6
0
def create_kaufmann_page(page_json):
    data = {
        'manuscript_slug':
        manuscript.ManuscriptPage.get_slug_for_title('Kaufmann Manuscript'),
        'page_id':
        re.match(r'^(.*)-large\.jpg', page_json['image_content']).group(1),
        'image_url':
        page_json['image_url'],
        'thumbnail_url':
        re.sub(r'large\.jpg$', 'large_thumbnail.jpg', page_json['image_url']),
        'contained_refs':
        [r.normal() for r in find_ranges(page_json['expanded_refs'])],
        'expanded_refs':
        page_json['expanded_refs'],
    }
    page = manuscript.ManuscriptPage().load_from_dict(data)
    page.save()
예제 #7
0
def create_munich():
    """
    keep logic for url derivation
    convert munich_filemap from a list to dict with <image_file> as key
    html_map = get_rows_from_db()
    get tref, url_list from html_map
    for each url, derive manuscript, image_number & filename
    from filename, load the page data from the json

    :return:
    """
    def get_page_for_manuscript(number, page_url):
        if number == 6:
            parsed = parse.urlparse(page_url)
            query = parse.parse_qs(parsed.query)
            return query['seite'][0]
        else:
            return re.search('image_([0-9]+)$', page_url).group(1)

    with open('munich_filemap.json') as fp:
        munich_data_list = json.load(fp)
    url_prefix = 'https://storage.googleapis.com/sefaria-manuscripts/munich-manuscript'
    munich_filemap = {page['image_file']: page for page in munich_data_list}
    print(next(iter(munich_filemap.keys())))

    manuscript_mapper = {
        6569: 6,
        3409: 95,
        6568: 140,
        6547: 141,
    }
    manuscript_json = MANUSCRIPT_DATA['Munich Manuscript']
    for n in manuscript_mapper.values():
        manuscript_json['title'] = f'Munich Manuscript {n}'
        manuscript_json['he_title'] = f'כתב יד מינכן {n}'
        create_manuscript(f'Munich Manuscript {n}', manuscript_json)

    bizzarre = []
    for item, (tref, url_list) in enumerate(get_rows_from_db().items()):
        if item % 200 == 0:
            print(item)
        for i, url in enumerate(url_list):
            filename = f'munich_images/{tref}.jpg' if i == 0 else f'munich_images/{tref}({"I"*i}).jpg'
            manuscript_hint = re.match(
                r'^http://daten.digitale-sammlungen.de/([^/]+)/', url).group(1)
            manuscript_number = int(
                re.search(r'0*([0-9]+)$', manuscript_hint).group(1))
            manuscript_number = manuscript_mapper[manuscript_number]
            slug = manuscript.ManuscriptPage.get_slug_for_title(
                f'Munich Manuscript {manuscript_number}')
            storage_url = filename.replace("munich_images", url_prefix)
            try:
                page_json = munich_filemap[filename]
            except KeyError as e:
                bizzarre.append(filename)
                continue
                # tractate = re.search(r'munich_images/([^\s]+\s[0-9])', filename)
                # if not tractate:
                #     print('debug statement insufficient')
                # close_keys = [k for k in munich_filemap.keys() if tractate.group(1) in k]
                # print(f'near matches to key {filename}:', *close_keys, sep='\n')
                # manuscript.ManuscriptPageSet({'manuscript_slug': {'$regex': 'munich.*'}}).delete()
                # raise e
            data = {
                'manuscript_slug': slug,
                'page_id':
                f'Cod. hebr. {manuscript_number} pg. {get_page_for_manuscript(manuscript_number, url)}',
                'image_url': storage_url,
                'thumbnail_url': storage_url.replace('.jpg', '_thumbnail.jpg'),
            }
            page_obj = manuscript.ManuscriptPage().load_from_dict(data)
            page_obj.add_ref(page_json['full_ref'])
            try:
                page_obj.save()
            except DuplicateKeyError as e:
                print(e, 'cleaning up', sep='\n')
                manuscript.ManuscriptPageSet({
                    'manuscript_slug': {
                        '$regex': 'munich.*'
                    }
                }).delete()
                raise e
    print(f'number of weird cases is: {len(bizzarre)}', *bizzarre, sep='\n')
예제 #8
0
                         thumbnail_directory)
        normal_sec = section.normal()

        data = {
            'manuscript_slug':
            bomberg_man.slug,
            'page_id':
            normal_sec,
            'image_url':
            get_url_for_file(filename, 'bomberg'),
            'thumbnail_url':
            get_url_for_file(filename.replace('.jpg', '_thumbnail.jpg'),
                             'bomberg'),
            'contained_refs': [normal_sec]
        }
        mpage = manuscript.ManuscriptPage().load_from_dict(data)
        mpage.set_expanded_refs()
        mpage.save()
"""
The following image is actually the last image in Berkahot:
https://storage.googleapis.com/sefaria-manuscripts/munich-manuscript/Eruvin 72b-75a.jpg

manuscript attrs:
- title:
- he_title
- source
- description
- he description

page attrs:
 - manuscript_slug