Пример #1
0
 def get_mat_pub_dates(self, bib_object):
     v_008_06 = get_values_by_field(bib_object, '008')[0][6]
     if v_008_06 in ['r', 's', 'p', 't']:
         v_008_0710 = get_values_by_field(
             bib_object,
             '008')[0][7:11].replace('u',
                                     '0').replace(' ',
                                                  '0').replace('X', '0')
         try:
             self.mat_pub_date_single = int(v_008_0710)
         except ValueError:
             pass
     else:
         v_008_0710 = get_values_by_field(
             bib_object,
             '008')[0][7:11].replace('u',
                                     '0').replace(' ',
                                                  '0').replace('X', '0')
         v_008_1114 = get_values_by_field(
             bib_object,
             '008')[0][11:15].replace('u',
                                      '0').replace(' ',
                                                   '0').replace('X', '0')
         try:
             self.mat_pub_date_from = int(v_008_0710)
             self.mat_pub_date_to = int(v_008_1114)
         except ValueError:
             pass
Пример #2
0
    def __init__(self, bib_object, work, manifestation, expression, buffer):

        # attributes for item_es_index
        self.mock_es_id = str(
            esid.POLONA_ITEM_PREFIX +
            to_single_value(get_values_by_field(bib_object, '001'))[1:])
        self.expression_ids = [str(expression.mock_es_id)]
        self.item_count = 1
        self.item_local_bib_id = str(
            to_single_value(get_values_by_field(bib_object, '001')))
        self.item_local_id = str(
            to_single_value(
                get_values_by_field_and_subfield(bib_object, ('856', ['u']))))
        self.item_mat_id = int(manifestation.mock_es_id)
        self.item_url = str(
            to_single_value(
                get_values_by_field_and_subfield(bib_object, ('856', ['u']))))
        self.item_work_id = int(work.mock_es_id)
        self.library = {
            'digital': True,
            'name': 'Polona.pl',
            'id': 10945
        }  # hardcoded - always the same
        self.metadata_original = str(uuid4())  # some random fake uuid
        self.metadata_source = 'REFERENCE'
        self.modification_time = '2019-10-11T17:45:21.527'  # fake time
        self.phrase_suggest = ['-']
        self.suggest = ['-']
        self.work_ids = [str(work.mock_es_id)]
        self.write_to_dump_file(buffer)
Пример #3
0
def get_data_for_matching(manifestation):
    ldr_67 = manifestation.leader[6:8]
    val_008_0614 = get_values_by_field(manifestation,
                                       '008')[0][6:15].replace('+', ' ')
    isbn_020_az = get_values_by_field_and_subfield(manifestation,
                                                   ('020', ['a', 'z']))
    title_245 = get_values_by_field_and_subfield(manifestation,
                                                 ('245', ['a', 'b']))[0]
    title_245_no_offset = ' '.join(sf for sf in manifestation.get_fields('245')
                                   [0].get_subfields('a', 'b'))[:25]
    title_245_with_offset = ' '.join(
        sf
        for sf in manifestation.get_fields('245')[0].get_subfields('a', 'b')
    )[int(manifestation.get_fields('245')[0].indicators[1]):25]
    titles_490 = get_values_by_field_and_subfield(manifestation,
                                                  ('490', ['a']))

    numbers_from_title_245 = ''.join(gr for gr in re.findall('\d', title_245))
    place_pub_260_a_first_word = get_values_by_field_and_subfield(
        manifestation, ('260', ['a']))[0].split()[0]
    num_of_pages_300_a = max(
        int(gr) for gr in re.findall(
            '\d+',
            get_values_by_field_and_subfield(manifestation, ('300',
                                                             ['a']))[0]))
    b_format = int(
        re.search(
            '\d+',
            get_values_by_field_and_subfield(manifestation,
                                             ('300', ['c']))[0])[0])
    edition = postprocess(normalize_edition_for_matching,
                          get_values_by_field(manifestation, '250'))

    return ManifMatchData(
        ldr_67=ldr_67,
        val_008_0614=val_008_0614,
        isbn_020_az=isbn_020_az,
        title_245=title_245,
        title_245_no_offset=title_245_no_offset,
        title_245_with_offset=title_245_with_offset,
        titles_490=titles_490,
        numbers_from_title_245=numbers_from_title_245,
        place_pub_260_a_first_word=place_pub_260_a_first_word,
        num_of_pages_300_a=num_of_pages_300_a,
        b_format=b_format,
        edition=edition)
Пример #4
0
def is_single_work(pymarc_object):
    # each and every record MUST have these fields, if it hasn't, it should be treated as invalid and skipped
    try:
        val_245a_last_char = get_values_by_field_and_subfield(
            pymarc_object, ('245', ['a']))[0][-1]
        val_245a = get_values_by_field_and_subfield(pymarc_object,
                                                    ('245', ['a']))[0]
        val_245c = get_values_by_field_and_subfield(pymarc_object,
                                                    ('245', ['c']))[0]
    except IndexError:
        logging.debug('Invalid record.')
        return False

    list_val_245b = get_values_by_field_and_subfield(pymarc_object,
                                                     ('245', ['b']))
    val_245b = list_val_245b[0] if list_val_245b else ''

    list_val_730 = get_values_by_field(pymarc_object, '730')
    list_val_501 = get_values_by_field(pymarc_object, '501')
    list_val_505 = get_values_by_field(pymarc_object, '505')
    list_val_740 = get_values_by_field(pymarc_object, '740')
    list_val_700t = get_values_by_field_and_subfield(pymarc_object,
                                                     ('700', ['t']))
    list_val_710t = get_values_by_field_and_subfield(pymarc_object,
                                                     ('710', ['t']))
    list_val_711t = get_values_by_field_and_subfield(pymarc_object,
                                                     ('711', ['t']))
    list_val_246i = get_values_by_field_and_subfield(pymarc_object,
                                                     ('246', ['i']))

    is_2_1_1_1 = val_245a_last_char != ';' and ' ; ' not in val_245a and ' ; ' not in val_245b and ' / ' not in val_245c
    is_2_1_1_2 = True if not list_val_730 or (
        len(list_val_730) == 1
        and 'Katalog wystawy' in list_val_730[0]) else False
    is_2_1_1_3 = True if not list_val_501 and not list_val_505 and not list_val_740 else False
    is_2_1_1_4 = True if not list_val_700t and not list_val_710t and not list_val_711t else False
    is_2_1_1_5 = True if len([
        x for x in list_val_246i if 'Tyt. oryg.' in x or 'Tytuł oryginału' in x
    ]) < 2 else False

    if is_2_1_1_1 and is_2_1_1_2 and is_2_1_1_3 and is_2_1_1_4 and is_2_1_1_5:
        return True
    else:
        return False
Пример #5
0
 def instantiate_bn_items(self, bib_object, work, expression, buffer):
     list_852_fields = bib_object.get_fields('852')
     if list_852_fields:
         i_mock_es_id = str(
             esid.BN_ITEM_PREFIX +
             to_single_value(get_values_by_field(bib_object, '001'))[1:])
         i = BnItem(bib_object, work, self, expression, buffer)
         self.item_ids.append(int(i_mock_es_id))
         self.stat_item_count += i.item_count
         return i
Пример #6
0
    def get_pub_country(self, bib_object, code_val_index):
        pub_008 = get_values_by_field(bib_object, '008')[0][15:18]
        pub_008 = pub_008[:-1] if pub_008[-1] == ' ' else pub_008
        pub_044_a = get_values_by_field_and_subfield(bib_object,
                                                     ('044', ['a']))

        country_codes = set()

        country_codes.add(pub_008)
        country_codes.update(pub_044_a)

        self.mat_pub_country.extend(
            resolve_code_and_serialize(list(country_codes), 'country_dict',
                                       code_val_index))
Пример #7
0
    def __init__(self, bib_object, work, manifestation, expression, buffer):

        # attributes for item_es_index
        self.mock_es_id = str(
            esid.BN_ITEM_PREFIX +
            to_single_value(get_values_by_field(bib_object, '001'))[1:])
        self.expression_ids = [str(expression.mock_es_id)]
        self.item_call_number = get_values_by_field_and_subfield(
            bib_object, ('852', ['h']))
        self.item_count = len(get_values_by_field(bib_object, '852'))
        self.item_deleted_id = []
        self.item_local_bib_id = str(
            to_single_value(get_values_by_field(bib_object, '001')))
        self.item_local_id = postprocess(
            str, get_values_by_field_and_subfield(bib_object, ('852', ['8'])))
        self.item_location = str(
            to_single_value(
                get_values_by_field_and_subfield(bib_object, ('852', ['c']))))
        self.item_mat_id = int(manifestation.mock_es_id)
        self.item_source = 'DATABN'  # fake source
        self.item_status = 'false'  # fake status
        self.item_url = f'https://katalogi.bn.org.pl/discovery/fulldisplay?docid=alma' \
                        f'{str(to_single_value(get_values_by_field(bib_object, "009")))}' \
                        f'&context=L&vid=48OMNIS_NLOP:48OMNIS_NLOP'
        self.item_work_id = int(work.mock_es_id)
        self.library = {
            'digital': False,
            'name': 'Biblioteka Narodowa',
            'id': 10947
        }  # hardcoded - always the same
        self.metadata_original = str(uuid4())  # some random fake uuid
        self.metadata_source = 'REFERENCE'
        self.modification_time = '2019-10-11T17:45:21.527'  # fake time
        self.phrase_suggest = ['-']
        self.suggest = ['-']
        self.work_ids = [str(work.mock_es_id)]
        self.write_to_dump_file(buffer)
Пример #8
0
 def instantiate_polona_items(self, bib_object, work, expression, buffer):
     list_856_uz = get_values_by_field_and_subfield(bib_object,
                                                    ('856', ['u', 'z']))
     if list_856_uz and 'Polonie' in to_single_value(list_856_uz):
         i_mock_es_id = str(
             esid.POLONA_ITEM_PREFIX +
             to_single_value(get_values_by_field(bib_object, '001'))[1:])
         i = PolonaItem(bib_object, work, self, expression, buffer)
         self.item_ids.append(int(i_mock_es_id))
         self.stat_item_count += i.item_count
         self.stat_digital_library_count = 1
         self.stat_digital = True
         self.stat_public_domain = True
         print('Instantiated polona item!')
         return i
Пример #9
0
    def add(self, bib_object, work, buffer, descr_index, code_val_index):
        if not self.mock_es_id:
            self.mock_es_id = str(
                esid.EXPRESSION_PREFIX +
                get_values_by_field(bib_object, '001')[0][1:])
        if not self.expr_form:
            self.expr_form = serialize_to_jsonl_descr(
                resolve_field_value(
                    get_values_by_field_and_subfield(bib_object,
                                                     ('380', ['a'])),
                    descr_index))
        if not self.expr_lang:
            self.expr_lang = [get_values_by_field(bib_object, '008')[0][35:38]]
            self.expr_lang = resolve_code_and_serialize(
                self.expr_lang, 'language_dict', code_val_index)
        if not self.expr_leader_type:
            self.expr_leader_type = bib_object.leader[6]
        if not self.expr_title:
            self.expr_title = postprocess(
                truncate_title_proper,
                get_values_by_field_and_subfield(bib_object,
                                                 ('245', ['a', 'b'])))[0]
        if not self.work_ids:
            self.work_ids = [int(work.mock_es_id)]
        if not self.expr_work:
            self.expr_work = {
                'id': int(work.mock_es_id),
                'type': 'work',
                'value': str(work.mock_es_id)
            }

        self.materialization_ids.append(
            int(esid.MANIFESTATION_PREFIX +
                get_values_by_field(bib_object, '001')[0][1:]))
        self.instantiate_manifestation(bib_object, work, buffer, descr_index,
                                       code_val_index)
Пример #10
0
def has_items(pymarc_object):
    return True if get_values_by_field(pymarc_object, '852') else False
Пример #11
0
def main_loop(configuration: dict):
    indexed_works_by_uuid = {}
    indexed_works_by_titles = {}
    indexed_works_by_mat_nlp_id = {}

    indexed_manifestations_bn_by_nlp_id = {}
    indexed_manifestations_bn_by_titles_245 = {}
    indexed_manifestations_bn_by_titles_490 = {}

    # prepare indexes
    logging.info('Indexing institutions...')
    indexed_libs_by_mak_id, indexed_libs_by_es_id = create_lib_indexes(
        configuration['inst_file_in'])
    logging.info('DONE!')

    logging.info('Indexing codes and values...')
    indexed_code_values = code_value_indexer(configuration['code_val_file_in'])
    logging.info('DONE!')

    logging.info('Indexing descriptors...')
    indexed_descriptors = index_descriptors(configuration['descr_files_in'])
    logging.info('DONE!')

    # start main loop - iterate through all bib records (only books) from BN
    logging.info('Starting main loop...')
    logging.info('FRBRrization step one in progress (first loop)...')

    # used for limit and stats
    counter = 0

    for bib in tqdm(read_marc_from_file(configuration['bn_file_in'])):
        if is_book_ebook_audiobook(bib) and is_single_work(bib) and has_items(
                bib) and is_245_indicator_2_valid(bib):
            if counter > configuration['limit']:
                break

            try:
                bib = resolve_record(bib, indexed_descriptors)
            except DescriptorNotResolved as error:
                logging.debug(error)
                continue

            # create stub work and get from manifestation data needed for work matching
            work = Work()
            work.get_manifestation_bn_id(bib)
            work.get_main_creator(bib, indexed_descriptors)
            work.get_other_creator(bib, indexed_descriptors)
            work.get_titles(bib)

            counter += 1

            # try to match with existing work (and if there is a match: merge to one work and index by all titles)
            # if there is no match, index new work by titles and by uuid
            work.match_with_existing_work_and_index(indexed_works_by_uuid,
                                                    indexed_works_by_titles)

            # index original bib record by bn_id - fast lookup for conversion and manifestation matching
            indexed_manifestations_bn_by_nlp_id.setdefault(
                get_values_by_field(bib, '001')[0], bib.as_marc())

            # index manifestation for matching with mak+ by 245 titles and 490 titles
            titles_for_manif_match = get_titles_for_manifestation_matching(bib)

            for title in titles_for_manif_match.get('titles_245'):
                indexed_manifestations_bn_by_titles_245.setdefault(
                    title, set()).add(get_values_by_field(bib, '001')[0])
            for title in titles_for_manif_match.get('titles_490'):
                indexed_manifestations_bn_by_titles_490.setdefault(
                    title, set()).add(get_values_by_field(bib, '001')[0])

    logging.info('DONE!')

    if configuration['frbr_step_two']:

        logging.info(
            'FRBRrization step two - trying to merge works using broader context (second loop)...'
        )

        for work_uuid, indexed_work in tqdm(indexed_works_by_uuid.items()):
            # check if work exists, it could've been set to None earlier in case of merging more than one work at a time
            if indexed_work:
                result = indexed_work.try_to_merge_possible_duplicates_using_broader_context(
                    indexed_works_by_uuid, indexed_works_by_titles)
                if result:
                    indexed_works_by_uuid[work_uuid] = None

        logging.info('DONE!')

    logging.info('Conversion in progress...')

    for work_uuid, indexed_work in tqdm(indexed_works_by_uuid.items()):
        # do conversion, upsert expressions and instantiate manifestations and BN items
        if indexed_work:
            print(indexed_work.titles245)
            indexed_work.convert_to_work(indexed_manifestations_bn_by_nlp_id,
                                         configuration['buffer'],
                                         indexed_descriptors,
                                         indexed_code_values)

            logging.debug(f'\n{indexed_work.mock_es_id}')

            for expression in indexed_work.expressions_dict.values():
                logging.debug(f'    {expression}')

                for manifestation in expression.manifestations:
                    # index works by manifestations nlp id for inserting MAK+ items
                    indexed_works_by_mat_nlp_id.setdefault(
                        manifestation.mat_nlp_id, indexed_work.uuid)

                    logging.debug(f'        {manifestation}')
                    for i in manifestation.bn_items:
                        logging.debug(f'            {i}')

    logging.info('DONE!')

    if configuration['run_manif_matcher']:

        logging.info('MAK+ manifestation matching in progress...')
        list_of_files = os.listdir(configuration['mak_files_in'])

        # iterate through marcxml MAK+ files
        for file_num, filename in enumerate(list_of_files, start=1):
            if file_num > configuration['limit_mak']:
                break
            else:
                path_file = os.sep.join(
                    [configuration['mak_files_in'], filename])
                logging.info(
                    f'Parsing MAK+ file nr {file_num} - {filename}...')
                parsed_xml = parse_xml_to_array(path_file)

                # iterate through parsed records (pymarc Records objects)
                for r in parsed_xml:
                    # check if it is not None - there are some problems with parsing
                    if r:
                        # try to match with BN manifestation
                        try:
                            match = match_manifestation(
                                r,
                                index_245=
                                indexed_manifestations_bn_by_titles_245,
                                index_490=
                                indexed_manifestations_bn_by_titles_490,
                                index_id=indexed_manifestations_bn_by_nlp_id)
                        except (IndexError, ValueError, TypeError) as error:
                            # print(error)
                            continue

                        if match:
                            list_ava = r.get_fields('AVA')

                            w_uuid = indexed_works_by_mat_nlp_id.get(match)
                            ref_to_work = indexed_works_by_uuid.get(w_uuid)

                            # this is definitely not a best way to do it
                            if ref_to_work:
                                for e in ref_to_work.expressions_dict.values():
                                    for m in e.manifestations:
                                        if m.mat_nlp_id == match:
                                            logging.debug(
                                                'Adding mak_items...')
                                            item_counter = 0
                                            item_add_counter = 0
                                            for num, ava in enumerate(list_ava,
                                                                      start=1):
                                                try:
                                                    it_to_add = MakItem(
                                                        ava,
                                                        indexed_libs_by_mak_id,
                                                        ref_to_work, e, m,
                                                        buff, num)
                                                    if it_to_add.item_local_bib_id not in m.mak_items:
                                                        logging.debug(
                                                            f'Added new mak_item - {num}'
                                                        )
                                                        m.mak_items.setdefault(
                                                            it_to_add.
                                                            item_local_bib_id,
                                                            it_to_add)
                                                        item_counter += 1
                                                    else:
                                                        existing_it = m.mak_items.get(
                                                            it_to_add.
                                                            item_local_bib_id)
                                                        existing_it.add(
                                                            it_to_add)
                                                        logging.debug(
                                                            f'Increased item_count in existing mak_item - {num}.'
                                                        )
                                                        item_add_counter += 1
                                                except AttributeError as error:
                                                    logging.debug(error)
                                                    continue
                                            logging.debug(
                                                f'Added {item_counter} new mak_items, increased count {item_add_counter} times.'
                                            )
        logging.info('DONE!')

    # loop for:
    # - adding mak items mock_es_ids
    # - serializing and writing mak items to json file
    # - getting libraries for manifestation
    # - getting mak item ids and count for manifestation
    # - serializing and writing manifestations to json file
    # - getting mak item ids and count for expression
    # - serializing and writing expressions to json file
    # - getting mak item ids and count, manifestation ids and couun, expresions ids and count for work
    # - serializing and writing works to json file

    for indexed_work in tqdm(indexed_works_by_uuid.values()):
        if indexed_work:
            logging.debug(f'\n{indexed_work.mock_es_id}')

            for expression in indexed_work.expressions_dict.values():
                logging.debug(f'    {expression}')

                for manifestation in expression.manifestations:

                    for num, item in enumerate(
                            manifestation.mak_items.values(), start=1):
                        item.mock_es_id = f'{str(num)}{str(manifestation.mock_es_id)}'
                        item.write_to_dump_file(buff)

                    manifestation.get_resolve_and_serialize_libraries(
                        indexed_libs_by_es_id)
                    manifestation.get_mak_item_ids()
                    manifestation.write_to_dump_file(buff)
                    logging.debug(f'        {manifestation}')

                    #for i in manif.bn_items:
                    #print(f'            BN - {i}')
                    #for im in manif.mak_items.values():
                    #print(f'            MAK - {im}')

                expression.get_item_ids_item_count_and_libraries()
                expression.write_to_dump_file(buff)

            indexed_work.get_expr_manif_item_ids_and_counts()
            indexed_work.write_to_dump_file(buff)

    logging.debug(indexed_works_by_uuid)
    logging.debug(indexed_works_by_titles)
    logging.debug(indexed_manifestations_bn_by_nlp_id)
    logging.debug(indexed_manifestations_bn_by_titles_245)
    logging.debug(indexed_manifestations_bn_by_titles_490)
Пример #12
0
    def __init__(self, bib_object, work, expression, buffer, descr_index,
                 code_val_index):
        # attributes for manifestation_es_index
        self.mock_es_id = str(
            esid.MANIFESTATION_PREFIX +
            to_single_value(get_values_by_field(bib_object, '001'))[1:])

        self.eForm = only_values(
            resolve_field_value(
                get_values_by_field_and_subfield(bib_object, ('380', ['a'])),
                descr_index))
        self.expression_ids = [int(expression.mock_es_id)]
        self.item_ids = [
        ]  # populated after instantiating all the manifestations and mak+ matching
        self.libraries = [
        ]  # populated after instantiating all the manifestations and mak+ matching
        self.mat_carrier_type = resolve_code_and_serialize(
            get_values_by_field_and_subfield(bib_object, ('338', ['b'])),
            'carrier_type_dict', code_val_index)
        self.mat_contributor = []
        self.mat_digital = False
        self.mat_edition = get_values_by_field(bib_object, '250')
        self.mat_external_id = get_values_by_field_and_subfield(
            bib_object, ('035', ['a']))
        self.mat_isbn = get_values_by_field_and_subfield(
            bib_object, ('020', ['a']))
        self.mat_matching_title = ''  # todo
        self.mat_material_type = ''  # todo
        self.mat_media_type = resolve_code_and_serialize(
            get_values_by_field_and_subfield(bib_object, ('337', ['b'])),
            'media_type_dict', code_val_index)
        self.mat_nat_bib = []  # todo
        self.mat_nlp_id = to_single_value(
            get_values_by_field(bib_object, '001'))
        self.mat_note = []  # todo
        self.mat_number_of_pages = to_single_value(
            get_values_by_field_and_subfield(bib_object, ('300', ['a'])))
        self.mat_physical_info = get_values_by_field(bib_object, '300')
        self.mat_pub_city = get_values_by_field_and_subfield(
            bib_object, ('260', ['a']))
        self.mat_pub_country = []
        self.get_pub_country(bib_object, code_val_index)
        self.mat_pub_date_from = None
        self.mat_pub_date_single = None
        self.mat_pub_date_to = None
        self.get_mat_pub_dates(bib_object)
        self.mat_pub_info = get_values_by_field(bib_object, '260')
        self.mat_publisher = []
        self.get_publishers_all(bib_object)
        self.mat_publisher_uniform = []
        self.get_uniform_publishers(bib_object, descr_index)
        self.mat_title_and_resp = get_values_by_field(bib_object, '245')
        self.mat_title_other_info = []  # todo
        self.mat_title_proper = to_single_value(
            postprocess(
                truncate_title_proper,
                get_values_by_field_and_subfield(bib_object,
                                                 ('245', ['a', 'b']))))
        self.mat_title_variant = get_values_by_field_and_subfield(
            bib_object, ('246', ['a', 'b']))
        self.metadata_original = str(uuid4())
        self.metadata_source = 'REFERENCE'
        self.modificationTime = "2019-10-01T13:34:23.580"
        self.phrase_suggest = [self.mat_title_proper]  # todo
        self.popularity_join = "owner"
        self.stat_digital = False
        self.stat_digital_library_count = 0
        self.stat_item_count = 0
        self.stat_library_count = 0
        self.stat_public_domain = False
        self.suggest = [self.mat_title_proper]  # todo
        self.work_creator = []
        self.work_creators = []
        self.get_work_creators(work)
        self.get_mat_contributors(bib_object, code_val_index, descr_index)
        self.work_ids = [int(work.mock_es_id)]

        self.bn_items = [
            self.instantiate_bn_items(bib_object, work, expression, buffer)
        ]
        self.polona_items = [
            self.instantiate_polona_items(bib_object, work, expression, buffer)
        ]
        self.mak_items = {}