示例#1
0
    def _release_group_artist_relationship_generator(dump_path):
        release_group_path = os.path.join(dump_path, 'mbdump', 'release_group')

        artist_credit_release = defaultdict(list)

        with open(release_group_path, 'r') as releasefile:
            n_rows = count_num_lines_in_file(releasefile)
            release_reader = DictReader(
                releasefile,
                delimiter='\t',
                fieldnames=['id', 'gid', 'label', 'artist_credit', 'type_id'],
            )
            for row in tqdm(release_reader, total=n_rows):
                artist_credit_release[row['artist_credit']].append(row['gid'])

        artist_credit_name_path = os.path.join(dump_path, 'mbdump',
                                               'artist_credit_name')

        artist_id_release = defaultdict(list)
        with open(artist_credit_name_path) as artistcreditfile:
            artist_credit_reader = DictReader(
                artistcreditfile,
                delimiter='\t',
                fieldnames=['id', 'nd', 'artist_id', 'artist_name'],
            )

            n_rows = count_num_lines_in_file(artistcreditfile)
            for row in tqdm(artist_credit_reader, total=n_rows):
                artist_id_release[row['artist_id']] = artist_credit_release[
                    row['id']]
                # memory free up for performance
                del artist_credit_release[row['id']]

        artist_path = os.path.join(dump_path, 'mbdump', 'artist')
        with open(artist_path, 'r') as artistfile:

            n_rows = count_num_lines_in_file(artistfile)
            artist_link_reader = DictReader(
                artistfile,
                delimiter='\t',
                fieldnames=[
                    'id',
                    'gid',
                    'label',
                    'sort_label',
                    'b_year',
                    'b_month',
                    'b_day',
                    'd_year',
                    'd_month',
                    'd_day',
                    'type_id',
                ],
            )

            for artist in tqdm(artist_link_reader, total=n_rows):
                for release_id in artist_id_release[artist['id']]:
                    yield (release_id, artist['gid'])
                # memory freeup for performance
                del artist_id_release[artist['id']]
示例#2
0
    def _get_urls_for_entity_id(dump_path: str, l_path: str,
                                resolve: bool) -> dict:
        """given a l_{something}_url relationship file, return a dict of
        somethingid-[urls]"""

        LOGGER.info(f"Loading %s relationships", l_path)

        urlid_entityid_relationship = {}

        with open(l_path, "r") as tsvfile:
            url_relationships = DictReader(tsvfile,
                                           delimiter='\t',
                                           fieldnames=[i for i in range(0, 6)])

            for relationship in tqdm(url_relationships,
                                     total=count_num_lines_in_file(tsvfile)):
                # url id matched with its user id
                if relationship[3] in urlid_entityid_relationship:
                    LOGGER.warning(
                        'Url with ID %s has multiple entities, only one will '
                        'be stored',
                        relationship[3],
                    )
                else:
                    urlid_entityid_relationship[
                        relationship[3]] = relationship[2]

        url_path = os.path.join(dump_path, 'mbdump', 'url')
        url_entityid = {}

        LOGGER.info('Checking URLs related to entity')

        # Translates URL IDs to the relative URL
        with open(url_path, "r") as tsvfile:

            urls = DictReader(tsvfile,
                              delimiter='\t',
                              fieldnames=[i for i in range(0, 5)])

            for url_record in tqdm(urls,
                                   total=count_num_lines_in_file(tsvfile)):

                urlid = url_record[0]
                if urlid in urlid_entityid_relationship:
                    for candidate_url in url_utils.clean(url_record[2]):
                        if not url_utils.validate(candidate_url):
                            continue
                        if resolve and not url_utils.resolve(candidate_url):
                            continue
                        url_entityid[
                            candidate_url] = urlid_entityid_relationship[urlid]
                        del urlid_entityid_relationship[urlid]

        entityid_url = defaultdict(list)
        # Inverts dictionary
        for url, entityid in url_entityid.items():
            entityid_url[entityid].append(url)

        return entityid_url
示例#3
0
def _similar_tokens_linker(
    wd_dataset: TextIO,
    target_db_entity: Union[BaseEntity, BaseLinkEntity],
    fields: Tuple[str, str],
    catalog_pid: str,
    compare_dates: bool,
    tokenize: Callable[[str], Set[str]],
) -> Iterable[Tuple[str, str, str]]:
    wd_field, target_field = fields
    to_exclude = set()

    for row in tqdm(wd_dataset, total=count_num_lines_in_file(wd_dataset)):
        wd_item = json.loads(row)
        qid = wd_item[keys.QID]

        for wd_name in wd_item[wd_field]:
            if not wd_name:
                continue

            to_exclude.clear()

            wd_tokens = tokenize(wd_name)

            if len(wd_tokens) <= 1:
                continue

            try:
                # Check if target token sets are equal or larger
                for target in data_gathering.tokens_fulltext_search(
                        target_db_entity, True, wd_tokens):
                    if not compare_dates or _birth_death_date_match(
                            wd_item, target):
                        yield qid, catalog_pid, target.catalog_id
                        to_exclude.add(target.catalog_id)

                # Check if target token sets are smaller
                where_clause = target_db_entity.catalog_id.notin_(to_exclude)
                for target in data_gathering.tokens_fulltext_search(
                        target_db_entity,
                        False,
                        wd_tokens,
                        where_clause=where_clause,
                ):
                    target_tokens = set(getattr(target, target_field).split())

                    if len(target_tokens) > 1 and target_tokens.issubset(
                            wd_tokens):
                        if not compare_dates or _birth_death_date_match(
                                wd_item, target):
                            yield qid, catalog_pid, target.catalog_id
            except SQLAlchemyError as error:
                LOGGER.warning(
                    "Skipping failed full-text search query due to %s. "
                    "You can enable the debug log with the CLI option "
                    "'-l soweego.linker.baseline DEBUG' for more details",
                    error.__class__.__name__,
                )
                LOGGER.debug(error)
                continue
示例#4
0
    def _artist_link_generator(self, dump_path: str, resolve: bool):
        l_artist_url_path = os.path.join(dump_path, 'mbdump', 'l_artist_url')

        # Loads all the relationships between URL and ARTIST ID
        artistid_url = self._get_urls_for_entity_id(dump_path,
                                                    l_artist_url_path, resolve)

        LOGGER.info('Adding link entities to DB')
        # Translates ARTIST ID to the relative ARTIST
        artist_path = os.path.join(dump_path, 'mbdump', 'artist')
        with open(artist_path, 'r') as artistfile:

            n_rows = count_num_lines_in_file(artistfile)
            artist_link_reader = DictReader(
                artistfile,
                delimiter='\t',
                fieldnames=[
                    'id',
                    'gid',
                    'label',
                    'sort_label',
                    'b_year',
                    'b_month',
                    'b_day',
                    'd_year',
                    'd_month',
                    'd_day',
                    'type_id',
                ],
            )

            for artist in tqdm(artist_link_reader, total=n_rows):

                if artist['id'] in artistid_url:
                    for link in artistid_url[artist['id']]:
                        if self._check_person(artist['type_id']):
                            current_entity = MusicbrainzArtistLinkEntity()
                            self._fill_link_entity(current_entity,
                                                   artist['gid'], link)
                            yield current_entity
                        if self._check_band(artist['type_id']):
                            current_entity = MusicbrainzBandLinkEntity()
                            self._fill_link_entity(current_entity,
                                                   artist['gid'], link)
                            yield current_entity
示例#5
0
    def _release_group_link_generator(self, dump_path: str, resolve: bool):
        l_release_group_url_path = os.path.join(dump_path, 'mbdump',
                                                'l_release_group_url')

        release_group_id_urls = self._get_urls_for_entity_id(
            dump_path, l_release_group_url_path, resolve)

        release_group_path = os.path.join(dump_path, 'mbdump', 'release_group')
        with open(release_group_path) as rfile:
            n_rows = count_num_lines_in_file(rfile)
            releases = DictReader(rfile,
                                  delimiter='\t',
                                  fieldnames=['id', 'gid', 'label'])

            for release in tqdm(releases, total=n_rows):
                if release['id'] in release_group_id_urls:
                    for link in release_group_id_urls[release['id']]:
                        entity = MusicbrainzReleaseGroupLinkEntity()
                        self._fill_link_entity(entity, release['gid'], link)
                        yield entity
示例#6
0
    def _release_group_generator(self, dump_path):
        release_group_datesprec = self._retrieve_release_group_dates(dump_path)
        release_group_path = os.path.join(dump_path, 'mbdump', 'release_group')

        with open(release_group_path, 'r') as releasefile:
            release_reader = DictReader(
                releasefile,
                delimiter='\t',
                fieldnames=['id', 'gid', 'label', 'artist_credit', 'type_id'],
            )

            for row in tqdm(release_reader,
                            total=count_num_lines_in_file(releasefile)):
                entity = MusicbrainzReleaseGroupEntity()
                self._fill_entity(entity, row, None)
                if row['id'] in release_group_datesprec:
                    dateprec = release_group_datesprec[row['id']]
                    if dateprec[1] != 0:
                        entity.born_precision = dateprec[1]
                        entity.born = dateprec[0]
                yield entity
示例#7
0
def _perfect_names_linker(
    wd_dataset: TextIO,
    target_db_entity: BaseEntity,
    catalog_pid: str,
    compare_dates: bool,
) -> Iterable[Tuple[str, str, str]]:
    bucket, bucket_names, bucket_size = [], set(), 100
    total = count_num_lines_in_file(wd_dataset)
    missing = total

    for row in tqdm(wd_dataset, total=total):
        wd_item = json.loads(row)
        bucket_names.update(wd_item[keys.NAME])
        bucket.append(wd_item)

        # Build a bucket of `bucket_size` Wikidata items
        if len(bucket) >= bucket_size or missing < bucket_size:
            missing -= len(bucket)

            # Look the names up in the target database
            for target in data_gathering.perfect_name_search_bucket(
                    target_db_entity, bucket_names):
                # Run a n^2 comparison and yield matches
                for wd in bucket:
                    # Wikidata items have lists of names
                    for wd_name in wd[keys.NAME]:
                        if not wd_name:
                            continue

                        if wd_name.lower() == target.name.lower():
                            if not compare_dates or _birth_death_date_match(
                                    wd, target):
                                yield wd[
                                    keys.QID], catalog_pid, target.catalog_id

            bucket.clear()
            bucket_names.clear()
示例#8
0
    def _artist_generator(self, dump_path):
        artist_alias_path = os.path.join(dump_path, 'mbdump', 'artist_alias')
        artist_path = os.path.join(dump_path, 'mbdump', 'artist')
        area_path = os.path.join(dump_path, 'mbdump', 'area')

        aliases = defaultdict(list)
        areas = {}

        LOGGER.info('Getting artist aliases')

        # Key is the entity id which has a list of aliases
        with open(artist_alias_path, 'r') as aliasesfile:
            for alias in DictReader(
                    aliasesfile,
                    delimiter='\t',
                    fieldnames=['id', 'parent_id', 'label'],
            ):
                aliases[alias['parent_id']].append(alias['label'])

        LOGGER.info('Getting area IDs and related names')

        # Key is the area internal id, value is the name
        with open(area_path, 'r') as areafile:
            for area in DictReader(areafile,
                                   delimiter='\t',
                                   fieldnames=['id', 'gid', 'name']):
                areas[area['id']] = area['name'].lower()

        LOGGER.info('Importing artist entities into DB')

        with open(artist_path, 'r') as artistfile:

            n_rows = count_num_lines_in_file(artistfile)

            artist_reader = DictReader(
                artistfile,
                delimiter='\t',
                fieldnames=[
                    'id',
                    'gid',
                    'label',
                    'sort_label',
                    'b_year',
                    'b_month',
                    'b_day',
                    'd_year',
                    'd_month',
                    'd_day',
                    'type_id',
                    'area',
                    'gender',
                    'ND1',
                    'ND2',
                    'ND3',
                    'ND4',
                    'b_place',
                    'd_place',
                ],
            )

            for artist in tqdm(artist_reader, total=n_rows):
                if self._check_person(artist['type_id']):
                    current_entity = MusicbrainzArtistEntity()

                    try:
                        self._fill_entity(current_entity, artist, areas)
                        current_entity.gender = self._artist_gender(
                            artist['gender'])
                    except KeyError:
                        LOGGER.error('Wrong gender code: %s', artist)
                        continue

                    # Creates an entity foreach available alias
                    for alias in self._alias_entities(
                            current_entity,
                            MusicbrainzArtistEntity,
                            aliases[artist['id']],
                    ):
                        alias.gender = current_entity.gender
                        yield alias

                    yield current_entity

                if self._check_band(artist['type_id']):
                    current_entity = MusicbrainzBandEntity()

                    try:
                        self._fill_entity(current_entity, artist, areas)
                    except ValueError:
                        LOGGER.error('Wrong date: %s', artist)
                        continue

                    # Creates an entity foreach available alias
                    for alias in self._alias_entities(
                            current_entity,
                            MusicbrainzBandEntity,
                            aliases[artist['id']],
                    ):
                        yield alias

                    yield current_entity
示例#9
0
    def _isni_link_generator(self, dump_path: str, resolve: bool):
        isni_file_path = os.path.join(dump_path, 'mbdump', 'artist_isni')

        artist_link = {}

        done = False
        for result in external_id_pids_and_urls():
            if done:
                break
            for pid, formatter in result.items():
                if pid != 'P213':
                    continue
                for url_formatter, _ in formatter.items():
                    with open(isni_file_path, 'r') as artistfile:
                        for artistid_isni in DictReader(
                                artistfile,
                                delimiter='\t',
                                fieldnames=['id', 'isni'],
                        ):
                            # If ISNI is valid, generates an url
                            artistid = artistid_isni['id']
                            isni = artistid_isni['isni']

                            link = url_formatter.replace('$1', isni)
                            for candidate_url in url_utils.clean(link):
                                if not url_utils.validate(candidate_url):
                                    continue
                                if resolve and not url_utils.resolve(
                                        candidate_url):
                                    continue
                                artist_link[artistid] = candidate_url
                done = True

        artist_path = os.path.join(dump_path, 'mbdump', 'artist')
        with open(artist_path, 'r') as artistfile:

            n_rows = count_num_lines_in_file(artistfile)

            artist_isni_reader = DictReader(
                artistfile,
                delimiter='\t',
                fieldnames=[
                    'id',
                    'gid',
                    'label',
                    'sort_label',
                    'b_year',
                    'b_month',
                    'b_day',
                    'd_year',
                    'd_month',
                    'd_day',
                    'type_id',
                ],
            )

            for artist in tqdm(artist_isni_reader, total=n_rows):
                try:
                    # Checks if artist has isni
                    link = artist_link[artist['id']]
                    if self._check_person(artist['type_id']):
                        current_entity = MusicbrainzArtistLinkEntity()
                        self._fill_link_entity(current_entity, artist['gid'],
                                               link)
                        yield current_entity
                    if self._check_band(artist['type_id']):
                        current_entity = MusicbrainzBandLinkEntity()
                        self._fill_link_entity(current_entity, artist['gid'],
                                               link)
                        yield current_entity
                except KeyError:
                    continue