Пример #1
0
def handle(file_path: str, mappings: dict, entity_type, link_type):
    """Generic .nt file dump handler for the dump import into the DB.
    Assumptions: each entity must be represented in a compact block of ''adjacent'' lines
    """
    db_manager = DBManager(
        get_path('soweego.importer.resources', 'db_credentials.json'))
    db_manager.drop(link_type)
    db_manager.drop(entity_type)
    session = db_manager.new_session()

    if not os.path.isfile(file_path):
        LOGGER.warning("file: %s not found", file_path)

    with open(file_path) as file:
        current_key = None
        current_entity = entity_type()

        for row in file.readlines():
            try:
                # split triples
                row_chunks = []
                split_regex = r'(?<=["\<])[^<>"]*(?=["\>])'
                for match in re.finditer(split_regex, row, re.I):
                    row_chunks.append(match.group(0))

                if len(row_chunks) != 3:
                    raise Exception(loc.MALFORMED_ROW)
                if row_chunks[1] in mappings:
                    current_field = mappings[row_chunks[1]]
                else:
                    raise Exception(loc.FIELD_NOT_MAPPED % row_chunks[1])

                if current_field == 'url':
                    link = link_type()
                    link.url = row_chunks[2]
                    link.tokens = row_chunks[2].replace("/", " ")  # TODO
                    link.catalog_id = current_entity.catalog_id
                    session.add(link)

                if current_key is not None and current_key != row_chunks[0]:
                    session.add(current_entity)
                    current_entity = entity_type()

                current_key = row_chunks[0]
                current_value = row_chunks[2]

                setattr(current_entity, current_field, current_value)

            except Exception as e:
                LOGGER.warning('Error at row %s \n %s', row, str(e))
        session.add(current_entity)
    try:
        db_manager.create(entity_type)
        db_manager.create(link_type)
        session.commit()
    except Exception as e:
        LOGGER.warning(loc.WRONG_MAPPINGS, str(e))
Пример #2
0
def build_target(
    goal: str, catalog: str, entity: str, identifiers: Set[str]
) -> Iterator[pd.DataFrame]:
    """Build a target catalog dataset for training or classification purposes:
    workflow step 1.

    Data is gathered by querying the ``s51434__mixnmatch_large_catalogs_p``
    database. This is where the :mod:`importer` inserts processed catalog dumps.

    The database is located in
    `ToolsDB <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#User_databases>`_
    under the Wikimedia
    `Toolforge <https://wikitech.wikimedia.org/wiki/Portal:Toolforge>`_ infrastructure.
    See `how to connect <https://wikitech.wikimedia.org/wiki/Help:Toolforge/Database#Connecting_to_the_database_replicas>`_.

    :param goal: ``{'training', 'classification'}``.
      Whether to build a dataset for training or classification
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param identifiers: a set of catalog IDs to gather data for
    :return: the generator yielding :class:`pandas.DataFrame` chunks
    """
    utils.check_goal_value(goal)

    LOGGER.info('Building target %s set for %s %s ...', goal, catalog, entity)

    # Target catalog ORM entities/DB tables
    base, link, nlp = (
        target_database.get_main_entity(catalog, entity),
        target_database.get_link_entity(catalog, entity),
        target_database.get_nlp_entity(catalog, entity),
    )
    tables = [table for table in (base, link, nlp) if table]

    # Initial query with all non-null tables
    query = Query(tables)
    # Remove `base` to avoid outer join with itself
    tables.remove(base)
    # Outer joins
    for table in tables:
        query = query.outerjoin(table, base.catalog_id == table.catalog_id)
    # Condition
    query = query.filter(base.catalog_id.in_(identifiers)).enable_eagerloads(
        False
    )

    sql = query.statement
    LOGGER.debug('SQL query to be fired: %s', sql)

    # Avoid loading query result in memory
    db_engine = DBManager().get_engine().execution_options(stream_results=True)

    return read_sql(sql, db_engine, chunksize=1000)
def add_catalog(catalog: str, entity: str) -> int:
    """Add or update a catalog.

    :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :return: the catalog *id* field of the *catalog* table
      in the *s51434__mixnmatch_p* Toolforge database
    """
    name_field = f'{catalog.title()} {entity}'

    session = DBManager(MNM_DB).new_session()
    try:
        existing = (
            session.query(mix_n_match.MnMCatalog)
            .filter_by(name=name_field)
            .first()
        )
        if existing is None:
            LOGGER.info(
                "Adding %s %s catalog to the mix'n'match DB ... ",
                catalog,
                entity,
            )
            db_entity = mix_n_match.MnMCatalog()
            _set_catalog_fields(db_entity, name_field, catalog, entity)
            session.add(db_entity)
            session.commit()
            catalog_id = db_entity.id
        else:
            LOGGER.info('Updating %s %s catalog ... ', catalog, entity)
            catalog_id = existing.id
            _set_catalog_fields(existing, name_field, catalog, entity)
            session.add(existing)
            session.commit()
    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed catalog addition/update due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.ingester DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)
        session.rollback()
        return None
    finally:
        session.close()

    LOGGER.info(
        'Catalog addition/update went fine. Internal ID: %d', catalog_id
    )
    return catalog_id
Пример #4
0
def _gather_target_data(
    catalog,
    entity,
    total_queries,
    works_buckets,
    works_inverted,
    people_buckets,
    people_inverted,
):
    claim_pid = vocabulary.WORKS_BY_PEOPLE_MAPPING[catalog][entity]
    db_entity = target_database.get_relationship_entity(catalog, entity)
    session = DBManager().connect_to_db()

    # Leverage works-people relationships
    try:
        for works, people in tqdm(product(works_buckets, people_buckets),
                                  total=total_queries):
            works_to_people = session.query(db_entity).filter(
                and_(
                    db_entity.from_catalog_id.in_(works),
                    db_entity.to_catalog_id.in_(people),
                ))

            for result in works_to_people:
                yield works_inverted[
                    result.from_catalog_id], claim_pid, people_inverted[
                        result.to_catalog_id], result.to_catalog_id
    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed query of works-people relationships due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.validator DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)

        session.rollback()
        return None
    finally:
        session.close()
Пример #5
0
    def extract_and_populate(self, dump_file_path):
        dump_path = os.path.join(os.path.dirname(
            os.path.abspath(dump_file_path)), "%s_%s" % (os.path.basename(dump_file_path), 'extracted'))

        if not os.path.isdir(dump_path):
            with tarfile.open(dump_file_path, "r:bz2") as tar:
                tar.extractall(dump_path)

        tables = [MusicbrainzArtistEntity,
                  MusicbrainzBandEntity]

        db_manager = DBManager()
        db_manager.drop(tables)
        db_manager.create(tables)

        artist_count = 0
        for artist in self._artist_generator(dump_path):
            artist_count = artist_count + 1
            session = db_manager.new_session()
            session.add(artist)
            session.commit()

        LOGGER.debug("Added %s artist records" % artist_count)

        db_manager.drop([MusicbrainzArtistLinkEntity,
                         MusicbrainzBandLinkEntity])
        db_manager.create([MusicbrainzArtistLinkEntity,
                           MusicbrainzBandLinkEntity])

        link_count = 0
        for link in self._link_generator(dump_path):
            link_count = link_count + 1
            session = db_manager.new_session()
            session.add(link)
            session.commit()

        LOGGER.debug("Added %s link records" % link_count)

        isni_link_count = 0
        for link in self._isni_link_generator(dump_path):
            isni_link_count = isni_link_count + 1
            session = db_manager.new_session()
            session.add(link)
            session.commit()

        LOGGER.debug("Added %s ISNI link records" % isni_link_count)

        db_manager.drop([MusicBrainzArtistBandRelationship])
        db_manager.create([MusicBrainzArtistBandRelationship])

        relationships_count = 0
        relationships_total = 0
        for relationship in self._artist_band_relationship_generator(dump_path):
            try:
                relationships_total = relationships_total + 1
                session = db_manager.new_session()
                session.add(relationship)
                session.commit()
                relationships_count = relationships_count + 1
            except IntegrityError as i:
                LOGGER.warning(str(i))

        LOGGER.debug("Added %s/%s relationships records" %
                     (relationships_count, relationships_total))
Пример #6
0
def add_matches(
    file_path: str,
    catalog_id: int,
    catalog: str,
    entity: str,
    confidence_range: Tuple[float, float],
) -> None:
    """Add or update matches to an existing catalog.
    Curated matches found in the catalog are kept as is.

    :param file_path: path to a file with matches
    :param catalog_id: the catalog *id* field of the *catalog* table
      in the *s51434__mixnmatch_p* Toolforge database
    :param catalog: ``{'discogs', 'imdb', 'musicbrainz', 'twitter'}``.
      A supported catalog
    :param entity: ``{'actor', 'band', 'director', 'musician', 'producer',
      'writer', 'audiovisual_work', 'musical_work'}``.
      A supported entity
    :param confidence_range: a pair of floats indicating
      the minimum and maximum confidence scores of matches
      that will be added/updated.
    """
    success = True  # Flag to log that everything went fine
    class_qid, url_prefix = _handle_metadata(catalog, entity)
    matches = _handle_matches(file_path, confidence_range)

    LOGGER.info(
        "Starting import of %s %s matches (catalog ID: %d) into the mix'n'match DB ...",
        catalog,
        entity,
        catalog_id,
    )

    start = datetime.now()
    session = DBManager(MNM_DB).new_session()

    # Note that the session is kept open after this operation
    curated, success = _sync_matches(session, catalog_id, success)

    # Filter curated matches:
    # rows with tids that are NOT (~) in curated tids
    matches = matches[~matches[keys.TID].isin(curated)]

    n_matches = len(matches)
    matches_reader = matches.itertuples(index=False, name=None)
    batch = []

    try:
        _import_matches(
            batch,
            catalog,
            catalog_id,
            class_qid,
            entity,
            matches_reader,
            n_matches,
            session,
            url_prefix,
        )

        LOGGER.info(
            'Adding last batch of %d %s %s matches, this may take a while ...',
            len(batch),
            catalog,
            entity,
        )
        # Commit remaining entities
        session.bulk_save_objects(batch)
        session.commit()

    except SQLAlchemyError as error:
        LOGGER.error(
            "Failed addition/update due to %s. "
            "You can enable the debug log with the CLI option "
            "'-l soweego.ingester DEBUG' for more details",
            error.__class__.__name__,
        )
        LOGGER.debug(error)
        session.rollback()
        success = False

    finally:
        session.close()

    if success:
        end = datetime.now()
        LOGGER.info(
            'Import of %s %s matches (catalog ID: %d) completed in %s. '
            'Total matches: %d',
            catalog,
            entity,
            catalog_id,
            end - start,
            n_matches,
        )
Пример #7
0
    def _process_masters_dump(self, dump_file_path):
        LOGGER.info("Starting import of masters from Discogs dump '%s'",
                    dump_file_path)
        start = datetime.now()
        tables = [DiscogsMasterEntity, DiscogsMasterArtistRelationship]
        db_manager = DBManager()
        LOGGER.info('Connected to database: %s', db_manager.get_engine().url)
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            'SQL tables dropped and re-created: %s',
            [table.__tablename__ for table in tables],
        )
        extracted_path = '.'.join(dump_file_path.split('.')[:-1])
        # Extract dump file if it has not yet been extracted
        if not os.path.exists(extracted_path):
            LOGGER.info('Extracting dump file')

            with gzip.open(dump_file_path, 'rb') as f_in:
                with open(extracted_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

        # count number of entries
        n_rows = sum(
            1 for _ in self._g_process_et_items(extracted_path, 'master'))
        session = db_manager.new_session()
        entity_array = []  # array to which we'll add the entities
        relationships_set = set()
        self.total_entities = 0
        for _, node in tqdm(self._g_process_et_items(extracted_path, 'master'),
                            total=n_rows):

            if not node.tag == 'master':
                continue

            self.total_entities += 1
            entity = self._extract_from_master_node(node, relationships_set)
            entity_array.append(entity)
            # commit in batches of `self._sqlalchemy_commit_every`
            if len(entity_array) >= self._sqlalchemy_commit_every:
                LOGGER.info('Adding batch of entities to the database, '
                            'this will take a while.'
                            'Progress will resume soon.')

                insert_start_time = datetime.now()

                session.bulk_save_objects(entity_array)
                session.commit()
                session.expunge_all()  # clear session

                entity_array.clear()  # clear entity array

                LOGGER.debug(
                    'It took %s to add %s entities to the database',
                    datetime.now() - insert_start_time,
                    self._sqlalchemy_commit_every,
                )
        # finally commit remaining entities in session
        # (if any), and close session
        session.bulk_save_objects(entity_array)
        session.bulk_save_objects([
            DiscogsMasterArtistRelationship(id1, id2)
            for id1, id2 in relationships_set
        ])
        session.commit()
        session.close()

        end = datetime.now()
        LOGGER.info(
            'Import completed in %s. Total entities: %d. '
            'Total relationships %s.',
            end - start,
            self.total_entities,
            len(relationships_set),
        )
        # once the import process is complete,
        # we can safely delete the extracted discogs dump
        os.remove(extracted_path)
Пример #8
0
    def _process_artists_dump(self, dump_file_path, resolve):
        LOGGER.info(
            "Starting import of musicians and bands from Discogs dump '%s'",
            dump_file_path,
        )
        start = datetime.now()
        tables = [
            DiscogsMusicianEntity,
            DiscogsMusicianNlpEntity,
            DiscogsMusicianLinkEntity,
            DiscogsGroupEntity,
            DiscogsGroupNlpEntity,
            DiscogsGroupLinkEntity,
        ]
        db_manager = DBManager()
        LOGGER.info('Connected to database: %s', db_manager.get_engine().url)
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            'SQL tables dropped and re-created: %s',
            [table.__tablename__ for table in tables],
        )
        extracted_path = '.'.join(dump_file_path.split('.')[:-1])
        # Extract dump file if it has not yet been extracted
        if not os.path.exists(extracted_path):
            LOGGER.info('Extracting dump file')

            with gzip.open(dump_file_path, 'rb') as f_in:
                with open(extracted_path, 'wb') as f_out:
                    shutil.copyfileobj(f_in, f_out)

        # count number of entries
        n_rows = sum(
            1 for _ in self._g_process_et_items(extracted_path, 'artist'))
        session = db_manager.new_session()
        entity_array = []  # array to which we'll add the entities
        for _, node in tqdm(self._g_process_et_items(extracted_path, 'artist'),
                            total=n_rows):

            if not node.tag == 'artist':
                continue

            infos = self._extract_from_artist_node(node, resolve)

            if infos is None:
                continue

            if 'groups' in infos:
                entity = DiscogsMusicianEntity()
                self._populate_musician(entity_array, entity, infos)
            # Band
            elif 'members' in infos:
                entity = DiscogsGroupEntity()
                self._populate_band(entity_array, entity, infos)

            # commit in batches of `self._sqlalchemy_commit_every`
            if len(entity_array) >= self._sqlalchemy_commit_every:
                LOGGER.info('Adding batch of entities to the database, '
                            'this will take a while. '
                            'Progress will resume soon.')

                insert_start_time = datetime.now()

                session.bulk_save_objects(entity_array)
                session.commit()
                session.expunge_all()  # clear session

                entity_array.clear()  # clear entity array

                LOGGER.debug(
                    'It took %s to add %s entities to the database',
                    datetime.now() - insert_start_time,
                    self._sqlalchemy_commit_every,
                )
        # finally commit remaining entities in session
        # (if any), and close session
        session.bulk_save_objects(entity_array)
        session.commit()
        session.close()
        end = datetime.now()
        LOGGER.info(
            'Import completed in %s. '
            'Total entities: %d - %d musicians with %d links - %d bands'
            ' with %d links - %d discarded dead links.',
            end - start,
            self.total_entities,
            self.musicians,
            self.musician_links,
            self.bands,
            self.band_links,
            self.dead_links,
        )
        # once the import process is complete,
        # we can safely delete the extracted discogs dump
        os.remove(extracted_path)
Пример #9
0
    def extract_and_populate(self, dump_file_paths: Iterable[str],
                             resolve: bool):
        dump_file_path = dump_file_paths[0]
        dump_path = os.path.join(
            os.path.dirname(os.path.abspath(dump_file_path)),
            f"{os.path.basename(dump_file_path)}_extracted",
        )

        if not os.path.isdir(dump_path):
            with tarfile.open(dump_file_path, "r:bz2") as tar:
                LOGGER.info("Extracting dump %s in %s", dump_file_path,
                            dump_path)
                tar.extractall(dump_path)
                LOGGER.info("Extracted dump %s in %s", dump_file_path,
                            dump_path)

        db_manager = DBManager()

        tables = [MusicbrainzReleaseGroupEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing release groups")

        release_groups_count = self._add_entities_from_generator(
            db_manager, self._release_group_generator, dump_path)

        LOGGER.debug("Added %s/%s release group records",
                     *release_groups_count)

        def release_artist_relationships_uniqueness_filter():
            """Remove duplicates from
            _release_group_artist_relationship_generator """
            yield from [
                MusicBrainzReleaseGroupArtistRelationship(item[0], item[1])
                for item in set(
                    self._release_group_artist_relationship_generator(
                        dump_path))
            ]

        tables = [MusicBrainzReleaseGroupArtistRelationship]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing relationships release-artist/band")

        relationships_count = self._add_entities_from_generator(
            db_manager, release_artist_relationships_uniqueness_filter)

        LOGGER.debug("Added %s/%s relationships records", *relationships_count)

        tables = [MusicbrainzReleaseGroupLinkEntity]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing release groups links")

        link_count = self._add_entities_from_generator(
            db_manager, self._release_group_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s release group link records", *link_count)

        tables = [MusicbrainzArtistEntity, MusicbrainzBandEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing artists and bands")

        artist_count = self._add_entities_from_generator(
            db_manager, self._artist_generator, dump_path)

        LOGGER.debug("Added %s/%s artist records", *artist_count)

        tables = [MusicbrainzArtistLinkEntity, MusicbrainzBandLinkEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing links")

        link_count = self._add_entities_from_generator(
            db_manager, self._artist_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s link records", *link_count)
        LOGGER.info("Importing ISNIs")

        isni_link_count = self._add_entities_from_generator(
            db_manager, self._isni_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s ISNI link records", *isni_link_count)

        tables = [MusicBrainzArtistBandRelationship]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing relationships artist-band")

        def artist_band_relationships_uniqueness_filter():
            yield from [
                MusicBrainzArtistBandRelationship(item[0], item[1])
                for item in set(
                    self._artist_band_relationship_generator(dump_path))
            ]

        relationships_count = self._add_entities_from_generator(
            db_manager, artist_band_relationships_uniqueness_filter)

        LOGGER.debug("Added %s/%s relationships records", *relationships_count)
        shutil.rmtree(dump_path, ignore_errors=True)
Пример #10
0
    def extract_and_populate(self, dump_file_paths: List[str],
                             resolve: bool) -> None:
        """
        Extracts the data in the dumps (person and movie) and processes them.
        It then proceeds to add the appropriate data to the database.

        See
        :ref:`soweego.importer.models.imdb_entity` module to see the SQLAlchemy
        definition of the entities we use to save IMDB data.

        :param dump_file_paths: the absolute paths of the already downloaded
        dump files.
        """

        # the order of these files is specified in `self.get_dump_download_urls`
        person_file_path = dump_file_paths[0]
        movies_file_path = dump_file_paths[1]

        LOGGER.debug('Path to movie info dump: %s', movies_file_path)
        LOGGER.debug('Path to person info dump: %s', person_file_path)

        start = datetime.datetime.now()

        tables = [
            imdb_entity.ImdbActorEntity,
            imdb_entity.ImdbDirectorEntity,
            imdb_entity.ImdbMovieEntity,
            imdb_entity.ImdbMusicianEntity,
            imdb_entity.ImdbProducerEntity,
            imdb_entity.ImdbWriterEntity,
            imdb_entity.ImdbMoviePersonRelationship,
        ]

        db_manager = DBManager()
        LOGGER.info('Connected to database: %s', db_manager.get_engine().url)

        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            'SQL tables dropped and re-created: %s',
            [table.__tablename__ for table in tables],
        )

        LOGGER.info('Starting import of movies ...')

        # Here we open the movie dump file, and add everything to the DB
        for movie_info, entity_array in self._loop_through_entities(
                movies_file_path):

            # create the movie SQLAlchemy entity and populate it
            movie_entity = imdb_entity.ImdbMovieEntity()
            movie_entity.catalog_id = movie_info.get('tconst')
            movie_entity.title_type = movie_info.get('titleType')
            if movie_info.get('primaryTitle') is not None:
                movie_entity.name = movie_info.get('primaryTitle')
                movie_entity.name_tokens = ' '.join(
                    text_utils.tokenize(movie_info.get('primaryTitle')))
            movie_entity.is_adult = (True if movie_info.get('isAdult') == '1'
                                     else False)
            try:
                movie_entity.born = datetime.date(year=int(
                    movie_info.get('startYear')),
                                                  month=1,
                                                  day=1)
                movie_entity.born_precision = 9
            except (KeyError, TypeError):
                LOGGER.debug('No start year value for %s', movie_entity)
            try:
                movie_entity.died = datetime.date(year=int(
                    movie_info.get('endYear')),
                                                  month=1,
                                                  day=1)
                movie_entity.died_precision = 9
            except (KeyError, TypeError):
                LOGGER.debug('No end year value for %s', movie_entity)
            movie_entity.runtime_minutes = movie_info.get('runtimeMinutes')

            if movie_info.get('genres'):  # if movie has a genre specified
                movie_entity.genres = ' '.join(
                    text_utils.tokenize(movie_info.get('genres')))

            # Creates entity for alias
            alias = movie_info.get('originalTitle')
            if alias is not None and movie_entity.name != alias:
                alias_entity = copy.deepcopy(movie_entity)
                alias_entity.name = alias
                alias_entity.name_tokens = ' '.join(text_utils.tokenize(alias))
                entity_array.append(alias_entity)

            entity_array.append(movie_entity)

            self.n_movies += 1

        # mark end for movie import process
        end = datetime.datetime.now()
        LOGGER.info(
            'Movie import completed in %s. '
            'Total movies imported: %d',
            end - start,
            self.n_movies,
        )

        LOGGER.info('Starting import of people ...')

        # reset timer for persons import
        start = datetime.datetime.now()

        for person_info, entity_array in self._loop_through_entities(
                person_file_path):

            # IMDb saves the list of professions as a comma separated
            # string
            professions = person_info.get('primaryProfession')

            # if person has no professions then ignore it
            if not professions:
                LOGGER.debug('Person %s has no professions',
                             person_info.get('nconst'))
                continue

            professions = professions.split(',')

            # each person can be added to multiple tables in the DB,
            # each table stands for one of the main professions
            types_of_entities = []

            if 'actor' in professions or 'actress' in professions:
                self.n_actors += 1
                types_of_entities.append(imdb_entity.ImdbActorEntity())

            if 'director' in professions:
                self.n_directors += 1
                types_of_entities.append(imdb_entity.ImdbDirectorEntity())

            if 'producer' in professions:
                self.n_producers += 1
                types_of_entities.append(imdb_entity.ImdbProducerEntity())

            if any(prof in [
                    'sound_department',
                    'composer',
                    'music_department',
                    'soundtrack',
            ] for prof in professions):
                self.n_musicians += 1
                types_of_entities.append(imdb_entity.ImdbMusicianEntity())

            if 'writer' in professions:
                self.n_writers += 1
                types_of_entities.append(imdb_entity.ImdbWriterEntity())

            # if the only profession a person has is `miscellaneous` then we
            # add it to all tables
            if professions == ['miscellaneous']:
                self.n_misc += 1
                types_of_entities = [
                    imdb_entity.ImdbActorEntity(),
                    imdb_entity.ImdbDirectorEntity(),
                    imdb_entity.ImdbMusicianEntity(),
                    imdb_entity.ImdbProducerEntity(),
                    imdb_entity.ImdbWriterEntity(),
                ]

            # add person to every matching table
            for etype in types_of_entities:
                self._populate_person(etype, person_info, entity_array)

            # if person is known for any movies then add these to the
            # database as well
            if person_info.get('knownForTitles'):
                self.n_person_movie_links += 1
                self._populate_person_movie_relations(person_info,
                                                      entity_array)

            self.n_persons += 1

        # mark the end time for the person import process
        end = datetime.datetime.now()
        LOGGER.info(
            'Person import completed in %s. '
            'Total people imported: %d - '
            'Actors: %d - Directors: %d - Musicians: %d - '
            'Producers: %d - Writers: %d - Misc: %d',
            end - start,
            self.n_persons,
            self.n_actors,
            self.n_directors,
            self.n_musicians,
            self.n_producers,
            self.n_writers,
            self.n_misc,
        )
Пример #11
0
    def _loop_through_entities(
            self, file_path: str) -> Generator[Tuple[Dict, List], None, None]:
        """
        Generator that given an IMDb dump file (which
        should be ".tsv.gz" format) it loops through every
        entry and yields it.

        :return: a generator which yields a Tuple[entity_info, entity_array]
        the consumer of this generator will take `entity_info`, create an
        SQLAlchemy entity, and append this to the `entity_array`
        """
        db_manager = DBManager()

        with gzip.open(file_path, 'rt') as ddump:
            session = db_manager.new_session()

            # count number of rows for TQDM, so we can display how
            # much is missing to complete the process. Then go back
            # to the start of the file with `.seek(0)`
            n_rows = sum(1 for line in ddump)
            ddump.seek(0)

            entity_array = []
            LOGGER.debug('Dump "%s" has %d entries', file_path, n_rows)

            reader = csv.DictReader(ddump, delimiter='\t')

            # for every entry in the file..
            for entity_info in tqdm(reader, total=n_rows):
                # clean the entry
                self._normalize_null(entity_info)

                # yield the cleaned dict
                yield entity_info, entity_array

                # every `_sqlalchemy_commit_every` loops we commit the
                # session to the DB. This is more efficient than commiting
                # every loop, and is not so hard on the memory requirements
                # as would be adding everything to session and commiting once
                # the for loop is done
                if len(entity_array) >= self._sqlalchemy_commit_every:
                    LOGGER.info(
                        'Adding batch of entities to the database, '
                        'this will take a while. Progress will resume soon.')

                    insert_start_time = datetime.datetime.now()

                    session.bulk_save_objects(entity_array)
                    session.commit()
                    session.expunge_all()  # clear session

                    entity_array.clear()  # clear entity array

                    LOGGER.debug(
                        'It took %s to add %s entities to the database',
                        datetime.datetime.now() - insert_start_time,
                        len(entity_array),
                    )

            # commit remaining entities
            session.bulk_save_objects(entity_array)
            session.commit()

            # clear list reference since it might still be available in
            # the scope where this generator was used.
            entity_array.clear()
Пример #12
0
    def extract_and_populate(self, dump_file_paths: List[str], resolve: bool):
        """Extract relevant data from the *artist* (people) and *release group*
        (works) MusicBrainz dumps, preprocess them, populate
        `SQLAlchemy <https://www.sqlalchemy.org/>`_ ORM entities, and persist
        them to a database instance.

        See :mod:`~soweego.importer.models.musicbrainz_entity`
        for the ORM definitions.

        :param dump_file_paths: paths to downloaded catalog dumps
        :param resolve: whether to resolve URLs found in catalog dumps or not
        """
        dump_file_path = dump_file_paths[0]
        dump_path = os.path.join(
            os.path.dirname(os.path.abspath(dump_file_path)),
            f"{os.path.basename(dump_file_path)}_extracted",
        )

        if not os.path.isdir(dump_path):
            with tarfile.open(dump_file_path, "r:bz2") as tar:
                LOGGER.info("Extracting dump %s in %s", dump_file_path,
                            dump_path)
                tar.extractall(dump_path)
                LOGGER.info("Extracted dump %s in %s", dump_file_path,
                            dump_path)

        db_manager = DBManager()

        tables = [MusicBrainzReleaseGroupEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing release groups")

        release_groups_count = self._add_entities_from_generator(
            db_manager, self._release_group_generator, dump_path)

        LOGGER.debug("Added %s/%s release group records",
                     *release_groups_count)

        def release_artist_relationships_uniqueness_filter():
            """Remove duplicates from
            _release_group_artist_relationship_generator """
            yield from [
                MusicBrainzReleaseGroupArtistRelationship(item[0], item[1])
                for item in set(
                    self._release_group_artist_relationship_generator(
                        dump_path))
            ]

        tables = [MusicBrainzReleaseGroupArtistRelationship]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing relationships release-artist/band")

        relationships_count = self._add_entities_from_generator(
            db_manager, release_artist_relationships_uniqueness_filter)

        LOGGER.debug("Added %s/%s relationships records", *relationships_count)

        tables = [MusicBrainzReleaseGroupLinkEntity]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing release groups links")

        link_count = self._add_entities_from_generator(
            db_manager, self._release_group_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s release group link records", *link_count)

        tables = [MusicBrainzArtistEntity, MusicBrainzBandEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing artists and bands")

        artist_count = self._add_entities_from_generator(
            db_manager, self._artist_generator, dump_path)

        LOGGER.debug("Added %s/%s artist records", *artist_count)

        tables = [MusicBrainzArtistLinkEntity, MusicBrainzBandLinkEntity]
        db_manager.drop(tables)
        db_manager.create(tables)

        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing links")

        link_count = self._add_entities_from_generator(
            db_manager, self._artist_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s link records", *link_count)
        LOGGER.info("Importing ISNIs")

        isni_link_count = self._add_entities_from_generator(
            db_manager, self._isni_link_generator, dump_path, resolve)

        LOGGER.debug("Added %s/%s ISNI link records", *isni_link_count)

        tables = [MusicBrainzArtistBandRelationship]
        db_manager.drop(tables)
        db_manager.create(tables)
        LOGGER.info(
            "Dropped and created tables %s",
            [table.__tablename__ for table in tables],
        )
        LOGGER.info("Importing relationships artist-band")

        def artist_band_relationships_uniqueness_filter():
            yield from [
                MusicBrainzArtistBandRelationship(item[0], item[1])
                for item in set(
                    self._artist_band_relationship_generator(dump_path))
            ]

        relationships_count = self._add_entities_from_generator(
            db_manager, artist_band_relationships_uniqueness_filter)

        LOGGER.debug("Added %s/%s relationships records", *relationships_count)
        shutil.rmtree(dump_path, ignore_errors=True)