def _classification_set_generator( catalog, entity, dir_io) -> Iterator[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]]: goal = 'classification' # Wikidata side wd_reader = workflow.build_wikidata(goal, catalog, entity, dir_io) wd_generator = workflow.preprocess_wikidata(goal, wd_reader) for i, wd_chunk in enumerate(wd_generator, 1): # Collect samples via queries to the target DB samples = blocking.find_samples( goal, catalog, wd_chunk[keys.NAME_TOKENS], i, target_database.get_main_entity(catalog, entity), dir_io, ) # Build target chunk from samples target_reader = workflow.build_target( goal, catalog, entity, set(samples.get_level_values(keys.TID))) # Preprocess target chunk target_chunk = workflow.preprocess_target(goal, target_reader) # Extract features features_path = os.path.join( dir_io, constants.FEATURES.format(catalog, entity, goal, i)) feature_vectors = workflow.extract_features(samples, wd_chunk, target_chunk, features_path) yield wd_chunk, target_chunk, feature_vectors LOGGER.info('Chunk %d classified', i)
def build_training_set( catalog: str, entity: str, dir_io: str ) -> Tuple[pd.DataFrame, pd.MultiIndex]: """Build a training set. :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param dir_io: input/output directory where working files will be read/written :return: the feature vectors and positive samples pair. Features are computed by comparing *(QID, catalog ID)* pairs. Positive samples are catalog IDs available in Wikidata """ goal = 'training' # Wikidata side wd_reader = workflow.build_wikidata(goal, catalog, entity, dir_io) wd_generator = workflow.preprocess_wikidata(goal, wd_reader) positive_samples, feature_vectors = None, None for i, wd_chunk in enumerate(wd_generator, 1): # Positive samples come from Wikidata if positive_samples is None: positive_samples = wd_chunk[keys.TID] else: # We concatenate the current chunk # and reset `positive_samples` at each iteration, # instead of appending each chunk to a list, # then concatenate it at the end of the loop. # Reason: keeping multiple yet small pandas objects # is less memory-efficient positive_samples = pd.concat([positive_samples, wd_chunk[keys.TID]]) # All samples come from queries to the target DB # and include negative ones all_samples = blocking.find_samples( goal, catalog, wd_chunk[keys.NAME_TOKENS], i, target_database.get_main_entity(catalog, entity), dir_io, ) # Build target chunk from all samples target_reader = workflow.build_target( goal, catalog, entity, set(all_samples.get_level_values(keys.TID)) ) # Preprocess target chunk target_chunk = workflow.preprocess_target(goal, target_reader) features_path = os.path.join( dir_io, constants.FEATURES.format(catalog, entity, goal, i) ) # Extract features from all samples chunk_fv = workflow.extract_features( all_samples, wd_chunk, target_chunk, features_path ) if feature_vectors is None: feature_vectors = chunk_fv else: feature_vectors = pd.concat([feature_vectors, chunk_fv], sort=False) # Final positive samples index positive_samples_index = pd.MultiIndex.from_tuples( zip(positive_samples.index, positive_samples), names=[keys.QID, keys.TID], ) LOGGER.info('Built positive samples index from Wikidata') feature_vectors = feature_vectors.fillna(constants.FEATURE_MISSING_VALUE) return feature_vectors, positive_samples_index
def execute( model_path: str, catalog: str, entity: str, threshold: float, name_rule: bool, dir_io: str, ) -> Iterator[pd.Series]: """Run a supervised linker. 1. Build the classification set relevant to the given catalog and entity 2. generate links between Wikidata items and catalog identifiers :param model_path: path to a trained model file :param catalog: ``{'discogs', 'imdb', 'musicbrainz'}``. A supported catalog :param entity: ``{'actor', 'band', 'director', 'musician', 'producer', 'writer', 'audiovisual_work', 'musical_work'}``. A supported entity :param threshold: minimum confidence score for generated links. Those below this value are discarded. Must be a float between 0 and 1 :param name_rule: whether to enable the rule on full names or not: if *True*, links with different full names are discarded after classification :param dir_io: input/output directory where working files will be read/written :return: the generator yielding chunks of links """ goal = 'classification' classifier = joblib.load(model_path) # Wikidata side wd_reader = workflow.build_wikidata(goal, catalog, entity, dir_io) wd_generator = workflow.preprocess_wikidata(goal, wd_reader) for i, wd_chunk in enumerate(wd_generator, 1): # Collect samples via queries to the target DB samples = blocking.find_samples( goal, catalog, wd_chunk[keys.NAME_TOKENS], i, target_database.get_main_entity(catalog, entity), dir_io, ) # Build target chunk from samples target_reader = workflow.build_target( goal, catalog, entity, set(samples.get_level_values(keys.TID)) ) # Preprocess target chunk target_chunk = workflow.preprocess_target(goal, target_reader) # Extract features features_path = os.path.join( dir_io, constants.FEATURES.format(catalog, entity, goal, i) ) feature_vectors = workflow.extract_features( samples, wd_chunk, target_chunk, features_path ) # The classification set must have the same feature space # as the training one _add_missing_feature_columns(classifier, feature_vectors) predictions = ( # LSVM doesn't support probability scores classifier.predict(feature_vectors) if isinstance(classifier, rl.SVMClassifier) else classifier.prob(feature_vectors) ) # Full name rule: if names differ, it's not a link if name_rule: LOGGER.info('Applying full names rule ...') predictions = pd.DataFrame(predictions).apply( _zero_when_different_names, axis=1, args=(wd_chunk, target_chunk), ) # Wikidata URL rule: if the target ID has a Wikidata URL, it's a link if target_chunk.get(keys.URL) is not None: predictions = pd.DataFrame(predictions).apply( _one_when_wikidata_link_correct, axis=1, args=(target_chunk,) ) LOGGER.info('Chunk %d classified', i) # Filter by threshold above_threshold = predictions[predictions >= threshold] # Remove duplicates yield above_threshold[~above_threshold.index.duplicated()]