Пример #1
0
    def get_candidates(self,
                       htid,
                       n=300,
                       min_count=2,
                       max_dist=.25,
                       min_prop_match=False,
                       search_k=-1,
                       force=False,
                       save=False,
                       ann_path=None,
                       ann_dims=None,
                       prefault=False):
        '''
        force: Force recrunch, even if the file already exists. Otherwise, existing file is loaded.
        save: save results to data_dir with stubbytree file structure
        '''
        outpath = os.path.join(
            self.data_dir, utils.id_to_stubbytree(htid, format='ann.parquet'))

        if not force and os.path.exists(outpath):
            logging.debug('File already found: {}'.format(outpath))
            try:
                results = pd.read_parquet(outpath)

                # Post cache filtering of rows
                results = results[results['count'] >= min_count]
                results = results[results['mean'] <= max_dist]
                if min_prop_match:
                    results = results[results['prop_match'] >= min_prop_match]
                return results

            except OSError:
                logging.warning("Issue loading ANN Candidates. Recrunching.")

        mtannoy = self.mtannoy(ann_path, ann_dims, prefault, force=False)
        results = mtannoy.doc_match_stats(htid,
                                          n=n,
                                          min_count=min_count,
                                          max_dist=max_dist,
                                          search_k=search_k)
        if save:
            os.makedirs(os.path.split(outpath)[0],
                        exist_ok=True)  # Create directories if needed
            results.to_parquet(outpath, compression='snappy')

        if min_prop_match:
            results = results[results['prop_match'] >= min_prop_match]

        return results
Пример #2
0
    def get_model_predictions(self,
                              htid,
                              candidates,
                              model_path=None,
                              metadb_path=None,
                              save=False,
                              force=False):
        '''
        Take left-right candidates and run it through the tensorflow model.
        '''
        outpath = os.path.join(
            self.data_dir,
            utils.id_to_stubbytree(htid, format='predictions.parquet'))

        if not force and os.path.exists(outpath):
            logging.debug('Predictions already found: {}'.format(outpath))
            predictions = pd.read_parquet(outpath)
            return predictions

        placeholder = pd.DataFrame([],
                                   columns=[
                                       'SWSM', 'SWDE', 'WP_DV', 'PARTOF',
                                       'CONTAINS', 'OVERLAPS', 'AUTHOR',
                                       'SIMDIFF', 'GRSIM', 'RANDDIFF', 'htid',
                                       'guess', 'title', 'description',
                                       'author', 'rights_date_used',
                                       'oclc_num', 'isbn', 'relatedness'
                                   ])
        if len(candidates) <= 1:
            predictions = placeholder
        else:
            rightindex, inputs = self._get_simmats_from_candidates(
                candidates, reshape=(150, 150, 1))
            if type(rightindex) is type(None):
                return None  # No data - different from empty predictions
            else:
                predictions = self._predict_from_simmat(
                    rightindex, inputs, model_path, metadb_path)

        if save:
            os.makedirs(os.path.split(outpath)[0],
                        exist_ok=True)  # Create directories if needed
            if predictions.empty:
                # Add a blank line to avoid empty parquet files
                predictions.loc[1, ] = [0] * 10 + [pd.NA] * 8 + [0]
            predictions.to_parquet(outpath, compression='snappy')

        return predictions
Пример #3
0
def main():
    import argparse
    logFormatter = logging.Formatter(
        "%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s")
    rootLogger = logging.getLogger()

    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers(dest='command')
    parser.add_argument("--data-root",
                        type=str,
                        default='/data/saddl/full/',
                        help="Location to save stubbytree data file outputs")
    parser.add_argument("--limit-workers",
                        type=int,
                        default=4,
                        help="Limit number of workers for Dask")
    parser.add_argument("--log-path",
                        type=str,
                        default='/tmp/',
                        help="Location for log files.")
    parser.add_argument("-v",
                        "--verbose",
                        help="increase output verbosity",
                        action="store_true")

    ann_parser = subparsers.add_parser(
        "Candidates", help="Save candidate relationships from ANN")
    meta_parser = subparsers.add_parser(
        "Meta_Candidates", help="Save candidate relationships from metadata")
    prediction_parser = subparsers.add_parser(
        "Predictions",
        help="Run candidates through SaDDL model to get predicted relationship."
    )
    inventory_parser = subparsers.add_parser(
        "Inventory",
        help="Take an inventory of which htids have been crunched, per step.")

    inventory_parser.add_argument('--targets', type=str, default=None,
                            help="Optional file with target htids. If provided, inventory will also save output " \
                                  "files of remaining htids")
    inventory_parser.add_argument('--prefix', type=str, default='/tmp/inventory',
                            help="Prefix for inventory files. e.g. the default /tmp/inventory will write " \
                                  "/tmp/inventory-processedann.gz, /tmp/inventory-processeddata.gz, etc.")

    # Args for prediction parser
    prediction_parser.add_argument('--model-path', type=str, default=None,
                            help="Location of SaDDL model. Default is None, which tries to fall " \
                            "back on what's in the config file")
    prediction_parser.add_argument(
        "--force-candidates",
        action="store_true",
        help=
        "Reprocess and overwrite candidate raising process if they already exist"
    )
    prediction_parser.add_argument(
        "--force-predictions",
        action="store_true",
        help=
        "Reprocess and overwrite model inference if it's already been saved")
    prediction_parser.add_argument(
        "--force-json",
        action="store_true",
        help=
        "Reprocess and overwrite final JSON files formatting if it's already been done"
    )
    prediction_parser.add_argument("--skip-json-output", action="store_true",
                            help="Just do model inference and save the raw data in parquet, without formatting for " \
                                   "dataset output.")

    # Configure for the MTAnnoy candidate retrieval
    for subparser in [ann_parser, prediction_parser]:
        subparser.add_argument('--ann-path', type=str, default=None,
                                help="Location of MTAnnoy index. Default is None, which tries to fall " \
                                "back on what's in the config file")
        subparser.add_argument(
            '--ann-dims',
            type=int,
            default=50,
            help='Number of dimensions for the MTAnnoy index.')
        subparser.add_argument(
            "--results-per-chunk",
            "-n",
            type=int,
            default=300,
            help="Number of ANN results to return per chunk")
        subparser.add_argument(
            "--min-count",
            type=int,
            default=2,
            help="Min number of matching chunks between books.")
        subparser.add_argument("--min-prop-match",
                               type=float,
                               default=.03,
                               help="Min proportion of match seen in target.")
        subparser.add_argument(
            "--max-dist",
            type=float,
            default=.18,
            help="Maximum distance between matching chunks.")
        subparser.add_argument('--prefault',
                               action='store_true',
                               help='Load ANN into memory.')

    for subparser in [meta_parser, prediction_parser]:
        subparser.add_argument('--title-ann-path', type=str, default=None,
                                    help="Location of Annoy index for book titles. Default is None, which tries to fall " \
                                    "back on what's in the config file")

    for subparser in [meta_parser, ann_parser, prediction_parser]:
        subparser.add_argument("--search-k",
                               type=int,
                               default=-1,
                               help="ANN search k parameter.")
        subparser.add_argument(
            "--htid-in",
            type=argparse.FileType('r'),
            default=None,
            help=
            'File of HTIDs to process. If set, htids args provided on the command line are ignored.'
        )
        subparser.add_argument(
            "htids",
            nargs='*',
            help='HTIDs to process. Alternately, provide --htid-in')

    for subparser in [meta_parser, ann_parser]:
        subparser.add_argument(
            "--overwrite",
            action="store_true",
            help=
            "Overwrite files if they already exist. Otherwise, they're skipped"
        )

    args = parser.parse_args()

    if not args.command:
        parser.print_help()
        return

    if args.limit_workers:
        import dask
        dask.config.set(num_workers=args.limit_workers)

    np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)

    if args.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    if args.log_path:
        fileHandler = logging.FileHandler("{0}/saddl-{1}.log".format(
            args.log_path, time.time()))
        fileHandler.setFormatter(logFormatter)
        rootLogger.addHandler(fileHandler)

    consoleHandler = logging.StreamHandler()
    consoleHandler.setFormatter(logFormatter)
    rootLogger.addHandler(consoleHandler)

    saddlr = Saddler(data_dir=args.data_root)

    starttime = time.time()
    skipped = 0
    errors = 0

    if args.command == 'Inventory':
        saddlr.inventory_files(prefix=args.prefix, target_list=args.targets)
    else:
        if args.htid_in:
            htids = [htid.strip() for htid in args.htid_in]
        else:
            htids = args.htids

    if args.command == 'Meta_Candidates':
        for i, htid in enumerate(htids):
            try:
                outpath = os.path.join(
                    args.data_root,
                    utils.id_to_stubbytree(htid, format='meta.parquet'))
                if not args.overwrite and os.path.exists(outpath):
                    logging.debug('File already found: {}'.format(outpath))
                    skipped += 1
                    continue

                results = saddlr.get_meta_candidates(htid,
                                                     args.title_ann_path,
                                                     save=True,
                                                     search_k=args.search_k,
                                                     force=args.overwrite)

                print_progress(starttime,
                               i + 1 - skipped,
                               len(htids) - skipped,
                               print_every=100)

            except KeyboardInterrupt:
                raise

            except KeyError:
                logging.warning(
                    f"Metadata key error with {htid} (not in Hathifiles or in Title Index)"
                )

            except:
                errors += 1
                logging.exception(
                    "Undiagnosed issue with {} (#{}; total errors: #{})".
                    format(htid, i, errors))

    if args.command == 'Candidates':
        # Pre-load MTAnnoy. Unnecessary, but more readable below
        saddlr.mtannoy(ann_dims=args.ann_dims,
                       ann_path=args.ann_path,
                       prefault=args.prefault)

        for i, htid in enumerate(htids):
            try:
                outpath = os.path.join(
                    args.data_root,
                    utils.id_to_stubbytree(htid, format='ann.parquet'))
                if not args.overwrite and os.path.exists(outpath):
                    logging.debug('File already found: {}'.format(outpath))
                    skipped += 1
                    continue

                results = saddlr.get_candidates(
                    htid,
                    n=args.results_per_chunk,
                    min_count=args.min_count,
                    max_dist=args.max_dist,
                    min_prop_match=args.min_prop_match,
                    search_k=args.search_k,
                    force=args.overwrite,
                    save=True)

                print_progress(starttime,
                               i + 1 - skipped,
                               len(htids) - skipped,
                               print_every=2)

            except KeyboardInterrupt:
                raise

            except KeyError:
                logging.warning(f"Key error with {htid}")

            except:
                logging.exception("Issue with {}".format(htid))

    elif args.command == "Predictions":
        # Pre-load TF Model, for readability
        saddlr.tf_model(args.model_path)

        for i, htid in enumerate(htids):
            if args.skip_json_output:
                outpath = os.path.join(
                    saddlr.data_dir,
                    utils.id_to_stubbytree(htid, format='predictions.parquet'))
                force = args.force_predictions
            else:
                outpath = os.path.join(
                    saddlr.data_dir,
                    utils.id_to_stubbytree(htid, format='saddl.json'))
                force = args.force_json

            if not force and os.path.exists(outpath):
                logging.debug('File already found: {}'.format(outpath))
                skipped += 1
                continue

            try:
                saddlr.get_predictions(
                    htid,
                    save_all=True,
                    title_ann_path=args.title_ann_path,
                    force_candidates=args.force_candidates,
                    force_predictions=args.force_predictions,
                    force_output=args.force_json,
                    skip_json_output=args.skip_json_output,
                    ann_args=dict(n=args.results_per_chunk,
                                  min_count=args.min_count,
                                  max_dist=args.max_dist,
                                  search_k=args.search_k,
                                  ann_path=args.ann_path,
                                  ann_dims=args.ann_dims,
                                  min_prop_match=args.min_prop_match,
                                  prefault=args.prefault))

                print_progress(starttime,
                               i + 1 - skipped,
                               len(htids) - skipped,
                               print_every=10)

            except KeyboardInterrupt:
                raise

            except:
                logging.exception("Undiagnosed issue with {}".format(htid))
Пример #4
0
    def export_structured_data(self,
                               htid,
                               predictions,
                               target=None,
                               save=False,
                               force=False):
        '''
        target: Series of metadata for target - this is *loaded* in this method, so only supply if you already
            have it in memory and don't want to do the lookup again.
        '''
        if type(predictions) is type(None):
            # Don't save dataset if no predictions given
            # This is different than if the predictions dataset is an empty dataframe
            return None
        # Redundant, because predictions shouldn't save with na's. Just for backward compat with older files
        predictions = predictions.dropna(subset=judgment_labels)

        outpath = os.path.join(
            self.data_dir, utils.id_to_stubbytree(htid, format='saddl.json'))
        if not force and os.path.exists(outpath):
            logging.debug('Dataset already found: {}'.format(outpath))
            try:
                with open(outpath, mode='r') as f:
                    data_entry = json.load(f)
                    return data_entry
            except json.JSONDecodeError:
                logging.error("loading error. Will ignore loading")

        if not target:
            target = dd.read_parquet(self.config['metadb_path'],
                                     engine='pyarrow-dataset',
                                     filters=[
                                         ('htid', '==', htid)
                                     ]).reset_index().compute().iloc[0]

        base_meta = [
            'htid', 'title', 'author', 'description', 'rights_date_used',
            'oclc_num', 'isbn'
        ]
        data_entry = dict(volume=target[base_meta].to_dict())
        data_entry['volume'][
            'link'] = "http://hdl.handle.net/2027/" + target['htid']
        data_entry['related_metadata'] = dict()
        data_entry['relationships'] = dict()
        data_entry['recommendations'] = dict()

        aut_prints = predictions.author.apply(alpha_fingerprint)
        target_print = alpha_fingerprint(target.author)
        by_author = predictions[aut_prints == target_print]

        # Add Collected Metadata
        def unique_nontarget_values(field,
                                    limit=['SWSM', 'SWSE'],
                                    df=by_author):
            diff = (df[field] != target[field]) if target[field] else True
            uniq = list(df[df.guess.isin(limit) & diff][field].unique())
            return uniq if len(uniq) else []

        data_entry['related_metadata'][
            'other years'] = unique_nontarget_values('rights_date_used')
        data_entry['related_metadata'][
            'other titles'] = unique_nontarget_values('title')
        data_entry['related_metadata'][
            'other OCLC numbers'] = unique_nontarget_values('oclc_num')
        data_entry['related_metadata'][
            'other enumchron values'] = unique_nontarget_values('description')
        data_entry['related_metadata'][
            'titles within this work'] = unique_nontarget_values(
                'title', ["CONTAINS"])
        data_entry['related_metadata'][
            'titles of works that contain this work'] = unique_nontarget_values(
                'title', ["PARTOF"])

        # Add Same Work Info
        def get_dict_by_guess(guess):
            a = by_author[by_author.guess == guess].sort_values(
                guess, ascending=False)
            if a.empty:
                return []
            a = a[base_meta + [guess]]
            a = a.rename(columns={
                'rights_date_used': 'year',
                guess: "confidence"
            })
            a['confidence'] = a['confidence'].multiply(100).astype(int)
            return a.to_dict(orient='records')

        data_entry['relationships']['identical works'] = get_dict_by_guess(
            "SWSM")
        data_entry['relationships'][
            'different expressions'] = get_dict_by_guess("SWDE")
        data_entry['relationships'][
            'other volumes of the larger work'] = get_dict_by_guess("WP_DV")
        data_entry['relationships']['this work contains'] = get_dict_by_guess(
            "CONTAINS")
        data_entry['relationships'][
            'this work is a part of'] = get_dict_by_guess("PARTOF")

        other_works = predictions[~predictions.guess.isin(
            ['SWSM', 'SWDE', 'WP_DV', 'CONTAINS', 'PARTOF'])]
        recs = other_works[other_works.relatedness > 0.05].sort_values(
            'relatedness').head(20)
        data_entry['recommendations'][
            'related authors'] = unique_nontarget_values('author',
                                                         judgment_labels,
                                                         df=recs)
        data_entry['recommendations']['similar books'] = recs[
            base_meta].rename(columns={
                'rights_date_used': 'year'
            }).to_dict(orient='records')

        if save:
            os.makedirs(os.path.split(outpath)[0],
                        exist_ok=True)  # Create directories if needed
            with open(outpath, mode='w') as f:
                json.dump(data_entry, f, cls=NumpyEncoder)

        return data_entry
Пример #5
0
    def get_meta_candidates(self,
                            htid,
                            title_ann_path=None,
                            sim_titles=True,
                            same_authors=True,
                            max_dist=.35,
                            search_k=-1,
                            max_author_results=100,
                            max_title_results=300,
                            raw_output=False,
                            save=False,
                            force=False):
        '''
        Get metadata-based candidates based on approximate title match (sim_titles=True) and same author
        match (same_authors=True).
        
        raw_output: Return underlying data as a dataframe or a tuple of two dataframes. Otherwise,
            results are returned as a single match/target/note dataframe
        
        max_author_results, max_title_results: Cap for how many candidates to return. A failsafe
            in case of edge cases that have a great deal of matches (e.g. U.S. Gov't, books named 'Works')
        '''
        assert sim_titles or same_authors
        outpath = os.path.join(
            self.data_dir, utils.id_to_stubbytree(htid, format='meta.parquet'))

        if not force and os.path.exists(outpath):
            logging.debug('Meta candidates already found: {}'.format(outpath))
            candidates = pd.read_parquet(outpath)
            return candidates

        titleann = self.titleann(title_ann_path)

        if sim_titles:
            idnum = titleann.htid2id[htid]
            results = titleann.u.get_nns_by_item(idnum,
                                                 n=25,
                                                 include_distances=True,
                                                 search_k=search_k)
            if (results[1][-1] < .3):
                results = titleann.u.get_nns_by_item(idnum,
                                                     n=100,
                                                     include_distances=True,
                                                     search_k=search_k)

            results = dict(zip(*results))
            result_htids = {
                titleann.id2htid[id]: dist
                for id, dist in results.items()
            }

            if htid not in result_htids:
                result_htids[htid] = 0

            meta = dd.read_parquet(self.config['metadb_path'],
                                   engine='pyarrow-dataset',
                                   columns=['title', 'author'],
                                   filters=[('htid', 'in',
                                             tuple(result_htids.keys()))
                                            ]).compute()

            meta = meta.loc[[
                htid for htid in result_htids.keys() if htid in meta.index
            ]]
            meta['distance'] = [result_htids[htid] for htid in meta.index]
            # Trim to just 'pretty similar'
            meta = meta[meta.distance <= max_dist]
            meta_candidates = meta.index.tolist()
        else:
            meta = dd.read_parquet(self.config['metadb_path'],
                                   engine='pyarrow-dataset',
                                   columns=['title', 'author'],
                                   filters=[('htid', '==', htid)]).compute()
        author = meta.loc[htid, 'author']
        if not author:
            same_authors = False

        if len(meta) > max_title_results:
            title = title.head(max_title_results)

        if same_authors:
            same_aut = dd.read_parquet(self.config['metadb_path'],
                                       engine='pyarrow-dataset',
                                       columns=['title', 'author'],
                                       filters=[('author', '==', author)
                                                ]).compute()

            if len(same_aut) > max_author_results:
                same_aut = same_aut.sample(max_author_results)

            author_candidates = same_aut.index.tolist()
        else:
            author_candidates = []

        if not raw_output or save:
            # If saving results with raw_output flag, the saved results will still be formatted
            out = pd.DataFrame(author_candidates + meta_candidates,
                               columns=['match'])
            # Why include htid if it's in the filename? For easier aggregate parquet reading later
            out['target'] = htid
            out['note'] = ['author'] * len(author_candidates) + ['meta'] * len(
                meta_candidates)
            out = out[out.match != htid]

        if save:
            os.makedirs(os.path.split(outpath)[0],
                        exist_ok=True)  # Create directories if needed
            out.to_parquet(outpath, compression='snappy')

        if raw_output and same_authors:
            return meta, same_aut
        elif raw_output and not same_authors:
            return meta

        else:
            return out