Exemplo n.º 1
0
def main() -> None:
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--host-url",
                        default="http://localhost:9411/v0",
                        help="connect to this host/port")
    parser.add_argument("--batch-size",
                        help="size of batch to send",
                        default=50,
                        type=int)
    parser.set_defaults(auth_var="FATCAT_AUTH_WORKER_CLEANUP", )
    parser.add_argument(
        "idents_file",
        help="File with release identifier to try updating",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    args = parser.parse_args()
    api = authenticated_api(
        args.host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var),
    )

    rldc = ReleaseLowercaseDoiCleanup(
        api,
        edit_batch_size=args.batch_size,
    )
    LinePusher(rldc, args.idents_file).run()
Exemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--api-host-url',
                        default="http://localhost:9411/v0",
                        help="connect to this host/port")
    parser.add_argument('--batch-size',
                        help="size of batch to send",
                        default=50,
                        type=int)
    parser.add_argument('domain_issnl_tsv_file',
                        help="domain/ISSNL mapping TSV file",
                        type=argparse.FileType('r'))
    parser.add_argument('insertable_tsv_file',
                        help="dumpgrobidmetainsertable TSV file to work over",
                        default=sys.stdin,
                        type=argparse.FileType('r'))

    auth_var = "FATCAT_AUTH_SANDCRAWLER"

    args = parser.parse_args()

    args.api = authenticated_api(
        args.api_host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(auth_var))
    run_fixup(args)
Exemplo n.º 3
0
def main() -> None:
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--host-url",
                        default="http://localhost:9411/v0",
                        help="connect to this host/port")
    parser.add_argument("--batch-size",
                        help="size of batch to send",
                        default=50,
                        type=int)
    parser.set_defaults(auth_var="FATCAT_AUTH_WORKER_CLEANUP", )
    parser.add_argument(
        "json_file",
        help="File with jsonlines with cleanup context",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    args = parser.parse_args()
    api = authenticated_api(
        args.host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var),
    )

    frbc = FileReleaseBugfix(
        api,
        edit_batch_size=args.batch_size,
    )
    JsonLinePusher(frbc, args.json_file).run()
Exemplo n.º 4
0
def main():
    api = authenticated_api(
        API_ENDPOINT,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get("FATCAT_API_AUTH_TOKEN"))

    path = sys.argv[1]
    reader = csv.DictReader(open(path), delimiter='\t')
    run(api, reader)
Exemplo n.º 5
0
def main() -> None:
    """
    Invoke like:

        python3 -m fatcat_tools.mergers.files [options]
    """
    parser = argparse.ArgumentParser()
    parser.add_argument("--host-url",
                        default="http://localhost:9411/v0",
                        help="connect to this host/port")
    parser.add_argument("--batch-size",
                        help="size of batch to send",
                        default=50,
                        type=int)
    parser.add_argument(
        "--editgroup-description-override",
        help="editgroup description override",
        default=None,
        type=str,
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="don't actually commit merges, just count what would have been",
    )
    parser.set_defaults(auth_var="FATCAT_API_AUTH_TOKEN", )
    subparsers = parser.add_subparsers()

    sub_merge_files = subparsers.add_parser("merge-files")
    sub_merge_files.set_defaults(func=run_merge_files)
    sub_merge_files.add_argument(
        "json_file",
        help="source of merge lines to process (or stdin)",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print("tell me what to do!")
        sys.exit(-1)

    # allow editgroup description override via env variable (but CLI arg takes
    # precedence)
    if not args.editgroup_description_override and os.environ.get(
            "FATCAT_EDITGROUP_DESCRIPTION"):
        args.editgroup_description_override = os.environ.get(
            "FATCAT_EDITGROUP_DESCRIPTION")

    args.api = authenticated_api(
        args.host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var),
    )
    args.func(args)
Exemplo n.º 6
0
def main() -> None:
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--fatcat-api-url",
                        default="http://localhost:9411/v0",
                        help="connect to this host/port")
    parser.add_argument("--batch-size",
                        help="size of batch to send",
                        default=50,
                        type=int)
    parser.add_argument(
        "--editgroup-description-override",
        help="editgroup description override",
        default=None,
        type=str,
    )
    parser.add_argument("--dry-run",
                        help="dry-run mode (don't actually update)",
                        default=False,
                        type=bool)
    subparsers = parser.add_subparsers()

    sub_files = subparsers.add_parser(
        "files", help="attempt metadata cleanups over a list of file entities")
    sub_files.set_defaults(
        func=run_files,
        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
    )
    sub_files.add_argument(
        "json_file",
        help="files JSON file to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print("tell me what to do!")
        sys.exit(-1)

    # allow editgroup description override via env variable (but CLI arg takes
    # precedence)
    if not args.editgroup_description_override and os.environ.get(
            "FATCAT_EDITGROUP_DESCRIPTION"):
        args.editgroup_description_override = os.environ.get(
            "FATCAT_EDITGROUP_DESCRIPTION")

    args.api = authenticated_api(
        args.fatcat_api_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var),
    )
    sentry_sdk.init(environment=args.env)
    args.func(args)
Exemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--host-url',
                        default="http://localhost:9411/v0",
                        help="connect to this host/port")
    parser.add_argument('--batch-size',
                        help="size of batch to send",
                        default=50,
                        type=int)
    parser.add_argument('--editgroup-description-override',
                        help="editgroup description override",
                        default=None,
                        type=str)
    parser.add_argument('--dry-run',
                        help="dry-run mode (don't actually update)",
                        default=False,
                        type=bool)
    subparsers = parser.add_subparsers()

    sub_files = subparsers.add_parser('files')
    sub_files.set_defaults(
        func=run_files,
        auth_var="FATCAT_AUTH_WORKER_CLEANUP",
    )
    sub_files.add_argument('json_file',
                           help="files JSON file to import from",
                           default=sys.stdin,
                           type=argparse.FileType('r'))

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print("tell me what to do!")
        sys.exit(-1)

    # allow editgroup description override via env variable (but CLI arg takes
    # precedence)
    if not args.editgroup_description_override \
            and os.environ.get('FATCAT_EDITGROUP_DESCRIPTION'):
        args.editgroup_description_override = os.environ.get(
            'FATCAT_EDITGROUP_DESCRIPTION')

    args.api = authenticated_api(
        args.host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var))
    args.func(args)
Exemplo n.º 8
0
def main() -> None:
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--verbose", action="store_true", help="enable verbose output")
    parser.add_argument(
        "--fatcat-api-url",
        default="http://localhost:9411/v0",
        help="fatcat API host/port to use",
    )
    parser.add_argument(
        "--poll-interval",
        help="how long to wait between polling (seconds)",
        default=10.0,
        type=float,
    )
    subparsers = parser.add_subparsers()

    sub_dummy = subparsers.add_parser("dummy", help="example/demonstration review bot")
    sub_dummy.set_defaults(func=run_dummy)
    sub_dummy.add_argument(
        "--continuous",
        action="store_true",
        help="run forever, polling for new reviewable editgroups",
    )
    sub_dummy.add_argument("--editgroup", help="single editgroup ID to review")
    sub_dummy.add_argument(
        "--annotate",
        action="store_true",
        help="for single editgroups, pushes result as annotation",
    )

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print("tell me what to do!")
        sys.exit(-1)
    if (args.editgroup and args.continuous) or not (args.editgroup or args.continuous):
        print("need to run on a single editgroup, or continuous")
        sys.exit(-1)

    args.api = authenticated_api(args.fatcat_api_url)
    sentry_sdk.init(environment=args.env)
    args.func(args)
Exemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--fatcat-api-url',
                        default="http://localhost:9411/v0",
                        help="connect to this host/port")
    subparsers = parser.add_subparsers()

    sub_uuid2fcid = subparsers.add_parser(
        'uuid2fcid',
        help="convert a standard UUID (as string) to fatcat ident format")
    sub_uuid2fcid.set_defaults(func=run_uuid2fcid)
    sub_uuid2fcid.add_argument('uuid', help="UUID to transform")

    sub_fcid2uuid = subparsers.add_parser(
        'fcid2uuid',
        help="convert a fatcat ident string to standard UUID format")
    sub_fcid2uuid.set_defaults(func=run_fcid2uuid)
    sub_fcid2uuid.add_argument('fcid', help="FCID to transform (into UUID)")

    sub_editgroup_accept = subparsers.add_parser(
        'editgroup-accept', help="accept an editgroup (by ident)")
    sub_editgroup_accept.set_defaults(func=run_editgroup_accept)
    sub_editgroup_accept.add_argument('editgroup_id',
                                      help="editgroup to accept")

    sub_editgroup_submit = subparsers.add_parser(
        'editgroup-submit', help="submit an editgroup for review (by ident)")
    sub_editgroup_submit.set_defaults(func=run_editgroup_submit)
    sub_editgroup_submit.add_argument('editgroup_id',
                                      help="editgroup to submit")

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print("tell me what to do!")
        sys.exit(-1)

    args.api = authenticated_api(args.fatcat_api_url)
    args.func(args)
Exemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--debug',
        action='store_true',
        help="enable debugging interface")
    parser.add_argument('--host-url',
        default="http://localhost:9411/v0",
        help="connect to this host/port")
    parser.add_argument('--kafka-hosts',
        default="localhost:9092",
        help="list of Kafka brokers (host/port) to use")
    parser.add_argument('--kafka-env',
        default="dev",
        help="Kafka topic namespace to use (eg, prod, qa)")
    parser.add_argument('--batch-size',
        help="size of batch to send",
        default=50, type=int)
    parser.add_argument('--editgroup-description-override',
        help="editgroup description override",
        default=None, type=str)
    subparsers = parser.add_subparsers()

    sub_crossref = subparsers.add_parser('crossref')
    sub_crossref.set_defaults(
        func=run_crossref,
        auth_var="FATCAT_AUTH_WORKER_CROSSREF",
    )
    sub_crossref.add_argument('json_file',
        help="crossref JSON file to import from",
        default=sys.stdin, type=argparse.FileType('r'))
    sub_crossref.add_argument('issn_map_file',
        help="ISSN to ISSN-L mapping file",
        default=None, type=argparse.FileType('r'))
    sub_crossref.add_argument('--extid-map-file',
        help="DOI-to-other-identifiers sqlite3 database",
        default=None, type=str)
    sub_crossref.add_argument('--no-lookup-refs',
        action='store_true',
        help="skip lookup of references (PMID or DOI)")
    sub_crossref.add_argument('--kafka-mode',
        action='store_true',
        help="consume from kafka topic (not stdin)")
    sub_crossref.add_argument('--bezerk-mode',
        action='store_true',
        help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)")

    sub_jalc = subparsers.add_parser('jalc')
    sub_jalc.set_defaults(
        func=run_jalc,
        auth_var="FATCAT_AUTH_WORKER_JALC",
    )
    sub_jalc.add_argument('xml_file',
        help="Jalc RDF XML file (record-per-line) to import from",
        default=sys.stdin, type=argparse.FileType('r'))
    sub_jalc.add_argument('issn_map_file',
        help="ISSN to ISSN-L mapping file",
        default=None, type=argparse.FileType('r'))
    sub_jalc.add_argument('--extid-map-file',
        help="DOI-to-other-identifiers sqlite3 database",
        default=None, type=str)

    sub_arxiv = subparsers.add_parser('arxiv')
    sub_arxiv.set_defaults(
        func=run_arxiv,
        auth_var="FATCAT_AUTH_WORKER_ARXIV",
    )
    sub_arxiv.add_argument('xml_file',
        help="arXivRaw XML file to import from",
        default=sys.stdin, type=argparse.FileType('r'))
    sub_arxiv.add_argument('--kafka-mode',
        action='store_true',
        help="consume from kafka topic (not stdin)")

    sub_pubmed = subparsers.add_parser('pubmed')
    sub_pubmed.set_defaults(
        func=run_pubmed,
        auth_var="FATCAT_AUTH_WORKER_PUBMED",
    )
    sub_pubmed.add_argument('xml_file',
        help="Pubmed XML file to import from",
        default=sys.stdin, type=argparse.FileType('r'))
    sub_pubmed.add_argument('issn_map_file',
        help="ISSN to ISSN-L mapping file",
        default=None, type=argparse.FileType('r'))
    sub_pubmed.add_argument('--no-lookup-refs',
        action='store_true',
        help="skip lookup of references (PMID or DOI)")
    sub_pubmed.add_argument('--kafka-mode',
        action='store_true',
        help="consume from kafka topic (not stdin)")

    sub_jstor = subparsers.add_parser('jstor')
    sub_jstor.set_defaults(
        func=run_jstor,
        auth_var="FATCAT_AUTH_WORKER_JSTOR",
    )
    sub_jstor.add_argument('list_file',
        help="List of JSTOR XML file paths to import from",
        default=sys.stdin, type=argparse.FileType('r'))
    sub_jstor.add_argument('issn_map_file',
        help="ISSN to ISSN-L mapping file",
        default=None, type=argparse.FileType('r'))

    sub_orcid = subparsers.add_parser('orcid')
    sub_orcid.set_defaults(
        func=run_orcid,
        auth_var="FATCAT_AUTH_WORKER_ORCID"
    )
    sub_orcid.add_argument('json_file',
        help="orcid JSON file to import from (or stdin)",
        default=sys.stdin, type=argparse.FileType('r'))

    sub_journal_metadata = subparsers.add_parser('journal-metadata')
    sub_journal_metadata.set_defaults(
        func=run_journal_metadata,
        auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
    )
    sub_journal_metadata.add_argument('json_file',
        help="Journal JSON metadata file to import from (or stdin)",
        default=sys.stdin, type=argparse.FileType('r'))

    sub_chocula = subparsers.add_parser('chocula')
    sub_chocula.set_defaults(
        func=run_chocula,
        auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
    )
    sub_chocula.add_argument('json_file',
        help="chocula JSON entities file (or stdin)",
        default=sys.stdin, type=argparse.FileType('r'))

    sub_matched = subparsers.add_parser('matched')
    sub_matched.set_defaults(
        func=run_matched,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_matched.add_argument('json_file',
        help="JSON file to import from (or stdin)",
        default=sys.stdin, type=argparse.FileType('r'))
    sub_matched.add_argument('--default-mimetype',
        default=None,
        help="default mimetype for imported files (if not specified per-file)")
    sub_matched.add_argument('--bezerk-mode',
        action='store_true',
        help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")
    sub_matched.add_argument('--default-link-rel',
        default="web",
        help="default URL rel for matches (eg, 'publisher', 'web')")

    sub_arabesque_match = subparsers.add_parser('arabesque')
    sub_arabesque_match.set_defaults(
        func=run_arabesque_match,
        auth_var="FATCAT_AUTH_WORKER_CRAWL",
    )
    sub_arabesque_match.add_argument('--sqlite-file',
        help="sqlite database file to import from")
    sub_arabesque_match.add_argument('--json-file',
        help="JSON file to import from (or stdin)",
        type=argparse.FileType('r'))
    sub_arabesque_match.add_argument('--do-updates',
        action='store_true',
        help="update pre-existing file entities if new match (instead of skipping)")
    sub_arabesque_match.add_argument('--no-require-grobid',
        action='store_true',
        help="whether postproc_status column must be '200'")
    sub_arabesque_match.add_argument('--extid-type',
        default="doi",
        help="identifer type in the database (eg, 'doi', 'pmcid'")
    sub_arabesque_match.add_argument('--crawl-id',
        help="crawl ID (optionally included in editgroup metadata)")
    sub_arabesque_match.add_argument('--default-link-rel',
        default="web",
        help="default URL rel for matches (eg, 'publisher', 'web')")

    sub_grobid_metadata = subparsers.add_parser('grobid-metadata')
    sub_grobid_metadata.set_defaults(
        func=run_grobid_metadata,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_grobid_metadata.add_argument('tsv_file',
        help="TSV file to import from (or stdin)",
        default=sys.stdin, type=argparse.FileType('r'))
    sub_grobid_metadata.add_argument('--group-size',
        help="editgroup group size to use",
        default=75, type=int)
    sub_grobid_metadata.add_argument('--longtail-oa',
        action='store_true',
        help="if this is an import of longtail OA content (sets an 'extra' flag)")
    sub_grobid_metadata.add_argument('--bezerk-mode',
        action='store_true',
        help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)")

    sub_wayback_static = subparsers.add_parser('wayback-static')
    sub_wayback_static.set_defaults(
        func=run_wayback_static,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_wayback_static.add_argument('wayback_url',
        type=str,
        help="URL of wayback capture to extract from")
    sub_wayback_static.add_argument('--extid',
        type=str,
        help="external identifier for release lookup")
    sub_wayback_static.add_argument('--release-id',
        type=str,
        help="release entity identifier")
    sub_wayback_static.add_argument('--editgroup-id',
        type=str,
        help="use existing editgroup (instead of creating a new one)")

    sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat')
    sub_cdl_dash_dat.set_defaults(
        func=run_cdl_dash_dat,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_cdl_dash_dat.add_argument('dat_path',
        type=str,
        help="local path dat to import (must be the dat discovery key)")
    sub_cdl_dash_dat.add_argument('--release-id',
        type=str,
        help="release entity identifier")
    sub_cdl_dash_dat.add_argument('--editgroup-id',
        type=str,
        help="use existing editgroup (instead of creating a new one)")

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print("tell me what to do!")
        sys.exit(-1)

    # allow editgroup description override via env variable (but CLI arg takes
    # precedence)
    if not args.editgroup_description_override \
            and os.environ.get('FATCAT_EDITGROUP_DESCRIPTION'):
        args.editgroup_description_override = os.environ.get('FATCAT_EDITGROUP_DESCRIPTION')

    args.api = authenticated_api(
        args.host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var))
    args.func(args)
Exemplo n.º 11
0
def test_authenticated_api():
    api = authenticated_api("http://localhost:9411/v0")
    api.get_changelog()
    api.auth_check()
Exemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--debug',
                        action='store_true',
                        help="enable debugging interface")
    parser.add_argument('--host-url',
                        default="http://localhost:9411/v0",
                        help="connect to this host/port")
    parser.add_argument('--kafka-hosts',
                        default="localhost:9092",
                        help="list of Kafka brokers (host/port) to use")
    parser.add_argument('--kafka-env',
                        default="qa",
                        help="Kafka topic namespace to use (eg, prod, qa)")
    parser.add_argument('--batch-size',
                        help="size of batch to send",
                        default=50,
                        type=int)
    subparsers = parser.add_subparsers()

    sub_crossref = subparsers.add_parser('crossref')
    sub_crossref.set_defaults(
        func=run_crossref,
        auth_var="FATCAT_AUTH_WORKER_CROSSREF",
    )
    sub_crossref.add_argument('json_file',
                              help="crossref JSON file to import from",
                              default=sys.stdin,
                              type=argparse.FileType('r'))
    sub_crossref.add_argument('issn_map_file',
                              help="ISSN to ISSN-L mapping file",
                              default=None,
                              type=argparse.FileType('r'))
    sub_crossref.add_argument('--extid-map-file',
                              help="DOI-to-other-identifiers sqlite3 database",
                              default=None,
                              type=str)
    sub_crossref.add_argument('--kafka-mode',
                              action='store_true',
                              help="consume from kafka topic (not stdin)")
    sub_crossref.add_argument(
        '--bezerk-mode',
        action='store_true',
        help=
        "don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)"
    )

    sub_orcid = subparsers.add_parser('orcid')
    sub_orcid.set_defaults(func=run_orcid, auth_var="FATCAT_AUTH_WORKER_ORCID")
    sub_orcid.add_argument('json_file',
                           help="orcid JSON file to import from (or stdin)",
                           default=sys.stdin,
                           type=argparse.FileType('r'))

    sub_journal_metadata = subparsers.add_parser('journal-metadata')
    sub_journal_metadata.set_defaults(
        func=run_journal_metadata,
        auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
    )
    sub_journal_metadata.add_argument(
        'json_file',
        help="Journal JSON metadata file to import from (or stdin)",
        default=sys.stdin,
        type=argparse.FileType('r'))

    sub_matched = subparsers.add_parser('matched')
    sub_matched.set_defaults(
        func=run_matched,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_matched.add_argument('json_file',
                             help="JSON file to import from (or stdin)",
                             default=sys.stdin,
                             type=argparse.FileType('r'))
    sub_matched.add_argument(
        '--bezerk-mode',
        action='store_true',
        help=
        "don't lookup existing files, just insert (clobbers; only for fast bootstrap)"
    )

    sub_grobid_metadata = subparsers.add_parser('grobid-metadata')
    sub_grobid_metadata.set_defaults(
        func=run_grobid_metadata,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_grobid_metadata.add_argument('tsv_file',
                                     help="TSV file to import from (or stdin)",
                                     default=sys.stdin,
                                     type=argparse.FileType('r'))
    sub_grobid_metadata.add_argument('--group-size',
                                     help="editgroup group size to use",
                                     default=75,
                                     type=int)
    sub_grobid_metadata.add_argument(
        '--longtail-oa',
        action='store_true',
        help=
        "if this is an import of longtail OA content (sets an 'extra' flag)")
    sub_grobid_metadata.add_argument(
        '--bezerk-mode',
        action='store_true',
        help=
        "don't lookup existing files, just insert (clobbers; only for fast bootstrap)"
    )

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print("tell me what to do!")
        sys.exit(-1)

    args.api = authenticated_api(
        args.host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var))
    args.func(args)
Exemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--host-url',
                        default="http://localhost:9411/v0",
                        help="connect to this host/port")
    parser.add_argument('--kafka-hosts',
                        default="localhost:9092",
                        help="list of Kafka brokers (host/port) to use")
    parser.add_argument('--kafka-env',
                        default="dev",
                        help="Kafka topic namespace to use (eg, prod, qa)")
    parser.add_argument('--batch-size',
                        help="size of batch to send",
                        default=50,
                        type=int)
    parser.add_argument('--editgroup-description-override',
                        help="editgroup description override",
                        default=None,
                        type=str)
    subparsers = parser.add_subparsers()

    sub_crossref = subparsers.add_parser(
        'crossref', help="import Crossref API metadata format (JSON)")
    sub_crossref.set_defaults(
        func=run_crossref,
        auth_var="FATCAT_AUTH_WORKER_CROSSREF",
    )
    sub_crossref.add_argument('json_file',
                              help="crossref JSON file to import from",
                              default=sys.stdin,
                              type=argparse.FileType('r'))
    sub_crossref.add_argument('issn_map_file',
                              help="ISSN to ISSN-L mapping file",
                              default=None,
                              type=argparse.FileType('r'))
    sub_crossref.add_argument('--extid-map-file',
                              help="DOI-to-other-identifiers sqlite3 database",
                              default=None,
                              type=str)
    sub_crossref.add_argument('--no-lookup-refs',
                              action='store_true',
                              help="skip lookup of references (PMID or DOI)")
    sub_crossref.add_argument('--kafka-mode',
                              action='store_true',
                              help="consume from kafka topic (not stdin)")
    sub_crossref.add_argument(
        '--bezerk-mode',
        action='store_true',
        help=
        "don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)"
    )

    sub_jalc = subparsers.add_parser(
        'jalc', help="import JALC DOI metadata from XML dump")
    sub_jalc.set_defaults(
        func=run_jalc,
        auth_var="FATCAT_AUTH_WORKER_JALC",
    )
    sub_jalc.add_argument(
        'xml_file',
        help="Jalc RDF XML file (record-per-line) to import from",
        default=sys.stdin,
        type=argparse.FileType('r'))
    sub_jalc.add_argument('issn_map_file',
                          help="ISSN to ISSN-L mapping file",
                          default=None,
                          type=argparse.FileType('r'))
    sub_jalc.add_argument('--extid-map-file',
                          help="DOI-to-other-identifiers sqlite3 database",
                          default=None,
                          type=str)

    sub_arxiv = subparsers.add_parser(
        'arxiv', help="import arxiv.org metadata from XML files")
    sub_arxiv.set_defaults(
        func=run_arxiv,
        auth_var="FATCAT_AUTH_WORKER_ARXIV",
    )
    sub_arxiv.add_argument('xml_file',
                           nargs='?',
                           help="arXivRaw XML file to import from",
                           default=sys.stdin,
                           type=argparse.FileType('r'))
    sub_arxiv.add_argument('--kafka-mode',
                           action='store_true',
                           help="consume from kafka topic (not stdin)")

    sub_pubmed = subparsers.add_parser(
        'pubmed', help="import MEDLINE/PubMed work-level metadata (XML)")
    sub_pubmed.set_defaults(
        func=run_pubmed,
        auth_var="FATCAT_AUTH_WORKER_PUBMED",
    )
    sub_pubmed.add_argument('xml_file',
                            nargs='?',
                            help="Pubmed XML file to import from",
                            default=sys.stdin,
                            type=argparse.FileType('r'))
    sub_pubmed.add_argument('issn_map_file',
                            help="ISSN to ISSN-L mapping file",
                            default=None,
                            type=argparse.FileType('r'))
    sub_pubmed.add_argument('--no-lookup-refs',
                            action='store_true',
                            help="skip lookup of references (PMID or DOI)")
    sub_pubmed.add_argument('--do-updates',
                            action='store_true',
                            help="update pre-existing release entities")
    sub_pubmed.add_argument('--kafka-mode',
                            action='store_true',
                            help="consume from kafka topic (not stdin)")

    sub_jstor = subparsers.add_parser(
        'jstor', help="import JSTOR work-level metadata from XML dump")
    sub_jstor.set_defaults(
        func=run_jstor,
        auth_var="FATCAT_AUTH_WORKER_JSTOR",
    )
    sub_jstor.add_argument('list_file',
                           help="List of JSTOR XML file paths to import from",
                           default=sys.stdin,
                           type=argparse.FileType('r'))
    sub_jstor.add_argument('issn_map_file',
                           help="ISSN to ISSN-L mapping file",
                           default=None,
                           type=argparse.FileType('r'))

    sub_orcid = subparsers.add_parser(
        'orcid', help="import creator entities from ORCID XML dump")
    sub_orcid.set_defaults(func=run_orcid, auth_var="FATCAT_AUTH_WORKER_ORCID")
    sub_orcid.add_argument('json_file',
                           help="orcid JSON file to import from (or stdin)",
                           default=sys.stdin,
                           type=argparse.FileType('r'))

    sub_journal_metadata = subparsers.add_parser(
        'journal-metadata',
        help="import/update container metadata from old manual munging format")
    sub_journal_metadata.set_defaults(
        func=run_journal_metadata,
        auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
    )
    sub_journal_metadata.add_argument(
        'json_file',
        help="Journal JSON metadata file to import from (or stdin)",
        default=sys.stdin,
        type=argparse.FileType('r'))

    sub_chocula = subparsers.add_parser(
        'chocula',
        help="import/update container metadata from chocula JSON export")
    sub_chocula.set_defaults(
        func=run_chocula,
        auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
    )
    sub_chocula.add_argument('json_file',
                             help="chocula JSON entities file (or stdin)",
                             default=sys.stdin,
                             type=argparse.FileType('r'))
    sub_chocula.add_argument('--do-updates',
                             action='store_true',
                             help="update pre-existing container entities")

    sub_matched = subparsers.add_parser(
        'matched',
        help=
        "add file entities matched against existing releases; custom JSON format"
    )
    sub_matched.set_defaults(
        func=run_matched,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_matched.add_argument('json_file',
                             help="JSON file to import from (or stdin)",
                             default=sys.stdin,
                             type=argparse.FileType('r'))
    sub_matched.add_argument(
        '--default-mimetype',
        default=None,
        help="default mimetype for imported files (if not specified per-file)")
    sub_matched.add_argument(
        '--bezerk-mode',
        action='store_true',
        help=
        "don't lookup existing files, just insert (clobbers; only for fast bootstrap)"
    )
    sub_matched.add_argument(
        '--default-link-rel',
        default="web",
        help="default URL rel for matches (eg, 'publisher', 'web')")

    sub_arabesque_match = subparsers.add_parser(
        'arabesque',
        help="add file entities matched to releases from crawl log analysis")
    sub_arabesque_match.set_defaults(
        func=run_arabesque_match,
        auth_var="FATCAT_AUTH_WORKER_CRAWL",
    )
    sub_arabesque_match.add_argument(
        '--sqlite-file', help="sqlite database file to import from")
    sub_arabesque_match.add_argument(
        '--json-file',
        help="JSON file to import from (or stdin)",
        type=argparse.FileType('r'))
    sub_arabesque_match.add_argument(
        '--do-updates',
        action='store_true',
        help=
        "update pre-existing file entities if new match (instead of skipping)")
    sub_arabesque_match.add_argument(
        '--no-require-grobid',
        action='store_true',
        help="whether postproc_status column must be '200'")
    sub_arabesque_match.add_argument(
        '--extid-type',
        default="doi",
        help="identifier type in the database (eg, 'doi', 'pmcid'")
    sub_arabesque_match.add_argument(
        '--crawl-id',
        help="crawl ID (optionally included in editgroup metadata)")
    sub_arabesque_match.add_argument(
        '--default-link-rel',
        default="web",
        help="default URL rel for matches (eg, 'publisher', 'web')")

    sub_ingest_file = subparsers.add_parser(
        'ingest-file-results',
        help=
        "add/update file entities linked to releases based on sandcrawler ingest results"
    )
    sub_ingest_file.set_defaults(
        func=run_ingest_file,
        auth_var="FATCAT_AUTH_WORKER_CRAWL",
    )
    sub_ingest_file.add_argument('json_file',
                                 help="ingest_file JSON file to import from",
                                 default=sys.stdin,
                                 type=argparse.FileType('r'))
    sub_ingest_file.add_argument(
        '--skip-source-whitelist',
        action='store_true',
        help="don't filter import based on request source whitelist")
    sub_ingest_file.add_argument('--kafka-mode',
                                 action='store_true',
                                 help="consume from kafka topic (not stdin)")
    sub_ingest_file.add_argument(
        '--do-updates',
        action='store_true',
        help=
        "update pre-existing file entities if new match (instead of skipping)")
    sub_ingest_file.add_argument(
        '--no-require-grobid',
        action='store_true',
        help="whether postproc_status column must be '200'")
    sub_ingest_file.add_argument(
        '--default-link-rel',
        default="web",
        help="default URL rel for matches (eg, 'publisher', 'web')")

    sub_savepapernow_file = subparsers.add_parser(
        'savepapernow-file-results',
        help="add file entities crawled due to async Save Paper Now request")
    sub_savepapernow_file.set_defaults(
        func=run_savepapernow_file,
        auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW",
    )
    sub_savepapernow_file.add_argument(
        'json_file',
        help="ingest-file JSON file to import from",
        default=sys.stdin,
        type=argparse.FileType('r'))
    sub_savepapernow_file.add_argument(
        '--kafka-mode',
        action='store_true',
        help="consume from kafka topic (not stdin)")

    sub_grobid_metadata = subparsers.add_parser(
        'grobid-metadata',
        help=
        "create release and file entities based on GROBID PDF metadata extraction"
    )
    sub_grobid_metadata.set_defaults(
        func=run_grobid_metadata,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_grobid_metadata.add_argument('tsv_file',
                                     help="TSV file to import from (or stdin)",
                                     default=sys.stdin,
                                     type=argparse.FileType('r'))
    sub_grobid_metadata.add_argument('--group-size',
                                     help="editgroup group size to use",
                                     default=75,
                                     type=int)
    sub_grobid_metadata.add_argument(
        '--longtail-oa',
        action='store_true',
        help=
        "if this is an import of longtail OA content (sets an 'extra' flag)")
    sub_grobid_metadata.add_argument(
        '--bezerk-mode',
        action='store_true',
        help=
        "don't lookup existing files, just insert (clobbers; only for fast bootstrap)"
    )

    sub_shadow_lib = subparsers.add_parser(
        'shadow-lib',
        help=
        "create release and file entities based on GROBID PDF metadata extraction"
    )
    sub_shadow_lib.set_defaults(
        func=run_shadow_lib,
        auth_var="FATCAT_AUTH_WORKER_SHADOW",
    )
    sub_shadow_lib.add_argument('json_file',
                                help="JSON file to import from (or stdin)",
                                default=sys.stdin,
                                type=argparse.FileType('r'))

    sub_wayback_static = subparsers.add_parser(
        'wayback-static',
        help="crude crawl+ingest tool for single-page HTML docs from wayback")
    sub_wayback_static.set_defaults(
        func=run_wayback_static,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_wayback_static.add_argument(
        'wayback_url', type=str, help="URL of wayback capture to extract from")
    sub_wayback_static.add_argument(
        '--extid', type=str, help="external identifier for release lookup")
    sub_wayback_static.add_argument('--release-id',
                                    type=str,
                                    help="release entity identifier")
    sub_wayback_static.add_argument(
        '--editgroup-id',
        type=str,
        help="use existing editgroup (instead of creating a new one)")

    sub_cdl_dash_dat = subparsers.add_parser(
        'cdl-dash-dat',
        help="crude helper to import datasets from Dat/CDL mirror pilot project"
    )
    sub_cdl_dash_dat.set_defaults(
        func=run_cdl_dash_dat,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_cdl_dash_dat.add_argument(
        'dat_path',
        type=str,
        help="local path dat to import (must be the dat discovery key)")
    sub_cdl_dash_dat.add_argument('--release-id',
                                  type=str,
                                  help="release entity identifier")
    sub_cdl_dash_dat.add_argument(
        '--editgroup-id',
        type=str,
        help="use existing editgroup (instead of creating a new one)")

    sub_datacite = subparsers.add_parser('datacite',
                                         help="import datacite.org metadata")
    sub_datacite.add_argument(
        'json_file',
        help="File with jsonlines from datacite.org v2 API to import from",
        default=sys.stdin,
        type=argparse.FileType('r'))
    sub_datacite.add_argument('issn_map_file',
                              help="ISSN to ISSN-L mapping file",
                              default=None,
                              type=argparse.FileType('r'))
    sub_datacite.add_argument('--extid-map-file',
                              help="DOI-to-other-identifiers sqlite3 database",
                              default=None,
                              type=str)
    sub_datacite.add_argument('--kafka-mode',
                              action='store_true',
                              help="consume from kafka topic (not stdin)")
    sub_datacite.add_argument(
        '--bezerk-mode',
        action='store_true',
        help=
        "don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)"
    )
    sub_datacite.add_argument('--debug',
                              action='store_true',
                              help="write converted JSON to stdout")
    sub_datacite.add_argument(
        '--insert-log-file',
        default='',
        type=str,
        help="write inserted documents into file (for debugging)")
    sub_datacite.set_defaults(
        func=run_datacite,
        auth_var="FATCAT_AUTH_WORKER_DATACITE",
    )

    sub_file_meta = subparsers.add_parser(
        'file-meta', help="simple update-only importer for file metadata")
    sub_file_meta.set_defaults(
        func=run_file_meta,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_file_meta.add_argument(
        'json_file',
        help="File with jsonlines from file_meta schema to import from",
        default=sys.stdin,
        type=argparse.FileType('r'))

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print("tell me what to do!")
        sys.exit(-1)

    # allow editgroup description override via env variable (but CLI arg takes
    # precedence)
    if not args.editgroup_description_override \
            and os.environ.get('FATCAT_EDITGROUP_DESCRIPTION'):
        args.editgroup_description_override = os.environ.get(
            'FATCAT_EDITGROUP_DESCRIPTION')

    args.api = authenticated_api(
        args.host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var))
    args.func(args)
Exemplo n.º 14
0
def api():
    load_dotenv(dotenv_path="./example.env")
    api_client = authenticated_api("http://localhost:9411/v0")
    api_client.editor_id = "aaaaaaaaaaaabkvkaaaaaaaaae"
    return api_client
Exemplo n.º 15
0
def main() -> None:
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--host-url",
                        default="http://localhost:9411/v0",
                        help="connect to this host/port")
    parser.add_argument(
        "--kafka-hosts",
        default="localhost:9092",
        help="list of Kafka brokers (host/port) to use",
    )
    parser.add_argument("--kafka-env",
                        default="dev",
                        help="Kafka topic namespace to use (eg, prod, qa)")
    parser.add_argument("--batch-size",
                        help="size of batch to send",
                        default=50,
                        type=int)
    parser.add_argument(
        "--editgroup-description-override",
        help="editgroup description override",
        default=None,
        type=str,
    )
    subparsers = parser.add_subparsers()

    sub_crossref = subparsers.add_parser(
        "crossref", help="import Crossref API metadata format (JSON)")
    sub_crossref.set_defaults(
        func=run_crossref,
        auth_var="FATCAT_AUTH_WORKER_CROSSREF",
    )
    sub_crossref.add_argument(
        "json_file",
        help="crossref JSON file to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_crossref.add_argument(
        "issn_map_file",
        help="ISSN to ISSN-L mapping file",
        default=None,
        type=argparse.FileType("r"),
    )
    sub_crossref.add_argument("--no-lookup-refs",
                              action="store_true",
                              help="skip lookup of references (PMID or DOI)")
    sub_crossref.add_argument("--kafka-mode",
                              action="store_true",
                              help="consume from kafka topic (not stdin)")
    sub_crossref.add_argument(
        "--bezerk-mode",
        action="store_true",
        help=
        "don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)",
    )

    sub_jalc = subparsers.add_parser(
        "jalc", help="import JALC DOI metadata from XML dump")
    sub_jalc.set_defaults(
        func=run_jalc,
        auth_var="FATCAT_AUTH_WORKER_JALC",
    )
    sub_jalc.add_argument(
        "xml_file",
        help="Jalc RDF XML file (record-per-line) to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_jalc.add_argument(
        "issn_map_file",
        help="ISSN to ISSN-L mapping file",
        default=None,
        type=argparse.FileType("r"),
    )

    sub_arxiv = subparsers.add_parser(
        "arxiv", help="import arxiv.org metadata from XML files")
    sub_arxiv.set_defaults(
        func=run_arxiv,
        auth_var="FATCAT_AUTH_WORKER_ARXIV",
    )
    sub_arxiv.add_argument(
        "xml_file",
        nargs="?",
        help="arXivRaw XML file to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_arxiv.add_argument("--kafka-mode",
                           action="store_true",
                           help="consume from kafka topic (not stdin)")

    sub_pubmed = subparsers.add_parser(
        "pubmed", help="import MEDLINE/PubMed work-level metadata (XML)")
    sub_pubmed.set_defaults(
        func=run_pubmed,
        auth_var="FATCAT_AUTH_WORKER_PUBMED",
    )
    sub_pubmed.add_argument(
        "xml_file",
        nargs="?",
        help="Pubmed XML file to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_pubmed.add_argument(
        "issn_map_file",
        help="ISSN to ISSN-L mapping file",
        default=None,
        type=argparse.FileType("r"),
    )
    sub_pubmed.add_argument("--no-lookup-refs",
                            action="store_true",
                            help="skip lookup of references (PMID or DOI)")
    sub_pubmed.add_argument("--do-updates",
                            action="store_true",
                            help="update pre-existing release entities")
    sub_pubmed.add_argument("--kafka-mode",
                            action="store_true",
                            help="consume from kafka topic (not stdin)")

    sub_jstor = subparsers.add_parser(
        "jstor", help="import JSTOR work-level metadata from XML dump")
    sub_jstor.set_defaults(
        func=run_jstor,
        auth_var="FATCAT_AUTH_WORKER_JSTOR",
    )
    sub_jstor.add_argument(
        "list_file",
        help="List of JSTOR XML file paths to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_jstor.add_argument(
        "issn_map_file",
        help="ISSN to ISSN-L mapping file",
        default=None,
        type=argparse.FileType("r"),
    )

    sub_orcid = subparsers.add_parser(
        "orcid", help="import creator entities from ORCID XML dump")
    sub_orcid.set_defaults(func=run_orcid, auth_var="FATCAT_AUTH_WORKER_ORCID")
    sub_orcid.add_argument(
        "json_file",
        help="orcid JSON file to import from (or stdin)",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    sub_journal_metadata = subparsers.add_parser(
        "journal-metadata",
        help="import/update container metadata from old manual munging format",
    )
    sub_journal_metadata.set_defaults(
        func=run_journal_metadata,
        auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
    )
    sub_journal_metadata.add_argument(
        "json_file",
        help="Journal JSON metadata file to import from (or stdin)",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    sub_chocula = subparsers.add_parser(
        "chocula",
        help="import/update container metadata from chocula JSON export")
    sub_chocula.set_defaults(
        func=run_chocula,
        auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",
    )
    sub_chocula.add_argument(
        "json_file",
        help="chocula JSON entities file (or stdin)",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_chocula.add_argument("--do-updates",
                             action="store_true",
                             help="update pre-existing container entities")

    sub_matched = subparsers.add_parser(
        "matched",
        help=
        "add file entities matched against existing releases; custom JSON format",
    )
    sub_matched.set_defaults(
        func=run_matched,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_matched.add_argument(
        "json_file",
        help="JSON file to import from (or stdin)",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_matched.add_argument(
        "--default-mimetype",
        default=None,
        help="default mimetype for imported files (if not specified per-file)",
    )
    sub_matched.add_argument(
        "--bezerk-mode",
        action="store_true",
        help=
        "don't lookup existing files, just insert (clobbers; only for fast bootstrap)",
    )
    sub_matched.add_argument(
        "--default-link-rel",
        default="web",
        help="default URL rel for matches (eg, 'publisher', 'web')",
    )

    sub_arabesque_match = subparsers.add_parser(
        "arabesque",
        help="add file entities matched to releases from crawl log analysis")
    sub_arabesque_match.set_defaults(
        func=run_arabesque_match,
        auth_var="FATCAT_AUTH_WORKER_CRAWL",
    )
    sub_arabesque_match.add_argument(
        "--sqlite-file", help="sqlite database file to import from")
    sub_arabesque_match.add_argument(
        "--json-file",
        help="JSON file to import from (or stdin)",
        type=argparse.FileType("r"))
    sub_arabesque_match.add_argument(
        "--do-updates",
        action="store_true",
        help=
        "update pre-existing file entities if new match (instead of skipping)",
    )
    sub_arabesque_match.add_argument(
        "--no-require-grobid",
        action="store_true",
        help="whether postproc_status column must be '200'",
    )
    sub_arabesque_match.add_argument(
        "--extid-type",
        default="doi",
        help="identifier type in the database (eg, 'doi', 'pmcid'",
    )
    sub_arabesque_match.add_argument(
        "--crawl-id",
        help="crawl ID (optionally included in editgroup metadata)")
    sub_arabesque_match.add_argument(
        "--default-link-rel",
        default="web",
        help="default URL rel for matches (eg, 'publisher', 'web')",
    )

    sub_ingest_file = subparsers.add_parser(
        "ingest-file-results",
        help=
        "add/update file entities linked to releases based on sandcrawler ingest results",
    )
    sub_ingest_file.set_defaults(
        func=run_ingest_file,
        auth_var="FATCAT_AUTH_WORKER_CRAWL",
    )
    sub_ingest_file.add_argument(
        "json_file",
        help="ingest_file JSON file to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_ingest_file.add_argument(
        "--skip-source-allowlist",
        action="store_true",
        help="don't filter import based on request source allowlist",
    )
    sub_ingest_file.add_argument("--kafka-mode",
                                 action="store_true",
                                 help="consume from kafka topic (not stdin)")
    sub_ingest_file.add_argument(
        "--do-updates",
        action="store_true",
        help=
        "update pre-existing file entities if new match (instead of skipping)",
    )
    sub_ingest_file.add_argument(
        "--no-require-grobid",
        action="store_true",
        help="whether postproc_status column must be '200'",
    )
    sub_ingest_file.add_argument(
        "--default-link-rel",
        default="web",
        help="default URL rel for matches (eg, 'publisher', 'web')",
    )

    sub_ingest_web = subparsers.add_parser(
        "ingest-web-results",
        help=
        "add/update web entities linked to releases based on sandcrawler ingest results",
    )
    sub_ingest_web.set_defaults(
        func=run_ingest_web,
        auth_var="FATCAT_AUTH_WORKER_CRAWL",
    )
    sub_ingest_web.add_argument(
        "json_file",
        help="ingest_web JSON file to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_ingest_web.add_argument(
        "--skip-source-allowlist",
        action="store_true",
        help="don't filter import based on request source allowlist",
    )
    sub_ingest_web.add_argument("--kafka-mode",
                                action="store_true",
                                help="consume from kafka topic (not stdin)")
    sub_ingest_web.add_argument(
        "--do-updates",
        action="store_true",
        help=
        "update pre-existing web entities if new match (instead of skipping)",
    )
    sub_ingest_web.add_argument(
        "--default-link-rel",
        default="web",
        help="default URL rel for matches (eg, 'publisher', 'web')",
    )

    sub_ingest_fileset = subparsers.add_parser(
        "ingest-fileset-results",
        help=
        "add/update fileset entities linked to releases based on sandcrawler ingest results",
    )
    sub_ingest_fileset.set_defaults(
        func=run_ingest_fileset,
        auth_var="FATCAT_AUTH_WORKER_CRAWL",
    )
    sub_ingest_fileset.add_argument(
        "json_file",
        help="ingest_fileset JSON file to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_ingest_fileset.add_argument(
        "--skip-source-allowlist",
        action="store_true",
        help="don't filter import based on request source allowlist",
    )
    sub_ingest_fileset.add_argument(
        "--kafka-mode",
        action="store_true",
        help="consume from kafka topic (not stdin)")
    sub_ingest_fileset.add_argument(
        "--do-updates",
        action="store_true",
        help=
        "update pre-existing fileset entities if new match (instead of skipping)",
    )
    sub_ingest_fileset.add_argument(
        "--default-link-rel",
        default="fileset",
        help="default URL rel for matches (eg, 'publisher', 'web')",
    )

    sub_ingest_fileset_file = subparsers.add_parser(
        "ingest-fileset-file-results",
        help=
        "add/update file entities linked to releases based on sandcrawler dataset/fileset ingest results",
    )
    sub_ingest_fileset_file.set_defaults(
        func=run_ingest_fileset_file,
        auth_var="FATCAT_AUTH_WORKER_CRAWL",
    )
    sub_ingest_fileset_file.add_argument(
        "json_file",
        help="ingest_fileset JSON file to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_ingest_fileset_file.add_argument(
        "--skip-source-allowlist",
        action="store_true",
        help="don't filter import based on request source allowlist",
    )
    sub_ingest_fileset_file.add_argument(
        "--kafka-mode",
        action="store_true",
        help="consume from kafka topic (not stdin)")
    sub_ingest_fileset_file.add_argument(
        "--do-updates",
        action="store_true",
        help=
        "update pre-existing fileset entities if new match (instead of skipping)",
    )
    sub_ingest_fileset_file.add_argument(
        "--default-link-rel",
        default="fileset",
        help="default URL rel for matches (eg, 'publisher', 'web')",
    )

    sub_savepapernow_file = subparsers.add_parser(
        "savepapernow-file-results",
        help="add file entities crawled due to async Save Paper Now request",
    )
    sub_savepapernow_file.set_defaults(
        func=run_savepapernow_file,
        auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW",
    )
    sub_savepapernow_file.add_argument(
        "json_file",
        help="ingest-file JSON file to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_savepapernow_file.add_argument(
        "--kafka-mode",
        action="store_true",
        help="consume from kafka topic (not stdin)")

    sub_savepapernow_web = subparsers.add_parser(
        "savepapernow-web-results",
        help=
        "add webcapture entities crawled due to async Save Paper Now request",
    )
    sub_savepapernow_web.set_defaults(
        func=run_savepapernow_web,
        auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW",
    )
    sub_savepapernow_web.add_argument(
        "json_file",
        help="ingest-file JSON file to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_savepapernow_web.add_argument(
        "--kafka-mode",
        action="store_true",
        help="consume from kafka topic (not stdin)")

    sub_savepapernow_fileset = subparsers.add_parser(
        "savepapernow-fileset-results",
        help="add fileset entities crawled due to async Save Paper Now request",
    )
    sub_savepapernow_fileset.set_defaults(
        func=run_savepapernow_fileset,
        auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW",
    )
    sub_savepapernow_fileset.add_argument(
        "json_file",
        help="ingest-file JSON file to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_savepapernow_fileset.add_argument(
        "--kafka-mode",
        action="store_true",
        help="consume from kafka topic (not stdin)")

    sub_grobid_metadata = subparsers.add_parser(
        "grobid-metadata",
        help=
        "create release and file entities based on GROBID PDF metadata extraction",
    )
    sub_grobid_metadata.set_defaults(
        func=run_grobid_metadata,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_grobid_metadata.add_argument(
        "tsv_file",
        help="TSV file to import from (or stdin)",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_grobid_metadata.add_argument("--group-size",
                                     help="editgroup group size to use",
                                     default=75,
                                     type=int)
    sub_grobid_metadata.add_argument(
        "--longtail-oa",
        action="store_true",
        help=
        "if this is an import of longtail OA content (sets an 'extra' flag)",
    )
    sub_grobid_metadata.add_argument(
        "--bezerk-mode",
        action="store_true",
        help=
        "don't lookup existing files, just insert (clobbers; only for fast bootstrap)",
    )

    sub_shadow_lib = subparsers.add_parser(
        "shadow-lib",
        help=
        "create release and file entities based on GROBID PDF metadata extraction",
    )
    sub_shadow_lib.set_defaults(
        func=run_shadow_lib,
        auth_var="FATCAT_AUTH_WORKER_SHADOW",
    )
    sub_shadow_lib.add_argument(
        "json_file",
        help="JSON file to import from (or stdin)",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    sub_datacite = subparsers.add_parser("datacite",
                                         help="import datacite.org metadata")
    sub_datacite.add_argument(
        "json_file",
        help="File with jsonlines from datacite.org v2 API to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_datacite.add_argument(
        "issn_map_file",
        help="ISSN to ISSN-L mapping file",
        default=None,
        type=argparse.FileType("r"),
    )
    sub_datacite.add_argument("--kafka-mode",
                              action="store_true",
                              help="consume from kafka topic (not stdin)")
    sub_datacite.add_argument(
        "--bezerk-mode",
        action="store_true",
        help=
        "don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)",
    )
    sub_datacite.add_argument("--debug",
                              action="store_true",
                              help="write converted JSON to stdout")
    sub_datacite.add_argument(
        "--insert-log-file",
        default="",
        type=str,
        help="write inserted documents into file (for debugging)",
    )
    sub_datacite.set_defaults(
        func=run_datacite,
        auth_var="FATCAT_AUTH_WORKER_DATACITE",
    )

    sub_doaj_article = subparsers.add_parser(
        "doaj-article", help="import doaj.org article metadata")
    sub_doaj_article.add_argument(
        "json_file",
        help="File with JSON lines from DOAJ API (or bulk dump) to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_doaj_article.add_argument(
        "--issn-map-file",
        help="ISSN to ISSN-L mapping file",
        default=None,
        type=argparse.FileType("r"),
    )
    sub_doaj_article.add_argument("--kafka-mode",
                                  action="store_true",
                                  help="consume from kafka topic (not stdin)")
    sub_doaj_article.add_argument(
        "--do-updates",
        action="store_true",
        help="update any pre-existing release entities")
    sub_doaj_article.set_defaults(
        func=run_doaj_article,
        auth_var="FATCAT_AUTH_WORKER_DOAJ",
    )

    sub_dblp_release = subparsers.add_parser(
        "dblp-release", help="import dblp release metadata")
    sub_dblp_release.add_argument(
        "xml_file",
        help="File with DBLP XML to import from",
        default=sys.stdin,
        type=argparse.FileType("rb"),
    )
    sub_dblp_release.add_argument(
        "--dblp-container-map-file",
        help="file path to dblp prefix to container_id TSV file",
        default=None,
        type=argparse.FileType("r"),
    )
    sub_dblp_release.add_argument(
        "--do-updates",
        action="store_true",
        help="update any pre-existing release entities")
    sub_dblp_release.add_argument(
        "--dump-json-mode",
        action="store_true",
        help="print release entities to stdout instead of importing",
    )
    sub_dblp_release.set_defaults(
        func=run_dblp_release,
        auth_var="FATCAT_AUTH_WORKER_DBLP",
    )

    sub_dblp_container = subparsers.add_parser(
        "dblp-container", help="import dblp container metadata")
    sub_dblp_container.add_argument(
        "json_file",
        help="File with DBLP container JSON to import from (see extra/dblp/)",
        default=sys.stdin,
        type=argparse.FileType("rb"),
    )
    sub_dblp_container.add_argument(
        "--dblp-container-map-file",
        help="file path to dblp pre-existing prefix to container_id TSV file",
        default=None,
        type=argparse.FileType("r"),
    )
    sub_dblp_container.add_argument(
        "--dblp-container-map-output",
        help="file path to output new dblp container map TSV to",
        default=None,
        type=argparse.FileType("w"),
    )
    sub_dblp_container.add_argument(
        "--issn-map-file",
        help="ISSN to ISSN-L mapping file",
        default=None,
        type=argparse.FileType("r"),
    )
    sub_dblp_container.add_argument(
        "--do-updates",
        action="store_true",
        help="update any pre-existing container entities")
    sub_dblp_container.set_defaults(
        func=run_dblp_container,
        auth_var="FATCAT_AUTH_WORKER_DBLP",
    )

    sub_file_meta = subparsers.add_parser(
        "file-meta", help="simple update-only importer for file metadata")
    sub_file_meta.set_defaults(
        func=run_file_meta,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_file_meta.add_argument(
        "json_file",
        help="File with jsonlines from file_meta schema to import from",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    sub_fileset = subparsers.add_parser("fileset",
                                        help="generic fileset importer")
    sub_fileset.set_defaults(
        func=run_fileset,
        auth_var="FATCAT_API_AUTH_TOKEN",
    )
    sub_fileset.add_argument(
        "json_file",
        help="File with jsonlines of fileset entities to import",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )
    sub_fileset.add_argument(
        "--skip-release-fileset-check",
        action="store_true",
        help=
        "create without checking if releases already have related filesets",
    )

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print("tell me what to do!")
        sys.exit(-1)

    # allow editgroup description override via env variable (but CLI arg takes
    # precedence)
    if not args.editgroup_description_override and os.environ.get(
            "FATCAT_EDITGROUP_DESCRIPTION"):
        args.editgroup_description_override = os.environ.get(
            "FATCAT_EDITGROUP_DESCRIPTION")

    args.api = authenticated_api(
        args.host_url,
        # token is an optional kwarg (can be empty string, None, etc)
        token=os.environ.get(args.auth_var),
    )
    sentry_sdk.init()
    args.func(args)