def main() -> None: parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--host-url", default="http://localhost:9411/v0", help="connect to this host/port") parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) parser.set_defaults(auth_var="FATCAT_AUTH_WORKER_CLEANUP", ) parser.add_argument( "idents_file", help="File with release identifier to try updating", default=sys.stdin, type=argparse.FileType("r"), ) args = parser.parse_args() api = authenticated_api( args.host_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var), ) rldc = ReleaseLowercaseDoiCleanup( api, edit_batch_size=args.batch_size, ) LinePusher(rldc, args.idents_file).run()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--api-host-url', default="http://localhost:9411/v0", help="connect to this host/port") parser.add_argument('--batch-size', help="size of batch to send", default=50, type=int) parser.add_argument('domain_issnl_tsv_file', help="domain/ISSNL mapping TSV file", type=argparse.FileType('r')) parser.add_argument('insertable_tsv_file', help="dumpgrobidmetainsertable TSV file to work over", default=sys.stdin, type=argparse.FileType('r')) auth_var = "FATCAT_AUTH_SANDCRAWLER" args = parser.parse_args() args.api = authenticated_api( args.api_host_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(auth_var)) run_fixup(args)
def main() -> None: parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--host-url", default="http://localhost:9411/v0", help="connect to this host/port") parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) parser.set_defaults(auth_var="FATCAT_AUTH_WORKER_CLEANUP", ) parser.add_argument( "json_file", help="File with jsonlines with cleanup context", default=sys.stdin, type=argparse.FileType("r"), ) args = parser.parse_args() api = authenticated_api( args.host_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var), ) frbc = FileReleaseBugfix( api, edit_batch_size=args.batch_size, ) JsonLinePusher(frbc, args.json_file).run()
def main(): api = authenticated_api( API_ENDPOINT, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get("FATCAT_API_AUTH_TOKEN")) path = sys.argv[1] reader = csv.DictReader(open(path), delimiter='\t') run(api, reader)
def main() -> None: """ Invoke like: python3 -m fatcat_tools.mergers.files [options] """ parser = argparse.ArgumentParser() parser.add_argument("--host-url", default="http://localhost:9411/v0", help="connect to this host/port") parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) parser.add_argument( "--editgroup-description-override", help="editgroup description override", default=None, type=str, ) parser.add_argument( "--dry-run", action="store_true", help="don't actually commit merges, just count what would have been", ) parser.set_defaults(auth_var="FATCAT_API_AUTH_TOKEN", ) subparsers = parser.add_subparsers() sub_merge_files = subparsers.add_parser("merge-files") sub_merge_files.set_defaults(func=run_merge_files) sub_merge_files.add_argument( "json_file", help="source of merge lines to process (or stdin)", default=sys.stdin, type=argparse.FileType("r"), ) args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") sys.exit(-1) # allow editgroup description override via env variable (but CLI arg takes # precedence) if not args.editgroup_description_override and os.environ.get( "FATCAT_EDITGROUP_DESCRIPTION"): args.editgroup_description_override = os.environ.get( "FATCAT_EDITGROUP_DESCRIPTION") args.api = authenticated_api( args.host_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var), ) args.func(args)
def main() -> None: parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--fatcat-api-url", default="http://localhost:9411/v0", help="connect to this host/port") parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) parser.add_argument( "--editgroup-description-override", help="editgroup description override", default=None, type=str, ) parser.add_argument("--dry-run", help="dry-run mode (don't actually update)", default=False, type=bool) subparsers = parser.add_subparsers() sub_files = subparsers.add_parser( "files", help="attempt metadata cleanups over a list of file entities") sub_files.set_defaults( func=run_files, auth_var="FATCAT_AUTH_WORKER_CLEANUP", ) sub_files.add_argument( "json_file", help="files JSON file to import from", default=sys.stdin, type=argparse.FileType("r"), ) args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") sys.exit(-1) # allow editgroup description override via env variable (but CLI arg takes # precedence) if not args.editgroup_description_override and os.environ.get( "FATCAT_EDITGROUP_DESCRIPTION"): args.editgroup_description_override = os.environ.get( "FATCAT_EDITGROUP_DESCRIPTION") args.api = authenticated_api( args.fatcat_api_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var), ) sentry_sdk.init(environment=args.env) args.func(args)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--host-url', default="http://localhost:9411/v0", help="connect to this host/port") parser.add_argument('--batch-size', help="size of batch to send", default=50, type=int) parser.add_argument('--editgroup-description-override', help="editgroup description override", default=None, type=str) parser.add_argument('--dry-run', help="dry-run mode (don't actually update)", default=False, type=bool) subparsers = parser.add_subparsers() sub_files = subparsers.add_parser('files') sub_files.set_defaults( func=run_files, auth_var="FATCAT_AUTH_WORKER_CLEANUP", ) sub_files.add_argument('json_file', help="files JSON file to import from", default=sys.stdin, type=argparse.FileType('r')) args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") sys.exit(-1) # allow editgroup description override via env variable (but CLI arg takes # precedence) if not args.editgroup_description_override \ and os.environ.get('FATCAT_EDITGROUP_DESCRIPTION'): args.editgroup_description_override = os.environ.get( 'FATCAT_EDITGROUP_DESCRIPTION') args.api = authenticated_api( args.host_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var)) args.func(args)
def main() -> None: parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--verbose", action="store_true", help="enable verbose output") parser.add_argument( "--fatcat-api-url", default="http://localhost:9411/v0", help="fatcat API host/port to use", ) parser.add_argument( "--poll-interval", help="how long to wait between polling (seconds)", default=10.0, type=float, ) subparsers = parser.add_subparsers() sub_dummy = subparsers.add_parser("dummy", help="example/demonstration review bot") sub_dummy.set_defaults(func=run_dummy) sub_dummy.add_argument( "--continuous", action="store_true", help="run forever, polling for new reviewable editgroups", ) sub_dummy.add_argument("--editgroup", help="single editgroup ID to review") sub_dummy.add_argument( "--annotate", action="store_true", help="for single editgroups, pushes result as annotation", ) args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") sys.exit(-1) if (args.editgroup and args.continuous) or not (args.editgroup or args.continuous): print("need to run on a single editgroup, or continuous") sys.exit(-1) args.api = authenticated_api(args.fatcat_api_url) sentry_sdk.init(environment=args.env) args.func(args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--fatcat-api-url', default="http://localhost:9411/v0", help="connect to this host/port") subparsers = parser.add_subparsers() sub_uuid2fcid = subparsers.add_parser( 'uuid2fcid', help="convert a standard UUID (as string) to fatcat ident format") sub_uuid2fcid.set_defaults(func=run_uuid2fcid) sub_uuid2fcid.add_argument('uuid', help="UUID to transform") sub_fcid2uuid = subparsers.add_parser( 'fcid2uuid', help="convert a fatcat ident string to standard UUID format") sub_fcid2uuid.set_defaults(func=run_fcid2uuid) sub_fcid2uuid.add_argument('fcid', help="FCID to transform (into UUID)") sub_editgroup_accept = subparsers.add_parser( 'editgroup-accept', help="accept an editgroup (by ident)") sub_editgroup_accept.set_defaults(func=run_editgroup_accept) sub_editgroup_accept.add_argument('editgroup_id', help="editgroup to accept") sub_editgroup_submit = subparsers.add_parser( 'editgroup-submit', help="submit an editgroup for review (by ident)") sub_editgroup_submit.set_defaults(func=run_editgroup_submit) sub_editgroup_submit.add_argument('editgroup_id', help="editgroup to submit") args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") sys.exit(-1) args.api = authenticated_api(args.fatcat_api_url) args.func(args)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--debug', action='store_true', help="enable debugging interface") parser.add_argument('--host-url', default="http://localhost:9411/v0", help="connect to this host/port") parser.add_argument('--kafka-hosts', default="localhost:9092", help="list of Kafka brokers (host/port) to use") parser.add_argument('--kafka-env', default="dev", help="Kafka topic namespace to use (eg, prod, qa)") parser.add_argument('--batch-size', help="size of batch to send", default=50, type=int) parser.add_argument('--editgroup-description-override', help="editgroup description override", default=None, type=str) subparsers = parser.add_subparsers() sub_crossref = subparsers.add_parser('crossref') sub_crossref.set_defaults( func=run_crossref, auth_var="FATCAT_AUTH_WORKER_CROSSREF", ) sub_crossref.add_argument('json_file', help="crossref JSON file to import from", default=sys.stdin, type=argparse.FileType('r')) sub_crossref.add_argument('issn_map_file', help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType('r')) sub_crossref.add_argument('--extid-map-file', help="DOI-to-other-identifiers sqlite3 database", default=None, type=str) sub_crossref.add_argument('--no-lookup-refs', action='store_true', help="skip lookup of references (PMID or DOI)") sub_crossref.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") sub_crossref.add_argument('--bezerk-mode', action='store_true', help="don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)") sub_jalc = subparsers.add_parser('jalc') sub_jalc.set_defaults( func=run_jalc, auth_var="FATCAT_AUTH_WORKER_JALC", ) sub_jalc.add_argument('xml_file', help="Jalc RDF XML file (record-per-line) to import from", default=sys.stdin, type=argparse.FileType('r')) sub_jalc.add_argument('issn_map_file', help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType('r')) sub_jalc.add_argument('--extid-map-file', help="DOI-to-other-identifiers sqlite3 database", default=None, type=str) sub_arxiv = subparsers.add_parser('arxiv') sub_arxiv.set_defaults( func=run_arxiv, auth_var="FATCAT_AUTH_WORKER_ARXIV", ) sub_arxiv.add_argument('xml_file', help="arXivRaw XML file to import from", default=sys.stdin, type=argparse.FileType('r')) sub_arxiv.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") sub_pubmed = subparsers.add_parser('pubmed') sub_pubmed.set_defaults( func=run_pubmed, auth_var="FATCAT_AUTH_WORKER_PUBMED", ) sub_pubmed.add_argument('xml_file', help="Pubmed XML file to import from", default=sys.stdin, type=argparse.FileType('r')) sub_pubmed.add_argument('issn_map_file', help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType('r')) sub_pubmed.add_argument('--no-lookup-refs', action='store_true', help="skip lookup of references (PMID or DOI)") sub_pubmed.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") sub_jstor = subparsers.add_parser('jstor') sub_jstor.set_defaults( func=run_jstor, auth_var="FATCAT_AUTH_WORKER_JSTOR", ) sub_jstor.add_argument('list_file', help="List of JSTOR XML file paths to import from", default=sys.stdin, type=argparse.FileType('r')) sub_jstor.add_argument('issn_map_file', help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType('r')) sub_orcid = subparsers.add_parser('orcid') sub_orcid.set_defaults( func=run_orcid, auth_var="FATCAT_AUTH_WORKER_ORCID" ) sub_orcid.add_argument('json_file', help="orcid JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_journal_metadata = subparsers.add_parser('journal-metadata') sub_journal_metadata.set_defaults( func=run_journal_metadata, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) sub_journal_metadata.add_argument('json_file', help="Journal JSON metadata file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_chocula = subparsers.add_parser('chocula') sub_chocula.set_defaults( func=run_chocula, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) sub_chocula.add_argument('json_file', help="chocula JSON entities file (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_matched = subparsers.add_parser('matched') sub_matched.set_defaults( func=run_matched, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_matched.add_argument('json_file', help="JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_matched.add_argument('--default-mimetype', default=None, help="default mimetype for imported files (if not specified per-file)") sub_matched.add_argument('--bezerk-mode', action='store_true', help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") sub_matched.add_argument('--default-link-rel', default="web", help="default URL rel for matches (eg, 'publisher', 'web')") sub_arabesque_match = subparsers.add_parser('arabesque') sub_arabesque_match.set_defaults( func=run_arabesque_match, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) sub_arabesque_match.add_argument('--sqlite-file', help="sqlite database file to import from") sub_arabesque_match.add_argument('--json-file', help="JSON file to import from (or stdin)", type=argparse.FileType('r')) sub_arabesque_match.add_argument('--do-updates', action='store_true', help="update pre-existing file entities if new match (instead of skipping)") sub_arabesque_match.add_argument('--no-require-grobid', action='store_true', help="whether postproc_status column must be '200'") sub_arabesque_match.add_argument('--extid-type', default="doi", help="identifer type in the database (eg, 'doi', 'pmcid'") sub_arabesque_match.add_argument('--crawl-id', help="crawl ID (optionally included in editgroup metadata)") sub_arabesque_match.add_argument('--default-link-rel', default="web", help="default URL rel for matches (eg, 'publisher', 'web')") sub_grobid_metadata = subparsers.add_parser('grobid-metadata') sub_grobid_metadata.set_defaults( func=run_grobid_metadata, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_grobid_metadata.add_argument('tsv_file', help="TSV file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_grobid_metadata.add_argument('--group-size', help="editgroup group size to use", default=75, type=int) sub_grobid_metadata.add_argument('--longtail-oa', action='store_true', help="if this is an import of longtail OA content (sets an 'extra' flag)") sub_grobid_metadata.add_argument('--bezerk-mode', action='store_true', help="don't lookup existing files, just insert (clobbers; only for fast bootstrap)") sub_wayback_static = subparsers.add_parser('wayback-static') sub_wayback_static.set_defaults( func=run_wayback_static, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_wayback_static.add_argument('wayback_url', type=str, help="URL of wayback capture to extract from") sub_wayback_static.add_argument('--extid', type=str, help="external identifier for release lookup") sub_wayback_static.add_argument('--release-id', type=str, help="release entity identifier") sub_wayback_static.add_argument('--editgroup-id', type=str, help="use existing editgroup (instead of creating a new one)") sub_cdl_dash_dat = subparsers.add_parser('cdl-dash-dat') sub_cdl_dash_dat.set_defaults( func=run_cdl_dash_dat, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_cdl_dash_dat.add_argument('dat_path', type=str, help="local path dat to import (must be the dat discovery key)") sub_cdl_dash_dat.add_argument('--release-id', type=str, help="release entity identifier") sub_cdl_dash_dat.add_argument('--editgroup-id', type=str, help="use existing editgroup (instead of creating a new one)") args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") sys.exit(-1) # allow editgroup description override via env variable (but CLI arg takes # precedence) if not args.editgroup_description_override \ and os.environ.get('FATCAT_EDITGROUP_DESCRIPTION'): args.editgroup_description_override = os.environ.get('FATCAT_EDITGROUP_DESCRIPTION') args.api = authenticated_api( args.host_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var)) args.func(args)
def test_authenticated_api(): api = authenticated_api("http://localhost:9411/v0") api.get_changelog() api.auth_check()
def main(): parser = argparse.ArgumentParser() parser.add_argument('--debug', action='store_true', help="enable debugging interface") parser.add_argument('--host-url', default="http://localhost:9411/v0", help="connect to this host/port") parser.add_argument('--kafka-hosts', default="localhost:9092", help="list of Kafka brokers (host/port) to use") parser.add_argument('--kafka-env', default="qa", help="Kafka topic namespace to use (eg, prod, qa)") parser.add_argument('--batch-size', help="size of batch to send", default=50, type=int) subparsers = parser.add_subparsers() sub_crossref = subparsers.add_parser('crossref') sub_crossref.set_defaults( func=run_crossref, auth_var="FATCAT_AUTH_WORKER_CROSSREF", ) sub_crossref.add_argument('json_file', help="crossref JSON file to import from", default=sys.stdin, type=argparse.FileType('r')) sub_crossref.add_argument('issn_map_file', help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType('r')) sub_crossref.add_argument('--extid-map-file', help="DOI-to-other-identifiers sqlite3 database", default=None, type=str) sub_crossref.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") sub_crossref.add_argument( '--bezerk-mode', action='store_true', help= "don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)" ) sub_orcid = subparsers.add_parser('orcid') sub_orcid.set_defaults(func=run_orcid, auth_var="FATCAT_AUTH_WORKER_ORCID") sub_orcid.add_argument('json_file', help="orcid JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_journal_metadata = subparsers.add_parser('journal-metadata') sub_journal_metadata.set_defaults( func=run_journal_metadata, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) sub_journal_metadata.add_argument( 'json_file', help="Journal JSON metadata file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_matched = subparsers.add_parser('matched') sub_matched.set_defaults( func=run_matched, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_matched.add_argument('json_file', help="JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_matched.add_argument( '--bezerk-mode', action='store_true', help= "don't lookup existing files, just insert (clobbers; only for fast bootstrap)" ) sub_grobid_metadata = subparsers.add_parser('grobid-metadata') sub_grobid_metadata.set_defaults( func=run_grobid_metadata, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_grobid_metadata.add_argument('tsv_file', help="TSV file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_grobid_metadata.add_argument('--group-size', help="editgroup group size to use", default=75, type=int) sub_grobid_metadata.add_argument( '--longtail-oa', action='store_true', help= "if this is an import of longtail OA content (sets an 'extra' flag)") sub_grobid_metadata.add_argument( '--bezerk-mode', action='store_true', help= "don't lookup existing files, just insert (clobbers; only for fast bootstrap)" ) args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") sys.exit(-1) args.api = authenticated_api( args.host_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var)) args.func(args)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--host-url', default="http://localhost:9411/v0", help="connect to this host/port") parser.add_argument('--kafka-hosts', default="localhost:9092", help="list of Kafka brokers (host/port) to use") parser.add_argument('--kafka-env', default="dev", help="Kafka topic namespace to use (eg, prod, qa)") parser.add_argument('--batch-size', help="size of batch to send", default=50, type=int) parser.add_argument('--editgroup-description-override', help="editgroup description override", default=None, type=str) subparsers = parser.add_subparsers() sub_crossref = subparsers.add_parser( 'crossref', help="import Crossref API metadata format (JSON)") sub_crossref.set_defaults( func=run_crossref, auth_var="FATCAT_AUTH_WORKER_CROSSREF", ) sub_crossref.add_argument('json_file', help="crossref JSON file to import from", default=sys.stdin, type=argparse.FileType('r')) sub_crossref.add_argument('issn_map_file', help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType('r')) sub_crossref.add_argument('--extid-map-file', help="DOI-to-other-identifiers sqlite3 database", default=None, type=str) sub_crossref.add_argument('--no-lookup-refs', action='store_true', help="skip lookup of references (PMID or DOI)") sub_crossref.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") sub_crossref.add_argument( '--bezerk-mode', action='store_true', help= "don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)" ) sub_jalc = subparsers.add_parser( 'jalc', help="import JALC DOI metadata from XML dump") sub_jalc.set_defaults( func=run_jalc, auth_var="FATCAT_AUTH_WORKER_JALC", ) sub_jalc.add_argument( 'xml_file', help="Jalc RDF XML file (record-per-line) to import from", default=sys.stdin, type=argparse.FileType('r')) sub_jalc.add_argument('issn_map_file', help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType('r')) sub_jalc.add_argument('--extid-map-file', help="DOI-to-other-identifiers sqlite3 database", default=None, type=str) sub_arxiv = subparsers.add_parser( 'arxiv', help="import arxiv.org metadata from XML files") sub_arxiv.set_defaults( func=run_arxiv, auth_var="FATCAT_AUTH_WORKER_ARXIV", ) sub_arxiv.add_argument('xml_file', nargs='?', help="arXivRaw XML file to import from", default=sys.stdin, type=argparse.FileType('r')) sub_arxiv.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") sub_pubmed = subparsers.add_parser( 'pubmed', help="import MEDLINE/PubMed work-level metadata (XML)") sub_pubmed.set_defaults( func=run_pubmed, auth_var="FATCAT_AUTH_WORKER_PUBMED", ) sub_pubmed.add_argument('xml_file', nargs='?', help="Pubmed XML file to import from", default=sys.stdin, type=argparse.FileType('r')) sub_pubmed.add_argument('issn_map_file', help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType('r')) sub_pubmed.add_argument('--no-lookup-refs', action='store_true', help="skip lookup of references (PMID or DOI)") sub_pubmed.add_argument('--do-updates', action='store_true', help="update pre-existing release entities") sub_pubmed.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") sub_jstor = subparsers.add_parser( 'jstor', help="import JSTOR work-level metadata from XML dump") sub_jstor.set_defaults( func=run_jstor, auth_var="FATCAT_AUTH_WORKER_JSTOR", ) sub_jstor.add_argument('list_file', help="List of JSTOR XML file paths to import from", default=sys.stdin, type=argparse.FileType('r')) sub_jstor.add_argument('issn_map_file', help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType('r')) sub_orcid = subparsers.add_parser( 'orcid', help="import creator entities from ORCID XML dump") sub_orcid.set_defaults(func=run_orcid, auth_var="FATCAT_AUTH_WORKER_ORCID") sub_orcid.add_argument('json_file', help="orcid JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_journal_metadata = subparsers.add_parser( 'journal-metadata', help="import/update container metadata from old manual munging format") sub_journal_metadata.set_defaults( func=run_journal_metadata, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) sub_journal_metadata.add_argument( 'json_file', help="Journal JSON metadata file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_chocula = subparsers.add_parser( 'chocula', help="import/update container metadata from chocula JSON export") sub_chocula.set_defaults( func=run_chocula, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) sub_chocula.add_argument('json_file', help="chocula JSON entities file (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_chocula.add_argument('--do-updates', action='store_true', help="update pre-existing container entities") sub_matched = subparsers.add_parser( 'matched', help= "add file entities matched against existing releases; custom JSON format" ) sub_matched.set_defaults( func=run_matched, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_matched.add_argument('json_file', help="JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_matched.add_argument( '--default-mimetype', default=None, help="default mimetype for imported files (if not specified per-file)") sub_matched.add_argument( '--bezerk-mode', action='store_true', help= "don't lookup existing files, just insert (clobbers; only for fast bootstrap)" ) sub_matched.add_argument( '--default-link-rel', default="web", help="default URL rel for matches (eg, 'publisher', 'web')") sub_arabesque_match = subparsers.add_parser( 'arabesque', help="add file entities matched to releases from crawl log analysis") sub_arabesque_match.set_defaults( func=run_arabesque_match, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) sub_arabesque_match.add_argument( '--sqlite-file', help="sqlite database file to import from") sub_arabesque_match.add_argument( '--json-file', help="JSON file to import from (or stdin)", type=argparse.FileType('r')) sub_arabesque_match.add_argument( '--do-updates', action='store_true', help= "update pre-existing file entities if new match (instead of skipping)") sub_arabesque_match.add_argument( '--no-require-grobid', action='store_true', help="whether postproc_status column must be '200'") sub_arabesque_match.add_argument( '--extid-type', default="doi", help="identifier type in the database (eg, 'doi', 'pmcid'") sub_arabesque_match.add_argument( '--crawl-id', help="crawl ID (optionally included in editgroup metadata)") sub_arabesque_match.add_argument( '--default-link-rel', default="web", help="default URL rel for matches (eg, 'publisher', 'web')") sub_ingest_file = subparsers.add_parser( 'ingest-file-results', help= "add/update file entities linked to releases based on sandcrawler ingest results" ) sub_ingest_file.set_defaults( func=run_ingest_file, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) sub_ingest_file.add_argument('json_file', help="ingest_file JSON file to import from", default=sys.stdin, type=argparse.FileType('r')) sub_ingest_file.add_argument( '--skip-source-whitelist', action='store_true', help="don't filter import based on request source whitelist") sub_ingest_file.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") sub_ingest_file.add_argument( '--do-updates', action='store_true', help= "update pre-existing file entities if new match (instead of skipping)") sub_ingest_file.add_argument( '--no-require-grobid', action='store_true', help="whether postproc_status column must be '200'") sub_ingest_file.add_argument( '--default-link-rel', default="web", help="default URL rel for matches (eg, 'publisher', 'web')") sub_savepapernow_file = subparsers.add_parser( 'savepapernow-file-results', help="add file entities crawled due to async Save Paper Now request") sub_savepapernow_file.set_defaults( func=run_savepapernow_file, auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW", ) sub_savepapernow_file.add_argument( 'json_file', help="ingest-file JSON file to import from", default=sys.stdin, type=argparse.FileType('r')) sub_savepapernow_file.add_argument( '--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") sub_grobid_metadata = subparsers.add_parser( 'grobid-metadata', help= "create release and file entities based on GROBID PDF metadata extraction" ) sub_grobid_metadata.set_defaults( func=run_grobid_metadata, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_grobid_metadata.add_argument('tsv_file', help="TSV file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_grobid_metadata.add_argument('--group-size', help="editgroup group size to use", default=75, type=int) sub_grobid_metadata.add_argument( '--longtail-oa', action='store_true', help= "if this is an import of longtail OA content (sets an 'extra' flag)") sub_grobid_metadata.add_argument( '--bezerk-mode', action='store_true', help= "don't lookup existing files, just insert (clobbers; only for fast bootstrap)" ) sub_shadow_lib = subparsers.add_parser( 'shadow-lib', help= "create release and file entities based on GROBID PDF metadata extraction" ) sub_shadow_lib.set_defaults( func=run_shadow_lib, auth_var="FATCAT_AUTH_WORKER_SHADOW", ) sub_shadow_lib.add_argument('json_file', help="JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType('r')) sub_wayback_static = subparsers.add_parser( 'wayback-static', help="crude crawl+ingest tool for single-page HTML docs from wayback") sub_wayback_static.set_defaults( func=run_wayback_static, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_wayback_static.add_argument( 'wayback_url', type=str, help="URL of wayback capture to extract from") sub_wayback_static.add_argument( '--extid', type=str, help="external identifier for release lookup") sub_wayback_static.add_argument('--release-id', type=str, help="release entity identifier") sub_wayback_static.add_argument( '--editgroup-id', type=str, help="use existing editgroup (instead of creating a new one)") sub_cdl_dash_dat = subparsers.add_parser( 'cdl-dash-dat', help="crude helper to import datasets from Dat/CDL mirror pilot project" ) sub_cdl_dash_dat.set_defaults( func=run_cdl_dash_dat, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_cdl_dash_dat.add_argument( 'dat_path', type=str, help="local path dat to import (must be the dat discovery key)") sub_cdl_dash_dat.add_argument('--release-id', type=str, help="release entity identifier") sub_cdl_dash_dat.add_argument( '--editgroup-id', type=str, help="use existing editgroup (instead of creating a new one)") sub_datacite = subparsers.add_parser('datacite', help="import datacite.org metadata") sub_datacite.add_argument( 'json_file', help="File with jsonlines from datacite.org v2 API to import from", default=sys.stdin, type=argparse.FileType('r')) sub_datacite.add_argument('issn_map_file', help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType('r')) sub_datacite.add_argument('--extid-map-file', help="DOI-to-other-identifiers sqlite3 database", default=None, type=str) sub_datacite.add_argument('--kafka-mode', action='store_true', help="consume from kafka topic (not stdin)") sub_datacite.add_argument( '--bezerk-mode', action='store_true', help= "don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)" ) sub_datacite.add_argument('--debug', action='store_true', help="write converted JSON to stdout") sub_datacite.add_argument( '--insert-log-file', default='', type=str, help="write inserted documents into file (for debugging)") sub_datacite.set_defaults( func=run_datacite, auth_var="FATCAT_AUTH_WORKER_DATACITE", ) sub_file_meta = subparsers.add_parser( 'file-meta', help="simple update-only importer for file metadata") sub_file_meta.set_defaults( func=run_file_meta, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_file_meta.add_argument( 'json_file', help="File with jsonlines from file_meta schema to import from", default=sys.stdin, type=argparse.FileType('r')) args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") sys.exit(-1) # allow editgroup description override via env variable (but CLI arg takes # precedence) if not args.editgroup_description_override \ and os.environ.get('FATCAT_EDITGROUP_DESCRIPTION'): args.editgroup_description_override = os.environ.get( 'FATCAT_EDITGROUP_DESCRIPTION') args.api = authenticated_api( args.host_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var)) args.func(args)
def api(): load_dotenv(dotenv_path="./example.env") api_client = authenticated_api("http://localhost:9411/v0") api_client.editor_id = "aaaaaaaaaaaabkvkaaaaaaaaae" return api_client
def main() -> None: parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--host-url", default="http://localhost:9411/v0", help="connect to this host/port") parser.add_argument( "--kafka-hosts", default="localhost:9092", help="list of Kafka brokers (host/port) to use", ) parser.add_argument("--kafka-env", default="dev", help="Kafka topic namespace to use (eg, prod, qa)") parser.add_argument("--batch-size", help="size of batch to send", default=50, type=int) parser.add_argument( "--editgroup-description-override", help="editgroup description override", default=None, type=str, ) subparsers = parser.add_subparsers() sub_crossref = subparsers.add_parser( "crossref", help="import Crossref API metadata format (JSON)") sub_crossref.set_defaults( func=run_crossref, auth_var="FATCAT_AUTH_WORKER_CROSSREF", ) sub_crossref.add_argument( "json_file", help="crossref JSON file to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_crossref.add_argument( "issn_map_file", help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType("r"), ) sub_crossref.add_argument("--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)") sub_crossref.add_argument("--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_crossref.add_argument( "--bezerk-mode", action="store_true", help= "don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)", ) sub_jalc = subparsers.add_parser( "jalc", help="import JALC DOI metadata from XML dump") sub_jalc.set_defaults( func=run_jalc, auth_var="FATCAT_AUTH_WORKER_JALC", ) sub_jalc.add_argument( "xml_file", help="Jalc RDF XML file (record-per-line) to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_jalc.add_argument( "issn_map_file", help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType("r"), ) sub_arxiv = subparsers.add_parser( "arxiv", help="import arxiv.org metadata from XML files") sub_arxiv.set_defaults( func=run_arxiv, auth_var="FATCAT_AUTH_WORKER_ARXIV", ) sub_arxiv.add_argument( "xml_file", nargs="?", help="arXivRaw XML file to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_arxiv.add_argument("--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_pubmed = subparsers.add_parser( "pubmed", help="import MEDLINE/PubMed work-level metadata (XML)") sub_pubmed.set_defaults( func=run_pubmed, auth_var="FATCAT_AUTH_WORKER_PUBMED", ) sub_pubmed.add_argument( "xml_file", nargs="?", help="Pubmed XML file to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_pubmed.add_argument( "issn_map_file", help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType("r"), ) sub_pubmed.add_argument("--no-lookup-refs", action="store_true", help="skip lookup of references (PMID or DOI)") sub_pubmed.add_argument("--do-updates", action="store_true", help="update pre-existing release entities") sub_pubmed.add_argument("--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_jstor = subparsers.add_parser( "jstor", help="import JSTOR work-level metadata from XML dump") sub_jstor.set_defaults( func=run_jstor, auth_var="FATCAT_AUTH_WORKER_JSTOR", ) sub_jstor.add_argument( "list_file", help="List of JSTOR XML file paths to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_jstor.add_argument( "issn_map_file", help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType("r"), ) sub_orcid = subparsers.add_parser( "orcid", help="import creator entities from ORCID XML dump") sub_orcid.set_defaults(func=run_orcid, auth_var="FATCAT_AUTH_WORKER_ORCID") sub_orcid.add_argument( "json_file", help="orcid JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType("r"), ) sub_journal_metadata = subparsers.add_parser( "journal-metadata", help="import/update container metadata from old manual munging format", ) sub_journal_metadata.set_defaults( func=run_journal_metadata, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) sub_journal_metadata.add_argument( "json_file", help="Journal JSON metadata file to import from (or stdin)", default=sys.stdin, type=argparse.FileType("r"), ) sub_chocula = subparsers.add_parser( "chocula", help="import/update container metadata from chocula JSON export") sub_chocula.set_defaults( func=run_chocula, auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA", ) sub_chocula.add_argument( "json_file", help="chocula JSON entities file (or stdin)", default=sys.stdin, type=argparse.FileType("r"), ) sub_chocula.add_argument("--do-updates", action="store_true", help="update pre-existing container entities") sub_matched = subparsers.add_parser( "matched", help= "add file entities matched against existing releases; custom JSON format", ) sub_matched.set_defaults( func=run_matched, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_matched.add_argument( "json_file", help="JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType("r"), ) sub_matched.add_argument( "--default-mimetype", default=None, help="default mimetype for imported files (if not specified per-file)", ) sub_matched.add_argument( "--bezerk-mode", action="store_true", help= "don't lookup existing files, just insert (clobbers; only for fast bootstrap)", ) sub_matched.add_argument( "--default-link-rel", default="web", help="default URL rel for matches (eg, 'publisher', 'web')", ) sub_arabesque_match = subparsers.add_parser( "arabesque", help="add file entities matched to releases from crawl log analysis") sub_arabesque_match.set_defaults( func=run_arabesque_match, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) sub_arabesque_match.add_argument( "--sqlite-file", help="sqlite database file to import from") sub_arabesque_match.add_argument( "--json-file", help="JSON file to import from (or stdin)", type=argparse.FileType("r")) sub_arabesque_match.add_argument( "--do-updates", action="store_true", help= "update pre-existing file entities if new match (instead of skipping)", ) sub_arabesque_match.add_argument( "--no-require-grobid", action="store_true", help="whether postproc_status column must be '200'", ) sub_arabesque_match.add_argument( "--extid-type", default="doi", help="identifier type in the database (eg, 'doi', 'pmcid'", ) sub_arabesque_match.add_argument( "--crawl-id", help="crawl ID (optionally included in editgroup metadata)") sub_arabesque_match.add_argument( "--default-link-rel", default="web", help="default URL rel for matches (eg, 'publisher', 'web')", ) sub_ingest_file = subparsers.add_parser( "ingest-file-results", help= "add/update file entities linked to releases based on sandcrawler ingest results", ) sub_ingest_file.set_defaults( func=run_ingest_file, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) sub_ingest_file.add_argument( "json_file", help="ingest_file JSON file to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_ingest_file.add_argument( "--skip-source-allowlist", action="store_true", help="don't filter import based on request source allowlist", ) sub_ingest_file.add_argument("--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_ingest_file.add_argument( "--do-updates", action="store_true", help= "update pre-existing file entities if new match (instead of skipping)", ) sub_ingest_file.add_argument( "--no-require-grobid", action="store_true", help="whether postproc_status column must be '200'", ) sub_ingest_file.add_argument( "--default-link-rel", default="web", help="default URL rel for matches (eg, 'publisher', 'web')", ) sub_ingest_web = subparsers.add_parser( "ingest-web-results", help= "add/update web entities linked to releases based on sandcrawler ingest results", ) sub_ingest_web.set_defaults( func=run_ingest_web, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) sub_ingest_web.add_argument( "json_file", help="ingest_web JSON file to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_ingest_web.add_argument( "--skip-source-allowlist", action="store_true", help="don't filter import based on request source allowlist", ) sub_ingest_web.add_argument("--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_ingest_web.add_argument( "--do-updates", action="store_true", help= "update pre-existing web entities if new match (instead of skipping)", ) sub_ingest_web.add_argument( "--default-link-rel", default="web", help="default URL rel for matches (eg, 'publisher', 'web')", ) sub_ingest_fileset = subparsers.add_parser( "ingest-fileset-results", help= "add/update fileset entities linked to releases based on sandcrawler ingest results", ) sub_ingest_fileset.set_defaults( func=run_ingest_fileset, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) sub_ingest_fileset.add_argument( "json_file", help="ingest_fileset JSON file to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_ingest_fileset.add_argument( "--skip-source-allowlist", action="store_true", help="don't filter import based on request source allowlist", ) sub_ingest_fileset.add_argument( "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_ingest_fileset.add_argument( "--do-updates", action="store_true", help= "update pre-existing fileset entities if new match (instead of skipping)", ) sub_ingest_fileset.add_argument( "--default-link-rel", default="fileset", help="default URL rel for matches (eg, 'publisher', 'web')", ) sub_ingest_fileset_file = subparsers.add_parser( "ingest-fileset-file-results", help= "add/update file entities linked to releases based on sandcrawler dataset/fileset ingest results", ) sub_ingest_fileset_file.set_defaults( func=run_ingest_fileset_file, auth_var="FATCAT_AUTH_WORKER_CRAWL", ) sub_ingest_fileset_file.add_argument( "json_file", help="ingest_fileset JSON file to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_ingest_fileset_file.add_argument( "--skip-source-allowlist", action="store_true", help="don't filter import based on request source allowlist", ) sub_ingest_fileset_file.add_argument( "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_ingest_fileset_file.add_argument( "--do-updates", action="store_true", help= "update pre-existing fileset entities if new match (instead of skipping)", ) sub_ingest_fileset_file.add_argument( "--default-link-rel", default="fileset", help="default URL rel for matches (eg, 'publisher', 'web')", ) sub_savepapernow_file = subparsers.add_parser( "savepapernow-file-results", help="add file entities crawled due to async Save Paper Now request", ) sub_savepapernow_file.set_defaults( func=run_savepapernow_file, auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW", ) sub_savepapernow_file.add_argument( "json_file", help="ingest-file JSON file to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_savepapernow_file.add_argument( "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_savepapernow_web = subparsers.add_parser( "savepapernow-web-results", help= "add webcapture entities crawled due to async Save Paper Now request", ) sub_savepapernow_web.set_defaults( func=run_savepapernow_web, auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW", ) sub_savepapernow_web.add_argument( "json_file", help="ingest-file JSON file to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_savepapernow_web.add_argument( "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_savepapernow_fileset = subparsers.add_parser( "savepapernow-fileset-results", help="add fileset entities crawled due to async Save Paper Now request", ) sub_savepapernow_fileset.set_defaults( func=run_savepapernow_fileset, auth_var="FATCAT_AUTH_WORKER_SAVEPAPERNOW", ) sub_savepapernow_fileset.add_argument( "json_file", help="ingest-file JSON file to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_savepapernow_fileset.add_argument( "--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_grobid_metadata = subparsers.add_parser( "grobid-metadata", help= "create release and file entities based on GROBID PDF metadata extraction", ) sub_grobid_metadata.set_defaults( func=run_grobid_metadata, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_grobid_metadata.add_argument( "tsv_file", help="TSV file to import from (or stdin)", default=sys.stdin, type=argparse.FileType("r"), ) sub_grobid_metadata.add_argument("--group-size", help="editgroup group size to use", default=75, type=int) sub_grobid_metadata.add_argument( "--longtail-oa", action="store_true", help= "if this is an import of longtail OA content (sets an 'extra' flag)", ) sub_grobid_metadata.add_argument( "--bezerk-mode", action="store_true", help= "don't lookup existing files, just insert (clobbers; only for fast bootstrap)", ) sub_shadow_lib = subparsers.add_parser( "shadow-lib", help= "create release and file entities based on GROBID PDF metadata extraction", ) sub_shadow_lib.set_defaults( func=run_shadow_lib, auth_var="FATCAT_AUTH_WORKER_SHADOW", ) sub_shadow_lib.add_argument( "json_file", help="JSON file to import from (or stdin)", default=sys.stdin, type=argparse.FileType("r"), ) sub_datacite = subparsers.add_parser("datacite", help="import datacite.org metadata") sub_datacite.add_argument( "json_file", help="File with jsonlines from datacite.org v2 API to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_datacite.add_argument( "issn_map_file", help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType("r"), ) sub_datacite.add_argument("--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_datacite.add_argument( "--bezerk-mode", action="store_true", help= "don't lookup existing DOIs, just insert (clobbers; only for fast bootstrap)", ) sub_datacite.add_argument("--debug", action="store_true", help="write converted JSON to stdout") sub_datacite.add_argument( "--insert-log-file", default="", type=str, help="write inserted documents into file (for debugging)", ) sub_datacite.set_defaults( func=run_datacite, auth_var="FATCAT_AUTH_WORKER_DATACITE", ) sub_doaj_article = subparsers.add_parser( "doaj-article", help="import doaj.org article metadata") sub_doaj_article.add_argument( "json_file", help="File with JSON lines from DOAJ API (or bulk dump) to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_doaj_article.add_argument( "--issn-map-file", help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType("r"), ) sub_doaj_article.add_argument("--kafka-mode", action="store_true", help="consume from kafka topic (not stdin)") sub_doaj_article.add_argument( "--do-updates", action="store_true", help="update any pre-existing release entities") sub_doaj_article.set_defaults( func=run_doaj_article, auth_var="FATCAT_AUTH_WORKER_DOAJ", ) sub_dblp_release = subparsers.add_parser( "dblp-release", help="import dblp release metadata") sub_dblp_release.add_argument( "xml_file", help="File with DBLP XML to import from", default=sys.stdin, type=argparse.FileType("rb"), ) sub_dblp_release.add_argument( "--dblp-container-map-file", help="file path to dblp prefix to container_id TSV file", default=None, type=argparse.FileType("r"), ) sub_dblp_release.add_argument( "--do-updates", action="store_true", help="update any pre-existing release entities") sub_dblp_release.add_argument( "--dump-json-mode", action="store_true", help="print release entities to stdout instead of importing", ) sub_dblp_release.set_defaults( func=run_dblp_release, auth_var="FATCAT_AUTH_WORKER_DBLP", ) sub_dblp_container = subparsers.add_parser( "dblp-container", help="import dblp container metadata") sub_dblp_container.add_argument( "json_file", help="File with DBLP container JSON to import from (see extra/dblp/)", default=sys.stdin, type=argparse.FileType("rb"), ) sub_dblp_container.add_argument( "--dblp-container-map-file", help="file path to dblp pre-existing prefix to container_id TSV file", default=None, type=argparse.FileType("r"), ) sub_dblp_container.add_argument( "--dblp-container-map-output", help="file path to output new dblp container map TSV to", default=None, type=argparse.FileType("w"), ) sub_dblp_container.add_argument( "--issn-map-file", help="ISSN to ISSN-L mapping file", default=None, type=argparse.FileType("r"), ) sub_dblp_container.add_argument( "--do-updates", action="store_true", help="update any pre-existing container entities") sub_dblp_container.set_defaults( func=run_dblp_container, auth_var="FATCAT_AUTH_WORKER_DBLP", ) sub_file_meta = subparsers.add_parser( "file-meta", help="simple update-only importer for file metadata") sub_file_meta.set_defaults( func=run_file_meta, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_file_meta.add_argument( "json_file", help="File with jsonlines from file_meta schema to import from", default=sys.stdin, type=argparse.FileType("r"), ) sub_fileset = subparsers.add_parser("fileset", help="generic fileset importer") sub_fileset.set_defaults( func=run_fileset, auth_var="FATCAT_API_AUTH_TOKEN", ) sub_fileset.add_argument( "json_file", help="File with jsonlines of fileset entities to import", default=sys.stdin, type=argparse.FileType("r"), ) sub_fileset.add_argument( "--skip-release-fileset-check", action="store_true", help= "create without checking if releases already have related filesets", ) args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") sys.exit(-1) # allow editgroup description override via env variable (but CLI arg takes # precedence) if not args.editgroup_description_override and os.environ.get( "FATCAT_EDITGROUP_DESCRIPTION"): args.editgroup_description_override = os.environ.get( "FATCAT_EDITGROUP_DESCRIPTION") args.api = authenticated_api( args.host_url, # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var), ) sentry_sdk.init() args.func(args)