Exemplo n.º 1
0
def test_issue_db_basics() -> None:

    api_conf = fatcat_openapi_client.Configuration()
    api_conf.host = settings.FATCAT_API_HOST
    api = fatcat_openapi_client.DefaultApi(
        fatcat_openapi_client.ApiClient(api_conf))

    es_client = elasticsearch.Elasticsearch(settings.ELASTICSEARCH_FATCAT_BASE)

    issue_db = IssueDB(settings.SCHOLAR_ISSUEDB_PATH)
    issue_db.init_db()

    with open("tests/files/sim_collections.json", "r") as f:
        issue_db.load_pubs(f.readlines(), api)

    with open("tests/files/sim_items.json", "r") as f:
        issue_db.load_issues(f.readlines(), es_client)
Exemplo n.º 2
0
def main() -> None:
    """
    Run this command like:

        python -m fatcat_scholar.sim_pipeline
    """

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    subparsers = parser.add_subparsers()

    parser.add_argument(
        "--issue-db-file",
        help="sqlite3 database file to open",
        default="data/issue_db.sqlite",
        type=str,
    )

    sub = subparsers.add_parser("run_issue_db",
                                help="iterates through entire IssueDB")
    sub.set_defaults(func="run_issue_db")
    sub.add_argument("--limit",
                     help="maximum number of pages to index",
                     type=int)

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        print("tell me what to do! (try --help)")
        sys.exit(-1)

    sp = SimPipeline(issue_db=IssueDB(args.issue_db_file))

    if args.func == "run_issue_db":
        sp.run_issue_db(limit=args.limit)
    else:
        func = getattr(sp, args.func)
        func()
Exemplo n.º 3
0
def main() -> None:
    """
    Run this command like:

        python -m fatcat_scholar.work_pipeline
    """

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    subparsers = parser.add_subparsers()

    parser.add_argument(
        "--issue-db-file",
        help="sqlite3 database file to open",
        default=settings.SCHOLAR_ISSUEDB_PATH,
        type=str,
    )
    parser.add_argument(
        "--sandcrawler-db-api",
        help="Sandcrawler Postgrest API endpoint",
        default=settings.SANDCRAWLER_DB_API,
        type=str,
    )
    parser.add_argument(
        "--sandcrawler-s3-api",
        help="Sandcrawler S3 (minio/seaweedfs) API endpoint",
        default=settings.SANDCRAWLER_S3_API,
        type=str,
    )

    sub = subparsers.add_parser("fetch-docs-worker", )
    sub.set_defaults(worker="fetch-docs-worker")

    sub = subparsers.add_parser("index-docs-worker", )
    sub.set_defaults(worker="index-docs-worker")

    args = parser.parse_args()
    if not args.__dict__.get("worker"):
        parser.print_help(file=sys.stderr)
        sys.exit(-1)

    if args.worker == "fetch-docs-worker":
        issue_db = IssueDB(args.issue_db_file)
        wp = WorkPipeline(
            issue_db=issue_db,
            sandcrawler_db_client=SandcrawlerPostgrestClient(
                api_url=args.sandcrawler_db_api),
            sandcrawler_s3_client=SandcrawlerMinioClient(
                host_url=args.sandcrawler_s3_api,
                access_key=os.environ.get("MINIO_ACCESS_KEY"),
                secret_key=os.environ.get("MINIO_SECRET_KEY"),
            ),
        )
        sp = SimPipeline(issue_db=issue_db)
        fdw = FetchDocsWorker(
            kafka_brokers=settings.KAFKA_BROKERS,
            consume_topics=[
                f"fatcat-{settings.SCHOLAR_ENV}.work-ident-updates",
                # TODO: f"scholar-{settings.SCHOLAR_ENV}.sim-updates",
            ],
            consumer_group=f"scholar-{settings.SCHOLAR_ENV}-fetch-workers",
            work_pipeline=wp,
            sim_pipeline=sp,
            produce_docs_topic=f"scholar-{settings.SCHOLAR_ENV}.update-docs",
            fatcat_api_host=settings.FATCAT_API_HOST,
        )
        fdw.run()
    elif args.worker == "index-docs-worker":
        es_client = elasticsearch.Elasticsearch(
            settings.ELASTICSEARCH_WRITE_BASE, timeout=25.0)
        idw = IndexDocsWorker(
            kafka_brokers=settings.KAFKA_BROKERS,
            batch_size=settings.INDEX_WORKER_BATCH_SIZE,
            consume_topics=[f"scholar-{settings.SCHOLAR_ENV}.update-docs"],
            consumer_group=f"scholar-{settings.SCHOLAR_ENV}-index-workers",
            es_client=es_client,
            es_index=settings.ELASTICSEARCH_WRITE_FULLTEXT_INDEX,
        )
        idw.run()
Exemplo n.º 4
0
def main() -> None:
    """
    Run this command like:

        python -m fatcat_scholar.work_pipeline
    """

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    subparsers = parser.add_subparsers()

    parser.add_argument(
        "--issue-db-file",
        help="sqlite3 database file to open",
        default="data/issue_db.sqlite",
        type=str,
    )
    parser.add_argument(
        "--sandcrawler-db-api",
        help="Sandcrawler Postgrest API endpoint",
        default=settings.SANDCRAWLER_DB_API,
        type=str,
    )
    parser.add_argument(
        "--sandcrawler-s3-api",
        help="Sandcrawler S3 (minio/seaweedfs) API endpoint",
        default=settings.SANDCRAWLER_S3_API,
        type=str,
    )

    sub = subparsers.add_parser(
        "run_releases",
        help="takes expanded release entity JSON, sorted by work_ident")
    sub.set_defaults(func="run_releases")
    sub.add_argument(
        "json_file",
        help="release entities, as JSON-lines",
        nargs="?",
        default=sys.stdin,
        type=argparse.FileType("r"),
    )

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        parser.print_help(file=sys.stderr)
        sys.exit(-1)

    if settings.SENTRY_DSN:
        sentry_sdk.init(
            dsn=settings.SENTRY_DSN,
            environment=settings.SCHOLAR_ENV,
            max_breadcrumbs=10,
            release=GIT_REVISION,
        )

    wp = WorkPipeline(
        issue_db=IssueDB(args.issue_db_file),
        sandcrawler_db_client=SandcrawlerPostgrestClient(
            api_url=args.sandcrawler_db_api),
        sandcrawler_s3_client=SandcrawlerMinioClient(
            host_url=args.sandcrawler_s3_api,
            access_key=os.environ.get("MINIO_ACCESS_KEY"),
            secret_key=os.environ.get("MINIO_SECRET_KEY"),
        ),
    )

    if args.func == "run_releases":
        wp.run_releases(args.json_file)
    else:
        func = getattr(wp, args.func)
        func()
def test_run_transform(mocker: Any) -> None:

    issue_db = IssueDB(settings.SCHOLAR_ISSUEDB_PATH)
    issue_db.init_db()

    responses.add(
        responses.GET,
        "http://disabled-during-tests-bogus.xyz:3333/grobid?sha1hex=eq.bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
        status=200,
        json=[
            {
                "sha1hex": "bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
                "updated": "2019-11-30T04:44:00+00:00",
                "grobid_version": "0.5.5-fatcat",
                "status_code": 200,
                "status": "success",
                "fatcat_release": "hsmo6p4smrganpb3fndaj2lon4",
                "metadata": {
                    "biblio": {
                        "doi": "10.7717/peerj.4375",
                        "date": "2018-02-13",
                        "title": "Distributed under Creative Commons CC-BY 4.0 The state of OA: a large-scale analysis of the prevalence and impact of Open Access articles",
                        "authors": [],
                    },
                    "language_code": "en",
                    "grobid_timestamp": "2019-11-30T04:44+0000",
                },
            }
        ],
    )

    responses.add(
        responses.GET,
        "http://disabled-during-tests-bogus.xyz:3333/pdf_meta?sha1hex=eq.bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
        status=200,
        json=[
            {
                "sha1hex": "bca1531b0562c6d72e0c283c1ccb97eb5cb02117",
                "updated": "2020-07-07T02:15:52.98309+00:00",
                "status": "success",
                "has_page0_thumbnail": True,
                "page_count": 23,
                "word_count": 10534,
                "page0_height": 792,
                "page0_width": 612,
                "permanent_id": "52f2164b9cc9e47fd150e7ee389b595a",
                "pdf_created": "2018-02-09T06:06:06+00:00",
                "pdf_version": "1.5",
                "metadata": {
                    "title": "",
                    "author": "",
                    "creator": "River Valley",
                    "subject": "Legal Issues, Science Policy, Data Science",
                    "producer": "pdfTeX-1.40.16",
                },
            }
        ],
    )

    es_raw = mocker.patch("fatcat_scholar.work_pipeline.WorkPipeline.fetch_file_grobid")
    es_raw.side_effect = [
        {"tei_xml": "<xml>dummy", "release_ident": "asdf123", "file_ident": "xyq9876"},
    ]

    wp = WorkPipeline(
        issue_db=issue_db,
        sandcrawler_db_client=SandcrawlerPostgrestClient(
            api_url=settings.SANDCRAWLER_DB_API
        ),
        sandcrawler_s3_client=SandcrawlerMinioClient(
            host_url=settings.SANDCRAWLER_S3_API
        ),
    )

    with open("tests/files/release_hsmo6p4smrganpb3fndaj2lon4_sans.json", "r") as f:
        wp.run_releases(f.readlines())
Exemplo n.º 6
0
def main() -> None:
    """
    Run this command like:

        python -m fatcat_scholar.sim_pipeline
    """

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    subparsers = parser.add_subparsers()

    parser.add_argument(
        "--issue-db-file",
        help="sqlite3 database file to open",
        default="data/issue_db.sqlite",
        type=str,
    )

    sub = subparsers.add_parser("run_issue_db",
                                help="iterates through entire IssueDB")
    sub.set_defaults(func="run_issue_db")
    sub.add_argument("--limit",
                     help="maximum number of pages to index",
                     type=int)

    sub = subparsers.add_parser(
        "run_print_issues",
        help="dumps issues as simple TSV rows (for parallel processing)",
    )
    sub.set_defaults(func="run_print_issues")

    sub = subparsers.add_parser("run_fetch_issue",
                                help="fetches pages for given issue item")
    sub.add_argument("issue_item", type=str)
    sub.add_argument("pub_collection", type=str)
    sub.set_defaults(func="run_fetch_issue")

    args = parser.parse_args()
    if not args.__dict__.get("func"):
        parser.print_help(file=sys.stderr)
        sys.exit(-1)

    if settings.SENTRY_DSN:
        sentry_sdk.init(
            dsn=settings.SENTRY_DSN,
            environment=settings.SCHOLAR_ENV,
            max_breadcrumbs=10,
            release=GIT_REVISION,
        )

    sp = SimPipeline(issue_db=IssueDB(args.issue_db_file))

    if args.func == "run_issue_db":
        sp.run_issue_db(limit=args.limit)
    elif args.func == "run_print_issues":
        sp.run_print_issues()
    elif args.func == "run_fetch_issue":
        sp.run_fetch_issue(issue_item=args.issue_item,
                           pub_collection=args.pub_collection)
    else:
        func = getattr(sp, args.func)
        func()