Exemplo n.º 1
0
def t_find_pdf_primarily(self, pub_id: str, title: str, authors: list,
                         url: str):
    resd = {"status": "ok"}

    if url:
        files_path = get_config("FILES_PATH")

        file_name = md5(url.encode("utf-8")).hexdigest()

        if not os.path.exists(files_path):
            os.makedirs(files_path)

        pdf_raw = download(url)

        full_path = f'{files_path}{os.path.sep}{file_name}.pdf'

        with open(full_path, "wb+") as f:
            f.write(pdf_raw)

        resd["path"] = full_path

        try:
            content = extract_text_from_pdf(full_path)
        except Exception as e:
            resd["extraction_failure"] = str(e)
            logger.debug(e)
            content = None

        update_result = update_one("publication", {
            "filter": {"id": {"$eq": pub_id}},
            "update": {
                "$set": {
                    "raw_base64": base64.encodebytes(pdf_raw).decode("utf-8"),
                    "content": content
                }
            },
            "upsert": True
        })

        logger.info(f'Update Result: {update_result}')

        t_elasticsearch_indexing.apply_async((pub_id,))
    else:
        authors = find("author", {
            "filter": {"id": {"$in": authors}},
            "projection": {"name": 1}
        })
        t_find_pdf_secondarily.apply_async(
            (pub_id, title, [a["name"] for a in authors])
        )

    return resd
Exemplo n.º 2
0
import base64
def t_elasticsearch_indexing(self, pub_id: str):
    resd = {"status": "ok"}

    pub = find_one("publication", {
        "filter": {"id": {"$eq": pub_id}, "vector": {"$exists": True}}
    })

    pub["authors"] = find("author", {
        "filter": {"id": {"$in": pub.get("authors", [])}},
        "projection": ["id", "name", "affiliation", "citedby", "interests",
                       "organizations"]
    })

    pub.pop("created_at", None)
    pub.pop("raw_base64", None)
    pub.pop("title_md5", None)
    pub.pop("_id", None)

    pub_id = pub.pop("id")

    vector_field_tokens = list()

    if pub.get("content", None):
        vector_field_tokens += pub["content"].split()
    if not pub["title"].startswith("unk_"):
        vector_field_tokens += pub["title"].split()

    vector_field = " ".join(vector_field_tokens)
    vectorizer_response = get_vector(preprocess_text(vector_field))

    pub["lang"] = vectorizer_response["lang"]
    pub["vector"] = vectorizer_response["vector"]

    langs = get_config("LANGUAGES")

    for lang in langs:
        if lang != pub["lang"]:
            pub[f'title_{lang}'] = preprocess_text(
                translate(pub["title"], lang) or ""
            )
            if str(pub.get("content", None)).strip().lower() not in ["none", ""]:
                pub[f'content_{lang}'] = preprocess_text(
                    translate(pub["content"], lang) or ""
                )
        else:
            pub[f'title_{lang}'] = preprocess_text(pub["title"])
            pub[f'content_{lang}'] = preprocess_text(pub.get("content",
                                                             "") or "")

    if "title" in pub: del pub["title"]
    if "content" in pub: del pub["content"]

    update_one("publication", {
        "filter": {"id": {"$eq": pub_id}},
        "update": {"$set": {"vector": pub["vector"],
                            "lang": pub["lang"]}}
    })

    for lang in langs:
        publication_mappings["properties"][f'title_{lang}'] = {"type": "text"}
        publication_mappings["properties"][f'content_{lang}'] = {"type": "text"}

    resp = rq.put(
        get_config("ELASTICSEARCH") + "/publication",
        json={"mappings": publication_mappings}
    )

    if resp.status_code == 400:
        resp = rq.put(
            get_config("ELASTICSEARCH") + "/publication/_mappings",
            json=publication_mappings
        )

    logger.info(f'Mapping Response: {resp.json()}')

    # resp = es.indices.create(
    #     index="publication",
    #     body={"mappings": publication_mappings},
    #     ignore=400
    # )

    result = es.index(index="publication", body=pub, id=pub_id)
    resd["result"] = result
    return resd
Exemplo n.º 4
0
from application import celery, logger
Exemplo n.º 5
0
from threading import Thread
Exemplo n.º 6
0
from flask import Blueprint, request
Exemplo n.º 7
0
def t_authors_scraper(self):
    organizations = find("organization")  # Get all organizations.

    len_organizations = len(organizations)

    logger.info(f'There are {len_organizations} organizations.')

    proxies = [
        "http://192.116.142.153:8080", "http://192.241.149.83:80",
        "http://192.241.150.4:80"
    ]

    for org in organizations:
        logger.info(f'Starting for <{org["domain"]}>')

        proxy = choice(proxies)

        tree, org_href = get_organization_page(org["domain"], proxy)

        counter = 10
        updates = list()
        while True:
            authors = get_authors(tree)

            if authors is None:
                break

            for author in authors:
                author = parse_author(author)

                author_in_mongo = find_one(
                    "author", {"filter": {
                        "id": {
                            "$eq": author["id"]
                        }
                    }})

                if not author_in_mongo:
                    logger.info(f'Starting for <{author["name"]}>')
                    t_get_author.apply_async((author, org["domain"]))
                else:
                    updates.append(author["id"])

            proxy = choice(proxies)
            tree = get_next_page(tree, counter, org_href, proxy)

            counter += 10

        update_many(
            "author", {
                "filter": {
                    "id": {
                        "$in": updates
                    }
                },
                "update": {
                    "$addToSet": {
                        "organizations": org["domain"]
                    }
                }
            })

    return {"status": "ok", "num_organizations": len_organizations}