Пример #1
0
def search(index: str, text: str):
    query = preprocess_text(text.strip().lower())

    vector = get_vector(text)["vector"]

    langs = get_config("LANGUAGES")

    search_fields = list()
    for lang in langs:
        search_fields += [f'title_{lang}', f'content_{lang}']

    query_json = {
        "_source": ["url", "authors", "citedby", "year", "lang"] +
        [f'title_{l}' for l in langs],
        "query": {
            "script_score": {
                "query": {
                    "bool": {
                        "should": [{
                            "match": {
                                f: query
                            }
                        } for f in search_fields]
                    }
                },
                "script": {
                    "source":
                    "cosineSimilarity(params.query_vector, 'vector') + 1.0",
                    "params": {
                        "query_vector": vector
                    }
                }
            }
        },
        "highlight": {
            "fragment_size": 100,
            "fields": {f: {}
                       for f in search_fields}
        },
        "size":
        100
    }

    url = get_config("ELASTICSEARCH") + f'/{index}/_search'
    response = rq.get(url, json=query_json).json()

    logger.info(f'Resp: {response}')

    return response.get("hits", {}).get("hits", [])
Пример #2
0
def search_publication():
    pubs = search("publication", request.args["text"])

    langs = get_config("LANGUAGES")
    authors = AutoSortedDict(sort_field="score")
    author_count, pub_count = 0, 0
    for pub in pubs:
        pub_count += 1
        pub.pop("_index")
        pub_lang = pub["_source"]["lang"]
        pub_authors = pub["_source"].get("authors", list())
        for pub_author in pub_authors:
            author_id = pub_author.pop("id")
            if authors.get(author_id, None) is not None:
                current_author = authors[author_id]

                del authors[author_id]

                current_author["pub_counts"] += 1
                current_author["score"] += pub["_score"]
                current_author["pubs"].append({
                    "id":
                    pub["_id"],
                    "title":
                    pub["_source"][f'title_{pub_lang}']
                })

                authors[author_id] = current_author
            elif author_count < 100:
                author_count += 1
                pub_author["score"] = pub["_score"]
                pub_author["pubs"] = [{
                    "id":
                    pub["_id"],
                    "title":
                    pub["_source"][f'title_{pub_lang}']
                }]
                pub_author["pub_counts"] = 1
                authors[author_id] = pub_author

        pub["_source"]["title"] = pub["_source"].get(f'title_{pub_lang}',
                                                     "unknown")

        for k in langs:
            pub["_source"].pop(f'title_{k}', None)
            pub["_source"].pop(f'content_{k}', None)

    return jsonify({
        "authors": {
            "count": author_count,
            "items": authors.values()[:100]
        },
        "pubs": {
            "count": pub_count,
            "items": pubs
        }
    })
Пример #3
0
def get_vector(text: str):
    url = get_config("VECTORIZER")
    url += "/vectorize"

    response = requests.get(
        url=url,
        json={"text": text}
    ).json()

    return response
Пример #4
0
def lang_detect(text):
    url = get_config("VECTORIZER")
    url += "/detect/lang"

    response = requests.get(
        url=url,
        json={"text": text}
    ).json()

    return response.get("lang", "unknown")
Пример #5
0
def get_docs(ids, projections=None):
    q = {"query": {"ids": {"values": list(ids)}}}

    if projections:
        q["_source"] = projections

    url = get_config("ELASTICSEARCH") + "/_search"
    response = rq.get(url, json=q).json()

    return response.get("hits", {}).get("hits", [])
Пример #6
0
def translate(text, dest_lang):
    url = get_config("VECTORIZER")
    url += "/translate"

    response = requests.get(
        url=url,
        json={"text": text, "dest_lang": dest_lang}
    ).json()

    return response.get("text", None)
Пример #7
0
def t_find_pdf_primarily(self, pub_id: str, title: str, authors: list,
                         url: str):
    resd = {"status": "ok"}

    if url:
        files_path = get_config("FILES_PATH")

        file_name = md5(url.encode("utf-8")).hexdigest()

        if not os.path.exists(files_path):
            os.makedirs(files_path)

        pdf_raw = download(url)

        full_path = f'{files_path}{os.path.sep}{file_name}.pdf'

        with open(full_path, "wb+") as f:
            f.write(pdf_raw)

        resd["path"] = full_path

        try:
            content = extract_text_from_pdf(full_path)
        except Exception as e:
            resd["extraction_failure"] = str(e)
            logger.debug(e)
            content = None

        update_result = update_one("publication", {
            "filter": {"id": {"$eq": pub_id}},
            "update": {
                "$set": {
                    "raw_base64": base64.encodebytes(pdf_raw).decode("utf-8"),
                    "content": content
                }
            },
            "upsert": True
        })

        logger.info(f'Update Result: {update_result}')

        t_elasticsearch_indexing.apply_async((pub_id,))
    else:
        authors = find("author", {
            "filter": {"id": {"$in": authors}},
            "projection": {"name": 1}
        })
        t_find_pdf_secondarily.apply_async(
            (pub_id, title, [a["name"] for a in authors])
        )

    return resd
Пример #8
0
def translate(text, dest_lang):
    url = get_config("VECTORIZER")
    url += "/translate"

    try:
        response = requests.get(
            url=url,
            json={"text": text, "dest_lang": dest_lang}
        ).json().get("text", None)
    except Exception:
        response = False

    return response
Пример #9
0
def update_vector(index, _id, vector, rcoef, relevance):
    logger.info(f'{type(relevance)}: {relevance}')

    sign = "+" if str(relevance).strip().lower() == "true" else "-"

    inline = "for (int i=0; i<ctx._source.vector.length; ++i){ctx._source.vector[i]=(ctx._source.vector[i]" + sign + "(params.vector[i]*params.rcoef))/2}"

    q = {
        "script": {
            "lang": "painless",
            "params": {
                "vector": list(vector),
                "rcoef": rcoef
            },
            "inline": inline
        }
    }

    response = rq.post(get_config("ELASTICSEARCH") + f'/{index}/_update/{_id}',
                       json=q).json()
    logger.info(response)

    return response
Пример #10
0
import requests
Пример #11
0
import base64
Пример #12
0
import json
Пример #13
0
def t_elasticsearch_indexing(self, pub_id: str):
    resd = {"status": "ok"}

    pub = find_one("publication", {
        "filter": {"id": {"$eq": pub_id}, "vector": {"$exists": True}}
    })

    pub["authors"] = find("author", {
        "filter": {"id": {"$in": pub.get("authors", [])}},
        "projection": ["id", "name", "affiliation", "citedby", "interests",
                       "organizations"]
    })

    pub.pop("created_at", None)
    pub.pop("raw_base64", None)
    pub.pop("title_md5", None)
    pub.pop("_id", None)

    pub_id = pub.pop("id")

    vector_field_tokens = list()

    if pub.get("content", None):
        vector_field_tokens += pub["content"].split()
    if not pub["title"].startswith("unk_"):
        vector_field_tokens += pub["title"].split()

    vector_field = " ".join(vector_field_tokens)
    vectorizer_response = get_vector(preprocess_text(vector_field))

    pub["lang"] = vectorizer_response["lang"]
    pub["vector"] = vectorizer_response["vector"]

    langs = get_config("LANGUAGES")

    for lang in langs:
        if lang != pub["lang"]:
            pub[f'title_{lang}'] = preprocess_text(
                translate(pub["title"], lang) or ""
            )
            if str(pub.get("content", None)).strip().lower() not in ["none", ""]:
                pub[f'content_{lang}'] = preprocess_text(
                    translate(pub["content"], lang) or ""
                )
        else:
            pub[f'title_{lang}'] = preprocess_text(pub["title"])
            pub[f'content_{lang}'] = preprocess_text(pub.get("content",
                                                             "") or "")

    if "title" in pub: del pub["title"]
    if "content" in pub: del pub["content"]

    update_one("publication", {
        "filter": {"id": {"$eq": pub_id}},
        "update": {"$set": {"vector": pub["vector"],
                            "lang": pub["lang"]}}
    })

    for lang in langs:
        publication_mappings["properties"][f'title_{lang}'] = {"type": "text"}
        publication_mappings["properties"][f'content_{lang}'] = {"type": "text"}

    resp = rq.put(
        get_config("ELASTICSEARCH") + "/publication",
        json={"mappings": publication_mappings}
    )

    if resp.status_code == 400:
        resp = rq.put(
            get_config("ELASTICSEARCH") + "/publication/_mappings",
            json=publication_mappings
        )

    logger.info(f'Mapping Response: {resp.json()}')

    # resp = es.indices.create(
    #     index="publication",
    #     body={"mappings": publication_mappings},
    #     ignore=400
    # )

    result = es.index(index="publication", body=pub, id=pub_id)
    resd["result"] = result
    return resd
Пример #14
0
def t_find_pdf_secondarily(self, pub_id: str, title: str, authors: list):
    resd = {"status": "ok"}

    try:
        # Her authoru tek tek kontrol etmemizi sağlayan for döngüsü
        for single_author in authors:
            # author için istek atıyoruz
            http = urllib3.PoolManager()
            response = http.request(
                'GET', 'https://libgen.is/scimag/?q=' + single_author)
            html_text = response.data
            soup = BeautifulSoup(html_text, 'html.parser')

            # arama sonucunda data döndü mü onu kontrol ediyoruz
            try:
                total_value = str(
                    soup.find('div', attrs={
                        'style': 'float:left'
                    }).getText()).split(" ")[0]
            except Exception:
                total_value = 0
            # eğer arama sonucunda bir data dönmedi ise diğer yazare
            # geçmesi için continue diyoruz döngüye
            if total_value == 0:
                continue

            # burada sayfa sayısını hesaplıyoruz. double ile bölmede kalan
            # muhabbetlerinden ötürü kontrol yapıp gerekliyse
            # toplam sayfa sayısına bir ekliyoruz en son sayfayı ıskalamamak için
            total_page_dobule = int(total_value) / 25
            total_page = int(int(total_value) / 25)
            if total_page != total_page_dobule:
                total_page += 1

            # Burda bir yazarın sonuçlarını taramak için sayfalarda geziyoruz.
            # İlk sayfa için yukarıda istek atmıştık 0'dan farklı bir sonuç
            # sayısı varsa buraya gelmiştik.
            # bu yüzden ilk sayfa için istek atmıyoruz.
            # eğer ilk sayfada sonuç bulunmazsa ve sayfa sayısı 1'den büyük
            # ise döngünün en sonunda istek atıyor
            # ve döngü yeni sayfanın içinde arama yapacak şekilde devam ediyor
            for i in range(total_page):
                counter = 0
                for row in soup.find_all('tr'):
                    if counter == 0:  # For initial row. Because it contains table information of page
                        counter += 1
                        continue
                    row_item = row.find_all('td')
                    row_title = row_item[1].find_all('a')[0].text
                    ratio = fuzz.ratio(row_title.lower(), title.lower(
                    ))  # row title ve verilen title benzer mi diye bakılıyor

                    if ratio > 75:
                        url_for_get = row_item[4].find_all('li')
                        href = url_for_get[1].find_all('a',
                                                       href=True)[0]['href']
                        response_for_pdf = http.request('GET', href)
                        pdf_page = BeautifulSoup(response_for_pdf.data,
                                                 'html.parser')
                        pdf_url = pdf_page.find_all(
                            'td', {'align': 'center'})[0].find_all(
                                'a', href=True)[0]['href']

                        pdf_raw = download(pdf_url)

                        files_path = get_config("FILES_PATH")

                        if not os.path.exists(files_path):
                            os.makedirs(files_path)

                        file_name = md5(pdf_url.encode("utf-8")).hexdigest()

                        full_path = f'{files_path}{os.path.sep}{file_name}.pdf'

                        with open(full_path, "wb+") as f:
                            f.write(pdf_raw)

                        resd["path"] = full_path

                        try:
                            content = extract_text_from_pdf(full_path)
                        except Exception as e:
                            resd["extraction_failure"] = str(e)
                            logger.debug(e)
                            content = None

                        update_one(
                            "publication", {
                                "filter": {
                                    "id": {
                                        "$eq": pub_id
                                    }
                                },
                                "update": {
                                    "$set": {
                                        "raw_base64":
                                        base64.encodebytes(pdf_raw).decode(
                                            "utf-8"),
                                        "content":
                                        content
                                    }
                                },
                                "upsert": True
                            })

                        if content:
                            logger.info(f'Content is added to publication.')

                            t_elasticsearch_indexing.apply_async((pub_id, ))

                            return resd

                if total_page > 1:
                    response = http.request(
                        'GET', 'https://libgen.is/scimag/?q=' + single_author +
                        '&page=' + str(i + 2))
                    html_text = response.data
                    soup = BeautifulSoup(html_text, 'html.parser')
    except Exception as e:
        logger.exception(e)

    t_elasticsearch_indexing.apply_async((pub_id, ))

    return resd
Пример #15
0
import requests