예제 #1
0
def func_dbpedia_spotlight(d):
    """
    Helper function for processing a paper in a thread with DBpedia Spotlight
    :param d: content of the paper
    :return: result of the annotation with DBpedia Spotlight in JSON || None if the JSON annotation exists already
    """
    d_json = {}
    paper_id = d['paper_id']
    title = d["metadata"]["title"]
    if os.path.isfile(path_output + '/dbpedia-spotlight/' + folder + '/' +
                      paper_id + '.json'):
        pbar.update()
        return None
    try:
        body_text = cotools.text(d)
        isreliable, textbytesfound, details, vectors = pycld2.detect(
            body_text, returnVectors=True)
        lang = vectors[0][3]
    # None or out of range
    except:
        lang = 'en'

    if os.path.isfile('/data/CORD19-Annotation-multi/entity-fishing/' +
                      folder + '/' + paper_id + '.json'):
        return None

    d_json["paper_id"] = paper_id
    d_json["lang"] = lang
    try:
        abstract = cotools.abstract(d)
        d_json["abstract"] = wa.request_dbpedia_spotlight(abstract, lang)
    # no abstract
    except Exception:
        pass

    d_json["title"] = wa.request_dbpedia_spotlight(title, lang)
    d_json["body_text"] = wa.request_dbpedia_spotlight(body_text, lang)

    d_json["ref_entries"] = {}
    for key, value in d["ref_entries"].items():
        d_json["ref_entries"][key] = wa.request_dbpedia_spotlight(
            value["text"])

    #d_json["bib_entries"] = {}
    #for key, value in d["bib_entries"].items():
    #    d_json["bib_entries"][key] = wa.request_dbpedia_spotlight(value["title"])

    d_json["back_matter"] = []
    for matter in d["back_matter"]:
        for key, value in matter.items():
            if key == 'text':
                text = {'text': wa.request_dbpedia_spotlight(value)}
                d_json["back_matter"].append(text)

    Output().save_json(
        d_json, path_output + '/dbpedia-spotlight/' + folder + '/' +
        d["paper_id"] + '.json')
    pbar.update()
    return d_json
예제 #2
0
print(str(sys.getsizeof(data)) + ' bytes')

print(f"{len(data)} papers")

print()
print("How data[index] looks like:")
pprint(data[13])

print()
print("How text looks like")
pprint(co.text(data[13]))

print()
print("How abstract looks like")
try:
    pprint(co.abstract(data[13]))
except KeyError:
    print("Abstract Not Found")

#pprint(co.abstracts(data[14:18]))

#abstracts = data.abstracts()
#pprint(abstracts)

## finding abstracts
print()
print("Finding abstracts")
#for x in data[100:5000]:
#	try:
#		pprint(co.abstract(x))
#	except KeyError:
예제 #3
0
# Third party modules
import cotools
from cotools import abstract, text

data = cotools.Paperset("data/all")

digest = [
    x for x in data
    if "digest" in cotools.text(x) or "digest" in cotools.abstract(x)
]

cov = ["covid", "novel_coronavirus"]
digest_covid = [
    x for x in digest
    if any(c in text(x).lower() for c in cov) or any(c in abstract(x).lower()
                                                     for c in cov)
]

len(digest_covid)

for d in digest_covid:
    print("-" * 55)
    print("\r\n")
    print("NEW PAPER")
    print("\r\n")
    print(abstract(d))
    print(text(d))