Пример #1
0
def get_article_series(
    dir="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/BMJ_Case_Reports/",
    dataset_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/dataset1.0.json"
):
    """
    :return: {article_id: series_name}
    """
    from describe_data import DATA_KEY, SOURCE_KEY

    series = {}
    for f in get_file_list(dir):
        if f.endswith(".full"):
            ls = open(f).readlines()
            for c, l in enumerate(ls):
                if '<ul class="series-titles">' in l:
                    s = ls[c + 1].strip()
                    if s.startswith("<li>") and s.endswith("</li>"):
                        series[os.path.basename(f)[:-5]] = s[4:-5].lower()
                        break

    # filter out those not in json dataset
    from describe_data import get_doc_ids
    doc_ids = get_doc_ids(dataset_file)

    return {doc_id: s for doc_id, s in series.items() if doc_id in doc_ids}
Пример #2
0
def get_article_specialty(
    dir="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/BMJ_Case_Reports/",
    dataset_file="/mnt/b5320167-5dbd-4498-bf34-173ac5338c8d/Datasets/bmj_case_reports_data/dataset_json_concept_annotated/dataset1.0.json"
):
    """
    :return: {article_id: {spec1, ...}}
    """

    specs = {}
    for f in get_file_list(dir):
        if f.endswith(".full"):
            ls = open(f).readlines()
            for c, l in enumerate(ls):
                if '<meta content="' in l and 'name="DC.subject"' in l:  # or ('name="DC.subject"' in ls[c+1])):
                    match = re.search(
                        '<meta content=\"(.*)\" name=\"DC\.subject', l)
                    if match:
                        specialties = match.group(1)
                        specs[os.path.basename(f)[:-5]] = set(
                            specialties.lower().split("; "))
                        break
                elif '<meta content="' in l and ('name="DC.subject"' not in l
                                                 and 'name="DC.subject"'
                                                 in ls[c + 1]):
                    match = re.search('<meta content=\"(.*)\"', l)
                    if match:
                        specialties = match.group(1)
                        specs[os.path.basename(f)[:-5]] = set(
                            specialties.lower().split("; "))
                        break

    # filter out those not in json dataset
    from describe_data import get_doc_ids
    doc_ids = get_doc_ids(dataset_file)
    #print(len(specs))
    #print(Counter(i for s in specs.values() for i in s))

    return {doc_id: s for doc_id, s in specs.items() if doc_id in doc_ids}