Пример #1
0
def fix_it(timestamp, book_chapter, content_err_list, content_fix_list):

    fix_dir = '../data/{}.fix'.format(timestamp)
    os.makedirs(fix_dir, exist_ok=True)
    fin_json = '../data/{}/{}.json'.format(timestamp, book_chapter)
    fold_json = '{}/{}.old.json'.format(fix_dir, book_chapter)
    fnew_json = '{}/{}.new.json'.format(fix_dir, book_chapter)

    data = utils.load_json(fin_json)
    new_content = []
    content_changed = False
    for i, x in enumerate(data['content']):
        has_issue = False
        for y, zz in zip(content_err_list, content_fix_list):
            if (x['vers'] == y['vers'] and x['text'] == y['text']):
                has_issue = True
                for z in zz:
                    new_content.append(z)
                content_changed = True
        if not has_issue:
            new_content.append(x)

    if not content_changed:
        print(':: target problem is not found in {}'.format(fin_json),
              file=sys.stderr,
              flush=True)
        return

    qcmd('cp {} {}'.format(fin_json, fold_json))
    data['content'] = new_content
    utils.write_json(data, fnew_json)
    qcmd('cp {} {}'.format(fnew_json, fin_json))
Пример #2
0
def format_csv(timestamp):

    data_json = '../data/{}.subject.json'.format(timestamp)
    data_dir = '../data/{}'.format(timestamp)
    out_csv = '../data/{}.csv'.format(timestamp)
    data = utils.load_json(data_json, verbose=True)
    url_with_subj = dict()
    for subj in data:
        for url in data[subj]:
            url_with_subj[url] = subj

    utils.qprint('writing ' + out_csv)
    fout = open(out_csv, 'w')
    for fname in os.listdir(data_dir):
        journal = fname.split('/')[-1].replace('.html', '').lower()
        fname = join(data_dir, fname)
        jdata = utils.load_json(fname, verbose=False)
        if jdata['url'] in url_with_subj:
            subj = url_with_subj[jdata['url']]
        else:
            subj = 'unknown'
        print(subj,
              journal,
              jdata['title'].replace(',', ''),
              jdata['impact']['2016/2017']['ifact'],
              jdata['impact']['2015']['ifact'],
              jdata['impact']['2014']['ifact'],
              jdata['impact']['2016/2017']['npub'],
              jdata['impact']['2015']['npub'],
              jdata['impact']['2014']['npub'],
              jdata['title_abbrev'].replace(',', ''),
              jdata['issn'].replace(',', ''),
              jdata['url'],
              sep=',',
              file=fout)

    for subj in sorted(data):
        for url in sorted(data[subj]):
            journal = url.split('/')[-1].replace('.html', '').lower()
            fname = '{}/{}.json'.format(data_dir, journal)
    fout.close()
Пример #3
0
def print_csv_short_list():

    data = utils.load_json('../data/short_list/omim_diseases.json')

    fname_out = 'csv/omim_diseases.csv'
    utils.qprint('writing ' + fname_out)
    fout = open(fname_out, 'w')

    print('oid,name_cn,name_en', file=fout)
    for k,v in sorted(data.items()):
        print('OMIM:'+k, v['name_cn'].replace(',',''),
              v['name_en'].replace(',',''), sep=',', file=fout)
    fout.close()
Пример #4
0
def scrape():

    if len(sys.argv) == 2:
        ts = sys.argv[1]
    else:
        ts = utils.get_timestamp()

    out_dir = '../data'
    os.makedirs(out_dir, exist_ok=True)

    link_json = '{}/{}.links.json'.format(out_dir, ts)
    link_data = {}
    if os.path.isfile(link_json):
        link_data = utils.load_json(link_json)
    else:
        link_data = get_all_chapter_links(link_json)

    get_all_chapter_text(link_data, '{}/{}'.format(out_dir, ts))
Пример #5
0
def print_csv_full_data():

    fname_out = 'csv/full_list.csv'
    utils.qprint('writing ' + fname_out)
    fout = open(fname_out, 'w')

    print('oid,name_cn,name_en', file=fout)
    in_dir = '../data/full_data'
    for fname in sorted(os.listdir(in_dir)):
        oid_fname = fname.split('.')[1]
        db_type = None
        if fname.startswith('hp.'):
            db_type = 'HP'
        elif fname.startswith('omim.'):
            db_type = 'OMIM'
        fname = in_dir + '/' + fname
        data = utils.load_json(fname, verbose=False)

        oid, name_cn, name_en = None, None, None
        if db_type == 'HP':
            for x in data['results']:
                if x['hpoId'] != 'HP:' + oid_fname:
                    continue
                oid = x['hpoId']
                name_cn = ('{};{}'.format(x['name_cn'], x['definition_cn'])
                           .replace(',',''))
                name_en = ('{};{}'.format(x['name_en'], x['definition_en'])
                           .replace(',',''))

        elif db_type == 'OMIM':
            for x in data['results']:
                if str(x['mimNumber']) != oid_fname:
                    continue
                oid = 'OMIM:' + str(x['mimNumber'])
                name_cn = x['cnTitle'].replace(',','')
                name_en = x['preTitle'].replace(',','')

        if oid:
            print(oid, name_cn, name_en, sep=',', file=fout)

    fout.close()
Пример #6
0
def test():

    data, error = get_journal_data(
        'http://www.bioxbio.com/if/html/Z-PADAGOGIK.html', 'foo.json', 0)
    print(data)
    print(error)


if __name__ == '__main__':

    # timestamp = '201804140057'
    # catch_missing(timestamp)
    # test()

    if len(sys.argv) != 2:
        timestamp = utils.get_timestamp()
    else:
        timestamp = sys.argv[1]

    out_dir = '../data/{}'.format(timestamp)
    subject_json = '../data/{}.subject.json'.format(timestamp)

    data = {}
    if os.path.isfile(subject_json):
        data = utils.load_json(subject_json)
    else:
        data = get_all_journal_links()
        utils.write_json(data, subject_json)

    get_all_journal_data(data, out_dir)
Пример #7
0
    # print(json.dumps(data, sort_keys=True, indent=2))
    # print(len(data))
    # print(error)

    data = get_all_journal_links(nproc=3, nretry=10, tsleep=1)
    utils.write_json(data, 'foo.json')




if __name__ == '__main__':

    # test()

    if len(sys.argv) != 2:
        timestamp = utils.get_timestamp()
    else:
        timestamp = sys.argv[1]

    out_dir = '../data/{}'.format(timestamp)
    url_json = '../data/{}.url.json'.format(timestamp)

    data = {}
    if os.path.isfile(url_json):
        data = utils.load_json(url_json)
    else:
        data = get_all_journal_links()
        utils.write_json(data, url_json)

    get_all_journal_data(data, out_dir)