def fix_it(timestamp, book_chapter, content_err_list, content_fix_list): fix_dir = '../data/{}.fix'.format(timestamp) os.makedirs(fix_dir, exist_ok=True) fin_json = '../data/{}/{}.json'.format(timestamp, book_chapter) fold_json = '{}/{}.old.json'.format(fix_dir, book_chapter) fnew_json = '{}/{}.new.json'.format(fix_dir, book_chapter) data = utils.load_json(fin_json) new_content = [] content_changed = False for i, x in enumerate(data['content']): has_issue = False for y, zz in zip(content_err_list, content_fix_list): if (x['vers'] == y['vers'] and x['text'] == y['text']): has_issue = True for z in zz: new_content.append(z) content_changed = True if not has_issue: new_content.append(x) if not content_changed: print(':: target problem is not found in {}'.format(fin_json), file=sys.stderr, flush=True) return qcmd('cp {} {}'.format(fin_json, fold_json)) data['content'] = new_content utils.write_json(data, fnew_json) qcmd('cp {} {}'.format(fnew_json, fin_json))
def format_csv(timestamp): data_json = '../data/{}.subject.json'.format(timestamp) data_dir = '../data/{}'.format(timestamp) out_csv = '../data/{}.csv'.format(timestamp) data = utils.load_json(data_json, verbose=True) url_with_subj = dict() for subj in data: for url in data[subj]: url_with_subj[url] = subj utils.qprint('writing ' + out_csv) fout = open(out_csv, 'w') for fname in os.listdir(data_dir): journal = fname.split('/')[-1].replace('.html', '').lower() fname = join(data_dir, fname) jdata = utils.load_json(fname, verbose=False) if jdata['url'] in url_with_subj: subj = url_with_subj[jdata['url']] else: subj = 'unknown' print(subj, journal, jdata['title'].replace(',', ''), jdata['impact']['2016/2017']['ifact'], jdata['impact']['2015']['ifact'], jdata['impact']['2014']['ifact'], jdata['impact']['2016/2017']['npub'], jdata['impact']['2015']['npub'], jdata['impact']['2014']['npub'], jdata['title_abbrev'].replace(',', ''), jdata['issn'].replace(',', ''), jdata['url'], sep=',', file=fout) for subj in sorted(data): for url in sorted(data[subj]): journal = url.split('/')[-1].replace('.html', '').lower() fname = '{}/{}.json'.format(data_dir, journal) fout.close()
def print_csv_short_list(): data = utils.load_json('../data/short_list/omim_diseases.json') fname_out = 'csv/omim_diseases.csv' utils.qprint('writing ' + fname_out) fout = open(fname_out, 'w') print('oid,name_cn,name_en', file=fout) for k,v in sorted(data.items()): print('OMIM:'+k, v['name_cn'].replace(',',''), v['name_en'].replace(',',''), sep=',', file=fout) fout.close()
def scrape(): if len(sys.argv) == 2: ts = sys.argv[1] else: ts = utils.get_timestamp() out_dir = '../data' os.makedirs(out_dir, exist_ok=True) link_json = '{}/{}.links.json'.format(out_dir, ts) link_data = {} if os.path.isfile(link_json): link_data = utils.load_json(link_json) else: link_data = get_all_chapter_links(link_json) get_all_chapter_text(link_data, '{}/{}'.format(out_dir, ts))
def print_csv_full_data(): fname_out = 'csv/full_list.csv' utils.qprint('writing ' + fname_out) fout = open(fname_out, 'w') print('oid,name_cn,name_en', file=fout) in_dir = '../data/full_data' for fname in sorted(os.listdir(in_dir)): oid_fname = fname.split('.')[1] db_type = None if fname.startswith('hp.'): db_type = 'HP' elif fname.startswith('omim.'): db_type = 'OMIM' fname = in_dir + '/' + fname data = utils.load_json(fname, verbose=False) oid, name_cn, name_en = None, None, None if db_type == 'HP': for x in data['results']: if x['hpoId'] != 'HP:' + oid_fname: continue oid = x['hpoId'] name_cn = ('{};{}'.format(x['name_cn'], x['definition_cn']) .replace(',','')) name_en = ('{};{}'.format(x['name_en'], x['definition_en']) .replace(',','')) elif db_type == 'OMIM': for x in data['results']: if str(x['mimNumber']) != oid_fname: continue oid = 'OMIM:' + str(x['mimNumber']) name_cn = x['cnTitle'].replace(',','') name_en = x['preTitle'].replace(',','') if oid: print(oid, name_cn, name_en, sep=',', file=fout) fout.close()
def test(): data, error = get_journal_data( 'http://www.bioxbio.com/if/html/Z-PADAGOGIK.html', 'foo.json', 0) print(data) print(error) if __name__ == '__main__': # timestamp = '201804140057' # catch_missing(timestamp) # test() if len(sys.argv) != 2: timestamp = utils.get_timestamp() else: timestamp = sys.argv[1] out_dir = '../data/{}'.format(timestamp) subject_json = '../data/{}.subject.json'.format(timestamp) data = {} if os.path.isfile(subject_json): data = utils.load_json(subject_json) else: data = get_all_journal_links() utils.write_json(data, subject_json) get_all_journal_data(data, out_dir)
# print(json.dumps(data, sort_keys=True, indent=2)) # print(len(data)) # print(error) data = get_all_journal_links(nproc=3, nretry=10, tsleep=1) utils.write_json(data, 'foo.json') if __name__ == '__main__': # test() if len(sys.argv) != 2: timestamp = utils.get_timestamp() else: timestamp = sys.argv[1] out_dir = '../data/{}'.format(timestamp) url_json = '../data/{}.url.json'.format(timestamp) data = {} if os.path.isfile(url_json): data = utils.load_json(url_json) else: data = get_all_journal_links() utils.write_json(data, url_json) get_all_journal_data(data, out_dir)