def get_subfigure_f(db_file, image_dir, subfigures_dst): conn = sqlite3.connect(db_file) df = select_helper(conn, sql_select_empty_subfigurejsonfiles, ['pmcid', 'figure_name']) insert_time = f'{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}' insert_subfigure_helper = DBHelper(conn, sql_insert_subfigure) insert_subfigure_helper.start() for pmcid, figure_name in tqdm.tqdm(zip(df['pmcid'], df['figure_name']), total=len(df), desc='Query new subfigures'): figure_file = image_dir / generate_path(pmcid) / '{}_{}'.format( pmcid, figure_name) records = split_subfigure(figure_file) records_to_insert = [(pmcid, figure_name, r.xtl, r.ytl, r.xbr, r.ybr, 'PMCFigureX', insert_time) for r in records] insert_subfigure_helper.extend(records_to_insert) insert_subfigure_helper.finish() insert_subfigure_helper.summarize() # get all subfigures df = select_helper(conn, sql_select_subfigure, ['pmcid', 'figure_name', 'xtl', 'ytl', 'xbr', 'ybr']) data = [] for i, row in tqdm.tqdm(df.iterrows(), total=len(df), desc='Get subfigures'): pmcid = row['pmcid'] figure_name = row['figure_name'] xtl = row['xtl'] ytl = row['ytl'] xbr = row['xbr'] ybr = row['ybr'] figure_file = generate_path(pmcid) / '{}_{}'.format(pmcid, figure_name) dst = figure_file.parent / f'{figure_file.stem}_{xtl}x{ytl}_{xbr}x{ybr}{figure_file.suffix}' data.append({ 'pmcid': pmcid, 'figure path': str(dst.as_posix()), 'xtl': xtl, 'ytl': ytl, 'xbr': xbr, 'ybr': ybr, 'type': 'subfigure' }) df = pd.DataFrame(data) df.to_csv(subfigures_dst, index=False) conn.close()
def get_bioc_f(db_file, bioc_dir): conn = sqlite3.connect(db_file) df = select_helper(conn, sql_select_empty_bioc, ['pmcid', 'pmid']) # cnt = collections.Counter() update_article_helper = DBHelper(conn, sql_update_articles) update_article_helper.start() for pmcid, pmid in tqdm.tqdm(zip(df['pmcid'], df['pmid']), total=len(df)): cnt['total pmc'] += 1 dst_dir = bioc_dir / generate_path(pmcid) dst = dst_dir / f'{pmcid}.xml' if dst.exists(): update_article_helper.append((1, pmcid)) else: try: get_bioc(pmid, dst) cnt['new bioc'] += 1 update_article_helper.append((1, pmcid)) except urllib.error.HTTPError: update_article_helper.append((0, pmcid)) update_article_helper.finish() conn.close() for k, v in cnt.most_common(): print(k, ':', v)
def get_figures(db_file, image_dir): conn = sqlite3.connect(db_file) df = select_helper(conn, sql_get_empty_figures, ['pmcid', 'figure_name']) cnt = collections.Counter() update_figure_helper = DBHelper(conn, sql_update_figure_size) update_figure_helper.start() for pmcid, figure_name in tqdm.tqdm(zip(df['pmcid'], df['figure_name']), total=len(df)): local_file = image_dir / generate_path(pmcid) / '{}_{}'.format( pmcid, figure_name) if not local_file.exists(): try: url = f'https://www.ncbi.nlm.nih.gov/pmc/articles/{pmcid}/bin/{figure_name}' urllib.request.urlretrieve(url, local_file) cnt['new figure'] += 1 except urllib.error.HTTPError: cnt['Http error'] += 1 with open(local_file, 'w') as _: pass try: im = Image.open(local_file) update_figure_helper.append( (im.width, im.height, pmcid, figure_name)) except: cnt['Image error'] += 1 cnt['total figure'] += 1 update_figure_helper.finish() conn.close() ppprint.pprint_counter(cnt, percentage=False)
def move1(src_dir, dst_dir): with os.scandir(src_dir) as it: for entry in tqdm.tqdm(it): src = entry.path pmcid = Path(src).stem parent_dir = dst_dir / generate_path(pmcid) parent_dir.mkdir(parents=True, exist_ok=True) dst = parent_dir / f'{pmcid}.xml' shutil.move(src, dst)
def get_figure_url(db_file, bioc_dir): conn = sqlite3.connect(db_file) df = select_helper(conn, sql_select_new_bioc, ['pmcid']) # insert_time = f'{datetime.datetime.now():%Y-%m-%d-%H-%M-%S}' insert_helper = DBHelper(conn, sql_insert_figure) insert_helper.start() for pmcid in tqdm.tqdm(df['pmcid'], total=len(df)): biocfile = bioc_dir / generate_path(pmcid) / f'{pmcid}.xml' figure_names = get_figure_link(biocfile) insert_helper.extend( set([(pmcid, figure_name, insert_time) for figure_name in figure_names])) insert_helper.finish() conn.close()
def get_figure_text(src1, src2, dest, history_file, bioc_dir): df1 = pd.read_csv(src1, dtype=str) df2 = pd.read_csv(src2, dtype=str) df = pd.concat([df1, df2], axis=0) figures = create_figures(df, history_file=history_file) docs = {} # type: Dict[str, bioc.BioCDocument] for figure in figures: pmcid = figure.pmcid if pmcid not in docs: src = bioc_dir / generate_path(pmcid) / f'{pmcid}.xml' collection = bioc.load(open(src)) docs[pmcid] = collection.documents[0] add_text(figure, docs[figure.pmcid]) with open(dest, 'w', encoding='utf8') as fp: objs = [f.to_dict() for f in figures] json.dump(objs, fp, indent=2)
def split_figure_f(db_file, image_dir, model_pathname, batch_size=16): conn = sqlite3.connect(db_file) df = select_helper(conn, sql_select_empty_subfigurejsonfiles, ['pmcid', 'figure_name']) conn.close() cnt = collections.Counter() tf.compat.v1.disable_eager_execution() separator = FigureSeparator(str(model_pathname)) with tf.compat.v1.Session(graph=separator.graph) as sess: needs_to_split = [] def split_and_save(): srcs = [r[0] for r in needs_to_split] dsts = [r[1] for r in needs_to_split] results = separator.extract_batch(sess, srcs) assert len(results) == len(srcs) for dst, result in zip(dsts, results): subfigures = result['sub_figures'] json.dump(subfigures, open(dst, 'w')) for pmcid, figure_name in tqdm.tqdm(zip(df['pmcid'], df['figure_name']), total=len(df)): src = image_dir / generate_path(pmcid) / '{}_{}'.format( pmcid, figure_name) dst = src.with_suffix('.json') if not dst.exists(): needs_to_split.append((src, dst)) if len(needs_to_split) >= batch_size: split_and_save() needs_to_split = [] else: if is_file_empty(src): cnt['empty figure'] += 1 continue if len(needs_to_split) > 0: split_and_save() for k, v in cnt.most_common(): print(k, ':', v)
def get_figure_f(db_file, image_dir, figures_dst): conn = sqlite3.connect(db_file) df = select_helper(conn, sql_select_figure, ['pmcid', 'figure_name', 'width', 'height']) data = [] for i, row in tqdm.tqdm(df.iterrows(), total=len(df), desc='Get whole figures'): pmcid = row['pmcid'] figure_name = row['figure_name'] figure_file = generate_path(pmcid) / '{}_{}'.format(pmcid, figure_name) data.append({ 'pmcid': pmcid, 'figure path': str(figure_file.as_posix()), 'xtl': 0, 'ytl': 0, 'xbr': row['width'], 'ybr': row['height'], 'type': 'figure' }) df = pd.DataFrame(data) df.to_csv(figures_dst, index=False) conn.close()