def _parse_body_text(self, doc): """ Returns the body_text of a document as a <class 'list'> of <class 'dict'>. This should be a list of objects of some kind. Seems to be usually something like {'section_heading': <class 'str'>, 'text': <class 'str'> } """ body_text = None if self.parse_full_text: paper_fs = gridfs.GridFS(self.db, collection='Scraper_share_osf_io_fs') pdf_file = paper_fs.get(doc['PDF_gridfs_id']) try: paragraphs = extract_paragraphs_pdf(BytesIO(pdf_file.read())) except Exception as e: print('Failed to extract PDF %s(%r) (%r)' % (doc['Doi'], doc['PDF_gridfs_id'], e)) traceback.print_exc() paragraphs = [] body_text = [{ 'section_heading': None, 'text': x } for x in paragraphs] return body_text return body_text
def parse_biorxiv_doc(doc, db): parsed_doc = dict() parsed_doc['title'] = clean_title(doc['Title']) parsed_doc['doi'] = doc['Doi'] parsed_doc['origin'] = "Scraper_chemrxiv_org" parsed_doc['link'] = doc['Link'] parsed_doc['journal'] = doc['Journal'] parsed_doc['publication_date'] = doc['Publication_Date'] parsed_doc['authors'] = doc["Authors"] parsed_doc['abstract'] = ' '.join( map(lambda x: re.sub(r'\s+', ' ', x), doc['Abstract'])) parsed_doc['has_year'] = True parsed_doc['has_month'] = True parsed_doc['has_day'] = True paper_fs = gridfs.GridFS(db, collection='Scraper_chemrxiv_org_fs') pdf_file = paper_fs.get(doc['PDF_gridfs_id']) try: paragraphs = extract_paragraphs_pdf(BytesIO(pdf_file.read())) except Exception as e: print('Failed to extract PDF %s(%r) (%r)' % (doc['Doi'], doc['PDF_gridfs_id'], e)) traceback.print_exc() paragraphs = [] parsed_doc['body_text'] = [{ 'section_heading': None, 'text': x } for x in paragraphs] return parsed_doc
def parse_synopsis_doc(doc, db): parsed_doc = dict() parsed_doc['title'] = clean_title(doc['Title']) parsed_doc['link'] = doc['Link'] parsed_doc['synopsis_link'] = doc['Synopsis_Link'] parsed_doc['origin'] = "Scraper_public_health_ontario" parsed_doc['journal_string'] = doc['Journal_String'].strip(' \t\r.') parsed_doc['authors'] = doc["Authors"] parsed_doc['abstract'] = find_abstract(doc.get('Abstract')) paper_fs = gridfs.GridFS(db, collection='Scraper_publichealthontario_fs') pdf_file = paper_fs.get(doc['PDF_gridfs_id']) # with open('example.pdf', 'wb') as f: # f.write(pdf_file.read()) # pdf_file.seek(0) try: paragraphs = extract_paragraphs_pdf(BytesIO(pdf_file.read()), return_dicts=True, only_printable=True) except Exception as e: print('Failed to extract PDF %s(%r) (%r)' % (doc['Doi'], doc['PDF_gridfs_id'], e)) traceback.print_exc() paragraphs = [] sections = {} last_sec = None for p in paragraphs: is_heading = 18 < p['bbox'][3] - p['bbox'][1] and p['bbox'][2] - p[ 'bbox'][0] < 230 if is_heading: last_sec = p['text'].lower() sections[last_sec] = [] elif last_sec is not None: sections[last_sec].append(p) parsed_doc['synopsis'] = { 'summary': sections.get('one-minute summary', None), 'additional_info': sections.get('additional information', None), 'pho_reviewer_comments': sections.get('pho reviewers comments', None), } if all(x is None for x in parsed_doc['synopsis'].values()): parsed_doc['synopsis'] = None return parsed_doc
def handle_doc(file_obj): collection, fs = auth_db() # check again! doc = collection.find_one({'_id': file_obj['_id']}) if 'pdf_extraction_version' in doc and \ doc['pdf_extraction_version'] == parser_version and \ 'parsed_date' in doc and \ doc['parsed_date'] > doc['uploadDate']: return None, None pdf_file = fs.find_one(file_obj['_id']) data = BytesIO(pdf_file.read()) try: paragraphs = extract_paragraphs_pdf(data, laparams=laparams, return_dicts=True) collection.update({'_id': file_obj['_id']}, { '$set': { 'pdf_extraction_success': True, 'pdf_extraction_plist': paragraphs, 'pdf_extraction_exec': None, 'pdf_extraction_version': parser_version, 'parsed_date': datetime.datetime.now(), } }) exc = None except Exception as e: paragraphs = None traceback.print_exc() exc = f'Failed to extract PDF {file_obj["filename"]} {e}' + traceback.format_exc( ) collection.update({'_id': file_obj['_id']}, { '$set': { 'pdf_extraction_success': False, 'pdf_extraction_plist': None, 'pdf_extraction_exec': exc, 'pdf_extraction_version': parser_version, 'parsed_date': datetime.datetime.now(), } }) return paragraphs, exc
def try_parse_pdf_hierarchy(pdf_file): try: paragraphs = extract_paragraphs_pdf(BytesIO(pdf_file.read())) except Exception as e: print('Failed to extract PDF %s(%r) (%r)' % (doc['Doi'], doc['PDF_gridfs_id'], e)) traceback.print_exc() paragraphs = [] headings = r'\n' \ r'(?:abstract|backgrounds?|introduction|methods?|' \ r'results?|discussions?|conclusions?|acknowledgements?|' \ r'references?)' continuing_section = fr'(?:.(?!{headings}))' sections = fr""" (?: (?:^|\n)\s* (?: abstract\s+(?P<abstract>{continuing_section}+)*| backgrounds?\s+(?P<background>{continuing_section}+)| introduction\s+(?P<introduction>{continuing_section}+)| methods?\s+(?P<method>{continuing_section}+)| results?\s+(?P<result>{continuing_section}+)| discussions?\s+(?P<discussion>{continuing_section}+)| conclusions?\s+(?P<conclusion>{continuing_section}+)| acknowledgements??\s+(?P<acknowledgement>{continuing_section}+)| references?\s+(?P<reference>{continuing_section}+) ) )+ """ sections = re.compile(sections, re.VERBOSE | re.DOTALL | re.IGNORECASE) body_text = '\n'.join(paragraphs) parsed_content = {'body': body_text} for match in re.finditer(sections, body_text): for name, value in match.groupdict().items(): if value is not None: parsed_content[name] = value # print(body_text) return parsed_content