def test_document_usage(self): """Test UsptoXmlReader used via Document.from_file.""" fname = 'US06840965B2.xml' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'uspto', fname), 'rb') d = Document.from_file(f, readers=[UsptoXmlReader()]) self.assertEqual(len(d.elements), 112)
def test_document_usage(self): """Test RscHtmlReader used via Document.from_file.""" fname = '10.1039_C6OB02074G.html' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'rsc', fname), 'rb') d = Document.from_file(f, readers=[RscHtmlReader()]) self.assertEqual(len(d.elements), 60)
def test_document_usage(self): """Test AcsHtmlReader used via Document.from_file.""" fname = 'acs.jmedchem.6b00723.html' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'acs', fname), 'rb') d = Document.from_file(f, readers=[AcsHtmlReader()]) self.assertEqual(len(d.elements), 198)
def parse(self, html_file): """ TODO: clean body texts. 02-11-2020 Unlike other XML files, tags for body texts are not quite consistent. For now, use CDE's reader to get body texts, and they have not only body texts but other preceding texts such as abstract. CDE's scraper can only body texts (scrape.paragraphs), but they are pure strings unlike Sentence instances. -> Exclude abstract and its preceding text from body text by last sentence of abstract in body text. 02-18-2020 """ htmlstring = open(html_file).read() ''' Remove encoding declaration since it causes the following error when Selector reads the string. -> ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration. ''' htmlstring = re.sub(r'<\?xml.*\?>', '', htmlstring) tree = etree.parse(html_file, self.html_parser) root = tree.getroot() # clean xml and extract essential elements. specials, refs = self.rsc_html_reader.preprocess(root) document = html.fromstring(htmlstring) title = document.findtext( './/title' ) # this title is only used to filter out the following error. The title from scrape below is used for JSON file. if title.strip( ) == 'RSC - Page load error': # e.g., 101039c1jm11358e.html logger.error('RSC - Page load error') return None abstract_element = document.find_class("abstract") abstract = [] start_para_idx = 1 for abs in abstract_element: para_id_prefix = 'abs' start_para_idx, sents = self.get_sentence(abs, para_id_prefix, start_para_idx, specials, refs) abstract.extend(sents) ''' Body Text ''' f = open(html_file, 'rb') doc = Document.from_file(f, readers=[self.rsc_html_reader]) body_text = [] sec_title = '' para_id_prefix = 'body' para_idx = 1 for elem in doc.elements: if isinstance(elem, Heading): sec_title = elem.text elif isinstance(elem, Paragraph): for sent in elem.sentences: token = [] start = [] end = [] for tok in sent.tokens: token.append(tok.text) start.append(tok.start - sent.start) end.append(tok.end - sent.start) pos = sent.pos_tags cems = [] for cem in sent.cems: cems.append([ cem.text, cem.start - sent.start, cem.end - sent.start ]) body_text.append({ 'section_title': sec_title, 'para_id': para_id_prefix + '_para_' + str(para_idx), 'sent': sent.text, 'token_pos': list(zip(token, start, end, pos)), 'chemical_entity': cems }) para_idx += 1 # Exclude abstract and its preceding text from body text. 02-18-2020 cut_off = -1 #if len(abstract) != 0 and len(body_text) != 0 and all(elem in body_text for elem in abstract): # Sometimes, abstract and body have different whitespaces. e.g., 101039c005501h.json if len(abstract) != 0 and len(body_text) != 0: if len(abstract) < 3: # debugging print('Abstract is a single sentence!!') for idx in range(len(body_text)): # compare only sent and remove leading and trailing whitespaces. Sometimes, abstract and body have different whitespaces. e.g., 101039c005501h.json # also compare preceding two sentences of the last one to increase accuracy. Some abstracts are a single sentence. e.g., 101039c2cp23070d.html #if abstract[-1]['sent'].strip() == body_text[idx]['sent'].strip() and abstract[-2]['sent'].strip() == body_text[idx-1]['sent'].strip() and abstract[-3]['sent'].strip() == body_text[idx-2]['sent'].strip(): if len( re.sub(r"[^a-zA-Z]", '', abstract[-1]['sent']) ) > 0: # ignore sents having non-alphabets such as '.', '\n' if re.sub(r'\s+', '', abstract[-1]['sent']) == re.sub( r'\s+', '', body_text[idx]['sent']): cut_off = idx + 1 break if cut_off != -1: body_text = body_text[cut_off:] ''' Figures ''' sel = Selector.from_text(htmlstring) scrape = RscHtmlDocument(sel) figures = [] for fig in scrape.figures: id = fig.reference if fig.reference is not None else fig.label label = fig.label if id is None: # e.g., 101039b918103b.html has an image having only url information. print('figure id is none.') continue fig_file = html_file.rsplit('/', 1)[0] + '/' + fig.url.rsplit( '/', 1)[1] caption = [] #cap = Text(fig.caption) #print(cap.sentences) if fig.caption is not None: for sent in Text(fig.caption): token = [] start = [] # start offset end = [] # end offset for tok in sent.tokens: token.append(tok.text) start.append(tok.start - sent.start) end.append(tok.end - sent.start) pos = sent.pos_tags cems = [] for cem in sent.cems: cems.append([ cem.text, cem.start - sent.start, cem.end - sent.start ]) caption.append({ 'sent': sent.text, 'token_pos': list(zip(token, start, end, pos)), 'chemical_entity': cems }) figures.append({ 'fig_id': id, 'label': label, 'caption': caption, 'fig_file': fig_file }) data = {} data['uid'] = scrape.doi data['publisher'] = scrape.publisher + ( ' - ' + scrape.journal if scrape.journal is not None else '') data['type'] = 'journal' if scrape.journal is not None else '' data['title'] = scrape.title data['year'] = '' if scrape.published_date is not None: data['year'] = scrape.published_date.strftime("%Y") elif scrape.online_date is not None: data['year'] = scrape.online_date.strftime("%Y") data['author'] = scrape.authors data['keywords'] = [] data['abstract'] = abstract data['body_text'] = body_text data['figures'] = figures # debug ''' if data['year'] == '': print('year is unknown.')` input('enter') if data['type'] == '': print('journal is unknown!!') # E.g., 101039c5md00579e.html, 101039c5md00579e.html has no journal value and only abstract. input('enter') ''' # write data to file output_filename = html_file.replace('.html', '.json') if output_filename == html_file: logger.error('>> HTML file does NOT exist!!') sys.exit() with open(output_filename, 'w') as outfile: json.dump(data, outfile) return scrape.doi
writer.writerow([ "number", "Source", "compound_from_rowheader", 'compound_from_caption', 'refractive_index_value', 'row_headers', 'specifier', 'caption', 'wavelength_from_caption', 'wavelength_from_headers' ]) s = 'the refractive index is measured at 485 nm' #print (get_wavelength_fromcaption(s)) count = 0 if True: for i in range(0, 168999): path = r'F:\el_refractive_index_volumn_2000-2020\{}.xml'.format(i) try: f = open(path, 'rb') d = Document.from_file(f) DOI = str(d.metadata.serialize()) # f = open(path, 'rb') # f1 = open(path, 'rb').read() # d = Document.from_file(f) # root = ET.fromstring(f1) # Journal = 'None' # DOI = 'None' # for child in root: # for cchild in child: # if cchild.tag == '{http://prismstandard.org/namespaces/basic/2.0/}publicationName': # Journal = cchild.text[:] # elif cchild.tag == '{http://prismstandard.org/namespaces/basic/2.0/}doi': # DOI = cchild.text[:] for t in d.tables:
from chemdataextractor.doc import Document, Heading, Paragraph from chemdataextractor.scrape import Selector from chemdataextractor.scrape.pub.rsc import RscHtmlDocument from chemdataextractor.reader import AcsHtmlReader, RscHtmlReader, PdfReader import os import sys import csv with open('file_name', 'rb') as file: doc = Document.from_file(file) # initialise with an empty dictionary compoundInfo = {} # Produce the list of dictionaries doc_records = doc.records.serialize() # filter to only ratio information ratio_doc_records = [record for record in doc_records if 'ratio' in record] # using a loop extract the ratio information within ratio_doc_records i = 0 for i in range(len(ratio_doc_records)): for key, value in ratio_doc_records[i].items(): compoundInfo[key] = value # Only allow Name and Ratio information, don't show any other attributes if (key == 'nmr_spectra' or key == 'ir_spectra' or key == 'melting_points' or key == 'labels' or key == 'roles'): del compoundInfo[key] # Open a new CSV file and append this information with open('csv_filename', 'a', newline='') as f: