def test_cems_stoplist(self): """Test Document cems removes words in stoplist, ncluding words entirely made up of ignore prefix/suffix. GitHub issue #12. """ self.assertEqual([Span('benzene', 0, 7)], Document('benzene-aromatic').cems) self.assertEqual([], Document('-aromatic').cems) self.assertEqual([], Document('non-aromatic').cems)
def test_parse_control_character(self): """Test control character in text is handled correctly.""" # The parser doesn't like controls because it uses LXML model so must be XML compatible. d = Document( Paragraph('Yielding 2,4,6-trinitrotoluene,\n m.p. 20 \x0eC.')) expected = [{'names': ['2,4,6-trinitrotoluene']}] self.assertEqual(expected, d.records.serialize())
def get_sentence(self, paragraph, para_id, specials, refs, sec_title=''): sents = [] elements = self.els_xml_reader._parse_element(paragraph, specials=specials, refs=refs) doc = Document(*elements) for para in doc.paragraphs: # Document object doesn't have direct access to sentences. for sent in para.sentences: token = [] start = [] end = [] for tok in sent.tokens: token.append(tok.text) start.append(tok.start - sent.start) end.append(tok.end - sent.start) pos = sent.pos_tags cems = [] for cem in sent.cems: cems.append([ cem.text, cem.start - sent.start, cem.end - sent.start ]) sents.append({ 'section_title': sec_title, 'para_id': para_id, 'sent': sent.text, 'token_pos': list(zip(token, start, end, pos)), 'chemical_entity': cems }) return sents
def test_document_usage(self): """Test RscHtmlReader used via Document.from_file.""" fname = '10.1039_C6OB02074G.html' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'rsc', fname), 'rb') d = Document.from_file(f, readers=[RscHtmlReader()]) self.assertEqual(len(d.elements), 60)
def test_document_usage(self): """Test UsptoXmlReader used via Document.from_file.""" fname = 'US06840965B2.xml' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'uspto', fname), 'rb') d = Document.from_file(f, readers=[UsptoXmlReader()]) self.assertEqual(len(d.elements), 112)
def test_document_usage(self): """Test AcsHtmlReader used via Document.from_file.""" fname = 'acs.jmedchem.6b00723.html' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'acs', fname), 'rb') d = Document.from_file(f, readers=[AcsHtmlReader()]) self.assertEqual(len(d.elements), 198)
def extract_chemdata(self, text): doc = Document(text) cems = doc.cems chem_mentions = doc.records.serialize() materials = [] for chem in chem_mentions: if 'names' in chem.keys(): materials.append(chem["names"]) return materials
def get_sentence(self, elem, para_id_prefix, start_para_idx, specials, refs, sec_title=''): sents = [] elements = self.rsc_html_reader._parse_element(elem, specials=specials, refs=refs) doc = Document(*elements) para_idx = start_para_idx for para in doc.paragraphs: for sent in para.sentences: token = [] start = [] end = [] for tok in sent.tokens: token.append(tok.text) start.append(tok.start - sent.start) end.append(tok.end - sent.start) pos = sent.pos_tags cems = [] for cem in sent.cems: cems.append([ cem.text, cem.start - sent.start, cem.end - sent.start ]) sents.append({ 'section_title': sec_title, 'para_id': para_id_prefix + '_para_' + str(para_idx), 'sent': sent.text, 'token_pos': list(zip(token, start, end, pos)), 'chemical_entity': cems }) para_idx += 1 return para_idx, sents
def parse(self, html_file): """ TODO: clean body texts. 02-11-2020 Unlike other XML files, tags for body texts are not quite consistent. For now, use CDE's reader to get body texts, and they have not only body texts but other preceding texts such as abstract. CDE's scraper can only body texts (scrape.paragraphs), but they are pure strings unlike Sentence instances. -> Exclude abstract and its preceding text from body text by last sentence of abstract in body text. 02-18-2020 """ htmlstring = open(html_file).read() ''' Remove encoding declaration since it causes the following error when Selector reads the string. -> ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration. ''' htmlstring = re.sub(r'<\?xml.*\?>', '', htmlstring) tree = etree.parse(html_file, self.html_parser) root = tree.getroot() # clean xml and extract essential elements. specials, refs = self.rsc_html_reader.preprocess(root) document = html.fromstring(htmlstring) title = document.findtext( './/title' ) # this title is only used to filter out the following error. The title from scrape below is used for JSON file. if title.strip( ) == 'RSC - Page load error': # e.g., 101039c1jm11358e.html logger.error('RSC - Page load error') return None abstract_element = document.find_class("abstract") abstract = [] start_para_idx = 1 for abs in abstract_element: para_id_prefix = 'abs' start_para_idx, sents = self.get_sentence(abs, para_id_prefix, start_para_idx, specials, refs) abstract.extend(sents) ''' Body Text ''' f = open(html_file, 'rb') doc = Document.from_file(f, readers=[self.rsc_html_reader]) body_text = [] sec_title = '' para_id_prefix = 'body' para_idx = 1 for elem in doc.elements: if isinstance(elem, Heading): sec_title = elem.text elif isinstance(elem, Paragraph): for sent in elem.sentences: token = [] start = [] end = [] for tok in sent.tokens: token.append(tok.text) start.append(tok.start - sent.start) end.append(tok.end - sent.start) pos = sent.pos_tags cems = [] for cem in sent.cems: cems.append([ cem.text, cem.start - sent.start, cem.end - sent.start ]) body_text.append({ 'section_title': sec_title, 'para_id': para_id_prefix + '_para_' + str(para_idx), 'sent': sent.text, 'token_pos': list(zip(token, start, end, pos)), 'chemical_entity': cems }) para_idx += 1 # Exclude abstract and its preceding text from body text. 02-18-2020 cut_off = -1 #if len(abstract) != 0 and len(body_text) != 0 and all(elem in body_text for elem in abstract): # Sometimes, abstract and body have different whitespaces. e.g., 101039c005501h.json if len(abstract) != 0 and len(body_text) != 0: if len(abstract) < 3: # debugging print('Abstract is a single sentence!!') for idx in range(len(body_text)): # compare only sent and remove leading and trailing whitespaces. Sometimes, abstract and body have different whitespaces. e.g., 101039c005501h.json # also compare preceding two sentences of the last one to increase accuracy. Some abstracts are a single sentence. e.g., 101039c2cp23070d.html #if abstract[-1]['sent'].strip() == body_text[idx]['sent'].strip() and abstract[-2]['sent'].strip() == body_text[idx-1]['sent'].strip() and abstract[-3]['sent'].strip() == body_text[idx-2]['sent'].strip(): if len( re.sub(r"[^a-zA-Z]", '', abstract[-1]['sent']) ) > 0: # ignore sents having non-alphabets such as '.', '\n' if re.sub(r'\s+', '', abstract[-1]['sent']) == re.sub( r'\s+', '', body_text[idx]['sent']): cut_off = idx + 1 break if cut_off != -1: body_text = body_text[cut_off:] ''' Figures ''' sel = Selector.from_text(htmlstring) scrape = RscHtmlDocument(sel) figures = [] for fig in scrape.figures: id = fig.reference if fig.reference is not None else fig.label label = fig.label if id is None: # e.g., 101039b918103b.html has an image having only url information. print('figure id is none.') continue fig_file = html_file.rsplit('/', 1)[0] + '/' + fig.url.rsplit( '/', 1)[1] caption = [] #cap = Text(fig.caption) #print(cap.sentences) if fig.caption is not None: for sent in Text(fig.caption): token = [] start = [] # start offset end = [] # end offset for tok in sent.tokens: token.append(tok.text) start.append(tok.start - sent.start) end.append(tok.end - sent.start) pos = sent.pos_tags cems = [] for cem in sent.cems: cems.append([ cem.text, cem.start - sent.start, cem.end - sent.start ]) caption.append({ 'sent': sent.text, 'token_pos': list(zip(token, start, end, pos)), 'chemical_entity': cems }) figures.append({ 'fig_id': id, 'label': label, 'caption': caption, 'fig_file': fig_file }) data = {} data['uid'] = scrape.doi data['publisher'] = scrape.publisher + ( ' - ' + scrape.journal if scrape.journal is not None else '') data['type'] = 'journal' if scrape.journal is not None else '' data['title'] = scrape.title data['year'] = '' if scrape.published_date is not None: data['year'] = scrape.published_date.strftime("%Y") elif scrape.online_date is not None: data['year'] = scrape.online_date.strftime("%Y") data['author'] = scrape.authors data['keywords'] = [] data['abstract'] = abstract data['body_text'] = body_text data['figures'] = figures # debug ''' if data['year'] == '': print('year is unknown.')` input('enter') if data['type'] == '': print('journal is unknown!!') # E.g., 101039c5md00579e.html, 101039c5md00579e.html has no journal value and only abstract. input('enter') ''' # write data to file output_filename = html_file.replace('.html', '.json') if output_filename == html_file: logger.error('>> HTML file does NOT exist!!') sys.exit() with open(output_filename, 'w') as outfile: json.dump(data, outfile) return scrape.doi
def parse(self, xml_file): ''' document encoding is ISO-8859-1, and if resolve_entities is set to True, then XMLSyntaxError occurs. ''' xml_parser = etree.XMLParser(encoding='ISO-8859-1', resolve_entities=False) tree = etree.parse(xml_file, xml_parser) #try: # tree = etree.parse(xml_file, self.parser) #except etree.XMLSyntaxError: # pass # debug docinfo = tree.docinfo encoding_info = docinfo.encoding if encoding_info != 'ISO-8859-1': print(encoding_info) input("Press Enter to continue...") root = tree.getroot() title = root.find('.//title_full') title = etree.tostring(title).decode( "utf-8") # retrive the original value to show in the TDM webpage. title = title.split( ">", 1)[1] # TODO: find a better way to remove the top tag. title = title.rsplit("</", 1)[0] doi = root.findtext('.//doi') # year checking priority: (1) date_history[epub] (2) date_cover (3) date_online[header] year = None year_elem = root.find('.//date_history') if year_elem is not None: year = year_elem.get('epub') if year is not None: year = year.split('-')[0] if year is None: year_elem = root.find('.//date_cover') if year_elem is not None: year = root.findtext('.//date_cover') year = year.split('-')[0] if year is None: year_elem = root.find('.//date_online') if year_elem is not None: year = year_elem.get('header') if year is not None: year = year.split('-')[0] authors = [] #for author in root.findall('.//author_granular'): # given = author.findtext('given') # surname = author.findtext('surname') # authors.append(given + ' ' + surname) for author in root.findall( './/author'): # not all article has <author_granular> tag authors.append(author.text) abstract = [] abstract_element = root.find( './/header_text') # not all article has [heading="Abstract"] #abstract = '' #if abstract_element is not None: # abstract = ' '.join(abstract_element.itertext()) # abstract = abstract.strip() if abstract_element is not None: # TODO: Fix the CDE related error - AttributeError: 'cython_function_or_method' object has no attribute 'lower' # e.g., jopt13_9_090201.pdf, jopt13_11_114001.pdf try: elements = self.nxml_reader._parse_element(abstract_element) doc = Document(*elements) for para in doc.paragraphs: for sent in para.sentences: #print(sent.tokens) #print(sent.pos_tagged_tokens) #abstract.append(sent.serialize()) token = [] start = [] end = [] for tok in sent.tokens: token.append(tok.text) start.append(tok.start - sent.start) end.append(tok.end - sent.start) pos = sent.pos_tags cems = [] for cem in sent.cems: cems.append([ cem.text, cem.start - sent.start, cem.end - sent.start ]) #print(sent.tokens) #print(sent.pos_tagged_tokens) #abstract.append(sent.serialize()) abstract.append({ 'sent': sent.text, #'tokens': sent.tokens, #'tokens': sent.tagged_tokens, #'tags': sent.tags, #'tokens': sent.raw_tokens, #'pos_tagged_tokens': sent.pos_tagged_tokens, #'ner_tagged_tokens': sent.ner_tagged_tokens 'token_pos': list(zip(token, start, end, pos)), 'chemical_entity': cems }) except: print('>> Error:', xml_file) pass copyright_text = root.findtext( './/copyright_text' ) # this will be used to remove copy right text from the extracted text from PDF. if doi is None: input("Press Enter to continue...") logger.debug( #f'\n>>> Journal Title: {journal_title}\n' #f'>>> Publisher: {publisher}\n' #f'>>> Article Type: {article_type}\n' f'\n>>> Encoding: {encoding_info}\n' f'>>> Title: {title}\n' f'>>> Year: {year}\n' f'>>> UID: {doi}\n' f'>>> Authors: {authors}\n' f'>>> Abstract:\n{abstract}\n' f'>>> CopyRight:\n{copyright_text}\n') #f'>>> Keywords: {keywords}\n') #f'>>> Body Text:\n{body_text}\n') metadata = {} metadata['uid'] = doi metadata['publisher'] = 'IOP' metadata[ 'type'] = 'journal-article' # TODO: exclude non journal articles. metadata['title'] = title metadata['year'] = year metadata['author'] = authors metadata['abstract'] = abstract return [metadata, copyright_text]
writer.writerow([ "number", "Source", "compound_from_rowheader", 'compound_from_caption', 'refractive_index_value', 'row_headers', 'specifier', 'caption', 'wavelength_from_caption', 'wavelength_from_headers' ]) s = 'the refractive index is measured at 485 nm' #print (get_wavelength_fromcaption(s)) count = 0 if True: for i in range(0, 168999): path = r'F:\el_refractive_index_volumn_2000-2020\{}.xml'.format(i) try: f = open(path, 'rb') d = Document.from_file(f) DOI = str(d.metadata.serialize()) # f = open(path, 'rb') # f1 = open(path, 'rb').read() # d = Document.from_file(f) # root = ET.fromstring(f1) # Journal = 'None' # DOI = 'None' # for child in root: # for cchild in child: # if cchild.tag == '{http://prismstandard.org/namespaces/basic/2.0/}publicationName': # Journal = cchild.text[:] # elif cchild.tag == '{http://prismstandard.org/namespaces/basic/2.0/}doi': # DOI = cchild.text[:] for t in d.tables:
from chemdataextractor.doc import Document, Heading, Paragraph from chemdataextractor.scrape import Selector from chemdataextractor.scrape.pub.rsc import RscHtmlDocument from chemdataextractor.reader import AcsHtmlReader, RscHtmlReader, PdfReader import os import sys import csv with open('file_name', 'rb') as file: doc = Document.from_file(file) # initialise with an empty dictionary compoundInfo = {} # Produce the list of dictionaries doc_records = doc.records.serialize() # filter to only ratio information ratio_doc_records = [record for record in doc_records if 'ratio' in record] # using a loop extract the ratio information within ratio_doc_records i = 0 for i in range(len(ratio_doc_records)): for key, value in ratio_doc_records[i].items(): compoundInfo[key] = value # Only allow Name and Ratio information, don't show any other attributes if (key == 'nmr_spectra' or key == 'ir_spectra' or key == 'melting_points' or key == 'labels' or key == 'roles'): del compoundInfo[key] # Open a new CSV file and append this information with open('csv_filename', 'a', newline='') as f:
def annotate(doi_pmid, text): global count global t0 t1 = time.time() if (count % 10 == 0): with open("{}.log".format(out_name), "a") as f: f.write("\n") f.write("{} out of {} completed\n".format(count, len(text_files.keys()))) f.write("elapsed time: " + str(time.time() - start_time) + "\n") igem.save_json(cache_name, smiles_cache) print() print("{} out of {} completed".format(count, len(text_files.keys()))) print(t1 - t0) t0 = t1 try: sentences = [ p.sentences for p in Document.from_string(text.encode()) if hasattr(p, 'sentences') ] # this has character-based indices except: sentences = [[]] sentence_found = [] starts = [] ends = [] indices = [] tagged = [] chemicals_found = [] bio_entities = [] bio_entities_with_pos = [] names_found = [] smiles_found = [] names_and_smiles = [] sentences = sentences[0] # weird nesting from CDE, do not change tot = time.time() times = 0 span_total = 0 successful_spans = 0 for i in range(len(sentences)): #TODO: change this to all sentences s = sentences[i] t_s_0 = time.time() # Part of Speech Tagger (used later for NLP) try: pos = (s.pos_tagged_tokens) except Exception as e: pos = cpt.tag(s.split()) spans = s.cems # generating here for enzyme finding span_names = [c.text for c in spans] # Enzymes in sentence (using regex) # attempt to get full enzyme names: enzyme_names = [] enzyme_names_locs = [] for i_w in range(len(pos)): word = pos[i_w][0] for m in re.finditer(r'[a-zA-Z]+ase\b', word): enzyme = m.group(0) i_l = i_w while i_l > 0: prev_word = pos[i_l][0] prev_pos = pos[i_l][1] if prev_word in span_names: enzyme = prev_word + " " + enzyme elif prev_pos not in ":;{}|,./<>?!": break i_l -= 1 enzyme_names.append(enzyme) enzyme_names_locs.append((enzyme, i_l, i_w)) spans_sent = [] smiles_sent = [] names_sent = [] names_smiles_sent = [] for r in range(len(spans)): span = spans[r] c = span.text # Tries to get smiles on entire string, then if it doesn't work, deals with the case where c is a conglomerate of chemicals seperated by spaces. name_smiles_tuples = get_smiles(s, c) print(name_smiles_tuples) print() # Ignore chemical if not found if not name_smiles_tuples or (len(name_smiles_tuples) == 1 and not name_smiles_tuples[0][0]): continue successful_spans += len(name_smiles_tuples) for name, smiles in name_smiles_tuples: if name: span_dict = { "text": name, "start": span.start, "end": span.end, "smiles": smiles } # Indexing through pos tokens to find chemical entities p = 0 while p < len(pos): token = pos[p][0] if token == span.text: span_dict["pos"] = pos[p][1] break p += 1 spans_sent.append(span_dict) names_sent.append(name) smiles_sent.append(smiles) names_smiles_sent.append((name, smiles)) # Leave for loop and add entries for each sentence in a given literature to lists sentence_found.append(s.text) chemicals_found.append(spans_sent) names_found.append( ", ".join(names_sent) ) # two commas and a space for redundancy, since IUPAC has commas smiles_found.append(", ".join(smiles_sent)) names_and_smiles.append(names_smiles_sent) starts.append(s.start) ends.append(s.end) indices.append(i) bio_entities.append(", ".join(enzyme_names)) bio_entities_with_pos.append(enzyme_names_locs) tagged.append(pos) if len(spans) > 0: times += time.time() - t_s_0 span_total += len(spans) #print(time.time()-t_s_0) # Create a dataframe with annotations from a given literature. print() print("Average time per span (one identified chemical entity): " + str(times / (span_total + 0.01))) t_an = time.time() print("Time for all sentences in text: " + str(t_an - tot)) print("Successfully classified span percent in paper: " + str(successful_spans / (span_total + 0.01))) # put all lists into a dictionary and coerce to dataframe! good riddance annotations = { "sentence": sentence_found, "start": starts, "end": ends, "indices": indices, "sentence_pos": tagged, "enzymes": bio_entities, "enzyme_locations": bio_entities_with_pos, "chemical_entities_full": chemicals_found, "chemical_names": names_found, "chemical_smiles": smiles_found, "name_smile_tuples": names_and_smiles } annots_csv = pd.DataFrame(annotations) annots_csv["lit_id"] = doi_pmid # Reorder our dataframe. annots_csv = annots_csv[[ "lit_id", "indices", "start", "end", "sentence", "sentence_pos", "enzymes", "enzyme_locations", "chemical_entities_full", "chemical_names", "chemical_smiles", "name_smile_tuples" ]] # Add the datagram to our csv_file, appending if it exists and creating a new one if not. if os.path.isfile(csv_file): annots_csv.to_csv(csv_file, mode='a', header=False, index=False) else: annots_csv.to_csv(csv_file, index=False)