def list_of_cems(self): """List of cems e.g. cem, cem, cem and cem""" return Group(self.single_cem + Optional(lbrct + R('^\d+$') + rbrct).hide() + ZeroOrMore(delim.hide() | self.single_cem | R('^\d+$')) + (I('and') | I('or') | I('to')).hide() + self.single_cem + Optional(lbrct + R('^\d+$') + rbrct).hide() + Optional(I('compounds') | I('samples')))('cem_list')
def value_phrase(self): number = R('^[\+\-–−]?\d+(\.\d+)?$') joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('raw_value').add_action(merge) spaced_range = (number + (R('^[\-–−~∼˜]$') + number | number))('raw_value').add_action( merge) to_range = (number + I('to') + number)('raw_value').add_action(join) plusminus_range = (number + R('±') + number)('value').add_action(join) value_range = (Optional(R('^[\-–−]$')) + (plusminus_range | joined_range | spaced_range | to_range))( 'raw_value').add_action(merge) value_single = (Optional(R('^[~∼˜\<\>]$')) + Optional(R('^[\-–−]$')) + number)('raw_value').add_action(merge) inumber = (R('\d*\.?\d*[i]$')).add_action(join) # inumber = R('^([-+]?(\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?[r]?|[-+]?((\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?)?[i]|[-+]?(\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?[r]?[-+]((\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?)?[i])$') ivalue = (R('\d*\.?\d*$') + R('^[\+\-–−]?') + inumber).add_action(join) value = Optional(lbrct).hide() + (ivalue | value_range | value_single)('raw_value') + Not(I('wt%')|I('vol%')|I('K')|I('times')|I('GPa')|I('wt')|I('vol')|I('%')|I('nm')|I('zF')|W('°')|W('KV')|W('kV')|W('MV')|I('kHz')|I('Hz')|I('GHz')|W('V')|W('J')|W('eV')|I('MHz')) + Optional(rbrct).hide() return value
def specifier_value_cem(self): return (Optional(I('below') | I('at')) \ + self.specifier_and_value \ + Optional((I('has') + I('been') + I('found') + I('for')) | ( I('was') + (I('observed') | I('determined') | I('measured') | I('calculated')))).hide() \ + Optional(I('in') | I('for') | I('of')).hide() \ + Optional(I('the')).hide() \ + Optional(R('^[:;,]$')).hide() \ + Optional(I('bulk') | I('powdered') | I('doped') | I('full')| (I('thin') + I('film'))).hide() + Optional(rbrct) \ + self.cem_phrase)('root_phrase')
def cem_value_specifier(self): return (self.cem_phrase + Optional((I('is') | I('was') | I('were')) + Optional(I('reported') | I('found') | I('calculate') | I('measured') | I('shown') | I('found')) + Optional(I('to'))).hide() \ + Optional((I('exhibit') | I('exhibits') | I('exhibiting') | R('^show[s]*$') | I('demonstrates') | I('undergoes') | I('have') | I('has') | I('having') | I('determined') | I('with'))).hide() \ + Optional(I('the') | I('a') | I('an')).hide() \ + Optional(I('value') | I('values')).hide() \ + Optional(I('varies') + I('from')).hide() \ + Optional(W('=') | W('~') | W('≈') | W('≃') | I('was') | I('is') | I('at') | I('as') | I('near') | I('above') | I('below')).hide() \ + Optional(I('in') + I('the') + I('range') | I('ranging')).hide() \ + Optional(I('of') | I('about') | I('from') | I('approximately') | I('around') | (I('high') + I('as')) | (I('higher') | I('lower') + I('than'))).hide() \ + self.value_phrase \ + Optional(I('as') | I('of') | I('for')).hide() \ + Optional(I('its') | I('their') | I('the')).hide() + self.specifier_phrase)('root_phrase')
def cem_specifier_value(self): return (( self.cem_phrase + Optional(delim).hide() + Optional(I('samples') | I('system') | I('systems') | I('sample')) + Optional( I('that') | I('which') | I('was') | I('since') | I('the')).hide() + Optional(I('typically')).hide() + Optional( I('exhibits') | I('exhibiting') | R('^show[s]*$') | I('demonstrates') | I('undergoes') | I('has') | I('having') | I('determined') | I('with') | I('where') | I('orders') | (I('is') + Optional(I('classified') + I('as')))).hide() + Optional(I('reported') + I('to') + self.have).hide() + Optional(lbrct).hide() + self.specifier_and_value + Optional(rbrct))('root_phrase'))
def prefix(self): """Specifier and prefix""" return (self.specifier_phrase + Optional(I('values')).hide() + Optional(delim).hide() + Optional((I('varies') + I('from')) | R('^increase(s|d)?') | I('falls') | I('reaches')).hide() + Optional(I('steeply')).hide() + Optional(I('recorded') | I('reported')).hide() + Optional(I('of') | I('was') | I('is') | I('at') | I('near') | I('above') | I('below') | I('with') | I('to') | I('were') | I('a')).hide() + Optional(I('reported') | I('determined') | I('estimated') | I('found') | I('occurs')).hide() + Optional(I('temperatures')).hide() + Optional(I('as') | (I('to') + I('be'))).hide() + Optional(I('in') + I('the') + I('range')).hide() + Optional(I('as') + I('high') + I('as')) + Optional(I('ranging') + I('from')).hide() + Optional(I('of')).hide() + Optional(I('rather') | I('quite')).hide() + Optional(I('high') | I('low') | I('maximum') | I('minimum')).hide() + Optional(I('the')).hide() + Optional(delim | lbrct | rbrct) + Optional( I('of') | I('about') | I('approximately') | I('typically') | I('ca.') | I('around') | I('at') | I( 'above') | I('below') | I('high') | I('low') | ((I('higher') | I('lower') | I('more') | I('less')) + I('than')) | I('order') | ( I('for') + I('instance')) | (I('up') + I('to')) | I('reaching') | I('value')).hide() + Optional(I('a') | I('an') | I('as')).hide() + Optional(I('maximum')).hide() + Optional(I('of')).hide() + ZeroOrMore(lbrct | delim | rbrct) + Optional(self.specifier_phrase) + Optional(I('of')).hide() + Optional(I('the')).hide() + Optional(I('order')).hide() + Optional((I('up') | I('equal')) + I('to')).hide() + Optional(I('of')).hide() + ZeroOrMore(lbrct | delim | rbrct) + Optional(W('=') | W('~') | W('≈') | W('≃') | W('>') | W('<')).hide() + ZeroOrMore(lbrct | delim | rbrct).hide()).add_action(join)
class Obstructions: #Class for bracket bracket = Optional(R(u'\(')) curlLine = Optional(R(u'\~')) of = Optional(W(u'of')) hyphen = Optional(R(u'\-')) all = bracket + curlLine + of + hyphen
class MpRegex: prefix = (R(u'^m\.?p\.?$', re.I) | I(u'melting') + I(u'point')).hide() #u-createsunicodestring units = Optional(R(u'^[CFK]\.?$'))(u'units').add_action(merge) value = R(u'^\d+(\.\d+)?$')(u'value') mp = (prefix + Obstructions.all + value + units)(u'mp')
class BpRegex: prefix = (R(u'^b\.?p\.?$', re.I) | I(u'boiling') + I(u'point')).hide() #u-createsunicodestring units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) value = R(u'^\d+(\.\d+)?$')(u'value') bp = (prefix + value + units)(u'bp')
#from chemdataextractor.doc import Sentence from chemdataextractor.parse import R, I, W, Optional, merge, join from chemdataextractor.parse.base import BaseSentenceParser from chemdataextractor.utils import first frequency_value = ((R("^\d+?\.\d+?$") | R("^\d+?$")) + (W('kHz') | W('MHz') | W('GHz') | W('Hz')))('frequencyvalue').add_action(join) class DielectricConstantFrequencyParser(BaseSentenceParser): root = frequency_value def interpret(self, result, start, end): raw_value = first(result.xpath('//frequencyvalue/text()')) #print (type(raw_value)) frequency = self.model(frequency=raw_value) #print (frequency) yield frequency
#from chemdataextractor.doc import Sentence from chemdataextractor.parse import R, I, W, Optional, merge, join from chemdataextractor.parse.base import BaseSentenceParser from chemdataextractor.utils import first wavelength_value = ( (R("^\d+?$") + W('nm')) | (R("^\d+?\.\d+?$") + W('μm')) | (R("^\d+?\.\d+?$") + W('nm')))('wavelengthvalue').add_action(join) class RefractiveIndexWavelengthParser(BaseSentenceParser): root = wavelength_value def interpret(self, result, start, end): raw_value = first(result.xpath('//wavelengthvalue/text()')) wavelength = self.model(wavelength=raw_value) yield wavelength
from chemdataextractor.doc import Paragraph, Heading, Sentence from lxml import etree class CurieTemperature(BaseModel): specifier = StringType() value = StringType() units = StringType() Compound.curie_temperatures = ListType(ModelType(CurieTemperature)) #%% [markdown] # Now define parse elements that describe how to identify the entities in text. Think of these as tagging processes. #%% # Define a very basic entity tagger specifier = (I('curie') + I('temperature') + Optional(lrb | delim) + Optional(R('^T(C|c)(urie)?')) + Optional(rrb) | R('^T(C|c)(urie)?'))('specifier').add_action(join) units = (R('^[CFK]\.?$'))('units').add_action(merge) value = (R('^\d+(\.\,\d+)?$'))('value') #%% [markdown] # Note we tag each with a unique identifier that will be used later. Now let the entities in a sentence be any ordering of these (or whatever ordering you feel like). Here we specify that the value and units must coincide, but this does not have to be the case. # # We also define an extremely general parse phrase, this will be used to identify candidate sentences. #%% # Let the entities be any combination of chemical names, specifier values and units entities = (chemical_name | specifier | value + units) # Now create a very generic parse phrase that will match any combination of these entities curie_temperature_phrase = (entities + OneOrMore(entities | Any()))('curie_temperature')
H = Hide Converter for ignoring the results of a parsed expression ''' ''' R matches the token text to the regular expression ^ matches any string that starts with b \ removes special meaning from from the . character ? Matches either once or zero times; marks the p as optional $ Matches at the end of a line I means the case is not sensitive | matches either b.p. or boiling point .hide() I think just makes it so the prefix does not show up in the output but it still searches the document for the text ''' prefix = (R(u'^b\.?p\.?$', re.I) | I(u'boiling') + I(u'point')).hide() ''' W matches the degree symbol exactly Optional means that the unit will be included if it is in the text R matches the token text to the regular experssion ^ matches any string that contains a C OR F OR K ''' units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) ''' R matches the token text to a regular experssion ^ matches any string that starts with a digit \d looks for a number between 0 and 9 + matches following digits one or more times \. matches . symbol if it is there () is the capturing group
import io import logging import os import unittest import numpy as np from chemdataextractor.relex.utils import mode_rows, match, KnuthMorrisPratt from chemdataextractor.doc import Sentence from chemdataextractor.relex import Relation, Entity, Phrase, Cluster from chemdataextractor.parse.cem import chemical_name from chemdataextractor.parse import R, merge logging.basicConfig(level=logging.DEBUG) log = logging.getLogger(__name__) units = (R('^[CFK]\.?$'))('units').add_action(merge) value = (R('^\d+(\.\,\d+)?$'))('value') class TestRelexUtils(unittest.TestCase): maxDiff = None def test_mode_rows(self): a = np.array([[1, 1, 1, 1], [1, 2, 3, 4], [1, 1, 1, 1], [3, 4, 5, 6]]) expected = [1, 1, 1, 1] result = list(mode_rows(a)) self.assertListEqual(result, expected) def test_match(self): s1 = Sentence('BiFeO3 with 1103 K') entities = [
footflow = StringType(contextual=True) footflowunit = StringType(contextual=True) pressure = StringType(contextual=True) presunits = StringType(contextual=True) tofvalue = StringType() yieldtest = StringType(contextual=True) selectivity = ListType(ModelType(Selectivity, contextual=True), contextual=True) Compound.conv = ListType(ModelType(Conv)) supportstring = "ZnO|YSZ|ceria|alumina|silica|SBA|ZSM|CNT|Al2O3|MgO|CeO2|TiO2|CMK|MnO|Y2O3|ZrO2|Tb4O7|HfO2|La2O3|Co3O4|ThO2|SiO2|Fe2O3|Sm2O3|Mo2C|Gd2O3|Yb2O3|CaO|CuO|NiO" symbolstring = "|Li|Be|Ne|Na|Mg|Al|Si|Cl|Ox|Ca|Sc|Ti|V|Cr|Mn|Fe|Co|Ni|Cu|Zn|Ga|Ge|As|Se|Br|Kr|Rb|Sr|Y|Zr|Nb|Mo|Tc|Ru|Rh|Pd|Ag|Cd|In|Sn|Sb|Te|Xe|Cs|Ba|La|Ce|Pr|Nd|Pm|Sm|Eu|Gd|Tb|Dy|Ho|Er|Tm|Yb|Lu|Hf|Ta|Re|Os|Ir|Pt|Au|Hg|Tl|Pb|Bi|Po|At|Rn|Fr|Ra|Ac|Th|Pa|Np|Pu|Am|Cm|Bk|Cf|Es|Fm|Md|No|Lr|Rf|Db|Sg|Bh|Hs|Mt|Ds|Rg|Cn|Nh|Fl|Mc|Lv|Ts|Og" name = (R( u"^((\d|%)*(" + symbolstring + "|" + supportstring + ")(?![a-z])(([A-Zα-ωΑ-Ω0-9\/\-,\.]|(wt.?))[a-zA-Z0-9α-ωΑ-Ω0-9\/\-,\.]*)?)") ) post = R("^([A-Z]|[0-9]|(wt.?)|" + symbolstring + u"|\/|-|\.|,|%|\(|\))+$") | T("SYM") name = (name + ZeroOrMore(post))(u"name").add_action(merge) catalyst = R( "(Catalysts?)|(precursors?)|(abb)|(sample)|(abb)|(materials?)|(composition)", re.IGNORECASE) class CatalystHeadingParser(BaseParser): root = catalyst def interpret(self, result, start, end):
class Anneal(BaseModel): """ Class for full list of spin-coating step parameters for full process. """ temps = ListType(ModelType(AnnealTemp)) times = ListType(ModelType(AnnealTime)) # Associating anneal parameters with a chemical object Compound.anneal = ListType(ModelType(Anneal)) # currently not working # Defining object parameters for the AnnealParser parser # Deliminators delim = R('^[;:,\./]$').hide() # Defining formats for annealing temperature and units tempprefix = (I('at') | I('or')).hide() tempunits = (W('°') + R('^[CFK]\.?$'))('tempunits').add_action(merge) tempvalue = R('^\d{2,4}?$')('tempvalue').add_action(merge) + Optional(delim) # Defining formats for spin-coating time and time units timeprefix = I('for').hide() timeunits = ( R('^s?(ec|econds)?$') | R('^m?(in|inute)?(s)?$') | R('^h?(ou)?(r)?(s)?$'))('timeunits').add_action(join) + Optional(delim) timevalue = R('^\d{,2}$')('timevalue') + Optional(delim) # Putting everything together temp = (tempvalue)('temp')
Compound.solubility = ListType(ModelType(Solubility)) # In[38]: import re from chemdataextractor.parse import R, I, W, Optional, merge # prefix = (R(u'^m\.?p\.?$', re.I) | I(u'melting') + I(u'point')).hide() prefix = (I(u'solubility')).hide() + Optional( W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional( I('in') + I('the') + I('range') + Optional(I('of')) | I('about')).hide() # delim = R(u'^[:;\.,]$') value = R(u'^\d+(\.\d+)?$')(u'value') units = (W(u'nM') | W(u'μM') | W(u'mM') | W(u'μg') | W(u'mg'))(u'units').add_action(merge) so = (prefix + Optional(R('\w+\s\w+')).hide() + value + units)(u'so') # units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) # value = R(u'^\d+(\.\d+)?$')(u'value') # bp = (prefix + + value + units)(u'bp') # In[39]: from chemdataextractor.parse.base import BaseParser from chemdataextractor.utils import first
def find_subset(items, target, acc=[]): if target == 0: return acc if len(items) == 0: return None acc_take = acc.copy() acc_take.append(items[0]) take = find_subset(items[1:], target - items[0], acc_take) if take: return take return find_subset(items[1:], target, acc) units = (Optional("(") + Optional(R(u'°')) + R(u'[CFK℃]')).add_action(merge)(u'units') value = (Optional(R('~')) + R(u'^\d{3,4}')).add_action(merge)(u'value') value2 = (Optional(R('~')) + R(u'^\d{3,4}(°C|K)')).add_action(merge)(u'value') temp1 = (value + units)(u'temp1') temp2 = value2 temp = (temp1 | value2)(u'tempphrase') #for catalyst name supportstring = "ZnO|YSZ|ceria|alumina|silica|SBA|ZSM|CNT|Al2O3|MgO|CeO2|TiO2|CMK|MnO|Y2O3|ZrO2|Tb4O7|HfO2|La2O3|Co3O4|ThO2|SiO2|Fe2O3|Sm2O3|Mo2C|Gd2O3|Yb2O3|CaO|CuO|NiO" symbolstring = "|Li|Be|Ne|Na|Mg|Al|Si|Cl|Ox|Ca|Sc|Ti|V|Cr|Mn|Fe|Co|Ni|Cu|Zn|Ga|Ge|As|Se|Br|Kr|Rb|Sr|Y|Zr|Nb|Mo|Tc|Ru|Rh|Pd|Ag|Cd|In|Sn|Sb|Te|Xe|Cs|Ba|La|Ce|Pr|Nd|Pm|Sm|Eu|Gd|Tb|Dy|Ho|Er|Tm|Yb|Lu|Hf|Ta|Re|Os|Ir|Pt|Au|Hg|Tl|Pb|Bi|Po|At|Rn|Fr|Ra|Ac|Th|Pa|Np|Pu|Am|Cm|Bk|Cf|Es|Fm|Md|No|Lr|Rf|Db|Sg|Bh|Hs|Mt|Ds|Rg|Cn|Nh|Fl|Mc|Lv|Ts|Og" name = (R( u"^((" + symbolstring + "|" + supportstring + ")(?![a-z])(([A-Zα-ωΑ-Ω0-9\/\-,\.]|(wt.?))[a-zA-Z0-9α-ωΑ-Ω0-9\/\-,\.]*)?)") ) post = R("^([A-Z]|[0-9]|(wt.?)|" + symbolstring + u"|\/|-|\.|,|%|\(|\))+$") | T("SYM")
#from chemdataextractor.model import Compound from chemdataextractor.parse import R, I, W, Optional, merge from chemdataextractor.parse.base import BaseSentenceParser #from chemdataextractor.model import Compound from chemdataextractor.parse.common import lbrct, dt, rbrct from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore, SkipTo import re from chemdataextractor.parse.auto import BaseAutoParser,construct_unit_element,Group,match_dimensions_of,value_element,value_element_plain from chemdataextractor.parse.actions import merge, join from lxml import etree import logging log = logging.getLogger(__name__) delim = R('^[:;\.,]$') class PropertyParserTemplate(BaseAutoParser, BaseSentenceParser): """Template parser for QuantityModel-type structures Finds Cem, Specifier, Value and Units from single sentences Other entities are merged contextually Returns: [type] -- [description] """ @property def specifier_phrase(self): return self.model.specifier.parse_expression('specifier')
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType import re from chemdataextractor.parse import R, I, W, Optional, merge class Capacity(BaseModel): value = StringType() units = StringType() Compound.capacity = ListType(ModelType(Capacity)) prefix = (I(u'capacity') | I(u'CO2') + I(u'uptake')).hide() #Left the optional in because if I take it out then there is a syntax error on line 33 units = (W(u'mmol g-1') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) value = R(u'^\d+(\.\d+)?$')(u'value') cp = (prefix + value + units)(u'cp') from chemdataextractor.parse.base import BaseParser from chemdataextractor.utils import first class CpParser(BaseParser): root = cp def interpret(self, result, start, end): compound = Compound(capacity=[ Capacity(value=first(result.xpath('./value/text()')), units=first(result.xpath('./units/text()')))
class SpinCoat(BaseModel): """ Class for full list of spin-coating step parameters for full process. """ #solvent = StringType(contextual=True) spds = ListType(ModelType(SpinSpd)) times = ListType(ModelType(SpinTime)) # Associate the spin-coating class with a given compound. May be worth # getting rid of for our eventual implementation, not yet sure. Compound.spin_coat = ListType(ModelType(SpinCoat)) # Variable assignments # Deliminators -- hide from tokenization delim = R('^[;:,\./]$').hide() # Defining formats for spin-coating value and units spdunits = ( R(u'^r(\.)?p(\.)?m(\.)?$') | R(u'^r(\.)?c(\.)?f(\.)?$') | R(u'^([x×]?)(\s?)?g$'))('spdunits').add_action(join) + ZeroOrMore(delim) spdvalue = Optional( W('(')).hide() + R(u'^\d+(,\d+)?[0][0]$')('spdvalue') + Optional( W(')')).hide() # Defining formats for spin-coating time and time units timeprefix = I('for').hide() timeunits = ( R('^s?(ec|econds)?$') | R('^m?(in|inute)?(s)?$') | R('^h?(ou)?(r)?(s)?$'))('timeunits').add_action(join) + Optional(delim) timevalue = R('^\d{,3}$')('timevalue') + Optional(delim) #<3 digits
#from chemdataextractor.doc import Sentence from chemdataextractor.parse import R, I, W, Optional, merge, join from chemdataextractor.parse.base import BaseSentenceParser from chemdataextractor.utils import first dielectriclost = R('^[0]\.[0][0-9]+]?')('dielectricloss') class DielectricLossParser(BaseSentenceParser): root = dielectriclost def interpret(self, result, start, end): raw_value = first(result.xpath('//dielectricloss/text()')) #print (type(raw_value)) frequency = self.model(dielectricloss=raw_value) #print (frequency) yield frequency