def test_RealPred_str(self): """ The 'informal' string representation of a RealPred should have a leading underscore """ thestring = '_the_q' catstring = '_cat_n_1' self.assertEqual(str(RealPred.from_string(thestring)), thestring) self.assertEqual(str(RealPred.from_string(catstring)), catstring)
from scipy.special import expit from numpy import outer, zeros_like, zeros, array from math import log from pydmrs.components import RealPred from utils import make_shared, is_verb D = 800 C = 40 half = int(D/2) with open('/anfs/bigdisc/gete2/wikiwoods/core-5-vocab.pkl', 'rb') as f: preds = pickle.load(f) ind = {p:i for i,p in enumerate(preds)} pred_index = {RealPred.from_string(p):i for p,i in ind.items()} pred_wei = make_shared(zeros((len(preds), D))) for filename, offset in [('/anfs/bigdisc/gete2/wikiwoods/word2vec/matrix_nouns400', 0), ('/anfs/bigdisc/gete2/wikiwoods/word2vec/matrix_verbs400', half)]: with open(filename, 'r') as f: for line in f: pred, vecstr = line.strip().split(maxsplit=1) vec = array(vecstr.split()) pred_wei[ind[pred], offset:offset+half] = vec # Make vectors longer (av. sum 1.138 over av. 44.9 nonzero entries) # An average entry is then 0.2, so a predicate is expit(0.2*30 - 3) = 0.95 true pred_wei *= 8 DATA = '/anfs/bigdisc/gete2/wikiwoods/core-5'
def test_RealPred_from_string(self): """ RealPred.from_string should instantiate RealPreds """ # Two slots the_rel = RealPred.from_string('_the_q_rel') the = RealPred.from_string('_the_q') self.assertEqual(RealPred('the','q'), the_rel) self.assertEqual(RealPred('the','q'), the) self.assertIsInstance(the_rel, RealPred) self.assertIsInstance(the, RealPred) # Three slots cat_rel = RealPred.from_string('_cat_n_1_rel') cat = RealPred.from_string('_cat_n_1') self.assertEqual(RealPred('cat','n','1'), cat_rel) self.assertEqual(RealPred('cat','n','1'), cat) self.assertIsInstance(cat_rel, RealPred) self.assertIsInstance(cat, RealPred) # Intermediate underscores in lemma nowhere_near_rel = RealPred.from_string('_nowhere_near_x_deg_rel') nowhere_near = RealPred.from_string('_nowhere_near_x_deg') self.assertEqual(RealPred('nowhere_near','x','deg'), nowhere_near_rel) self.assertEqual(RealPred('nowhere_near','x','deg'), nowhere_near) self.assertIsInstance(nowhere_near_rel, RealPred) self.assertIsInstance(nowhere_near, RealPred) # Too few slots, no leading underscore, or not a string with self.assertRaises(ValueError): RealPred.from_string("_the_rel") with self.assertRaises(ValueError): RealPred.from_string("_the") with self.assertRaises(ValueError): RealPred.from_string("udef_q_rel") with self.assertRaises(TypeError): RealPred.from_string(1)
def loads_xml(bytestring, encoding=None, cls=ListDmrs, **kwargs): """ Currently processes "<dmrs>...</dmrs>" To be updated for "<dmrslist>...</dmrslist>"... Expects a bytestring; to load from a string instead, specify encoding Produces a ListDmrs by default; for a different type, specify cls """ if encoding: bytestring = bytestring.encode(encoding) xml = ET.XML(bytestring) dmrs = cls(**kwargs) dmrs.cfrom = int(xml.get('cfrom')) if 'cfrom' in xml.attrib else None dmrs.cto = int(xml.get('cto')) if 'cto' in xml.attrib else None dmrs.surface = xml.get('surface') dmrs.ident = int(xml.get('ident')) if 'ident' in xml.attrib else None index_id = int(xml.get('index')) if 'index' in xml.attrib else None top_id = None for elem in xml: if elem.tag == 'node': nodeid = int(elem.get('nodeid')) if 'nodeid' in elem.attrib else None cfrom = int(elem.get('cfrom')) if 'cfrom' in elem.attrib else None cto = int(elem.get('cto')) if 'cto' in elem.attrib else None surface = elem.get('surface') base = elem.get('base') carg = elem.get('carg') pred = None sortinfo = None for sub in elem: if sub.tag == 'realpred': try: pred = RealPred(sub.get('lemma'), sub.get('pos'), sub.get('sense')) except PydmrsValueError: # If the whole pred name is under 'lemma', rather than split between 'lemma', 'pos', 'sense' pred = RealPred.from_string(sub.get('lemma')) warn("RealPred given as string rather than lemma, pos, sense", PydmrsWarning) elif sub.tag == 'gpred': try: pred = GPred.from_string(sub.text) except PydmrsValueError: # If the string is actually for a RealPred, not a GPred pred = RealPred.from_string(sub.text) warn("RealPred string found in a <gpred> tag", PydmrsWarning) elif sub.tag == 'sortinfo': sortinfo = sub.attrib else: raise PydmrsValueError(sub.tag) dmrs.add_node(cls.Node(nodeid=nodeid, pred=pred, carg=carg, sortinfo=sortinfo, cfrom=cfrom, cto=cto, surface=surface, base=base)) elif elem.tag == 'link': start = int(elem.get('from')) end = int(elem.get('to')) if start == 0: top_id = end else: rargname = None post = None for sub in elem: if sub.tag == 'rargname': rargname = sub.text elif sub.tag == 'post': post = sub.text else: raise PydmrsValueError(sub.tag) dmrs.add_link(Link(start, end, rargname, post)) else: raise PydmrsValueError(elem.tag) if top_id: dmrs.top = dmrs[top_id] if index_id: dmrs.index = dmrs[index_id] return dmrs