def test_metadata(self): doc = ElementTree(file='fixtures/metadata.xml') parser = MorParser() metadata = parser.parse_metadata(doc) self.assertEqual(metadata, { 'lang': 'eng', 'corpus': 'manchester', 'date': '1984-01-01', 'participants': [ { 'id': 'INV', 'name': 'Caroline', 'role': 'Investigator', 'language': 'eng' }, { 'id': 'CHI', 'name': 'Nicole', 'role': 'Target_Child', 'language': 'eng', 'age': 'P2Y10M8D', 'sex': 'female', 'group': 'typical', 'SES' :'MC' }, { 'id': 'MOT', 'name': 'Mother', 'language': 'eng' }]})
def test_missing_pos(self): parser = MorParser() for uid, speaker, tokens in parser.parse("fixtures/missing_pos.xml"): for token in tokens: #print(token.word + '/' + token.pos + '|' + token.stem) self.assertNotEqual(token.pos, 'unk', 'failed to parse known tag')
def test_compound(self): comps = self.compounds.findall("w") self.assertEqual(5, len(comps)) for word in comps: parser = MorParser() parser.namespace = "" parts = parser.parse_mor_element(word, word.find('mor')) self.assertGreaterEqual(parts[0].stem.count("_"), 1)
def test_clitics(self): parser = MorParser() for uid, speaker, tokens in parser.parse("fixtures/clitics.xml"): self.assertGreater(len(tokens), 1, "failed splitting {0} into clitics".format(tokens)) self.assertNotIn("?", [w.word for w in tokens]) self.assertEqual(' '.join(map(str, tokens)), ("hidden/part|hide&PERF away/adv|away where/adv:wh|where " "nobody/pro:indef|nobody 'd/mod|genmod be/v:cop|be ./.|.")) head, tail = parser.split_clitic_wordform("that's") self.assertEqual(head, "that") self.assertEqual(tail, ["'s"])
def test_commas(self): parser = MorParser() for uid, speaker, tokens in parser.parse("fixtures/commas.xml"): self.assertIn(',', [word.stem for word in tokens])
def test_document(self): parser = MorParser() for i in parser.parse("fixtures/test_doc.xml"): # iterate through an ensure no exceptions are thrown pass
from talkbank_parser import MorParser import glob import illegal_utterance_filter parser = MorParser() def age_transform(metadata): '''transform the original age into age in months''' ageday = 0 for i in metadata: # get the location of 'Y', 'M', 'D' if i == 'Y': loc_Y = metadata.index(i) elif i == 'M': loc_M = metadata.index(i) elif i == 'D': loc_D = metadata.index(i) for i in metadata: # get the number before 'Y', 'M', and 'D' if i == 'Y': age_year = int(metadata[loc_Y - 1]) elif i == 'M': age_month = int(metadata[loc_Y + 1:loc_M]) elif i == 'D': age_day = int(metadata[loc_M + 1:loc_D]) if age_day >= 15: ageday = 1 else: ageday = 0 age_transformed = age_year * 12 + age_month + ageday return age_transformed
def test_compound(self): for word in self.compounds.findall("w/mor"): parser = MorParser() parser.namespace = "" parts = parser.parse_mor_element(None, word) self.assertGreaterEqual(parts[0].stem.count("_"), 1)