def _get_participants(self, fileid): # multidimensional dicts def dictOfDicts(): return defaultdict(dictOfDicts) xmldoc = ElementTree.parse(fileid).getroot() # getting participants' data pat = dictOfDicts() for participant in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS,NS)): for (key,value) in participant.items(): pat[participant.get('id')][key] = value return pat
def _get_age(self, fileid, month): xmldoc = ElementTree.parse(fileid).getroot() for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS,NS)): try: if pat.get('id') == 'CHI': age = pat.get('age') if month: age = self._convert_age(age) return age # some files don't have age data except (TypeError, AttributeError), e: return None
def _get_age(self, fileid, month): xmldoc = ElementTree.parse(fileid).getroot() for pat in xmldoc.findall('.//{%s}Participants/{%s}participant' % (NS, NS)): try: if pat.get('id') == 'CHI': age = pat.get('age') if month: age = _convert_age(age) return age # some files don't have age data except (TypeError, AttributeError), e: return None
def _get_participants(self, fileid): # multidimensional dicts def dictOfDicts(): return defaultdict(dictOfDicts) xmldoc = ElementTree.parse(fileid).getroot() # getting participants' data pat = dictOfDicts() for participant in xmldoc.findall( './/{%s}Participants/{%s}participant' % (NS, NS)): for (key, value) in participant.items(): pat[participant.get('id')][key] = value return pat
def _get_words(self, fileid, speaker, sent, stem, relation, pos, strip_space, replace): xmldoc = ElementTree.parse(fileid).getroot() # processing each xml doc results = [] for xmlsent in xmldoc.findall('.//{%s}u' % NS): sents = [] # select speakers if speaker == 'ALL' or xmlsent.get('who') in speaker: for xmlword in xmlsent.findall('.//{%s}w' % NS): infl = None ; suffixStem = None # getting replaced words if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS,NS)): xmlword = xmlsent.find('.//{%s}w/{%s}replacement/{%s}w' % (NS,NS,NS)) elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS)): xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS,NS)) # get text if xmlword.text: word = xmlword.text else: word = '' # strip tailing space if strip_space: word = word.strip() # stem if relation or stem: try: xmlstem = xmlword.find('.//{%s}stem' % NS) word = xmlstem.text except AttributeError, e: pass # if there is an inflection try: xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' % (NS,NS,NS)) word += '-' + xmlinfl.text except: pass # if there is a suffix try: xmlsuffix = xmlword.find('.//{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' % (NS,NS,NS,NS)) suffixStem = xmlsuffix.text except AttributeError: suffixStem = "" # pos if relation or pos: try: xmlpos = xmlword.findall(".//{%s}c" % NS) word = (word,xmlpos[0].text) if len(xmlpos) != 1 and suffixStem: suffixStem = (suffixStem,xmlpos[1].text) except (AttributeError,IndexError), e: word = (word,None) if suffixStem: suffixStem = (suffixStem,None) # relational # the gold standard is stored in <mor></mor><mor type="trn"><gra type="grt"> if relation == True: for xmlstem_rel in xmlword.findall('.//{%s}mor/{%s}gra' % (NS,NS)): if not xmlstem_rel.get('type') == 'grt': word = (word[0],word[1],xmlstem_rel.get('index')+"|"+xmlstem_rel.get('head')+ "|"+xmlstem_rel.get('relation')) else: word = (word[0],word[1],word[2],word[0],word[1],xmlstem_rel.get('index')+"|"+ xmlstem_rel.get('head')+"|"+xmlstem_rel.get('relation')) try: for xmlpost_rel in xmlword.findall('.//{%s}mor/{%s}mor-post/{%s}gra' % (NS,NS,NS)): if not xmlpost_rel.get('type') == 'grt': suffixStem = (suffixStem[0],suffixStem[1],xmlpost_rel.get('index')+"|"+xmlpost_rel.get('head')+ "|"+xmlpost_rel.get('relation')) else: suffixStem = (suffixStem[0],suffixStem[1],suffixStem[2],suffixStem[0],suffixStem[1],xmlpost_rel.get('index')+"|"+xmlpost_rel.get('head')+ "|"+xmlpost_rel.get('relation')) except: pass sents.append(word) if suffixStem: sents.append(suffixStem) if sent or relation: results.append(sents) else: results.extend(sents)
def _get_corpus(self, fileid): results = dict() xmldoc = ElementTree.parse(fileid).getroot() for key, value in xmldoc.items(): results[key] = value return results
def _get_words(self, fileid, speaker, sent, stem, relation, pos, strip_space, replace): xmldoc = ElementTree.parse(fileid).getroot() # processing each xml doc results = [] for xmlsent in xmldoc.findall('.//{%s}u' % NS): sents = [] # select speakers if speaker == 'ALL' or xmlsent.get('who') == speaker: for xmlword in xmlsent.findall('.//{%s}w' % NS): infl = None suffixStem = None # getting replaced words if replace and xmlsent.find('.//{%s}w/{%s}replacement' % (NS, NS)): xmlword = xmlsent.find( './/{%s}w/{%s}replacement/{%s}w' % (NS, NS, NS)) elif replace and xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)): xmlword = xmlsent.find('.//{%s}w/{%s}wk' % (NS, NS)) # get text if xmlword.text: word = xmlword.text else: word = '' # strip tailing space if strip_space: word = word.strip() # stem if relation or stem: try: xmlstem = xmlword.find('.//{%s}stem' % NS) word = xmlstem.text except AttributeError, e: pass # if there is an inflection try: xmlinfl = xmlword.find('.//{%s}mor/{%s}mw/{%s}mk' % (NS, NS, NS)) word += '-' + xmlinfl.text except: pass # if there is a suffix try: xmlsuffix = xmlword.find( './/{%s}mor/{%s}mor-post/{%s}mw/{%s}stem' % (NS, NS, NS, NS)) suffixStem = xmlsuffix.text except AttributeError: suffixStem = "" # pos if relation or pos: try: xmlpos = xmlword.findall(".//{%s}c" % NS) word = (word, xmlpos[0].text) if len(xmlpos) != 1 and suffixStem: suffixStem = (suffixStem, xmlpos[1].text) except (AttributeError, IndexError), e: word = (word, None) if suffixStem: suffixStem = (suffixStem, None) # relational # the gold standard is stored in <mor></mor><mor type="trn"><gra type="grt"> if relation == True: for xmlstem_rel in xmlword.findall( './/{%s}mor/{%s}gra' % (NS, NS)): if not xmlstem_rel.get('type') == 'grt': word = (word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) else: word = (word[0], word[1], word[2], word[0], word[1], xmlstem_rel.get('index') + "|" + xmlstem_rel.get('head') + "|" + xmlstem_rel.get('relation')) try: for xmlpost_rel in xmlword.findall( './/{%s}mor/{%s}mor-post/{%s}gra' % (NS, NS, NS)): if not xmlpost_rel.get('type') == 'grt': suffixStem = (suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) else: suffixStem = (suffixStem[0], suffixStem[1], suffixStem[2], suffixStem[0], suffixStem[1], xmlpost_rel.get('index') + "|" + xmlpost_rel.get('head') + "|" + xmlpost_rel.get('relation')) except: pass sents.append(word) if suffixStem: sents.append(suffixStem) if sent or relation: results.append(sents) else: results.extend(sents)