def padTrigramFeatures(self, maxNumTrigrams, posPaddingVector, depPaddingVector): lenTrigrams = len(self.trigramFeatures) while lenTrigrams < maxNumTrigrams: iFeatures = Features() iFeatures.addFeature("lemmaw1", -1) iFeatures.addFeature("lemmaw2", -1) iFeatures.addFeature("lemmaw3", -1) iFeatures.addFeature("posw1", posPaddingVector) iFeatures.addFeature("posw2", posPaddingVector) iFeatures.addFeature("posw3", posPaddingVector) iFeatures.addFeature("labelw1", depPaddingVector) iFeatures.addFeature("labelw2", depPaddingVector) iFeatures.addFeature("labelw3", depPaddingVector) self.trigramFeatures.append(iFeatures) lenTrigrams += 1
def padBigramFeatures(self, maxNumBigrams, posPaddingVector, depPaddingVector): lenBigrams = len(self.bigramFeatures) while lenBigrams < maxNumBigrams: iFeatures = Features() iFeatures.addFeature("labelw1", depPaddingVector) iFeatures.addFeature("labelhead", depPaddingVector) iFeatures.addFeature("labelw2", depPaddingVector) iFeatures.addFeature("lemmaw1", -1) iFeatures.addFeature("lemmaw2", -1) iFeatures.addFeature("posw1", posPaddingVector) iFeatures.addFeature("posw2", posPaddingVector) iFeatures.addFeature("num-childrenw1", -1) iFeatures.addFeature("num-childrenw2", -1) iFeatures.addFeature("poshead", posPaddingVector) iFeatures.addFeature("poschild", posPaddingVector) self.bigramFeatures.append(iFeatures) lenBigrams += 1
def computeAtomicFeatures(self): for node in self.node_list: iFeatures = Features() lemma = node.conllLine.lemma iFeatures.addFeature("lemma", lemma) label = node.conllLine.deprel iFeatures.addFeature("label", self.getVectorRepresentation(label, "dep")) pos = node.conllLine.pos iFeatures.addFeature("pos", self.getVectorRepresentation(pos, "pos")) children = self.getChildren(node) grandChildren = self.getGrandChildren(children) num_children = len(children) iFeatures.addFeature("num-children", num_children) num_grandChildren = len(grandChildren) iFeatures.addFeature("num-grandchildren", num_grandChildren) labels_children = [] pos_children = [] for child in children: labels_children.append(child.conllLine.deprel) pos_children.append(child.conllLine.pos) # [numberNmods, numberPmods, .... totalNumberOfDeps] same with pos lenPos = len(self.dictPos) lenDep = len(self.dictDep) i = 0 j = 0 posVector = [] depVector = [] while i < lenPos: posVector.append(0) i += 1 while j < lenDep: depVector.append(0) j += 1 for labelChild in labels_children: position = self.dictDep[labelChild] depVector[position] += 1 for posChild in pos_children: position = self.dictPos[posChild] posVector[position] += 1 iFeatures.addFeature("labels-children", "".join(map(str, depVector))) iFeatures.addFeature("pos-children", "".join(map(str, posVector))) node.atomicFeatures = iFeatures
def computeGlobalFeatures(self): iFeatures = Features() labelw1 = self.node_list[0].conllLine.deprel iFeatures.addFeature("labelw1", self.getVectorRepresentation(labelw1, "dep")) #GET EMBEDDING lemmaRoot = self.root.lemma iFeatures.addFeature("lemmaRoot", lemmaRoot) #GET EMBEDDING lemmaw1 = self.node_list[0].conllLine.lemma iFeatures.addFeature("lemmaw1", lemmaw1) posw1 = self.node_list[0].conllLine.pos iFeatures.addFeature("posw1", self.getVectorRepresentation(posw1, "pos")) if len(self.node_list) > 1: posw2 = self.node_list[1].conllLine.pos iFeatures.addFeature("posw2", self.getVectorRepresentation(posw2, "pos")) poswn_1 = self.node_list[-2].conllLine.pos iFeatures.addFeature("poswn_1", self.getVectorRepresentation(poswn_1, "pos")) labelwn_1 = self.node_list[-2].conllLine.deprel iFeatures.addFeature( "labelwn_1", self.getVectorRepresentation(labelwn_1, "dep")) #GET EMBEDDING lemmawn_1 = self.node_list[-2].conllLine.lemma iFeatures.addFeature("lemmawn_1", lemmawn_1) else: iFeatures.addFeature( "posw2", self.getVectorRepresentation("<PADDING>", "pos")) iFeatures.addFeature( "poswn_1", self.getVectorRepresentation("<PADDING>", "pos")) iFeatures.addFeature( "labelwn_1", self.getVectorRepresentation("<PADDING>", "dep")) #EMBEDDING PADDING iFeatures.addFeature("lemmawn_1", "NOLEMMA") if len(self.node_list) > 2: posw3 = self.node_list[2].conllLine.pos iFeatures.addFeature("posw3", self.getVectorRepresentation(posw3, "pos")) poswn_2 = self.node_list[-3].conllLine.pos iFeatures.addFeature("poswn_2", self.getVectorRepresentation(poswn_2, "pos")) else: iFeatures.addFeature( "posw3", self.getVectorRepresentation("<PADDING>", "pos")) iFeatures.addFeature( "poswn_2", self.getVectorRepresentation("<PADDING>", "pos")) if len(self.node_list) > 3: poswn_3 = self.node_list[-4].conllLine.pos iFeatures.addFeature("poswn_3", self.getVectorRepresentation(poswn_3, "pos")) else: iFeatures.addFeature( "poswn_3", self.getVectorRepresentation("<PADDING>", "pos")) question = 0 for node in self.node_list: if "?" in node.conllLine.form: question = 1 break iFeatures.addFeature("question", question) self.globalFeatures = iFeatures
def computeTrigramFeatures(self): for parent in self.node_list: children = self.getChildren(parent) for child in children: grandChildren = self.getChildren(child) for grandChild in grandChildren: iFeatures = Features() #HEAD(w1,w2,w3) ?? lemmaw1 = parent.conllLine.lemma iFeatures.addFeature("lemmaw1", lemmaw1) lemmaw2 = child.conllLine.lemma iFeatures.addFeature("lemmaw2", lemmaw2) lemmaw3 = grandChild.conllLine.lemma iFeatures.addFeature("lemmaw3", lemmaw3) posw1 = parent.conllLine.pos iFeatures.addFeature( "posw1", self.getVectorRepresentation(posw1, "pos")) posw2 = child.conllLine.pos iFeatures.addFeature( "posw2", self.getVectorRepresentation(posw2, "pos")) posw3 = grandChild.conllLine.pos iFeatures.addFeature( "posw3", self.getVectorRepresentation(posw3, "pos")) labelw1 = parent.conllLine.deprel iFeatures.addFeature( "labelw1", self.getVectorRepresentation(labelw1, "dep")) labelw2 = child.conllLine.deprel iFeatures.addFeature( "labelw2", self.getVectorRepresentation(labelw2, "dep")) labelw3 = grandChild.conllLine.deprel iFeatures.addFeature( "labelw3", self.getVectorRepresentation(labelw3, "dep")) parent.trigramFeatures.append(iFeatures)
def computeBigramFeatures(self): for parent in self.node_list: children = self.getChildren(parent) bigrams = [] for child in children: iFeatures = Features() labelw1 = parent.conllLine.deprel iFeatures.addFeature( "labelw1", self.getVectorRepresentation(labelw1, "dep")) labelhead = parent.conllLine.deprel iFeatures.addFeature( "labelhead", self.getVectorRepresentation(labelhead, "dep")) labelw2 = child.conllLine.deprel iFeatures.addFeature( "labelw2", self.getVectorRepresentation(labelw2, "dep")) lemmaw1 = parent.conllLine.lemma iFeatures.addFeature("lemmaw1", lemmaw1) lemmaw2 = child.conllLine.lemma iFeatures.addFeature("lemmaw2", lemmaw2) posw1 = parent.conllLine.pos iFeatures.addFeature( "posw1", self.getVectorRepresentation(posw1, "pos")) posw2 = child.conllLine.pos iFeatures.addFeature( "posw2", self.getVectorRepresentation(posw2, "pos")) num_childrenw1 = len(children) iFeatures.addFeature("num-childrenw1", num_childrenw1) childrenw2 = self.getChildren(child) num_childrenw2 = len(childrenw2) iFeatures.addFeature("num-childrenw2", num_childrenw2) poshead = parent.conllLine.pos iFeatures.addFeature( "poshead", self.getVectorRepresentation(poshead, "pos")) poschild = child.conllLine.pos iFeatures.addFeature( "poschild", self.getVectorRepresentation(poschild, "pos")) parent.bigramFeatures.append(iFeatures)