def process(self,file): feats = {} Extractor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) pos = cqpf.getColumn(1) # initialize counts for name in self.posnames: feats[name] = 0 for i in range(2,len(pos)): # ignore first two pos ... uni = (pos[i])[0:3] bi = (pos[i-1])[0:3] + "_" + uni tri = (pos[i-2])[0:3] + "_" + bi if uni in feats: feats[uni] += 1 if bi in feats: feats[bi] += 1 if tri in feats: feats[tri] += 1 for x in self.posnames: feats[x] /= float(len(pos)-2) return ir.getID(),feats
def process(self,file): feats = {} Extractor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) #words = ' '.join(cqpf.getColumn(0)) #pos = ' '.join(self.disambiguatePOS(cqpf.getColumn(1))) lemma = cqpf.getColumn(2) sentences = cqpf.getAnnotations("s") wordpostmp = [] for (start,end,attr) in sentences: wordpostmp.append('<s>') wordpostmp.extend(self.getWordsWithPOS( cqpf.getColumn(0)[start:end], self.disambiguatePOS(cqpf.getColumn(1)[start:end]))) wordpostmp.append('</s> ') wordpos = ' '.join(wordpostmp) feats.update(self.extractWithREs(self.DIRECT_FEATS,wordpos)) feats.update(self.extractWithREs(self.CALC_FEATS,wordpos)) feats.update(self.extractFromLemmatatizedForms(self.LEMMA_FEATS,lemma)) self.calculateFeats(feats) self.normalizeByLength(feats, len(lemma)) feats.update(self.extractStatistics(cqpf)) print feats return ir.getID(),feats
def process(self,file): feats = {} Extractor.process(self,file) ir = InputReader(file) ir.read() cqpf = CQPFormat(ir.getText()) lengths = [end-start for (start,end,arg) in cqpf.getAnnotations("s")] print self.__featureNames feats = utils.getStats("SENT_LENGTH", lengths) return ir.getID(),feats