示例#1
0
 def get_local_features(self, text):
     feature_vector = dict()
     text_obj = TextObj(text)
     feature_extractor.get_ngrams(feature_vector, text_obj.tokens)
     feature_extractor.get_ngrams(feature_vector, text_obj.tokens, n=2)
     feature_extractor.get_initialisms(feature_vector, text_obj.tokens)
     feature_extractor.get_basic_lengths(feature_vector, text_obj.text, text_obj.sentences, text_obj.tokens)
     feature_extractor.get_repeated_punct(feature_vector, text_obj.text)
     feature_extractor.get_LIWC(feature_vector, text_obj.text)
     
     return feature_vector
示例#2
0
    def build_features(self, feats=None, start=None, end=None): # default: None -> use default features
        minIndex = self.occurrences[0].start
        maxIndex = self.occurrences[-1].end

        if start is None:
           start = minIndex
        if end is None:
           end = maxIndex

        assert minIndex <= start <= maxIndex, "Start index is beyond bounds." 
        assert minIndex <= end <= maxIndex, "End index is beyond bounds." 
        assert start <= end, "Start index is greater than end index."

        startInd = self.getIndex(self.occurrences, lambda x,y: x.start>=y, start)
        endInd = self.getIndex(self.occurrences, lambda x,y: x.end>y, end)
        if endInd is None:
           endInd = len(self.occurrences)-1

        occurrences = self.occurrences[startInd:endInd]
        text = self.text[start:end]
        tokens = [o.text for o in occurrences]

        feature_dependencies_in = [d for d in self.dependencies if startInd <= d.gov_index < endInd  and startInd <= d.dep_index < endInd]

        feature_dependencies_boundary = [d for d in self.dependencies if startInd <= d.gov_index < endInd != startInd <= d.dep_index < endInd]

        # MPQA &c
        if feats is None:
            self.features = dict() # start fresh
            # default features
            feats = ['unigram', 'initialism', 'lengths', 'punctuation', 'quotes', 'liwc', 'dep']
        for feat in feats:
            if feat.endswith('gram'):
                n = measure_to_int(feat)
                feature_extractor.get_ngrams(self.features, tokens, n=n)
            elif feat.endswith('alism'):
                feature_extractor.get_initialisms(self.features, tokens, use_lowercase=True, finalism=(feat == 'finalism'))
            elif feat.startswith('lengths'):
                sentences = []
                numSents = len(self.sentstarts)
                for i in range(numSents):
                    if self.sentstarts[i] > end:
                       break
                    sStart = self.sentstarts[i]
                    sEnd = self.sentends[i]
                    if self.sentends[i] > start and self.sentstarts[i] < start:
                       sStart = start
                    elif self.sentends[i] > end and self.sentstarts[i] < end:
                       sEnd = end
                    sentences.append(self.text[sStart:sEnd])

                words = tokens
                feature_extractor.get_basic_lengths(self.features, text, sentences, words)
            elif feat.startswith('punct'):
                feature_extractor.get_repeated_punct(self.features, text)
            elif feat.startswith('quot'):
                feature_extractor.get_quoted_terms(self.features, text)
            elif feat.lower() == 'liwc':
                text_scores = Counter()
                text_scores['Word Count'] = len(occurrences)
                for o in occurrences:
                    text_scores.update(o.liwc)
                text_scores = word_category_counter.normalize(text_scores)
                for category, score in text_scores.items():
                    self.features['LIWC:'+category] = score
            elif feat.lower() == 'dep':
                dep_scores = Counter()
                #pdb.set_trace()
                for d in feature_dependencies_in:
                    dep_string = "%s(%s,%s)" % (d.relation, self.occurrences[d.gov_index].lemma, self.occurrences[d.dep_index].lemma)
                    dep_scores[dep_string] += 1
                for dep, score in dep_scores.items():
                    self.features['dep:'+ dep] = score