Пример #1
0
def test_cap_type():
    for x in data:
        try:
            assert capitalization_type(x[0]) == x[1]
        except:
            print 'Error: %s' % x[0]
            print 'Result should be %s but got %s' % (x[1], capitalization_type(x[0]))
            raise
Пример #2
0
    def tag(cls, text):
        """Class method that returns tags given some text"""
        if not text:
            return []

        text = text.replace("'", "")
        cap_type = capitalization_type(text)

        bt = BasicTokenizer()
        tokens = bt.tokenize(text)
        pos = nltk.pos_tag(tokens)
        log.info('POS before lower casing:%s', str(pos))

        if cap_type == CapType.ALLCAPS:
            # If the headline is in AllCAPS then the POS tagger
            # produces too many proper nouns, hence we de-capitilize text first
            tokens = bt.tokenize(text.lower())
            pos = nltk.pos_tag(tokens)
            log.info('POS after lower casing:%s', str(pos))

        # Only return those tokens whose pos is in the include list
        tags = [t[0] for t in pos if t[1] in pos_include]

        # Now exclude stopwords...
        tags = [t for t in tags if not t in stop_words]
        
        # Call Singularize
        tags = [singularize(t) for t in tags]
    
        # We want to preserve the order of tags purely for esthetic value
        # hence we will not use set()
        # We will also preserve uppercased tags if they are the first occurence

        tags_ = CIList()
        for t in tags:
            if t in tags_: 
                continue
            if len(t) < 2: 
                continue
            tags_.append(t)

        return tags_
Пример #3
0
import csv

from silcc.lib.capnormalizer import capitalization_type, CapType

if __name__ == '__main__':
    reader = csv.reader(open('data/training/muti_submissions.csv', 'rU'))
    for line in reader:
        if len(line) != 3:
            continue
        print line
        text = line[1] 
        type_ = capitalization_type(text)
        for k, v in CapType.__dict__.iteritems():
            if isinstance(v, int) and type_ == v:
                print k
        print '----'