Exemplo n.º 1
0
def test_singularizer_abraxas_tags():
    """This is not a true test, it just
    outputs the singularize result on 
    Abraxas tags in order to bootstrap the
    singularizer rules"""
    reader = csv.reader(open('../../data/tests/abraxas_tags.csv'))
    for line in reader:
        print '%s ==> %s' % (line[2], singularize(line[2]))
Exemplo n.º 2
0
    def tag(cls, text):
        """Class method that returns tags given some text"""
        if not text:
            return []

        text = text.replace("'", "")
        cap_type = capitalization_type(text)

        bt = BasicTokenizer()
        tokens = bt.tokenize(text)
        pos = nltk.pos_tag(tokens)
        log.info('POS before lower casing:%s', str(pos))

        if cap_type == CapType.ALLCAPS:
            # If the headline is in AllCAPS then the POS tagger
            # produces too many proper nouns, hence we de-capitilize text first
            tokens = bt.tokenize(text.lower())
            pos = nltk.pos_tag(tokens)
            log.info('POS after lower casing:%s', str(pos))

        # Only return those tokens whose pos is in the include list
        tags = [t[0] for t in pos if t[1] in pos_include]

        # Now exclude stopwords...
        tags = [t for t in tags if not t in stop_words]
        
        # Call Singularize
        tags = [singularize(t) for t in tags]
    
        # We want to preserve the order of tags purely for esthetic value
        # hence we will not use set()
        # We will also preserve uppercased tags if they are the first occurence

        tags_ = CIList()
        for t in tags:
            if t in tags_: 
                continue
            if len(t) < 2: 
                continue
            tags_.append(t)

        return tags_
Exemplo n.º 3
0
    def tag(cls, text):
        """Class method that returns tags given some text"""

        text = text.replace("'", "")
        cap_type = capitalization_type(text)
        bt = BasicTokenizer()

        if cap_type == 'ALLCAPS':
            context = bt.tokenize(text.lower())
        else:
            context = bt.tokenize(text)

        tags = []
        for i in range(len(context)):
            features = featurize(i, context)
            d = dict(
                word=context[i], 
                context=text, 
                features=features, tokens=features, tags=tags)
            m, s = apply_multinomial_NB(C, V, prior, condprob, d)
            if m == 'ham':
                tags.append(context[i])

        # Strip out stopwords...
        tags = [t for t in tags if t not in stop_words]

        # Call Singularize
        tags = [singularize(t) for t in tags]

        # We want to preserve the order of tags purely for esthetic value
        # hence we will not use set()
        # We will also preserve uppercased tags if they are the first occurence

        tags_ = CIList()
        for t in tags:
            if t in tags_: continue
            if len(t) < 2: continue
            tags_.append(t)

        return tags_
Exemplo n.º 4
0
def test_singularizer():
    assert singularize('movies') == 'movie'
    assert singularize('business') == 'business'
    assert singularize('series') == 'series'
    assert singularize('women') == 'woman'
    assert singularize('radii') == 'radius'
    assert singularize('octopii') == 'octopus'
    assert singularize('virii') == 'virus'
    assert singularize('fish') == 'fish'
    assert singularize('properties') == 'property'
    assert singularize('drapes') == 'drape'
    assert singularize('types') == 'type'
    assert singularize('pass') == 'pass'
    assert singularize('balls') == 'ball'
    assert singularize('scissors')  == 'scissors'
    assert singularize('clothes')  == 'cloth'
    assert singularize('theses')  == 'thesis'
    assert singularize('indices')  == 'index'
    assert singularize('knives')  == 'knife'
    assert singularize('lives')  == 'life'
    assert singularize('thieves')  == 'thief'
    assert singularize('fungi')  == 'fungus'