示例#1
0
def categorize_phrases(sentphrases, body_part_name, use_stem=True):
    stemmer = nltk.stem.PorterStemmer()
    
    records = []    
    for i,row in sentphrases.iterrows():
        sentence = row['sentence']
        phrase = row['phrase']
        
        categories = []
        parents = []
        
        for word in phrase:
            if use_stem == True:
                category = taxonomy.classify(stemmer.stem(word.string))
            else:
                category = taxonomy.classify(word.string)
                
            if category:
                categories.append(category)
                parent = taxonomy.parents(category)
                
                if parent and parent[0] != category:
                    parents.append(parent[0])
                else:
                    parents.append('')
                
                
            #elif word.string == body_part_name:
            #    categories.append(body_part_name)
            #    parents.append(body_part_name)
                
        assert(len(categories) == len(parents))
        
        phrase_str = ' '.join([ w.string+'/'+w.tag for w in phrase ])
        phrase_str = phrase_str.replace(',', '')
        
        sentence_str = sentence.replace(',', '')        
        
        score = len(categories) * (len(categories) / (1.*len(phrase))) 
        for i,category in enumerate(categories):            
            records.append( ( body_part_name, sentence_str, phrase_str, category, parents[i], score) )
            
    df = pd.DataFrame( records, columns=['body_part', 'sentence', 'phrase', 'category', 'parent', 'score'])
                
    return df
示例#2
0
from pattern.search import search, taxonomy, Classifier
from pattern.en import parsetree

# The search module includes a Taxonomy class
# that can be used to define semantic word types.
# For example, consider that you want to extract flower names from a text.
# This would make search patterns somewhat unwieldy:
# search("rose|lily|daisy|daffodil|begonia", txt).

# A better approach is to use the taxonomy:
for flower in ("rose", "lily", "daisy", "daffodil", "begonia"):
    taxonomy.append(flower, type="flower")

print taxonomy.children("flower")
print taxonomy.parents("rose")
print taxonomy.classify("rose")  # Yields the most recently added parent.
print

# Taxonomy terms can be included in a pattern by using uppercase:
t = parsetree("A field of white daffodils.", lemmata=True)
m = search("FLOWER", t)
print t
print m
print

# Another example:
taxonomy.append("chicken", type="food")
taxonomy.append("chicken", type="bird")
taxonomy.append("penguin", type="bird")
taxonomy.append("bird", type="animal")
示例#3
0
from pattern.search import Pattern, Constraint, Classifier, taxonomy
from pattern.en     import Sentence, parse

# The search module includes a Taxonomy class 
# that can be used to define semantic word types.
# For example, consider that you want to extract flower names from a text.
# This would make patterns somewhat unwieldy, e.g.:
# Pattern.fromstring("rose|lily|daisy|daffodil|begonia").

# A better approach is to use the taxonomy:
for flower in ("rose", "lily", "daisy", "daffodil", "begonia"):
    taxonomy.append(flower, type="flower")
    
print taxonomy.children("flower")
print taxonomy.parents("rose")
print taxonomy.classify("rose") # Yields the most recently added parent.
print
    
# Taxonomy terms can be included in a pattern:
p = Pattern([Constraint(taxa=["flower"])]) # or
p = Pattern.fromstring("FLOWER")

s = Sentence(parse("A field of white daffodils.", lemmata=True))
m = p.search(s)
print s
print m
print

from pattern.search import search
taxonomy.append("chicken", type="food")
        ## Parse the whole document for analyzing
        ## The pattern.en parser groups words that belong together into chunks.
        ##For example, the black cat is one chunk, tagged NP (i.e., a noun phrase)
        t = parsetree(modified_text, lemmata=True)

        ## get target search phrases based on the top freq words.
        results_dict = get_phrases_fr_list_of_keywords(t, list_of_top_freq_words, phrases_num_limit=5)

        ##>>> ['turbine', 'fluid', 'impulse', 'rotor']
        ##>>> keywords:  turbine
        ##>>> [u'Turbine', u'.A steam turbine', u'case openedA turbine', u'useful work .A turbine', u'rotor .Early turbine']
        ##>>> ********
        ##>>> keywords:  fluid
        ##>>> [u'fluid', u'working fluid', u'a high velocity fluid', u'the fluid', u'the working fluid']
        ##>>> ********
        ##>>> keywords:  impulse
        ##>>> [u'impulse', u'reaction and impulse', u'Impulse', u'de Laval type impulse', u'equivalent impulse']
        ##>>> ********
        ##>>> keywords:  rotor
        ##>>> [u'rotor', u'the rotor', u'turbine rotor', u'absolute terms the rotor', u'temperature turbine rotor']
        ##>>> ********

        taxonomy.classifiers.append(WordNetClassifier())

        for n in list_of_top_freq_words:
            pass
            print taxonomy.parents(n)

        sys.exit()
示例#5
0
文件: frames.py 项目: rsteckel/EDA
from pattern.en import conjugate, lemma, lexeme
from pattern.search import search, taxonomy



for f in ('rose', 'lily', 'daisy', 'daffodil', 'begonia'):
    taxonomy.append(f, type='flower')

for f in ('flower', 'tree'):
    taxonomy.append(f, type='plant')
    

t = parsetree('A field of daffodils is white.', lemmata=True)
print search('PLANT', t) 

taxonomy.parents('daffodil', recursive=True)
taxonomy.children('plant', recursive=False)


#def taxonomy_normalize(sentence):    
#    bp_match = search('BEAUTY_PARTS', parsetree(sentence, lemmata=True))
#    facial_match = search('MAKEUP', parsetree(sentence, lemmata=True))
#    feet_match = search('FEET', parsetree(sentence, lemmata=True))
#    body_match = search('BODY', parsetree(sentence, lemmata=True))    
#    
#    matches = [ [ 'BEAUTY_PARTS-'+word.lemma for word in m] for m in bp_match ] \
#                + [ [ 'MAKEUP-'+word.lemma for word in m] for m in facial_match ] \
#                + [ [ 'FEET-'+word.lemma for word in m] for m in feet_match ] \
#                + [ [ 'BODY-'+word.lemma for word in m] for m in body_match ]
#
#    return matches
示例#6
0
        ##For example, the black cat is one chunk, tagged NP (i.e., a noun phrase)
        t = parsetree(modified_text, lemmata=True)

        ## get target search phrases based on the top freq words.
        results_dict = get_phrases_fr_list_of_keywords(t, list_of_top_freq_words, phrases_num_limit =5)

        ##>>> ['turbine', 'fluid', 'impulse', 'rotor']
        ##>>> keywords:  turbine
        ##>>> [u'Turbine', u'.A steam turbine', u'case openedA turbine', u'useful work .A turbine', u'rotor .Early turbine']
        ##>>> ********
        ##>>> keywords:  fluid
        ##>>> [u'fluid', u'working fluid', u'a high velocity fluid', u'the fluid', u'the working fluid']
        ##>>> ********
        ##>>> keywords:  impulse
        ##>>> [u'impulse', u'reaction and impulse', u'Impulse', u'de Laval type impulse', u'equivalent impulse']
        ##>>> ********
        ##>>> keywords:  rotor
        ##>>> [u'rotor', u'the rotor', u'turbine rotor', u'absolute terms the rotor', u'temperature turbine rotor']
        ##>>> ********

        taxonomy.classifiers.append(WordNetClassifier())

        for n in list_of_top_freq_words:
            pass
            print taxonomy.parents(n)

        sys.exit()



示例#7
0
    




for f in ('reflect', 'bank'):
    taxonomy.append(f, type='angle')

for f in ('bank', 'financial-institution'):
    taxonomy.append(f, type='finance')
    

t = parsetree('A field of daffodils is white.', lemmata=True)
print search('PLANT', t) 

taxonomy.parents('daffodil', recursive=True)
taxonomy.children('plant', recursive=False)

taxonomy.classify('bank')





from pattern.en import wordnet


a = wordnet.synsets('tone')[4]

b = wordnet.synsets('color')[0]