def categorize_phrases(sentphrases, body_part_name, use_stem=True): stemmer = nltk.stem.PorterStemmer() records = [] for i,row in sentphrases.iterrows(): sentence = row['sentence'] phrase = row['phrase'] categories = [] parents = [] for word in phrase: if use_stem == True: category = taxonomy.classify(stemmer.stem(word.string)) else: category = taxonomy.classify(word.string) if category: categories.append(category) parent = taxonomy.parents(category) if parent and parent[0] != category: parents.append(parent[0]) else: parents.append('') #elif word.string == body_part_name: # categories.append(body_part_name) # parents.append(body_part_name) assert(len(categories) == len(parents)) phrase_str = ' '.join([ w.string+'/'+w.tag for w in phrase ]) phrase_str = phrase_str.replace(',', '') sentence_str = sentence.replace(',', '') score = len(categories) * (len(categories) / (1.*len(phrase))) for i,category in enumerate(categories): records.append( ( body_part_name, sentence_str, phrase_str, category, parents[i], score) ) df = pd.DataFrame( records, columns=['body_part', 'sentence', 'phrase', 'category', 'parent', 'score']) return df
from pattern.search import search, taxonomy, Classifier from pattern.en import parsetree # The search module includes a Taxonomy class # that can be used to define semantic word types. # For example, consider that you want to extract flower names from a text. # This would make search patterns somewhat unwieldy: # search("rose|lily|daisy|daffodil|begonia", txt). # A better approach is to use the taxonomy: for flower in ("rose", "lily", "daisy", "daffodil", "begonia"): taxonomy.append(flower, type="flower") print taxonomy.children("flower") print taxonomy.parents("rose") print taxonomy.classify("rose") # Yields the most recently added parent. print # Taxonomy terms can be included in a pattern by using uppercase: t = parsetree("A field of white daffodils.", lemmata=True) m = search("FLOWER", t) print t print m print # Another example: taxonomy.append("chicken", type="food") taxonomy.append("chicken", type="bird") taxonomy.append("penguin", type="bird") taxonomy.append("bird", type="animal")
from pattern.search import Pattern, Constraint, Classifier, taxonomy from pattern.en import Sentence, parse # The search module includes a Taxonomy class # that can be used to define semantic word types. # For example, consider that you want to extract flower names from a text. # This would make patterns somewhat unwieldy, e.g.: # Pattern.fromstring("rose|lily|daisy|daffodil|begonia"). # A better approach is to use the taxonomy: for flower in ("rose", "lily", "daisy", "daffodil", "begonia"): taxonomy.append(flower, type="flower") print taxonomy.children("flower") print taxonomy.parents("rose") print taxonomy.classify("rose") # Yields the most recently added parent. print # Taxonomy terms can be included in a pattern: p = Pattern([Constraint(taxa=["flower"])]) # or p = Pattern.fromstring("FLOWER") s = Sentence(parse("A field of white daffodils.", lemmata=True)) m = p.search(s) print s print m print from pattern.search import search taxonomy.append("chicken", type="food")
## Parse the whole document for analyzing ## The pattern.en parser groups words that belong together into chunks. ##For example, the black cat is one chunk, tagged NP (i.e., a noun phrase) t = parsetree(modified_text, lemmata=True) ## get target search phrases based on the top freq words. results_dict = get_phrases_fr_list_of_keywords(t, list_of_top_freq_words, phrases_num_limit=5) ##>>> ['turbine', 'fluid', 'impulse', 'rotor'] ##>>> keywords: turbine ##>>> [u'Turbine', u'.A steam turbine', u'case openedA turbine', u'useful work .A turbine', u'rotor .Early turbine'] ##>>> ******** ##>>> keywords: fluid ##>>> [u'fluid', u'working fluid', u'a high velocity fluid', u'the fluid', u'the working fluid'] ##>>> ******** ##>>> keywords: impulse ##>>> [u'impulse', u'reaction and impulse', u'Impulse', u'de Laval type impulse', u'equivalent impulse'] ##>>> ******** ##>>> keywords: rotor ##>>> [u'rotor', u'the rotor', u'turbine rotor', u'absolute terms the rotor', u'temperature turbine rotor'] ##>>> ******** taxonomy.classifiers.append(WordNetClassifier()) for n in list_of_top_freq_words: pass print taxonomy.parents(n) sys.exit()
from pattern.en import conjugate, lemma, lexeme from pattern.search import search, taxonomy for f in ('rose', 'lily', 'daisy', 'daffodil', 'begonia'): taxonomy.append(f, type='flower') for f in ('flower', 'tree'): taxonomy.append(f, type='plant') t = parsetree('A field of daffodils is white.', lemmata=True) print search('PLANT', t) taxonomy.parents('daffodil', recursive=True) taxonomy.children('plant', recursive=False) #def taxonomy_normalize(sentence): # bp_match = search('BEAUTY_PARTS', parsetree(sentence, lemmata=True)) # facial_match = search('MAKEUP', parsetree(sentence, lemmata=True)) # feet_match = search('FEET', parsetree(sentence, lemmata=True)) # body_match = search('BODY', parsetree(sentence, lemmata=True)) # # matches = [ [ 'BEAUTY_PARTS-'+word.lemma for word in m] for m in bp_match ] \ # + [ [ 'MAKEUP-'+word.lemma for word in m] for m in facial_match ] \ # + [ [ 'FEET-'+word.lemma for word in m] for m in feet_match ] \ # + [ [ 'BODY-'+word.lemma for word in m] for m in body_match ] # # return matches
##For example, the black cat is one chunk, tagged NP (i.e., a noun phrase) t = parsetree(modified_text, lemmata=True) ## get target search phrases based on the top freq words. results_dict = get_phrases_fr_list_of_keywords(t, list_of_top_freq_words, phrases_num_limit =5) ##>>> ['turbine', 'fluid', 'impulse', 'rotor'] ##>>> keywords: turbine ##>>> [u'Turbine', u'.A steam turbine', u'case openedA turbine', u'useful work .A turbine', u'rotor .Early turbine'] ##>>> ******** ##>>> keywords: fluid ##>>> [u'fluid', u'working fluid', u'a high velocity fluid', u'the fluid', u'the working fluid'] ##>>> ******** ##>>> keywords: impulse ##>>> [u'impulse', u'reaction and impulse', u'Impulse', u'de Laval type impulse', u'equivalent impulse'] ##>>> ******** ##>>> keywords: rotor ##>>> [u'rotor', u'the rotor', u'turbine rotor', u'absolute terms the rotor', u'temperature turbine rotor'] ##>>> ******** taxonomy.classifiers.append(WordNetClassifier()) for n in list_of_top_freq_words: pass print taxonomy.parents(n) sys.exit()
for f in ('reflect', 'bank'): taxonomy.append(f, type='angle') for f in ('bank', 'financial-institution'): taxonomy.append(f, type='finance') t = parsetree('A field of daffodils is white.', lemmata=True) print search('PLANT', t) taxonomy.parents('daffodil', recursive=True) taxonomy.children('plant', recursive=False) taxonomy.classify('bank') from pattern.en import wordnet a = wordnet.synsets('tone')[4] b = wordnet.synsets('color')[0]