示例#1
0
    def extract_verb_and_entity_name_from_text(self, text):

        verb_result = []
        np_result = []

        pst = parsetree(text)
        for sentence in pst:
            for chunk in sentence.chunks:
                if chunk.type == "NP":
                    np_result.append(chunk)
                    continue
                if chunk.type == "VP":
                    word_tagged_list = chunk.tagged
                    for word in word_tagged_list:
                        if word[1][0] == "V":
                            verb_result.append(word[0])
                    continue

        verb_result = set(verb_result)

        final_result = []
        for candidate in verb_result:
            candidate = self.lemmatizer.lemmatize(candidate)
            if candidate not in self.stopwords:
                final_result.append(candidate)

        for np in np_result:
            entity_name = self.get_clean_entity_name_for_string(np.string)
            if entity_name:
                final_result.append(entity_name)

        return list(set(final_result))
示例#2
0
 def extract_chunk(self, text):
     """
     return all the chunk extracted form the text
     :param text: the text
     :return: list of chunk of object pattern.text.Chunk
     """
     result = []
     pst = parsetree(text)
     for sentence in pst:
         for chunk in sentence.chunks:
             if chunk.type == "NP":
                 result.append(chunk)
                 # print chunk.type, [(w.string, w.type) for w in chunk.words]
     return result
示例#3
0
    def extract_single_verb(self, text):
        """
        return all the chunk extracted form the text
        :param text: the text
        :return: list of chunk of object pattern.text.Chunk
        """
        result = []
        pst = parsetree(text)
        for sentence in pst:
            for chunk in sentence.chunks:
                if chunk.type == "VP":
                    result.append(chunk.string.split(" ")[0])

        candidates = set(result)

        final_result = []
        for candidate in candidates:
            candidate = self.lemmatizer.lemmatize(candidate)
            if candidate not in self.stopwords:
                final_result.append(candidate)

        return list(set(final_result))
示例#4
0
    def test_get_np_for_all(self):
        text_list = self.text_list

        from textblob.taggers import NLTKTagger
        from textblob.tokenizers import SentenceTokenizer
        chunker = ConllExtractor()

        tb = Blobber(pos_tagger=NLTKTagger(),
                     tokenizer=SentenceTokenizer(),
                     np_extractor=chunker)

        for text in text_list:
            # tbinstance=tb(text)
            # sentences=tbinstance.sentences
            # print(sentences)
            # for s in sentences:
            #     s.
            pst = parsetree(text)
            print(pst)
            for sentence in pst:
                for chunk in sentence.chunks:
                    if chunk.type == "NP":
                        print chunk.type, [(w.string, w.type)
                                           for w in chunk.words]
示例#5
0
print(parse("les chats noirs", chunks=False, language="fr", tagset=UNIVERSAL))
print(parse("i gatti neri", chunks=False, language="it", tagset=UNIVERSAL))
print(parse("de zwarte katten", chunks=False, language="nl", tagset=UNIVERSAL))
print("")

# This comes at the expense of (in this example) losing information about plural nouns (NNS => NN).
# But it may be more comfortable for you to build multilingual apps
# using the universal constants (e.g., PRON, PREP, CONJ),
# instead of learning the Penn Treebank tagset by heart,
# or wonder why the Italian "che" is tagged "PRP", "IN" or "CC"
# (in the universal tagset it is a PRON or a CONJ).

from pattern.text import parsetree

for sentence in parsetree("i gatti neri che sono la mia",
                          language="it",
                          tagset=UNIVERSAL):
    for word in sentence.words:
        if word.tag == PRON:
            print(word)

# The language() function in pattern.text can be used to guess the language of a text.
# It returns a (language code, confidence)-tuple.
# It can guess en, es, de, fr, it, nl.

from pattern.text import language

print("")
print(language(u"the cat sat on the mat"))  # ("en", 1.00)
print(language(u"de kat zat op de mat"))  # ("nl", 0.80)
print(language(u"le chat s'était assis sur le tapis"))  # ("fr", 0.86)
示例#6
0
print(parse("the black cats"      , chunks=False, language="en", tagset=UNIVERSAL))
print(parse("los gatos negros"    , chunks=False, language="es", tagset=UNIVERSAL))
print(parse("les chats noirs"     , chunks=False, language="fr", tagset=UNIVERSAL))
print(parse("i gatti neri"        , chunks=False, language="it", tagset=UNIVERSAL))
print(parse("de zwarte katten"    , chunks=False, language="nl", tagset=UNIVERSAL))
print()

# This comes at the expense of (in this example) losing information about plural nouns (NNS => NN).
# But it may be more comfortable for you to build multilingual apps 
# using the universal constants (e.g., PRON, PREP, CONJ), 
# instead of learning the Penn Treebank tagset by heart,
# or wonder why the Italian "che" is tagged "PRP", "IN" or "CC"
# (in the universal tagset it is a PRON or a CONJ).

from pattern.text import parsetree

for sentence in parsetree("i gatti neri che sono la mia", language="it", tagset=UNIVERSAL):
    for word in sentence.words:
        if word.tag == PRON:
            print(word)
            
# The language() function in pattern.text can be used to guess the language of a text.
# It returns a (language code, confidence)-tuple.
# It can guess en, es, de, fr, it, nl.

from pattern.text import language

print()
print(language(u"the cat sat on the mat"))             # ("en", 1.00)
print(language(u"de kat zat op de mat"))               # ("nl", 0.80)
print(language(u"le chat s'était assis sur le tapis")) # ("fr", 0.86)
示例#7
0
def getSortedSentenceList(query, raw_sentence_list, english_postagger, min_words=14, max_sentences=2000):
# igraph object
    enhancedSentences=generateEnhancedSentences(raw_sentence_list,english_postagger)
    print 'Enhanced sentences==>', len(enhancedSentences)
    taggedSentences=english_postagger.tag_sents(nltk.word_tokenize(sent) for sent in enhancedSentences)
    taggedSentences=generateTempRewrittenSentences(taggedSentences)
    iobject = generateMultiplePaths(taggedSentences)
    startvertex = getVertex(iobject, '-start-/-/-start-')
    endvertex = getVertex(iobject, '-end-/-/-end-')
    vertexList = iobject.vs()
    allpaths = paths_from_to(iobject, startvertex, endvertex)
    shuffle(allpaths)
    allpaths=allpaths[0:2000]
    generatedSentences = []
    a = datetime.datetime.now()
    print 'starting paths...'
    sentence_container = {}
    for path in allpaths:
        paired_parentheses = 0
        quotation_mark_number = 0
        if len(path) >= min_words: 
            sentence = ' '.join(getWordFromVertexName(vertexList[element]['name']) for element in path) 
            for word in sentence.split():
                if word == '(':
                    paired_parentheses -= 1
                elif word == ')':
                    paired_parentheses += 1
                elif word == '"' or word == '\'\'' or word == '``':
                    quotation_mark_number += 1
            if paired_parentheses == 0 and \
                (quotation_mark_number%2) == 0 and \
                not sentence_container.has_key(sentence.strip()):   
                generatedSentences.append(sentence.strip())  
                sentence_container[sentence.strip()]=1
            
    
    b = datetime.datetime.now()
    print 'done with paths' , (b-a)
    shuffle(generatedSentences)
    generatedSentences=generatedSentences[0:max_sentences]
    for gensent in generatedSentences:
        s = parsetree(gensent, tokenize = True, relations=True, lemmata = True)
        chunkList=[chunk.type for row in s for chunk in row.chunks]
        relationList=[rel for row in s for rel in row.relations]
        if 'VP' not in chunkList or 'SBJ' not in relationList: #subject verb
            generatedSentences.remove(gensent)
            
    #shuffle(generatedSentences)        
    
    
    docs=[]
    docs.append(query) ## Query add
    docs.extend(generatedSentences)
       
    bow_matrix = TfidfVectorizer(stop_words=stopwordList).fit_transform(docs)
    normalized = TfidfTransformer().fit_transform(bow_matrix)
    
    cosine_similarity_matrix = (normalized[1:] * normalized[1:].T).A
    sources, targets = cosine_similarity_matrix.nonzero()
    similarity_igraph = igraph.Graph(zip(sources, targets), directed=True)
    scores = igraph.Graph.pagerank(similarity_igraph)
    
    docqueryRelevance = linear_kernel(normalized[0:1], normalized[1:]).flatten()
    
    scoredList = [(scores[i] * docqueryRelevance[i], s, i) for i, s in enumerate(generatedSentences)]
    #scoredList = [(docqueryRelevance[i], s, i) for i, s in enumerate(generatedSentences)]
    
    #for score, sent, i in scoredList:
        #print score, sent, i
    #cosine_similarity_matrix=np.asmatrix(cosine_similarity_matrix)    
    return scoredList, cosine_similarity_matrix    
    (r'.*s$', 'NNS'),  # plural nouns
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
    (r'.*', 'NN')  # nouns (default) ...
]

rt = RegexpTagger(patterns)

rt.evaluate(test_data)

ut = UnigramTagger(train_data)
bt = BigramTagger(train_data)
tt = TrigramTagger(train_data)

ut.evaluate(test_data)


def combined_tagger(train_data, taggers, backoff=None):
    for tagger in taggers:
        backoff = tagger(train_data, backoff=backoff)
    return backoff


ct = combined_tagger(train_data=train_data,
                     taggers=[UnigramTagger, BigramTagger, TrigramTagger],
                     backoff=rt)

tree = parsetree(sentence)

for sentence_tree in tree:
    print(sentence_tree.chunks)
示例#9
0
'''
Created on Apr 9, 2015

@author: sub253
'''
from pattern.text import parsetree, Chunk, pprint, parse
sent = 'A number of methods have been recommended to help ease symptoms, including adequate liquid intake and rest.'
#sent ='There are two more scripts of interest.'
s = parsetree(sent, tokenize=True, relations=True, lemmata=True)
#chunk = Chunk(s)
#

#parse(sent, relations=True))
print s
relationList = s.sentences[0].relations

print 'Relationlist=>', relationList
sbjstring = ''
objstring = ''
if 'SBJ' in relationList:
    for chunk in relationList['SBJ'].values():
        print chunk.words
        sbjstring = sbjstring + ' ' + ' '.join(word.string
                                               for word in chunk.words)

if 'OBJ' in relationList:
    for chunk in relationList['OBJ'].values():
        print chunk.words
        objstring = objstring + ' ' + ' '.join(word.string
                                               for word in chunk.words)
print sbjstring.strip()