def process(statement, database_name=DATABASE_NAME): ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013" already encountering a statement like "There is a game engine Unity3d" gives us trouble seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization which doesn't really work for things like CTO as a category of items, hmm >>> sent = "There is a game engine Unreal Engine".split() >>> print nltk.ne_chunk(nltk.pos_tag(sent)) ''' # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D" # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source" s = parse(statement, relations=True, lemmata=True, light=True) s = split(s) #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s) s, result = extract(statement) if result: #try: noun = search('(NN)+', s)[0].string table = pluralize(noun.replace(' ', '_')) result = search( '(JJ|NNPS|NNP)+', s ) # this pulls in adjectives, but there's supposed to be a better fix coming ident = result[0].string name = result[1].string if len(result) > 1 else ident #raise Exception(table+"; "+ident+"; "+name) return newTable(table, ident, name, database_name) #except: #return regexMatch(statement,database_name) else: return regexMatch(statement, database_name)
def process(statement,database_name = DATABASE_NAME): ''' Allows us to create entities via statements like "There is a course CSCI4702 called Mobile Programming" and modify entities with statements like "CSCI4702 has a start date of Jan 31st 2013" already encountering a statement like "There is a game engine Unity3d" gives us trouble seems like we need named entity recognition to be able to extract types like that ... or perhaps rely on capitalization which doesn't really work for things like CTO as a category of items, hmm >>> sent = "There is a game engine Unreal Engine".split() >>> print nltk.ne_chunk(nltk.pos_tag(sent)) ''' # this runs real fast, but it doesn't quite get the NN/NNP combination I hoped for from "There is a game engine Unity3D" # although it does now with light=True setting, but now it doesn't get the NNP in "There is a game engine Source" s = parse(statement, relations=True, lemmata=True, light=True) s = split(s) #result = search('There be DT NN+ (DT) (RB) (JJ) NNP+ (call) (DT) (RB) (JJ) (NNPS|NNP)+', s) s, result = extract(statement) if result: #try: noun = search('(NN)+', s)[0].string table = pluralize(noun.replace(' ','_')) result = search('(JJ|NNPS|NNP)+', s) # this pulls in adjectives, but there's supposed to be a better fix coming ident = result[0].string name = result[1].string if len(result) > 1 else ident #raise Exception(table+"; "+ident+"; "+name) return newTable(table,ident,name,database_name) #except: #return regexMatch(statement,database_name) else: return regexMatch(statement,database_name)
def sentiment(content): if len(wordnet.sentiment) == 0: wordnet.sentiment.load() relevant_types = ['JJ', 'VB', 'RB'] #adjectives, verbs, adverbs score = 0 sentences = split(parse(content)) for sentence in sentences: for index, word in enumerate(sentence.words): if word.string != '' and word.type in relevant_types: try: synset = wordnet.synsets(word.string, word.type) except KeyError: #incorrect part of speech tag or not in wordnet, skip it continue pos, neg, obj = synset[0].weight #weights concluding statements #idea from [Ohana, Tierney '09] documentpos = index / float(len(sentence.words)) #weights more subjective statements subjscore = ((pos - neg) * (1 - obj)) score = score + subjscore * documentpos return score
def sentiment(content): wordnet.sentiment.load() relevant_types = ['JJ', 'VB', 'RB'] # adjectives, verbs, adverbs score = 0 sentences = split(parse(content, lemmata=True)) for sentence in sentences: for word in sentence.words: if word.type in relevant_types: pos, neg, obj = wordnet.sentiment[word.lemma] score = score + ((pos - neg) * (1 - obj)) # weight subjective words return score
def sentiment(content): from pattern.en import parse, split, wordnet #must have sentiwordnet available wordnet.sentiment.load() relevant_types = ['JJ', 'VB', 'RB'] #adjectives, verbs, adverbs score = 0 sentences = split(parse(content, lemmata=True)) for sentence in sentences: for word in sentence.words: if word.type in relevant_types: pos, neg, obj = wordnet.sentiment[word.lemma] score = score + ((pos - neg) * (1 - obj)) #weight subjective words heavily return 1 if score >=0 else -1
def sentiment(content): from pattern.en import parse, split, wordnet #must have sentiwordnet available wordnet.sentiment.load() relevant_types = ['JJ', 'VB', 'RB'] #adjectives, verbs, adverbs score = 0 sentences = split(parse(content, lemmata=True)) for sentence in sentences: for word in sentence.words: if word.type in relevant_types: pos, neg, obj = wordnet.sentiment[word.lemma] score = score + ( (pos - neg) * (1 - obj)) #weight subjective words heavily return 1 if score >= 0 else -1
def sentiment(content): from pattern.en import parse, split, wordnet wordnet.sentiment.load() relevant_types = [ 'JJ', 'VB', 'VBD', 'VBN', 'VBG' 'RB', ] score = 0 sentences = split(parse(content, lemmata=True)) for sentence in sentences: for word in sentence.words: if word.type in relevant_types: pos, neg, obj = wordnet.sentiment[word.lemma] score = score + ((pos - neg) * (1 - obj)) #return 1 if score >= 0 else -1 return score
def tokenize(self, text): """ Tokenize words in a text and return the relevant ones Parameters ---------- text : str Text to tokenize. """ for f in self.filters: text = f(text) words = [] for s in nlp.split(nlp.parse(text)): for word, tag in s.tagged: if tag in self.nlp_tags: word = word.lower() if word not in self.exclude_words: words.append(word) return words
def test_split(self): # Assert split(parse(s)) == Text. v = en.split(en.parse("The cat purs.")) self.assertTrue(isinstance(v, en.Text)) print("pattern.en.split()")
def test_split(self): # Assert split(parse(s)) == Text. v = en.split(en.parse("The cat purs.")) self.assertTrue(isinstance(v, en.Text)) print "pattern.en.split()"
spacy_pos = pd.concat( [ labeled_pos, pd.DataFrame({"Spacy_pos_pred": tokens_pos, "Spacy_pos_full_pred": tokens_pos_full, "Spacy_tag": tokens_tag}), ], axis=1, ) # %% # Pattern s = parse(last_5_sent_full_clean) s = split(s) pattern_pos = [] for i in range(len(s)): pattern_pos.extend(list(s.sentences[i].pos)) if len(pattern_pos) != labeled_pos.shape[0]: print("inconsistency between pattern pos and labeled pos") #labeled_pos["Pattern_pos_pred"] = pattern_pos pattern_pos = pd.concat( [labeled_pos, pd.DataFrame({"Pattern_pos_pred": pattern_pos})], axis=1 )
def main(): # First two vars hold the number of relevant sentences, the 2 others the float values police_killer_i = 0 police_killed_i = 0 police_killer_value = 0.0 police_killed_value = 0.0 total_sentences = 0 # Init Twitter query engine engine = Twitter(license=None, language='en') results_list = [] print('Performing twitter queries...') # 4 differents queries with 100 results each = 400 results results_list.append( engine.search('policeman kill', start=1, count=100, cached=False)) results_list.append( engine.search('policeman killed', start=1, count=100, cached=False)) results_list.append( engine.search('police kill', start=1, count=100, cached=False)) results_list.append( engine.search('police killed', start=1, count=100, cached=False)) #print lemma('shot') # Open a file to put some recognized examples examples_file = open('examples.txt', 'w') # For each list of results for ii in xrange(len(results_list)): print('Starting to analyze query results: ' + str(ii + 1) + ' out of ' + str(len(results_list))) for res in results_list[ii]: # Parse and split the tweet in sentences s = parse(string.lower(res.description), chunks=True, relations=True, lemmata=True) #s = parse(string.lower(res), chunks=True, relations=True, lemmata=True) #pprint(s) ss = split(s) # Then for each sentence for sent in ss: # Update sentences number total_sentences += 1 found = False i = 0 value = 0.0 # First check the affidability of the sentence - .5 if a bad word is found, 1.0 otherwise while (not found and (i < len(sent.words))): #print sent.words[i] if (sent.words[i].string in PROFANITY): found = True i = i + 1 if (found): #print('Found a bad word') value = 0.5 else: # No bad words found -> giving max affidability value value = 1.0 #print sent.chunks # Here, I want to clear the sentence from the PNP elements. I will filter out the words belonging to PNP from the list cleared_sentence_words = filter(lambda (i): i.pnp is None, sent.words) cleared_string = '' # But now, seems like there's no way to reconstruct a parsed sentence but assembling again a string and parsing it again for word in cleared_sentence_words: cleared_string += ' ' + word.string #print cleared_string cleared_sentence = parse(cleared_string, chunks=True, relations=True, lemmata=True) cleared_sentence = split(cleared_sentence) #pprint(cleared_sentence) sentence_type1 = False # Now cleared sentence is a sentence without PNP # Check if it is a standard active sentence for match in search('NP kill NP', cleared_sentence): # It is sentence_type1 = True # Check if the Subject is the police if (match.constituents()[0].role == 'SBJ'): for word in match.constituents()[0].words: if word.string in search_list: police_killer_i += 1 police_killer_value += value #print('Police killed') # Print to the examples' file the recognized match for sword in match.words: examples_file.write( str(sword.string.encode("utf-8")) + ' ') examples_file.write('\r\n') #examples_file.write(str(match.words)+'\r\n'); examples_file.write( ' Recognized as: police killed somebody' + '\r\n') examples_file.write( ' TYPE: ACTIVE - SUBJECT' + '\r\n') examples_file.write('\r\n') if (len(match.constituents()) > 2): # Or check if it is object if (match.constituents()[2].role == 'OBJ'): for word in match.constituents()[2].words: if word.string in search_list: police_killed_i += 1 police_killed_value += value #print('Killed by police') # Print to the example file the recognized match for sword in match.words: examples_file.write( str(sword.string.encode("utf-8")) + ' ') examples_file.write('\r\n') examples_file.write( ' Recognized as: police killed by somebody' + '\r\n') examples_file.write( ' TYPE: ACTIVE - OBJECT' + '\r\n') examples_file.write('\r\n') # If it was not an active sentence, check if it is a passive one if (not sentence_type1): #print('Try type 2') for match in search('NP kill (PP)+ (NP)+', cleared_sentence): # Here, the problem is that match.constituents returns a mixed list which can contain both Chunks and Words # We are interested in tags, hence in Chunks, hence we need to do some isinstance non-safe tricks # Checking the subject if (isinstance(match.constituents()[0], Chunk)): if (match.constituents()[0].role == 'SBJ'): #print('Is subject') for word in match.constituents()[0]: #for word in match.chunks()[0]: if word.string in search_list: police_killer_i += 1 police_killer_value += value # Print to the example file the recognized match for sword in match.words: examples_file.write( str( sword.string.encode( "utf-8")) + ' ') examples_file.write('\r\n') examples_file.write( ' Recognized as: police killed somebody' + '\r\n') examples_file.write( ' TYPE: PASSIVE - SUBJECT - CHUNK' + '\r\n') examples_file.write('\r\n') elif (isinstance(match.constituents()[0], Word)): if match.constituents()[0].string in search_list: police_killer_i += 1 police_killer_value += value #print('Killed by police') # Print to the example file the recognized match for sword in match.words: examples_file.write( str(sword.string.encode("utf-8")) + ' ') examples_file.write('\r\n') examples_file.write( ' Recognized as: police killed somebody' + '\r\n') examples_file.write( ' TYPE: PASSIVE - SUBJECT - WORD' + '\r\n') examples_file.write('\r\n') # Checking the object. First I have to filter out the Word objects from the match results to see if I have enough Chunks if (len( filter(lambda (i): isinstance(i, Chunk), match.constituents())) == 4): if (match.constituents()[3].role == 'OBJ'): for word in match.constituents()[3]: if word.string in search_list: police_killed_i += 1 police_killed_value += value # Print to the example file the recognized match for sword in match.words: examples_file.write( str( sword.string.encode( "utf-8")) + ' ') examples_file.write('\r\n') examples_file.write( ' Recognized as: police was killed by somebody' + '\r\n') examples_file.write( ' TYPE: PASSIVE - OBJECT - CHUNK' + '\r\n') examples_file.write('\r\n')
def main(): # First two vars hold the number of relevant sentences, the 2 others the float values police_killer_i = 0 police_killed_i = 0 police_killer_value = 0.0 police_killed_value = 0.0 total_sentences = 0 # Init Twitter query engine engine = Twitter(license=None, language='en') results_list = [] print('Performing twitter queries...') # 4 differents queries with 100 results each = 400 results results_list.append(engine.search('policeman kill', start=1, count=100, cached=False)) results_list.append(engine.search('policeman killed', start=1, count=100, cached=False)) results_list.append(engine.search('police kill', start=1, count=100, cached=False)) results_list.append(engine.search('police killed', start=1, count=100, cached=False)) #print lemma('shot') # Open a file to put some recognized examples examples_file = open('examples.txt', 'w') # For each list of results for ii in xrange(len(results_list)): print('Starting to analyze query results: '+str(ii+1) + ' out of '+str(len(results_list))) for res in results_list[ii]: # Parse and split the tweet in sentences s = parse(string.lower(res.description), chunks=True, relations=True, lemmata=True) #s = parse(string.lower(res), chunks=True, relations=True, lemmata=True) #pprint(s) ss = split(s) # Then for each sentence for sent in ss: # Update sentences number total_sentences += 1 found = False i = 0 value = 0.0 # First check the affidability of the sentence - .5 if a bad word is found, 1.0 otherwise while (not found and (i < len(sent.words))): #print sent.words[i] if (sent.words[i].string in PROFANITY): found = True i = i+1 if (found): #print('Found a bad word') value = 0.5 else: # No bad words found -> giving max affidability value value = 1.0 #print sent.chunks # Here, I want to clear the sentence from the PNP elements. I will filter out the words belonging to PNP from the list cleared_sentence_words = filter( lambda(i): i.pnp is None, sent.words) cleared_string = ''; # But now, seems like there's no way to reconstruct a parsed sentence but assembling again a string and parsing it again for word in cleared_sentence_words: cleared_string += ' ' + word.string #print cleared_string cleared_sentence = parse(cleared_string, chunks=True, relations=True, lemmata=True) cleared_sentence = split(cleared_sentence) #pprint(cleared_sentence) sentence_type1 = False # Now cleared sentence is a sentence without PNP # Check if it is a standard active sentence for match in search('NP kill NP', cleared_sentence): # It is sentence_type1 = True # Check if the Subject is the police if (match.constituents()[0].role == 'SBJ'): for word in match.constituents()[0].words: if word.string in search_list: police_killer_i += 1 police_killer_value += value #print('Police killed') # Print to the examples' file the recognized match for sword in match.words: examples_file.write(str(sword.string.encode("utf-8"))+' ') examples_file.write('\r\n') #examples_file.write(str(match.words)+'\r\n'); examples_file.write(' Recognized as: police killed somebody'+'\r\n') examples_file.write(' TYPE: ACTIVE - SUBJECT'+'\r\n') examples_file.write('\r\n') if (len(match.constituents()) > 2): # Or check if it is object if (match.constituents()[2].role == 'OBJ'): for word in match.constituents()[2].words: if word.string in search_list: police_killed_i += 1 police_killed_value += value #print('Killed by police') # Print to the example file the recognized match for sword in match.words: examples_file.write(str(sword.string.encode("utf-8"))+' ') examples_file.write('\r\n') examples_file.write(' Recognized as: police killed by somebody'+'\r\n') examples_file.write(' TYPE: ACTIVE - OBJECT'+'\r\n') examples_file.write('\r\n') # If it was not an active sentence, check if it is a passive one if (not sentence_type1): #print('Try type 2') for match in search('NP kill (PP)+ (NP)+', cleared_sentence): # Here, the problem is that match.constituents returns a mixed list which can contain both Chunks and Words # We are interested in tags, hence in Chunks, hence we need to do some isinstance non-safe tricks # Checking the subject if (isinstance(match.constituents()[0], Chunk)): if (match.constituents()[0].role == 'SBJ'): #print('Is subject') for word in match.constituents()[0]: #for word in match.chunks()[0]: if word.string in search_list: police_killer_i += 1 police_killer_value += value # Print to the example file the recognized match for sword in match.words: examples_file.write(str(sword.string.encode("utf-8"))+' ') examples_file.write('\r\n') examples_file.write(' Recognized as: police killed somebody'+'\r\n'); examples_file.write(' TYPE: PASSIVE - SUBJECT - CHUNK'+'\r\n') examples_file.write('\r\n'); elif (isinstance(match.constituents()[0], Word)): if match.constituents()[0].string in search_list: police_killer_i += 1 police_killer_value += value #print('Killed by police') # Print to the example file the recognized match for sword in match.words: examples_file.write(str(sword.string.encode("utf-8"))+' ') examples_file.write('\r\n') examples_file.write(' Recognized as: police killed somebody'+'\r\n') examples_file.write(' TYPE: PASSIVE - SUBJECT - WORD'+'\r\n') examples_file.write('\r\n') # Checking the object. First I have to filter out the Word objects from the match results to see if I have enough Chunks if (len( filter(lambda(i): isinstance(i, Chunk), match.constituents())) == 4): if (match.constituents()[3].role == 'OBJ'): for word in match.constituents()[3]: if word.string in search_list: police_killed_i += 1 police_killed_value += value # Print to the example file the recognized match for sword in match.words: examples_file.write(str(sword.string.encode("utf-8"))+' ') examples_file.write('\r\n') examples_file.write(' Recognized as: police was killed by somebody'+'\r\n'); examples_file.write(' TYPE: PASSIVE - OBJECT - CHUNK'+'\r\n') examples_file.write('\r\n');