def check_phrase_for_occupations_nobreaks(s): import nlp from occ import set2code found = [] words = nlp.word_tokenize(s) words = [nlp.lemmatize(x) for x in words] sets = set() sets.update( nlp.getCloseUnorderedSets(words, minTuple=1, maxTuple=1, maxBuffer=0)) sets.update( nlp.getCloseUnorderedSets(words, minTuple=2, maxTuple=2, maxBuffer=0)) sets.update( nlp.getCloseUnorderedSets(words, minTuple=3, maxTuple=3, maxBuffer=0)) sets.update( nlp.getCloseUnorderedSets(words, minTuple=4, maxTuple=4, maxBuffer=0)) for fs in sets: if fs in set2code: c = set2code[fs]["code"] found.append({"word": " ".join(fs), "occ": [c], "fs": fs}) def is_subset_anyone(x): for y in found: if x['fs'] != y['fs'] and x['fs'].issubset(y['fs']): return True found = [x for x in found if not is_subset_anyone(x)] return found
def choose_function(self, r_content): start_function_point = 0 shutdown_function_point = 0 for _key in self.start_function_keywords: if _key.strip() in r_content: start_function_point += 1 for _key in self.shutdown_function_keywords: if _key.strip() in r_content: shutdown_function_point += 1 max_point = max(start_function_point, shutdown_function_point) # debug if self.properties['debugmode']: print('[start_function: ' + str(start_function_point) + "]") print('[shutdown_function: ' + str(shutdown_function_point) + ']') if [start_function_point, shutdown_function_point].count(max_point) > 1: a = input("Not very clear. Which function should i choose ?\n" "1. start_function\n" "2. shutdown_function\n" "3. \n" "4. \n" "Your choice: ") if a == '2': self.shutdown_function_keywords += nlp.word_tokenize(r_content) self.shutdown_function_keywords = nlp.remove_duplicate_in_list(self.shutdown_function_keywords) return 'shutdown_function' elif a == '3': return 'start_function' elif a == '4': return 'start_function' else: self.start_function_keywords += nlp.word_tokenize(r_content) self.start_function_keywords = nlp.remove_duplicate_in_list(self.start_function_keywords) self.save_keywords_data_xml() return 'start_function' if max_point == 0 or max_point == start_function_point: return 'start_function' else: return 'shutdown_function'
def check(s): import nlp from occ import term2code found = [] words = nlp.word_tokenize(s.lower()) words += ["-"] + nlp.word_tokenize(s.lower()) # This algorithm proceeds from largest to smallest tuples, making sure not to count any codes inside codes max_tuples = 4 current_tuples = max_tuples process_now = nlp.getTuples(words, minTuple=4, maxTuple=4) while current_tuples > 0: # print(process_now, current_tuples) dont_process_next = set() for tup in process_now: tocheck = " ".join(tup) if tocheck in term2code: c = term2code[tocheck]["code"] found.append({"word": tocheck, "occ": [c]}) dont_process_next.update( nlp.getTuples(list(tup), minTuple=current_tuples - 1, maxTuple=current_tuples - 1)) # print(dont_process_next) process_now = set( nlp.getTuples(words, minTuple=current_tuples - 1, maxTuple=current_tuples - 1)) process_now = process_now.difference(dont_process_next) current_tuples -= 1 return found
def run(self): from nlp import sent_tokenize, word_tokenize, lemmatize fb = self['fullBody'] possibilities = sent_tokenize(fb) for i, p in enumerate(possibilities): words = word_tokenize(fb) words = map(lemmatize, words) if len([x for x in words if x == 'die']): return p
def run(self): import nlp my_props = set() toSearch = self.ofWhat['firstSentence'] # don't want lexicon (or a name) to be spotted in the "died on the 3rd with is family" toSearch = toSearch.split("died")[0] toSearch = toSearch.split("dead")[0] toSearch = toSearch.split("killed")[0] toSearch = toSearch.split("drowned")[0] # their own name might get confusing for this analysis... toSearch = toSearch.replace(self.ofWhat["name"], "") # intelligent tokenization toSearchWords = nlp.word_tokenize(toSearch) kinMatch = 0 kinMatchStronger = 0 lexicon = nlp.inquirer_lexicon["KIN"] for x in toSearchWords: if x.upper() in lexicon: kinMatch += 1 for x in nlp.getTuples(toSearchWords, 2, 2): if x[0].upper() in lexicon and x[1].upper() == "OF": kinMatchStronger += 1 if kinMatch > 0: my_props.add("lex_match") if kinMatchStronger > 0: my_props.add("lex_match_strong") # I also need a full name that matches in the last name... for names in nlp.getTuples(toSearchWords, 2, 3): # must be capitalized... if any(x[0].lower() == x[0] for x in names): continue # last must be the same name! if names[-1].lower() != self.ofWhat["last_name"]: continue my_props.add("name_match") break return list(my_props)
def loadAssociations(): global codes global term2code global set2code CSV_fn = path.join(path.dirname(__file__), "..", "w2c_source", "compiledCodes.csv") print("Loading term-code associations into variable 'codes' from %s..." % CSV_fn) print("Loading term dictionary into variable 'term2code' from %s..." % CSV_fn) with open(CSV_fn, 'r') as outCodesF: CSV_r = DictReader(outCodesF) codes = list(CSV_r) for code in codes: term2code[ code["term"] ] = code words = nlp.word_tokenize( code["term"] ) words = [nlp.lemmatize(x) for x in words] set2code[ frozenset(words) ] = code
def run_old(self): import nlp ret = None # print(self.ofWhat['firstSentence']) # most consistently, it's the first noun chunk: def isName(x): if len(x.split()) < 2: return False if not nlp.isTitleCase(x): return False return True # start with NER from spacy: if ret is None: guesses = self.ofWhat['spacyFirstSentence'].ents guesses = [ x for x in guesses if x.label_ == 'PERSON' and isName(x.text) ] if len(guesses) > 0: # just use the first one # and we'll probably need expansion ret = guesses[0].text # print("NER for the win") # first, expand. it many times doesn't get parens, or Dr. Rev. etc. # we then need to look deeper, if it's a "Mr." "Mrs." or "Dr." # then just try some noun chunking... if ret is None: nc = list(self.ofWhat['spacyFirstSentence'].noun_chunks) if len(nc) > 0: nc = list(filter(isName, map(str, nc))) if len(nc) > 0: ret = nc[0] # print("Noun Chunk Found!") if ret is None: ret = None return ret # print(name) if False: # try spacy's NER: guesses = self.ofWhat['spacyFirstSentence'].ents guesses = [x for x in guesses if x.label_ == 'PERSON'] print("FS:", self.ofWhat['firstSentence']) if len(guesses) > 0: print("Found:", [x.text for x in guesses]) if True: # name is ALMOST ALWAYS the first noun_chunk. nc = list(self.ofWhat['spacyFirstSentence'].noun_chunks) if len(nc) > 0: nc = list(filter(nlp.isTitleCase, map(str, nc))) if len(nc) > 0: # print(nc) pass # print("FS:", self.ofWhat['firstSentence']) return # also could just check that the words are in the title... if fsname is not None: t = self.ofWhat['title'].lower() tw = set(nlp.word_tokenize(t)) fsnamew = set(nlp.word_tokenize(str(fsname).lower())) if len(tw.intersection(fsnamew)) > 0: # print(fsname) pass # the title is a good check if False: tname = self.ofWhat['title'] pats = [ "is dead", ",", "dies", "is slain", "of", "dead", ] for pat in pats: tname = re.split(pat, tname, flags=re.IGNORECASE)[0]
def run(self): import g from occ import set2code import nlp import wiki import re """ if len(self.ofWhat['spacyFirstSentence']) == 0: if self.debug: g.p("Skipping. No content after trim.") coding.stateCounter.update(["zeroLengthSkip"]) return if self.debug: g.p.depth = 0 g.p() g.p(self.ofWhat['spacyFirstSentence']) g.p.depth += 1 """ dead_guys_occs = set() if len(self.ofWhat["name"]) > 0: words = wiki.lookupOccupationalTitles(self.ofWhat["name"]) for x in words: dead_guys_occs.update(set2code[set(x)]) if len(dead_guys_occs) > 0: if self.debug: g.p("WikiData returns %s which gives OCC %s" % (words, dead_guys_occs)) if self.debug: g.p("Extracted name: %s" % self.ofWhat["name"]) # extract information from the title dieWords = ['dies', 'die', 'dead'] t = self.ofWhat['title'] ts = [x.strip() for x in re.split(r'[;,]|--', t)] ts = ts[1:] # the name is always the first one for tp in ts: tpW = [x.lower() for x in nlp.word_tokenize(tp)] hasDeathWord = False for dW in dieWords: if dW in tpW: hasDeathWord = True if hasDeathWord: continue # if it's a number, continue try: int(tp) continue except ValueError: pass if self.debug: g.p("Extracted from title:", tp) didSomething = False guesses = [] # Alec McGail, scientist and genius, died today. nameChildren = list(self.ofWhat["spacyName"].root.children) apposHooks = list( filter(lambda nameChild: nameChild.dep_ == 'appos', nameChildren)) if len(apposHooks) > 0: didSomething = True # painter, scientist, and architect baseNouns = nlp.followRecursive(apposHooks, 'conj') # one of the first **novelists** for i, x in enumerate(baseNouns): if nlp.isPrepPhrase(x) and str(x) == 'one': baseNouns[i] = nlp.enterPrepPhrase(x)[0] # now that the important "what they were" nouns are identified, # identify what OCC they are for n in baseNouns: result = set2code[set(n)] guesses.append(result) return guesses # Alec McGail, who ..., died today. relcls = list( filter(lambda nameChild: nameChild.dep_ == 'relcl', nameChildren)) if len(relcls) > 0: g.p.depth += 1 for relcl in relcls: # need to follow advcl and conj goDeep = nlp.followRecursive(relcl, ['advcl', 'conj']) be = ['was', 'became'] for v in goDeep: # as _ followPreps = nlp.followRecursive(v, ['relcl', 'prep', 'pobj']) asWhat = [ x for x in followPreps if next(x.ancestors).text == 'as' and x.pos_ == 'pobj' ] if self.debug and len(asWhat): g.p('whoAs', asWhat) if len(asWhat): didSomething = True # who was a scientist and inventor if v.pos_ == 'VERB': if v.text in be: for vc in v.children: if vc.dep_ != 'attr': continue if self.debug: g.p('Expanded be verb', vc, vc.dep_) # guesses.append(result) didSomething = True finalGuess = [] for guess in guesses: if len(guess['occ']) != 1: continue finalGuess.append(guess['occ'][0]) if self.debug: g.p("finalGuess", finalGuess) if False: moreGuesses = [] # more stupid guesses... # literally expand every noun for w in self.ofWhat['spacyFirstSentence']: if w.pos_ != 'NOUN': continue guess = coding.nounOCC(w) moreGuesses.append(guess) stupidFinalGuess = [] for guess in moreGuesses: stupidFinalGuess += guess['occ'] if self.debug: g.p("stupidFinalGuess", stupidFinalGuess) if set(stupidFinalGuess) != set(finalGuess): g.p("And they're different!", extrad=1) if not didSomething: if len(dead_guys_occs) > 0: coding.stateCounter.update(["justWikidata"]) else: if self.debug: g.p("Skipping. Strange grammatical construction.") coding.stateCounter.update(["strangeGrammar"])
def choose_bot(self, r_content): chatbot_point = 0 infobot_point = 0 cmdbot_point = 0 learningbot_point = 0 r_content_list = nlp.word_tokenize(r_content) for _key in self.chatbot_keywords: if _key.strip() in r_content_list: chatbot_point += 1 for _key in self.infobot_keywords: if _key.strip() in r_content_list: infobot_point += 1 for _key in self.cmdbot_keywords: if _key.strip() in r_content_list: cmdbot_point += 1 for _key in self.learningbot_keywords: if _key.strip() in r_content_list: learningbot_point += 1 max_point = max(chatbot_point, infobot_point, cmdbot_point, learningbot_point) # debug if self.properties['debugmode']: print('[chatBot: ' + str(chatbot_point) + "]") print('[infoBot: ' + str(infobot_point) + "]") print('[cmdBot: ' + str(cmdbot_point) + "]") print('[learningBot: ' + str(learningbot_point) + "]") print('[max_point: ' + str(max_point) + "]") # Hoi y kien nguoi dung neu co 2 hoac nhieu bot co cung so diem if [chatbot_point, infobot_point, cmdbot_point, learningbot_point ].count(max_point) > 1: a = input("Not very clear. Which bot should i choose ?\n" "1. ChatBot\n" "2. InfoBot\n" "3. CmdBot\n" "4. LearningBot\n" "Your choice: ") if a == '2': self.infobot_keywords += nlp.word_tokenize(cmd.raw_content) self.infobot_keywords = nlp.remove_duplicate_in_list( self.infobot_keywords) self.save_keywords_data_xml() return 'infobot' elif a == '3': self.cmdbot_keywords += nlp.word_tokenize(cmd.raw_content) self.cmdbot_keywords = nlp.remove_duplicate_in_list( self.cmdbot_keywords) self.save_keywords_data_xml() return 'cmdbot' elif a == '4': self.learningbot_keywords += nlp.word_tokenize(cmd.raw_content) self.learningbot_keywords = nlp.remove_duplicate_in_list( self.learningbot_keywords) self.save_keywords_data_xml() return 'learningbot' else: self.chatbot_keywords += nlp.word_tokenize(cmd.raw_content) self.chatbot_keywords = nlp.remove_duplicate_in_list( self.chatbot_keywords) self.save_keywords_data_xml() return 'chatbot' if max_point == 0 or max_point == chatbot_point: return 'chatbot' else: if max_point == infobot_point: return 'infobot' elif max_point == cmdbot_point: return 'cmdbot' else: return 'learningbot'
def _tokenizer(text): return nlp.word_tokenize(text, remove_punct=False, remove_num=True)