def find(self, refs, tagged_sentences): ref_dict = {} # reference map by sentence index for prp, fullname, index in refs: if index not in ref_dict: ref_dict[index] = [prp, fullname, index] else: ref_dict[index].append([prp, fullname, index]) names = utils.get_names_dict(utils.people) # TODO: # find PRP/name, VRB, PRP/name for - 1 # memory for people: name/name/name/... (they did) - 2 interact = [] for index, sentence in enumerate(tagged_sentences): chunked_sentence = regexp.CustomChunker().parse(sentence) retaged_sentence = utils.retag_chunked(chunked_sentence) new_tagged_sentence = utils.mark_sentence_names(retaged_sentence, names) # find prepositions, replace with real names and print what they did who, what, prp, seq = [], [], [], [] prp_counter = 0 for (word, tag, piece, pt) in new_tagged_sentence: reset = True w = word.lower() if tag.startswith("PRP"): # this is a reference if (w in ("he", "she", "his", "him", "her", "i", "me", "our")): if index in ref_dict: who.append(ref_dict[index][1]) # PRP-person mapping exists prp.append(word) else: who.append(None) # PRP exists without mapped person prp.append(word) elif w in names: # this word belongs to a person name, there's no PRP for it who.append(word) prp.append(None) elif piece in ('TARINYS'): #TODO: add some details to extracted actions reset = False seq.append(word) if reset and len(seq) > 0: # join neighbouring verbs if possible what.append(" ".join(seq)) seq = [] if len(who) > 1 and len(what) > 0: # only show people & their interactions that include an action # capitalize each person name/surname first letter for i, boo in enumerate(who): if boo: who[i] = " ".join([part[0].upper()+part[1:] for part in boo.split(" ")]) interact.append({'who':who, 'prp': prp, 'what':what}) return interact
def find(self, people, sentences, tagged_sentences): # create a all possible lowercase names & drink beer later :P names = utils.get_names_dict(people) # find names in text and mark them with special symbols & append additional data new_tagged_sentences = [] for index, sentence in enumerate(tagged_sentences): sent = [] for word, tag in sentence: key = word.lower() if key in names: sent.append((word.lower(), tag, '+', names[key])) # add `+` to a name & include data else: sent.append((word.lower(), tag, 'o', None)) # not a name - mark as `o` new_tagged_sentences.append(sent) # store references as lists: [PRP, fullname, sentence_index] refs = [] # store last he & she while scanning sentences, # if unknown sex name is found - store it also (determine type on first PRP found) last_he = [None, 0] # store as: [people_data, word_index_in_text] last_she = [None, 0] last_unknown = [None, 0] word_index = 0 for index, sentence in enumerate(new_tagged_sentences): for element in sentence: word, tag, flag, data = element # unpack all data word_index += 1 # increase processed words index if tag.startswith("PRP") and len(word) <= 5 and (word not in ("it", "our", "their", "us", "its", "we", "they")): # a determined reference was found! ### unknown sex resolver if last_unknown[0]: # if we have an unknown name without sex, then assign the next first found sex to it gender = self.get_gender(word) print "Last unknown person -", last_unknown[0]['fullname'], "- was assigned sex:", gender last_unknown[0]['sex'] = gender # the hack has been fixed :P if gender == "male": if last_he[0]: # last `he` exists if last_he[1] < last_unknown[1]: # override if unknown is newer last_he = last_unknown else: last_he = last_unknown # override as no `he` exists elif gender == "female": if last_she[0]: if last_she[1] < last_unknown[1]: last_she = last_unknown else: last_she = last_unknown if gender in ("male", "female"): # clear only if actualy determined last_unknown = [None, 0] # clear #### determine the person mentioned #print "*"*80 matched = None if word in ("he", "his", "him") and last_he[0]: # male matched = last_he elif word in ("she", "her") and last_she[0]: # female matched = last_she elif word in ("i", "me", "our"): # cannot determine sex - as multi if last_he[0] and last_she[0]: # we have both sex types in memory, choose the last one mentioned if last_he[1] > last_she[1]: # he is more fresh matched = last_he else: # she is more fresh matched = last_she elif last_he[0]: # we have only `he` in memory matched = last_he elif last_she[0]: # we have only `she` in memory matched = last_she if matched: # refresh index - this person has been just mentioned matched[1] = word_index #print "REF[", word, "] is -", matched[0]['fullname'], "- in sentence Nr.", index refs.append([word, matched[0]['fullname'], index]) else: #print "REF[", word, "] is", "UNKNOWN", "in sentence Nr.", index refs.append([word, "?", index]) #print "\t", sentences[index] # show the corresponding sentence with the match elif flag == "+": # a word is a name, so put it into memory if data["sex"] == "male": last_he = [data, word_index] elif data["sex"] == "female": last_she = [data, word_index] elif data['sex'] == "?": last_unknown = [data, word_index] return refs
def find(self, people, sentences, tagged_sentences): # create a all possible lowercase names & drink beer later :P names = utils.get_names_dict(people) # find names in text and mark them with special symbols & append additional data new_tagged_sentences = [] for index, sentence in enumerate(tagged_sentences): sent = [] for word, tag in sentence: key = word.lower() if key in names: sent.append( (word.lower(), tag, '+', names[key])) # add `+` to a name & include data else: sent.append((word.lower(), tag, 'o', None)) # not a name - mark as `o` new_tagged_sentences.append(sent) # store references as lists: [PRP, fullname, sentence_index] refs = [] # store last he & she while scanning sentences, # if unknown sex name is found - store it also (determine type on first PRP found) last_he = [None, 0] # store as: [people_data, word_index_in_text] last_she = [None, 0] last_unknown = [None, 0] word_index = 0 for index, sentence in enumerate(new_tagged_sentences): for element in sentence: word, tag, flag, data = element # unpack all data word_index += 1 # increase processed words index if tag.startswith("PRP") and len(word) <= 5 and (word not in ( "it", "our", "their", "us", "its", "we", "they")): # a determined reference was found! ### unknown sex resolver if last_unknown[ 0]: # if we have an unknown name without sex, then assign the next first found sex to it gender = self.get_gender(word) print "Last unknown person -", last_unknown[0][ 'fullname'], "- was assigned sex:", gender last_unknown[0]['sex'] = gender # the hack has been fixed :P if gender == "male": if last_he[0]: # last `he` exists if last_he[1] < last_unknown[ 1]: # override if unknown is newer last_he = last_unknown else: last_he = last_unknown # override as no `he` exists elif gender == "female": if last_she[0]: if last_she[1] < last_unknown[1]: last_she = last_unknown else: last_she = last_unknown if gender in ( "male", "female"): # clear only if actualy determined last_unknown = [None, 0] # clear #### determine the person mentioned #print "*"*80 matched = None if word in ("he", "his", "him") and last_he[0]: # male matched = last_he elif word in ("she", "her") and last_she[0]: # female matched = last_she elif word in ("i", "me", "our"): # cannot determine sex - as multi if last_he[0] and last_she[ 0]: # we have both sex types in memory, choose the last one mentioned if last_he[1] > last_she[1]: # he is more fresh matched = last_he else: # she is more fresh matched = last_she elif last_he[0]: # we have only `he` in memory matched = last_he elif last_she[0]: # we have only `she` in memory matched = last_she if matched: # refresh index - this person has been just mentioned matched[1] = word_index #print "REF[", word, "] is -", matched[0]['fullname'], "- in sentence Nr.", index refs.append([word, matched[0]['fullname'], index]) else: #print "REF[", word, "] is", "UNKNOWN", "in sentence Nr.", index refs.append([word, "?", index]) #print "\t", sentences[index] # show the corresponding sentence with the match elif flag == "+": # a word is a name, so put it into memory if data["sex"] == "male": last_he = [data, word_index] elif data["sex"] == "female": last_she = [data, word_index] elif data['sex'] == "?": last_unknown = [data, word_index] return refs
def find(self, refs, tagged_sentences): ref_dict = {} # reference map by sentence index for prp, fullname, index in refs: if index not in ref_dict: ref_dict[index] = [prp, fullname, index] else: ref_dict[index].append([prp, fullname, index]) names = utils.get_names_dict(utils.people) # TODO: # find PRP/name, VRB, PRP/name for - 1 # memory for people: name/name/name/... (they did) - 2 interact = [] for index, sentence in enumerate(tagged_sentences): chunked_sentence = regexp.CustomChunker().parse(sentence) retaged_sentence = utils.retag_chunked(chunked_sentence) new_tagged_sentence = utils.mark_sentence_names( retaged_sentence, names) # find prepositions, replace with real names and print what they did who, what, prp, seq = [], [], [], [] prp_counter = 0 for (word, tag, piece, pt) in new_tagged_sentence: reset = True w = word.lower() if tag.startswith("PRP"): # this is a reference if (w in ("he", "she", "his", "him", "her", "i", "me", "our")): if index in ref_dict: who.append(ref_dict[index] [1]) # PRP-person mapping exists prp.append(word) else: who.append( None) # PRP exists without mapped person prp.append(word) elif w in names: # this word belongs to a person name, there's no PRP for it who.append(word) prp.append(None) elif piece in ('TARINYS' ): #TODO: add some details to extracted actions reset = False seq.append(word) if reset and len( seq) > 0: # join neighbouring verbs if possible what.append(" ".join(seq)) seq = [] if len(who) > 1 and len( what ) > 0: # only show people & their interactions that include an action # capitalize each person name/surname first letter for i, boo in enumerate(who): if boo: who[i] = " ".join([ part[0].upper() + part[1:] for part in boo.split(" ") ]) interact.append({'who': who, 'prp': prp, 'what': what}) return interact