Пример #1
0
	def find(self, refs, tagged_sentences):
		ref_dict = {} # reference map by sentence index
		for prp, fullname, index in refs:
			if index not in ref_dict:
				ref_dict[index] = [prp, fullname, index]
			else:
				ref_dict[index].append([prp, fullname, index])

		names = utils.get_names_dict(utils.people)

		# TODO: 
		# 		find PRP/name, VRB, PRP/name for - 1
		# 		memory for people: name/name/name/... (they did) - 2

		interact = []
		for index, sentence in enumerate(tagged_sentences):
			chunked_sentence = regexp.CustomChunker().parse(sentence)
			retaged_sentence = utils.retag_chunked(chunked_sentence)
			new_tagged_sentence = utils.mark_sentence_names(retaged_sentence, names)
			
			# find prepositions, replace with real names and print what they did
			who, what, prp, seq = [], [], [], []
			prp_counter = 0
			for (word, tag, piece, pt) in new_tagged_sentence:
				reset = True
				w = word.lower()
				if tag.startswith("PRP"): # this is a reference
					if (w in ("he", "she", "his", "him", "her", "i", "me", "our")):
						if index in ref_dict:
							who.append(ref_dict[index][1]) # PRP-person mapping exists
							prp.append(word)
						else:
							who.append(None) # PRP exists without mapped person
							prp.append(word)
				elif w in names: 
					# this word belongs to a person name, there's no PRP for it
					who.append(word)
					prp.append(None)
				elif piece in ('TARINYS'): #TODO: add some details to extracted actions
					reset = False
					seq.append(word) 
					
				if reset and len(seq) > 0: # join neighbouring verbs if possible
					what.append(" ".join(seq))
					seq = []
			
			if len(who) > 1 and len(what) > 0: # only show people & their interactions that include an action
				# capitalize each person name/surname first letter
				for i, boo in enumerate(who):
					if boo:
						who[i] = " ".join([part[0].upper()+part[1:] for part in boo.split(" ")])
				interact.append({'who':who, 'prp': prp, 'what':what})
				
		return interact 
Пример #2
0
	def find(self, people, sentences, tagged_sentences):
		# create a all possible lowercase names & drink beer later :P
		names = utils.get_names_dict(people)

		# find names in text and mark them with special symbols & append additional data
		new_tagged_sentences = []
		for index, sentence in enumerate(tagged_sentences):
			sent = [] 
			for word, tag in sentence:
				key = word.lower() 
				if key in names:
					sent.append((word.lower(), tag, '+', names[key])) # add `+` to a name & include data
				else:
					sent.append((word.lower(), tag, 'o', None)) # not a name - mark as `o`
			new_tagged_sentences.append(sent)
		
		# store references as lists: [PRP, fullname, sentence_index]
		refs = []

		# store last he & she while scanning sentences, 
		# if unknown sex name is found - store it also (determine type on first PRP found)
		last_he = [None, 0] # store as: [people_data, word_index_in_text]
		last_she = [None, 0]
		last_unknown = [None, 0]
		word_index = 0
		for index, sentence in enumerate(new_tagged_sentences):	
			for element in sentence:
				word, tag, flag, data = element # unpack all data
				word_index += 1 # increase processed words index
				
				if tag.startswith("PRP") and len(word) <= 5 and (word not in ("it", "our", "their", "us", "its", "we", "they")): # a determined reference was found!
					### unknown sex resolver
					if last_unknown[0]: # if we have an unknown name without sex, then assign the next first found sex to it
						gender = self.get_gender(word)
						print "Last unknown person -", last_unknown[0]['fullname'], "- was assigned sex:", gender
						last_unknown[0]['sex'] = gender
						
						# the hack has been fixed :P
						if gender == "male":
							if last_he[0]: # last `he` exists
								if last_he[1] < last_unknown[1]: # override if unknown is newer 
									last_he = last_unknown
							else:
								last_he = last_unknown # override as no `he` exists
						elif gender == "female":
							if last_she[0]:
								if last_she[1] < last_unknown[1]:
									last_she = last_unknown
							else:
								last_she = last_unknown

						if gender in ("male", "female"): # clear only if actualy determined
							last_unknown = [None, 0] # clear
					
					#### determine the person mentioned
					#print "*"*80
					matched = None
					if word in ("he", "his", "him") and last_he[0]: # male
						matched = last_he
					elif word in ("she", "her") and last_she[0]: # female
						matched = last_she
					elif word in ("i", "me", "our"): # cannot determine sex - as multi
						if last_he[0] and last_she[0]: # we have both sex types in memory, choose the last one mentioned
							if last_he[1] > last_she[1]: # he is more fresh
								matched = last_he
							else: # she is more fresh
								matched = last_she
						elif last_he[0]: # we have only `he` in memory
							matched = last_he
						elif last_she[0]: # we have only `she` in memory
							matched = last_she
					
					if matched: # refresh index - this person has been just mentioned
						matched[1] = word_index
						#print "REF[", word, "] is -", matched[0]['fullname'], "- in sentence Nr.", index
						refs.append([word, matched[0]['fullname'], index])
					else:
						#print "REF[", word, "] is", "UNKNOWN", "in sentence Nr.", index
						refs.append([word, "?", index])
					#print "\t", sentences[index] # show the corresponding sentence with the match
					
				elif flag == "+": # a word is a name, so put it into memory
					if data["sex"] == "male":
						last_he = [data, word_index]
					elif data["sex"] == "female":
						last_she = [data, word_index]
					elif data['sex'] == "?":
						last_unknown = [data, word_index]
						
		return refs
Пример #3
0
    def find(self, people, sentences, tagged_sentences):
        # create a all possible lowercase names & drink beer later :P
        names = utils.get_names_dict(people)

        # find names in text and mark them with special symbols & append additional data
        new_tagged_sentences = []
        for index, sentence in enumerate(tagged_sentences):
            sent = []
            for word, tag in sentence:
                key = word.lower()
                if key in names:
                    sent.append(
                        (word.lower(), tag, '+',
                         names[key]))  # add `+` to a name & include data
                else:
                    sent.append((word.lower(), tag, 'o',
                                 None))  # not a name - mark as `o`
            new_tagged_sentences.append(sent)

        # store references as lists: [PRP, fullname, sentence_index]
        refs = []

        # store last he & she while scanning sentences,
        # if unknown sex name is found - store it also (determine type on first PRP found)
        last_he = [None, 0]  # store as: [people_data, word_index_in_text]
        last_she = [None, 0]
        last_unknown = [None, 0]
        word_index = 0
        for index, sentence in enumerate(new_tagged_sentences):
            for element in sentence:
                word, tag, flag, data = element  # unpack all data
                word_index += 1  # increase processed words index

                if tag.startswith("PRP") and len(word) <= 5 and (word not in (
                        "it", "our", "their", "us", "its", "we",
                        "they")):  # a determined reference was found!
                    ### unknown sex resolver
                    if last_unknown[
                            0]:  # if we have an unknown name without sex, then assign the next first found sex to it
                        gender = self.get_gender(word)
                        print "Last unknown person -", last_unknown[0][
                            'fullname'], "- was assigned sex:", gender
                        last_unknown[0]['sex'] = gender

                        # the hack has been fixed :P
                        if gender == "male":
                            if last_he[0]:  # last `he` exists
                                if last_he[1] < last_unknown[
                                        1]:  # override if unknown is newer
                                    last_he = last_unknown
                            else:
                                last_he = last_unknown  # override as no `he` exists
                        elif gender == "female":
                            if last_she[0]:
                                if last_she[1] < last_unknown[1]:
                                    last_she = last_unknown
                            else:
                                last_she = last_unknown

                        if gender in (
                                "male",
                                "female"):  # clear only if actualy determined
                            last_unknown = [None, 0]  # clear

                    #### determine the person mentioned
                    #print "*"*80
                    matched = None
                    if word in ("he", "his", "him") and last_he[0]:  # male
                        matched = last_he
                    elif word in ("she", "her") and last_she[0]:  # female
                        matched = last_she
                    elif word in ("i", "me",
                                  "our"):  # cannot determine sex - as multi
                        if last_he[0] and last_she[
                                0]:  # we have both sex types in memory, choose the last one mentioned
                            if last_he[1] > last_she[1]:  # he is more fresh
                                matched = last_he
                            else:  # she is more fresh
                                matched = last_she
                        elif last_he[0]:  # we have only `he` in memory
                            matched = last_he
                        elif last_she[0]:  # we have only `she` in memory
                            matched = last_she

                    if matched:  # refresh index - this person has been just mentioned
                        matched[1] = word_index
                        #print "REF[", word, "] is -", matched[0]['fullname'], "- in sentence Nr.", index
                        refs.append([word, matched[0]['fullname'], index])
                    else:
                        #print "REF[", word, "] is", "UNKNOWN", "in sentence Nr.", index
                        refs.append([word, "?", index])
                    #print "\t", sentences[index] # show the corresponding sentence with the match

                elif flag == "+":  # a word is a name, so put it into memory
                    if data["sex"] == "male":
                        last_he = [data, word_index]
                    elif data["sex"] == "female":
                        last_she = [data, word_index]
                    elif data['sex'] == "?":
                        last_unknown = [data, word_index]

        return refs
Пример #4
0
    def find(self, refs, tagged_sentences):
        ref_dict = {}  # reference map by sentence index
        for prp, fullname, index in refs:
            if index not in ref_dict:
                ref_dict[index] = [prp, fullname, index]
            else:
                ref_dict[index].append([prp, fullname, index])

        names = utils.get_names_dict(utils.people)

        # TODO:
        # 		find PRP/name, VRB, PRP/name for - 1
        # 		memory for people: name/name/name/... (they did) - 2

        interact = []
        for index, sentence in enumerate(tagged_sentences):
            chunked_sentence = regexp.CustomChunker().parse(sentence)
            retaged_sentence = utils.retag_chunked(chunked_sentence)
            new_tagged_sentence = utils.mark_sentence_names(
                retaged_sentence, names)

            # find prepositions, replace with real names and print what they did
            who, what, prp, seq = [], [], [], []
            prp_counter = 0
            for (word, tag, piece, pt) in new_tagged_sentence:
                reset = True
                w = word.lower()
                if tag.startswith("PRP"):  # this is a reference
                    if (w in ("he", "she", "his", "him", "her", "i", "me",
                              "our")):
                        if index in ref_dict:
                            who.append(ref_dict[index]
                                       [1])  # PRP-person mapping exists
                            prp.append(word)
                        else:
                            who.append(
                                None)  # PRP exists without mapped person
                            prp.append(word)
                elif w in names:
                    # this word belongs to a person name, there's no PRP for it
                    who.append(word)
                    prp.append(None)
                elif piece in ('TARINYS'
                               ):  #TODO: add some details to extracted actions
                    reset = False
                    seq.append(word)

                if reset and len(
                        seq) > 0:  # join neighbouring verbs if possible
                    what.append(" ".join(seq))
                    seq = []

            if len(who) > 1 and len(
                    what
            ) > 0:  # only show people & their interactions that include an action
                # capitalize each person name/surname first letter
                for i, boo in enumerate(who):
                    if boo:
                        who[i] = " ".join([
                            part[0].upper() + part[1:]
                            for part in boo.split(" ")
                        ])
                interact.append({'who': who, 'prp': prp, 'what': what})

        return interact