예제 #1
0
def get_entities(text):

	sem_dict = {}
	m_persons = re.findall(PERSON_NP,text)
#print m_persons
	p_list = process_ne(m_persons)
	# MAKE UNIQUE
	p_list_set = set(p_list)
	print "plist "
	print p_list_set
	if(DEBUG):
		print "set of all the persos =",p_list_set
	temp_list = utility.remove_subsets(list(p_list_set))
	for np in temp_list:
		sem_dict[np] = "PERSON"
	if(DEBUG):	
		print temp_list
	m_organizations = re.findall(ORGANIZATION_NP,text)
	print m_organizations 
#	print m_organizations
	o_list = process_ne(m_organizations)
	o_list_set = set(o_list)
	print "o_list"
	print o_list_set
	temp_list = utility.remove_subsets(list(o_list_set))
	if(DEBUG):
		print temp_list 
	for np in temp_list:
		sem_dict[np] = "ORGANIZATION"
	#print o_list_set 
	return sem_dict 
예제 #2
0
def process_input_text(file_text,id_name):
	# remove the \n from in between the lines
	(meta,main) = preprocess.split_text(file_text)
	if (not meta):
		print "ERROR IN SPLITTING MAIN AND META"
		return 
	if(not main):
		print "ERROR IN SPLITTING MAIN AND META"
		return
	#print proc_meta(meta)
		
	temp_victim_list = []
	final_victim_set =set([])
	temp_target_list = []
	final_target_set = set([])
	temp_perpi_list = []
	final_perpi_set = set([])

	file_text = re.sub(NEWLINE," ",main)
	file_text_list = file_text.split('\n')
	if(DEBUG):
		print ("processing text",main) 
		print ("")
	
	# pass file text instead of main in infoextract2.py 	
	incident_type = incident_predictor.get_predicted_event(main) 
	# TODO NER CALL A FUNCTION THAT returns NER DICT
	ner_tagged_text = process_ner.java_ner_tagger(file_text)
	if (ner_tagged_text):
		ner_tagged_text.strip()
		if(ner_tagged_text):
			ner_dict = process_ner.get_entities()

	if(ner_dict):
		print ner_dict
	# open file containing victim patterns
	text = utility.f_read('victim_out_patterns_regex2')
  	victim_patt_lines = text.split('\n')
	text = utility.f_read('target_out_patterns_regex2') # has only back patt
  	target_patt_lines = text.split('\n')
	text = utility.f_read('perp_out_patterns_regex2') # has both front and back patterns 
  	perp_patt_lines = text.split('\n')
	# ALGO read one line at a time .. if it matches one of the patterns then parse that line and do ur thing 


	# READ EACH LINE IN THE from input file   
	for line in file_text_list:
		line = line.strip()
		if(not line):
			continue

		# split each line into several sentences
		sents = utility.sent_splitter(line)
		for sent in sents:
			#print "processing line",line	
			# make sure no consecutive white spaces in ur line
			sent  = sent.strip()
			# TODO remove 's and `` from sentence remove `` as well ?
			sent = re.sub(SPATT,"",sent)			
			input_line = re.sub(COLL_SPACES,SPACES_REPL,sent)
			temp_victim_list = pattern_extractor.get_victims(input_line,victim_patt_lines)
			if temp_victim_list:
				for victim in temp_victim_list:
					victim  = victim.strip()
					if victim:
						final_victim_set.add(victim)
			# TARGET LIST
			temp_target_list = pattern_extractor.get_targets(input_line,target_patt_lines)
			if temp_target_list:
				for target in temp_target_list:
					target = target.strip()
					if target:
						final_target_set.add(target)
			# PERPI LIST
			temp_perpi_list = pattern_extractor.get_perpi(input_line,perp_patt_lines)
			if temp_perpi_list:
				for perp in temp_perpi_list:
					perp = perp.strip()
					if perp:
						final_perpi_set.add(perp)


			# now use algorithms to clean this list and to remove redundant stuff 
			# get target_list
	# a victim cannot be an org or location ?? has to be  a person 

	#subset removal
	v_new_list = list(final_victim_set)
	v_new_list  = utility.remove_subsets(v_new_list)	
	print "after subset removal"
	print v_new_list
	v_new_list = utility.remove_syn(v_new_list)
	print "after duplicate removal for ",id_name
	print v_new_list

	v_new_list = utility.rmv_flagged_np(v_new_list,'victim')# e.g headquarters
	print "after removing flag words   for ",id_name
	print v_new_list

	v_new_list = utility.first_word_flag(v_new_list,'victim')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print v_new_list

	v_new_list = utility.first_word_rmv(v_new_list)# e.g COLONEL REPORTER
	print "after removing first title words like COLONEL etc ",id_name
	print v_new_list

	v_new_list = utility.one_word_cleaner(v_new_list)
	print "after one word and digit removal for ",id_name
	print v_new_list
	v_new_list = utility.victim_hacks(v_new_list)# e.g hacks
	print "after adding some hacks make unique",id_name
	print v_new_list
	print "###########################"

	# a target cannot be a a person or location 

	t_new_list  = list(final_target_set)
	t_new_list  = utility.remove_subsets(t_new_list)	
	print "after subset removal"
	print t_new_list
	t_new_list = utility.remove_syn(t_new_list)
	print "after duplicate removal"
	print t_new_list


	t_new_list = utility.rmv_flagged_np(t_new_list,'target')# e.g headquarters
	print "after removing flag words   for ",id_name
	print t_new_list
	t_new_list = utility.first_word_flag(t_new_list,'target')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print t_new_list

	t_new_list = utility.one_word_cleaner(t_new_list)
	print "###Final after one word removal for ",id_name
	print t_new_list
	#print "###########################"


	# NER HINT a perpetrator cannot be a LOCATION or an org ??

	p_new_list  = list(final_perpi_set)
	p_new_list  = utility.remove_subsets(p_new_list)	
	print "after subset removal"
	print p_new_list
	p_new_list = utility.remove_syn(p_new_list)
	print "after duplicate removal"
	print p_new_list

	p_new_list = utility.rmv_flagged_np(p_new_list,'perp')# e.g headquarters
	print "after removing flag words   for ",id_name
	print p_new_list
	p_new_list = utility.first_word_flag(p_new_list,'perp')# e.g suspects 
	print "after one removing first word flags  for ",id_name
	print p_new_list

	p_new_list = utility.one_word_cleaner(p_new_list)
	print " Final after one word and digit removal for ",id_name
	print p_new_list
	#print "###########################"


	#dict_out    = matching.match(parsed_text)
	#print ("")
	print_outf(id_name,incident_type,[],p_new_list,[],t_new_list,v_new_list)