def get_entities(text): sem_dict = {} m_persons = re.findall(PERSON_NP,text) #print m_persons p_list = process_ne(m_persons) # MAKE UNIQUE p_list_set = set(p_list) print "plist " print p_list_set if(DEBUG): print "set of all the persos =",p_list_set temp_list = utility.remove_subsets(list(p_list_set)) for np in temp_list: sem_dict[np] = "PERSON" if(DEBUG): print temp_list m_organizations = re.findall(ORGANIZATION_NP,text) print m_organizations # print m_organizations o_list = process_ne(m_organizations) o_list_set = set(o_list) print "o_list" print o_list_set temp_list = utility.remove_subsets(list(o_list_set)) if(DEBUG): print temp_list for np in temp_list: sem_dict[np] = "ORGANIZATION" #print o_list_set return sem_dict
def process_input_text(file_text,id_name): # remove the \n from in between the lines (meta,main) = preprocess.split_text(file_text) if (not meta): print "ERROR IN SPLITTING MAIN AND META" return if(not main): print "ERROR IN SPLITTING MAIN AND META" return #print proc_meta(meta) temp_victim_list = [] final_victim_set =set([]) temp_target_list = [] final_target_set = set([]) temp_perpi_list = [] final_perpi_set = set([]) file_text = re.sub(NEWLINE," ",main) file_text_list = file_text.split('\n') if(DEBUG): print ("processing text",main) print ("") # pass file text instead of main in infoextract2.py incident_type = incident_predictor.get_predicted_event(main) # TODO NER CALL A FUNCTION THAT returns NER DICT ner_tagged_text = process_ner.java_ner_tagger(file_text) if (ner_tagged_text): ner_tagged_text.strip() if(ner_tagged_text): ner_dict = process_ner.get_entities() if(ner_dict): print ner_dict # open file containing victim patterns text = utility.f_read('victim_out_patterns_regex2') victim_patt_lines = text.split('\n') text = utility.f_read('target_out_patterns_regex2') # has only back patt target_patt_lines = text.split('\n') text = utility.f_read('perp_out_patterns_regex2') # has both front and back patterns perp_patt_lines = text.split('\n') # ALGO read one line at a time .. if it matches one of the patterns then parse that line and do ur thing # READ EACH LINE IN THE from input file for line in file_text_list: line = line.strip() if(not line): continue # split each line into several sentences sents = utility.sent_splitter(line) for sent in sents: #print "processing line",line # make sure no consecutive white spaces in ur line sent = sent.strip() # TODO remove 's and `` from sentence remove `` as well ? sent = re.sub(SPATT,"",sent) input_line = re.sub(COLL_SPACES,SPACES_REPL,sent) temp_victim_list = pattern_extractor.get_victims(input_line,victim_patt_lines) if temp_victim_list: for victim in temp_victim_list: victim = victim.strip() if victim: final_victim_set.add(victim) # TARGET LIST temp_target_list = pattern_extractor.get_targets(input_line,target_patt_lines) if temp_target_list: for target in temp_target_list: target = target.strip() if target: final_target_set.add(target) # PERPI LIST temp_perpi_list = pattern_extractor.get_perpi(input_line,perp_patt_lines) if temp_perpi_list: for perp in temp_perpi_list: perp = perp.strip() if perp: final_perpi_set.add(perp) # now use algorithms to clean this list and to remove redundant stuff # get target_list # a victim cannot be an org or location ?? has to be a person #subset removal v_new_list = list(final_victim_set) v_new_list = utility.remove_subsets(v_new_list) print "after subset removal" print v_new_list v_new_list = utility.remove_syn(v_new_list) print "after duplicate removal for ",id_name print v_new_list v_new_list = utility.rmv_flagged_np(v_new_list,'victim')# e.g headquarters print "after removing flag words for ",id_name print v_new_list v_new_list = utility.first_word_flag(v_new_list,'victim')# e.g suspects print "after one removing first word flags for ",id_name print v_new_list v_new_list = utility.first_word_rmv(v_new_list)# e.g COLONEL REPORTER print "after removing first title words like COLONEL etc ",id_name print v_new_list v_new_list = utility.one_word_cleaner(v_new_list) print "after one word and digit removal for ",id_name print v_new_list v_new_list = utility.victim_hacks(v_new_list)# e.g hacks print "after adding some hacks make unique",id_name print v_new_list print "###########################" # a target cannot be a a person or location t_new_list = list(final_target_set) t_new_list = utility.remove_subsets(t_new_list) print "after subset removal" print t_new_list t_new_list = utility.remove_syn(t_new_list) print "after duplicate removal" print t_new_list t_new_list = utility.rmv_flagged_np(t_new_list,'target')# e.g headquarters print "after removing flag words for ",id_name print t_new_list t_new_list = utility.first_word_flag(t_new_list,'target')# e.g suspects print "after one removing first word flags for ",id_name print t_new_list t_new_list = utility.one_word_cleaner(t_new_list) print "###Final after one word removal for ",id_name print t_new_list #print "###########################" # NER HINT a perpetrator cannot be a LOCATION or an org ?? p_new_list = list(final_perpi_set) p_new_list = utility.remove_subsets(p_new_list) print "after subset removal" print p_new_list p_new_list = utility.remove_syn(p_new_list) print "after duplicate removal" print p_new_list p_new_list = utility.rmv_flagged_np(p_new_list,'perp')# e.g headquarters print "after removing flag words for ",id_name print p_new_list p_new_list = utility.first_word_flag(p_new_list,'perp')# e.g suspects print "after one removing first word flags for ",id_name print p_new_list p_new_list = utility.one_word_cleaner(p_new_list) print " Final after one word and digit removal for ",id_name print p_new_list #print "###########################" #dict_out = matching.match(parsed_text) #print ("") print_outf(id_name,incident_type,[],p_new_list,[],t_new_list,v_new_list)