def process_input_text(file_text, id_name): global KEY (meta, main) = preprocess.split_text(file_text) if not meta: print "ERROR IN SPLITTING MAIN AND META" return if not main: print "ERROR IN SPLITTING MAIN AND META" return file_text = re.sub(NEWLINE, " ", main) if DEBUG: print ("processing text", main) print ("") d = answr_dict() if not KEY: make_key() grammar = r""" NP: {<RB|PP\$>?<JJ>*<NN>+<POS>?} NP: {<RB|PP\$>?<JJ>*<NN>+<NNS>*} {<NNP>+} {<RB|PP\$>?<JJ>*<NNS>*<POS>?} """ # sents = map(pos_tag, map(word_tokenize, [s for s in sent_tokenize(file_text.lower())])) # cp = RegexpParser(grammar) # for s in sents: # print cp.parse(s) weapons = get_weapon(file_text, d) print weapons weapon = weapons[0][0] print id_name print "C", KEY[id_name], "\n", "D", weapon print # perpindiv = get_perp_indiv(file_text, d) perpindiv = "-" # perporg = get_perp_org(file_text, d) perporg = "-" # targets = get_target(file_text, d) # target = targets[0][0] target = "-" # victims = get_victim(file_text, d) # victim = victims[0][0] victim = "-" incident_type = incident_predictor.get_predicted_event(main) print_out(id_name, incident_type, weapon, perpindiv, perporg, target, victim)
def process_input_text(file_text,id_name): # remove the \n from in between the lines (meta,main) = preprocess.split_text(file_text) if (not meta): print "ERROR IN SPLITTING MAIN AND META" return if(not main): print "ERROR IN SPLITTING MAIN AND META" return #print proc_meta(meta) temp_victim_list = [] final_victim_set =set([]) temp_target_list = [] final_target_set = set([]) temp_perpi_list = [] final_perpi_set = set([]) file_text = re.sub(NEWLINE," ",main) file_text_list = file_text.split('\n') if(DEBUG): print ("processing text",main) print ("") # pass file text instead of main in infoextract2.py incident_type = incident_predictor.get_predicted_event(main) # TODO NER CALL A FUNCTION THAT returns NER DICT ner_tagged_text = process_ner.java_ner_tagger(file_text) if (ner_tagged_text): ner_tagged_text.strip() if(ner_tagged_text): ner_dict = process_ner.get_entities() if(ner_dict): print ner_dict # open file containing victim patterns text = utility.f_read('victim_out_patterns_regex2') victim_patt_lines = text.split('\n') text = utility.f_read('target_out_patterns_regex2') # has only back patt target_patt_lines = text.split('\n') text = utility.f_read('perp_out_patterns_regex2') # has both front and back patterns perp_patt_lines = text.split('\n') # ALGO read one line at a time .. if it matches one of the patterns then parse that line and do ur thing # READ EACH LINE IN THE from input file for line in file_text_list: line = line.strip() if(not line): continue # split each line into several sentences sents = utility.sent_splitter(line) for sent in sents: #print "processing line",line # make sure no consecutive white spaces in ur line sent = sent.strip() # TODO remove 's and `` from sentence remove `` as well ? sent = re.sub(SPATT,"",sent) input_line = re.sub(COLL_SPACES,SPACES_REPL,sent) temp_victim_list = pattern_extractor.get_victims(input_line,victim_patt_lines) if temp_victim_list: for victim in temp_victim_list: victim = victim.strip() if victim: final_victim_set.add(victim) # TARGET LIST temp_target_list = pattern_extractor.get_targets(input_line,target_patt_lines) if temp_target_list: for target in temp_target_list: target = target.strip() if target: final_target_set.add(target) # PERPI LIST temp_perpi_list = pattern_extractor.get_perpi(input_line,perp_patt_lines) if temp_perpi_list: for perp in temp_perpi_list: perp = perp.strip() if perp: final_perpi_set.add(perp) # now use algorithms to clean this list and to remove redundant stuff # get target_list # a victim cannot be an org or location ?? has to be a person #subset removal v_new_list = list(final_victim_set) v_new_list = utility.remove_subsets(v_new_list) print "after subset removal" print v_new_list v_new_list = utility.remove_syn(v_new_list) print "after duplicate removal for ",id_name print v_new_list v_new_list = utility.rmv_flagged_np(v_new_list,'victim')# e.g headquarters print "after removing flag words for ",id_name print v_new_list v_new_list = utility.first_word_flag(v_new_list,'victim')# e.g suspects print "after one removing first word flags for ",id_name print v_new_list v_new_list = utility.first_word_rmv(v_new_list)# e.g COLONEL REPORTER print "after removing first title words like COLONEL etc ",id_name print v_new_list v_new_list = utility.one_word_cleaner(v_new_list) print "after one word and digit removal for ",id_name print v_new_list v_new_list = utility.victim_hacks(v_new_list)# e.g hacks print "after adding some hacks make unique",id_name print v_new_list print "###########################" # a target cannot be a a person or location t_new_list = list(final_target_set) t_new_list = utility.remove_subsets(t_new_list) print "after subset removal" print t_new_list t_new_list = utility.remove_syn(t_new_list) print "after duplicate removal" print t_new_list t_new_list = utility.rmv_flagged_np(t_new_list,'target')# e.g headquarters print "after removing flag words for ",id_name print t_new_list t_new_list = utility.first_word_flag(t_new_list,'target')# e.g suspects print "after one removing first word flags for ",id_name print t_new_list t_new_list = utility.one_word_cleaner(t_new_list) print "###Final after one word removal for ",id_name print t_new_list #print "###########################" # NER HINT a perpetrator cannot be a LOCATION or an org ?? p_new_list = list(final_perpi_set) p_new_list = utility.remove_subsets(p_new_list) print "after subset removal" print p_new_list p_new_list = utility.remove_syn(p_new_list) print "after duplicate removal" print p_new_list p_new_list = utility.rmv_flagged_np(p_new_list,'perp')# e.g headquarters print "after removing flag words for ",id_name print p_new_list p_new_list = utility.first_word_flag(p_new_list,'perp')# e.g suspects print "after one removing first word flags for ",id_name print p_new_list p_new_list = utility.one_word_cleaner(p_new_list) print " Final after one word and digit removal for ",id_name print p_new_list #print "###########################" #dict_out = matching.match(parsed_text) #print ("") print_outf(id_name,incident_type,[],p_new_list,[],t_new_list,v_new_list)