def process_input_text(file_text, id_name): global KEY (meta, main) = preprocess.split_text(file_text) if not meta: print "ERROR IN SPLITTING MAIN AND META" return if not main: print "ERROR IN SPLITTING MAIN AND META" return file_text = re.sub(NEWLINE, " ", main) if DEBUG: print ("processing text", main) print ("") d = answr_dict() if not KEY: make_key() grammar = r""" NP: {<RB|PP\$>?<JJ>*<NN>+<POS>?} NP: {<RB|PP\$>?<JJ>*<NN>+<NNS>*} {<NNP>+} {<RB|PP\$>?<JJ>*<NNS>*<POS>?} """ # sents = map(pos_tag, map(word_tokenize, [s for s in sent_tokenize(file_text.lower())])) # cp = RegexpParser(grammar) # for s in sents: # print cp.parse(s) weapons = get_weapon(file_text, d) print weapons weapon = weapons[0][0] print id_name print "C", KEY[id_name], "\n", "D", weapon print # perpindiv = get_perp_indiv(file_text, d) perpindiv = "-" # perporg = get_perp_org(file_text, d) perporg = "-" # targets = get_target(file_text, d) # target = targets[0][0] target = "-" # victims = get_victim(file_text, d) # victim = victims[0][0] victim = "-" incident_type = incident_predictor.get_predicted_event(main) print_out(id_name, incident_type, weapon, perpindiv, perporg, target, victim)
def process_input_text(file_text, id_name): # remove the \n from in between the lines (meta, main) = preprocess.split_text(file_text) if not meta: print "ERROR IN SPLITTING MAIN AND META" return if not main: print "ERROR IN SPLITTING MAIN AND META" return # print proc_meta(meta) temp_victim_list = [] final_victim_set = set([]) temp_target_list = [] final_target_set = set([]) temp_perpi_list = [] final_perpi_set = set([]) file_text = re.sub(NEWLINE, " ", main) file_text_list = file_text.split("\n") if DEBUG: print ("processing text", main) print ("") ### BEGIN EXPERIMENTAL ### # pass file text instead of main in infoextract2.py incident_type = incident_predictor.get_predicted_event(main) # incident_type = '-' # TODO NER CALL A FUNCTION THAT returns NER DICT d = answr_dict() weapon = get_weapon(file_text, d) weapon_l = [weapon[0][0]] perp_org = get_perp_org(file_text, d) perp_org_l = [perp_org[0][0]] p_new_list = ["-"] t_new_list = ["-"] v_new_list = ["-"] ### END EXPERIMENTAL ### """ # open file containing victim patterns text = utility.f_read('victim_out_patterns_regex2') victim_patt_lines = text.split('\n') text = utility.f_read('target_out_patterns_regex2') # has only back patt target_patt_lines = text.split('\n') text = utility.f_read('perp_out_patterns_regex2') # has both front and back patterns perp_patt_lines = text.split('\n') # ALGO read one line at a time .. if it matches one of the patterns then parse that line and do ur thing # READ EACH LINE IN THE from input file for line in file_text_list: line = line.strip() if(not line): continue # split each line into several sentences sents = utility.sent_splitter(line) for sent in sents: #print "processing line",line # make sure no consecutive white spaces in ur line sent = sent.strip() # TODO remove 's and `` from sentence remove `` as well ? sent = re.sub(SPATT,"",sent) input_line = re.sub(COLL_SPACES,SPACES_REPL,sent) temp_victim_list = pattern_extractor.get_victims(input_line,victim_patt_lines) if temp_victim_list: for victim in temp_victim_list: victim = victim.strip() if victim: final_victim_set.add(victim) # TARGET LIST temp_target_list = pattern_extractor.get_targets(input_line,target_patt_lines) if temp_target_list: for target in temp_target_list: target = target.strip() if target: final_target_set.add(target) # PERPI LIST temp_perpi_list = pattern_extractor.get_perpi(input_line,perp_patt_lines) if temp_perpi_list: for perp in temp_perpi_list: perp = perp.strip() if perp: final_perpi_set.add(perp) # now use algorithms to clean this list and to remove redundant stuff # get target_list #subset removal v_new_list = list(final_victim_set) v_new_list = utility.remove_subsets(v_new_list) if (DEBUG): print "after subset removal" print v_new_list v_new_list = utility.remove_syn(v_new_list) if (DEBUG): print "after duplicate removal for ",id_name print v_new_list v_new_list = utility.rmv_flagged_np(v_new_list,'victim')# e.g headquarters if (DEBUG): print "after removing flag words for ",id_name print v_new_list v_new_list = utility.first_word_flag(v_new_list,'victim')# e.g suspects if (DEBUG): print "after one removing first word flags for ",id_name print v_new_list v_new_list = utility.first_word_rmv(v_new_list)# e.g COLONEL REPORTER if (DEBUG): print "after removing first title words like COLONEL etc ",id_name print v_new_list v_new_list = utility.one_word_cleaner(v_new_list) if (DEBUG): print "after one word and digit removal for ",id_name print v_new_list v_new_list = utility.victim_hacks(v_new_list)# e.g hacks if (DEBUG): print "after adding some hacks make unique",id_name print v_new_list print "###########################" t_new_list = list(final_target_set) t_new_list = utility.remove_subsets(t_new_list) if (DEBUG): print "after subset removal" print t_new_list t_new_list = utility.remove_syn(t_new_list) if (DEBUG): print "after duplicate removal" print t_new_list t_new_list = utility.rmv_flagged_np(t_new_list,'target')# e.g headquarters if (DEBUG): print "after removing flag words for ",id_name print t_new_list t_new_list = utility.first_word_flag(t_new_list,'target')# e.g suspects if (DEBUG): print "after one removing first word flags for ",id_name print t_new_list t_new_list = utility.one_word_cleaner(t_new_list) if (DEBUG): print "###Final after one word removal for ",id_name print t_new_list t_new_list = ['-'] #print "###########################" p_new_list = ['-'] p_new_list = list(final_perpi_set) p_new_list = utility.remove_subsets(p_new_list) if (DEBUG): print "after subset removal" print p_new_list p_new_list = utility.remove_syn(p_new_list) if (DEBUG): print "after duplicate removal" print p_new_list p_new_list = utility.rmv_flagged_np(p_new_list,'perp')# e.g headquarters if (DEBUG): print "after removing flag words for ",id_name print p_new_list p_new_list = utility.first_word_flag(p_new_list,'perp')# e.g suspects if (DEBUG): print "after one removing first word flags for ",id_name print p_new_list p_new_list = utility.one_word_cleaner(p_new_list) if (DEBUG): print " Final after one word and digit removal for ",id_name print p_new_list """ # print "###########################" # dict_out = matching.match(parsed_text) # print ("") print_outf(id_name, incident_type, weapon_l, p_new_list, perp_org_l, t_new_list, v_new_list)