else: # use the first word in the patt to split the parsed sentence patt = patt.strip() get_np(split_word,parsed_sent,BACK) # this file reads a file which contains the extracted pattern tuples , takes those patterns and used it extract NP from a parsed file # writes the regex patterns to a file , The actual matching can be done easily if __name__ =="__main__": # reads the patterns tuples filename from the command line filename = sys.argv[1] filename_out = filename +"_regex2" text = utility.f_read(filename) new_list = process_patterns(text) new_list.sort() # write each of these into a file f_w = open(filename_out,'w') for line in new_list: f_w.write(line) f_w.write("\n") f_w.close() text = utility.f_read('victim_out_patterns_regex2') lines = text.split('\n') """ # read sample parsed text file
def process_input_text(file_text,id_name): # remove the \n from in between the lines (meta,main) = preprocess.split_text(file_text) if (not meta): print "ERROR IN SPLITTING MAIN AND META" return if(not main): print "ERROR IN SPLITTING MAIN AND META" return #print proc_meta(meta) temp_victim_list = [] final_victim_set =set([]) temp_target_list = [] final_target_set = set([]) temp_perpi_list = [] final_perpi_set = set([]) file_text = re.sub(NEWLINE," ",main) file_text_list = file_text.split('\n') if(DEBUG): print ("processing text",main) print ("") # pass file text instead of main in infoextract2.py incident_type = incident_predictor.get_predicted_event(main) # TODO NER CALL A FUNCTION THAT returns NER DICT ner_tagged_text = process_ner.java_ner_tagger(file_text) if (ner_tagged_text): ner_tagged_text.strip() if(ner_tagged_text): ner_dict = process_ner.get_entities() if(ner_dict): print ner_dict # open file containing victim patterns text = utility.f_read('victim_out_patterns_regex2') victim_patt_lines = text.split('\n') text = utility.f_read('target_out_patterns_regex2') # has only back patt target_patt_lines = text.split('\n') text = utility.f_read('perp_out_patterns_regex2') # has both front and back patterns perp_patt_lines = text.split('\n') # ALGO read one line at a time .. if it matches one of the patterns then parse that line and do ur thing # READ EACH LINE IN THE from input file for line in file_text_list: line = line.strip() if(not line): continue # split each line into several sentences sents = utility.sent_splitter(line) for sent in sents: #print "processing line",line # make sure no consecutive white spaces in ur line sent = sent.strip() # TODO remove 's and `` from sentence remove `` as well ? sent = re.sub(SPATT,"",sent) input_line = re.sub(COLL_SPACES,SPACES_REPL,sent) temp_victim_list = pattern_extractor.get_victims(input_line,victim_patt_lines) if temp_victim_list: for victim in temp_victim_list: victim = victim.strip() if victim: final_victim_set.add(victim) # TARGET LIST temp_target_list = pattern_extractor.get_targets(input_line,target_patt_lines) if temp_target_list: for target in temp_target_list: target = target.strip() if target: final_target_set.add(target) # PERPI LIST temp_perpi_list = pattern_extractor.get_perpi(input_line,perp_patt_lines) if temp_perpi_list: for perp in temp_perpi_list: perp = perp.strip() if perp: final_perpi_set.add(perp) # now use algorithms to clean this list and to remove redundant stuff # get target_list # a victim cannot be an org or location ?? has to be a person #subset removal v_new_list = list(final_victim_set) v_new_list = utility.remove_subsets(v_new_list) print "after subset removal" print v_new_list v_new_list = utility.remove_syn(v_new_list) print "after duplicate removal for ",id_name print v_new_list v_new_list = utility.rmv_flagged_np(v_new_list,'victim')# e.g headquarters print "after removing flag words for ",id_name print v_new_list v_new_list = utility.first_word_flag(v_new_list,'victim')# e.g suspects print "after one removing first word flags for ",id_name print v_new_list v_new_list = utility.first_word_rmv(v_new_list)# e.g COLONEL REPORTER print "after removing first title words like COLONEL etc ",id_name print v_new_list v_new_list = utility.one_word_cleaner(v_new_list) print "after one word and digit removal for ",id_name print v_new_list v_new_list = utility.victim_hacks(v_new_list)# e.g hacks print "after adding some hacks make unique",id_name print v_new_list print "###########################" # a target cannot be a a person or location t_new_list = list(final_target_set) t_new_list = utility.remove_subsets(t_new_list) print "after subset removal" print t_new_list t_new_list = utility.remove_syn(t_new_list) print "after duplicate removal" print t_new_list t_new_list = utility.rmv_flagged_np(t_new_list,'target')# e.g headquarters print "after removing flag words for ",id_name print t_new_list t_new_list = utility.first_word_flag(t_new_list,'target')# e.g suspects print "after one removing first word flags for ",id_name print t_new_list t_new_list = utility.one_word_cleaner(t_new_list) print "###Final after one word removal for ",id_name print t_new_list #print "###########################" # NER HINT a perpetrator cannot be a LOCATION or an org ?? p_new_list = list(final_perpi_set) p_new_list = utility.remove_subsets(p_new_list) print "after subset removal" print p_new_list p_new_list = utility.remove_syn(p_new_list) print "after duplicate removal" print p_new_list p_new_list = utility.rmv_flagged_np(p_new_list,'perp')# e.g headquarters print "after removing flag words for ",id_name print p_new_list p_new_list = utility.first_word_flag(p_new_list,'perp')# e.g suspects print "after one removing first word flags for ",id_name print p_new_list p_new_list = utility.one_word_cleaner(p_new_list) print " Final after one word and digit removal for ",id_name print p_new_list #print "###########################" #dict_out = matching.match(parsed_text) #print ("") print_outf(id_name,incident_type,[],p_new_list,[],t_new_list,v_new_list)
out_list = [] for line in lines: # handle empty lines if(not line): print "line",line print "skipped a line" continue m = re.match(PATT,line) if(m): line = line.lstrip('#') line = line.strip() out_list.append(line) # sort list so that we can identify duplicate patterns out_list.sort() return out_list if (__name__=="__main__"): filename = sys.argv[1] print filename text = utility.f_read(filename) out_lines = get_hash_lines(text) filename_n = filename+"_patterns" f_w = open(filename_n,'w') for line in out_lines: s = "%s"%(line,) f_w.write(s) f_w.write("\n") f_w.close()