def get_targets(sent,patt_lines): pot_target_list = [] matched_patt_word = [] for patt in patt_lines: if (not patt): continue patt = re.sub(COLL_SPACES,SPACES_REPL,patt) m = re.findall(patt,sent) if m: # check forward or backward if(DEBUG): print "pattern matched ",m,"for patt ",patt,"and sent",sent # Now parse this line parsed_sent = parse_file(sent) if (not parsed_sent): print "could not parse line"+parsed_sent continue # NOW NP CHUNK THE SENTENCE # First make sense of parsed input pos_dict,parse_dict,_ = process_parse.pprocess_pline(parsed_sent) # the above might return multiple lines for i in xrange(len(pos_dict.keys())): pos_sent = pos_dict[i] parsed_sent = parse_dict[i] # NP chunking algo np_sent= process_parse.extract_np(pos_sent,parsed_sent) np_chunk_sent = process_parse.assemble_extracts(np_sent) # MATCHES BACK PATTERN # use the first word in the patt to split the parsed sentence patt = patt.strip() if(not patt): print "patt was empty line move to next" continue split_patt = patt.split() split_word = split_patt[0] split_word = split_word.strip() if split_word in matched_patt_word: if(DEBUG): print "###not matching back pattern since back pattern with same key word was matched ,back key word =",split_word continue m_temp = re.search(split_word,np_chunk_sent) if(not m_temp): print "split word=",split_word,"not in sent" continue pot_target_list = get_np(split_word,np_chunk_sent,BACK,'target') # search for AND IN THE np if it exists divide the np into two parts new_list = and_detector(pot_target_list) return new_list
def get_perpi(sent,patt_lines): pot_perpi_list = [] matched_patt_word = [] for patt in patt_lines: if (not patt): continue #m2 = re.search('MURDERED',patt) #if m2: # print "patt",patt # collapse multiple white spaces patt = re.sub(COLL_SPACES,SPACES_REPL,patt) # check if any of the victim patterns exist for this line m = re.findall(patt,sent) if m: # check forward or backward if(DEBUG): print "pattern matched ",m,"for patt ",patt,"and sent",sent # Now parse this line parsed_sent = parse_file(sent) if (not parsed_sent): print "could not parse line"+parsed_sent continue # NOW NP CHUNK THE SENTENCE # First make sense of parsed input pos_dict,parse_dict,_ = process_parse.pprocess_pline(parsed_sent) # the above might return multiple lines for i in xrange(len(pos_dict.keys())): pos_sent = pos_dict[i] parsed_sent = parse_dict[i] # NP chunking algo np_sent= process_parse.extract_np(pos_sent,parsed_sent) np_chunk_sent = process_parse.assemble_extracts(np_sent) if(is_front(patt)): #perpi just have one word ( as of now) so just split by word patt = patt.strip() if(not patt): print "patt was empty line move to next" continue split_patt = patt.split() split_word = split_patt[0] split_word = split_word.strip() # THIS MAKES SURE THAT a FONT PATT IS NOT MATCHED AGAIN BY BACK PATT #matched_patt_word.append(split_word) m_temp = re.search(split_word,np_chunk_sent) if(not m_temp): print "split word=",split_word,"not in sent" continue pot_perpi_list = get_np(split_word,np_chunk_sent,FRONT,'perpi') if(len(pot_perpi_list) > 0): # THIS MAKES SURE THAT a FONT PATT IS NOT MATCHED AGAIN BY BACK PATT matched_patt_word.append(split_word) else: # MATCHES BACK PATTERN # Back patterns have three words ..second last word is the main word / split word patt = patt.strip() if(not patt): print "patt was empty line move to next" continue split_patt = patt.split() # second last word or second word is the main word split_word = split_patt[1] split_word = split_word.strip() if split_word in matched_patt_word: print "###not matching back pattern since back pattern with same key word was matched ,back key word =",split_word continue m_temp = re.search(split_word,np_chunk_sent) if(not m_temp): print "split word=",split_word,"not in sent" continue pot_perpi_list = get_np(split_word,np_chunk_sent,BACK,'perpi') # search for AND IN THE np if it exists divide the np into two parts new_list = and_detector(pot_perpi_list) return new_list