Пример #1
0
def process_input_text(file_text, id_name):
    global KEY

    (meta, main) = preprocess.split_text(file_text)
    if not meta:
        print "ERROR IN SPLITTING MAIN AND META"
        return
    if not main:
        print "ERROR IN SPLITTING MAIN AND META"
        return

    file_text = re.sub(NEWLINE, " ", main)
    if DEBUG:
        print ("processing text", main)
        print ("")

    d = answr_dict()

    if not KEY:
        make_key()

    grammar = r"""
	NP: {<RB|PP\$>?<JJ>*<NN>+<POS>?}
	NP: {<RB|PP\$>?<JJ>*<NN>+<NNS>*}
	    {<NNP>+}
	    {<RB|PP\$>?<JJ>*<NNS>*<POS>?}
	"""
    # sents = map(pos_tag, map(word_tokenize, [s for s in sent_tokenize(file_text.lower())]))
    # cp = RegexpParser(grammar)
    # for s in sents:
    # 	print cp.parse(s)
    weapons = get_weapon(file_text, d)
    print weapons
    weapon = weapons[0][0]
    print id_name
    print "C", KEY[id_name], "\n", "D", weapon
    print
    # perpindiv = get_perp_indiv(file_text, d)
    perpindiv = "-"
    # perporg = get_perp_org(file_text, d)
    perporg = "-"
    # targets = get_target(file_text, d)
    # target = targets[0][0]
    target = "-"
    # victims = get_victim(file_text, d)
    # victim = victims[0][0]
    victim = "-"

    incident_type = incident_predictor.get_predicted_event(main)
    print_out(id_name, incident_type, weapon, perpindiv, perporg, target, victim)
Пример #2
0
def process_input_text(file_text, id_name):
    # remove the \n from in between the lines
    (meta, main) = preprocess.split_text(file_text)
    if not meta:
        print "ERROR IN SPLITTING MAIN AND META"
        return
    if not main:
        print "ERROR IN SPLITTING MAIN AND META"
        return
        # print proc_meta(meta)

    temp_victim_list = []
    final_victim_set = set([])
    temp_target_list = []
    final_target_set = set([])
    temp_perpi_list = []
    final_perpi_set = set([])

    file_text = re.sub(NEWLINE, " ", main)
    file_text_list = file_text.split("\n")
    if DEBUG:
        print ("processing text", main)
        print ("")

        ### BEGIN EXPERIMENTAL ###

        # pass file text instead of main in infoextract2.py
    incident_type = incident_predictor.get_predicted_event(main)
    # incident_type = '-'
    # TODO NER CALL A FUNCTION THAT returns NER DICT

    d = answr_dict()

    weapon = get_weapon(file_text, d)
    weapon_l = [weapon[0][0]]
    perp_org = get_perp_org(file_text, d)
    perp_org_l = [perp_org[0][0]]
    p_new_list = ["-"]
    t_new_list = ["-"]
    v_new_list = ["-"]

    ### END EXPERIMENTAL ###

    """
	# open file containing victim patterns
	text = utility.f_read('victim_out_patterns_regex2')
  	victim_patt_lines = text.split('\n')
	text = utility.f_read('target_out_patterns_regex2') # has only back patt
  	target_patt_lines = text.split('\n')
	text = utility.f_read('perp_out_patterns_regex2') # has both front and back patterns 
  	perp_patt_lines = text.split('\n')
	# ALGO read one line at a time .. if it matches one of the patterns then parse that line and do ur thing 


	# READ EACH LINE IN THE from input file   
	for line in file_text_list:
		line = line.strip()
		if(not line):
			continue

		# split each line into several sentences
		sents = utility.sent_splitter(line)
		for sent in sents:
			#print "processing line",line	
			# make sure no consecutive white spaces in ur line
			sent  = sent.strip()
			# TODO remove 's and `` from sentence remove `` as well ?
			sent = re.sub(SPATT,"",sent)			
			input_line = re.sub(COLL_SPACES,SPACES_REPL,sent)
			temp_victim_list = pattern_extractor.get_victims(input_line,victim_patt_lines)
			if temp_victim_list:
				for victim in temp_victim_list:
					victim  = victim.strip()
					if victim:
						final_victim_set.add(victim)
			# TARGET LIST
			temp_target_list = pattern_extractor.get_targets(input_line,target_patt_lines)
			if temp_target_list:
				for target in temp_target_list:
					target = target.strip()
					if target:
						final_target_set.add(target)
			# PERPI LIST
			temp_perpi_list = pattern_extractor.get_perpi(input_line,perp_patt_lines)
			if temp_perpi_list:
				for perp in temp_perpi_list:
					perp = perp.strip()
					if perp:
						final_perpi_set.add(perp)


			# now use algorithms to clean this list and to remove redundant stuff 
			# get target_list

	#subset removal
	v_new_list = list(final_victim_set)
	v_new_list  = utility.remove_subsets(v_new_list)
	if (DEBUG):
		print "after subset removal"
		print v_new_list
	v_new_list = utility.remove_syn(v_new_list)
	if (DEBUG):
		print "after duplicate removal for ",id_name
		print v_new_list

	v_new_list = utility.rmv_flagged_np(v_new_list,'victim')# e.g headquarters

	if (DEBUG):
		print "after removing flag words   for ",id_name
		print v_new_list

	v_new_list = utility.first_word_flag(v_new_list,'victim')# e.g suspects

	if (DEBUG):
		print "after one removing first word flags  for ",id_name
		print v_new_list

	v_new_list = utility.first_word_rmv(v_new_list)# e.g COLONEL REPORTER
	
	if (DEBUG):
		print "after removing first title words like COLONEL etc ",id_name
		print v_new_list

	v_new_list = utility.one_word_cleaner(v_new_list)

	if (DEBUG):
		print "after one word and digit removal for ",id_name
		print v_new_list
	v_new_list = utility.victim_hacks(v_new_list)# e.g hacks
	
	if (DEBUG):
		print "after adding some hacks make unique",id_name
		print v_new_list
		print "###########################"


	t_new_list  = list(final_target_set)
	t_new_list  = utility.remove_subsets(t_new_list)	
	if (DEBUG):
		print "after subset removal"
		print t_new_list
	t_new_list = utility.remove_syn(t_new_list)
	if (DEBUG):
		print "after duplicate removal"
		print t_new_list

	t_new_list = utility.rmv_flagged_np(t_new_list,'target')# e.g headquarters
	if (DEBUG):
		print "after removing flag words   for ",id_name
		print t_new_list
	t_new_list = utility.first_word_flag(t_new_list,'target')# e.g suspects 

	if (DEBUG):
		print "after one removing first word flags  for ",id_name
		print t_new_list

	t_new_list = utility.one_word_cleaner(t_new_list)
	if (DEBUG):
		print "###Final after one word removal for ",id_name
		print t_new_list
	t_new_list = ['-']
	#print "###########################"



	p_new_list = ['-']
	p_new_list  = list(final_perpi_set)
	p_new_list  = utility.remove_subsets(p_new_list)	
	if (DEBUG):
		print "after subset removal"
		print p_new_list
	p_new_list = utility.remove_syn(p_new_list)
	if (DEBUG):
		print "after duplicate removal"
		print p_new_list

	p_new_list = utility.rmv_flagged_np(p_new_list,'perp')# e.g headquarters
	if (DEBUG):
		print "after removing flag words   for ",id_name
		print p_new_list
	p_new_list = utility.first_word_flag(p_new_list,'perp')# e.g suspects 
	if (DEBUG):
		print "after one removing first word flags  for ",id_name
		print p_new_list

	p_new_list = utility.one_word_cleaner(p_new_list)
	if (DEBUG):
		print " Final after one word and digit removal for ",id_name
		print p_new_list
	"""
    # print "###########################"

    # dict_out    = matching.match(parsed_text)
    # print ("")
    print_outf(id_name, incident_type, weapon_l, p_new_list, perp_org_l, t_new_list, v_new_list)