def train_perceptron(all_sentences, feature_dict, tbank, history):
    weight_matrix = init_weights(len(feature_dict))
    pre_pros = []
    t1 = time()
    current_sen = 1
    for sentence in all_sentences:
        print "train sentence: " + str(current_sen)
        current_sen += 1
        try:
            parsed_tree = tbank.parse(sentence.raw_sentence)
            # For loop around this, so that you loop through all sentences --> weights should be updated
            #sentence.words_tags
            # """
            # ==== comment 0
            # hier staat hoe op de juiste manier door de boom gelopen kan worden (afhankelijk van global boolean
            # 	golinear en iterator-functie iterloop (die uit depTree komt)
            # 	Er is waarschijnlijk een nettere manier om dit in de andere code te plaatsen dan copy-pasten, maar dat is een optie
            # """
            histories = []
            target_feature_vectors = []
            if golinear:
                context_words = [w.orth_ for w in iterloop(parsed_tree)]
                context_pos_tags = [w.tag_ for w in iterloop(parsed_tree)]
                context_tags = [
                    sentence.words_tags[dt.sen_idx(sentence.raw_sentence,
                                                   wrd)][1]
                    for wrd in iterloop(parsed_tree)
                ]
                for i, wrd in enumerate(context_words):
                    if i < history:
                        history_tags = tuple(['-TAGSTART-'] +
                                             context_tags[0:i])
                        history_words = ['-START-'] + context_words[0:i]
                        history_pos_tags = ['-POSTAGSTART-'
                                            ] + context_pos_tags[0:i]
                    else:
                        history_tags = context_tags[i - history:i]
                        history_words = context_words[i - history:i]
                        history_pos_tags = context_pos_tags[i - history:i]
                    history_vectors = ('ph', [history_tags])
                    cur_idx = i
                    prev_idx = cur_idx - 1
                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])
                    target_feature_vectors.append(
                        dp.construct_feature_vector(wrd, context_tags[i],
                                                    feature_dict,
                                                    history_words, history,
                                                    history_vectors,
                                                    history_pos_tags,
                                                    distance))
                    histories.append(
                        (prev_idx, history_words, history_pos_tags, distance))
            else:
                for i, wrd in enumerate(iterloop(parsed_tree)):
                    cur = wrd
                    history_words = []
                    history_tags = []
                    history_pos_tags = []
                    for j in range(history):
                        par = cur.head
                        if cur == par:
                            parw = '-START-'
                            idx = -1
                            tag = '-TAGSTART-'
                            pos = '-POSTAGSTART-'
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                            break
                        else:
                            parw = par.orth_
                            idx = dt.sen_idx(sentence.raw_sentence, par)
                            tag = sentence.words_tags[idx][1]
                            pos = par.tag_
                            cur = par
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                    history_vectors = ('ph', [history_tags])
                    cur_idx = dt.sen_idx(sentence.raw_sentence, wrd)

                    for prev_idx, w in enumerate(iterloop(parsed_tree)):
                        if w == wrd.head:
                            break
                    if wrd.head == wrd:
                        prev_idx = -1

                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])
                    #else:
                    #	prev_idx = dt.sen_idx(sentence.raw_sentence,wrd.head)
                    cur_tag = sentence.words_tags[idx][1]
                    target_feature_vectors.append(
                        dp.construct_feature_vector(wrd.orth_, cur_tag,
                                                    feature_dict,
                                                    history_words, history,
                                                    history_vectors,
                                                    history_pos_tags,
                                                    distance))
                    # hist_hist = []
                    # for tag in all_tags:
                    # 	hist_hist.append(
                    # 		dp.construct_feature_vector(wrd.orth_,tag,feature_dict,history_words,history, history_vectors, history_pos_tags, distance)
                    # 	)
                    # histories.append(hist_hist)
                    histories.append(
                        (prev_idx, history_words, history_pos_tags, distance))
            # """
            # /==== end comment 0
            # """
            #print histories
            dict_target_feature_vectors = [
                v2d(target_feature_vector[0][0])
                for target_feature_vector in target_feature_vectors
            ]
            pre_pros.append(
                (parsed_tree, dict_target_feature_vectors, histories))
            #weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict,
            #			history, weight_matrix, context_words, context_pos_tags)
        except Exception as ex:
            print "error"
            pipeline.log('train', sentence)

    print 'pre_pros', time() - t1
    t2 = time()
    print len(pre_pros)

    for i in range(iters):
        iter_time = time()
        print "at iter", i
        cum_weights = (i) * weight_matrix
        for parsed_tree, dict_target_feature_vectors, histories in pre_pros:
            target_feature_vectors = [
                d2v(dict_target_feature_vector)
                for dict_target_feature_vector in dict_target_feature_vectors
            ]
            weight_matrix = train_perceptron_once(parsed_tree,
                                                  target_feature_vectors,
                                                  feature_dict, history,
                                                  weight_matrix, histories)
        weight_matrix = (cum_weights + weight_matrix) / (i + 1)
        print "one iter: ", time() - iter_time
    print 'train', time() - t2
    return weight_matrix
def train_perceptron(all_sentences, feature_dict, tbank, history):
	weight_matrix = init_weights(len(feature_dict))
	pre_pros = []
	t1 = time()
	current_sen = 1
	for sentence in all_sentences:
		print "train sentence: "+str(current_sen)
		current_sen += 1
		try:
			parsed_tree = tbank.parse(sentence.raw_sentence)
			# For loop around this, so that you loop through all sentences --> weights should be updated
		
			histories = []
			target_feature_vectors = []
			if golinear:
				context_words = [w.orth_ for w in iterloop(parsed_tree) ]
				context_pos_tags = [w.tag_ for w in iterloop(parsed_tree) ]
				context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree)]
				for i,wrd in enumerate(context_words):
					if i < history:
						history_tags = tuple(['-TAGSTART-']+context_tags[0:i])
						history_words = ['-START-']+context_words[0:i]
						history_pos_tags = ['-POSTAGSTART-']+context_pos_tags[0:i]
					else:
						history_tags = context_tags[i-history:i]
						history_words = context_words[i-history:i]
						history_pos_tags = context_pos_tags[i-history:i]
					history_vectors = ('ph', [history_tags] )
					cur_idx = i
					prev_idx = cur_idx-1
					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])
					target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], 
							feature_dict, history_words, history, history_vectors, history_pos_tags, distance) )
					histories.append((prev_idx,history_words,history_pos_tags,distance))
			else:
				for i,wrd in enumerate(iterloop(parsed_tree)):
					cur = wrd
					history_words = []
					history_tags = []
					history_pos_tags = []
					for j in range(history):
						par = cur.head
						if cur == par:
							parw = '-START-'
							idx = -1
							tag = '-TAGSTART-'
							pos = '-POSTAGSTART-'
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
							break
						else:
							parw = par.orth_
							idx = dt.sen_idx(sentence.raw_sentence,par)
							tag = sentence.words_tags[idx][1]
							pos = par.tag_
							cur = par
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
					history_vectors = ('ph',[history_tags] )
					cur_idx = dt.sen_idx(sentence.raw_sentence,wrd)
					
					for prev_idx,w in enumerate(iterloop(parsed_tree)):
						if w == wrd.head:
							break
					if wrd.head == wrd:
						prev_idx = -1

					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])
					
					cur_tag = sentence.words_tags[idx][1]
					target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict,
									history_words, history, history_vectors, history_pos_tags, distance) )
					histories.append((prev_idx,history_words,history_pos_tags,distance))
			
			dict_target_feature_vectors = [v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors]
			pre_pros.append((parsed_tree,dict_target_feature_vectors,histories))
		except Exception as ex:
			print "error"
			pipeline.log('train',sentence)
	
	print 'pre_pros',time()-t1
	t2 = time()
	print len(pre_pros)

	for i in range(iters):
		iter_time = time()
		print "at iter",i
		cum_weights = (i)*weight_matrix
		for parsed_tree,dict_target_feature_vectors,histories in pre_pros:
			target_feature_vectors = [d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors]
			weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, 
							history, weight_matrix, histories)
		weight_matrix = (cum_weights + weight_matrix)/(i+1)
		print "one iter: ", time() - iter_time
	print 'train',time()-t2
	return weight_matrix
Пример #3
0
def flaws(dt,
          all_sentences,
          feature_dict,
          tbank,
          history,
          weight_matrix=None,
          with_tags=True):
    if weight_matrix is None:
        weight_matrix = sp.init_weights(len(feature_dict))
    E = 0.0
    counter_flaw = 1
    for sentence in all_sentences:
        current_time = time()
        print "at flaws, sentence ", counter_flaw
        counter_flaw += 1
        try:
            parsed_tree = tbank.parse(sentence.raw_sentence)

            histories = []
            target_feature_vectors = []

            if sp.golinear:
                #print parsed_tree
                context_words = [w.orth_ for w in dt.linear(parsed_tree)]
                context_pos_tags = [w.tag_ for w in dt.linear(parsed_tree)]
                context_tags = [
                    sentence.words_tags[dt.sen_idx(sentence.raw_sentence,
                                                   wrd)][1]
                    for wrd in dt.linear(parsed_tree)
                ]
                target_feature_vectors = []
                for i, wrd in enumerate(context_words):
                    if i < history:
                        history_tags = tuple(['-TAGSTART-'] +
                                             context_tags[0:i])
                        history_words = ['-START-'] + context_words[0:i]
                        history_pos_tags = ['-POSTAGSTART-'
                                            ] + context_pos_tags[0:i]
                    else:
                        history_tags = context_tags[i - history:i]
                        history_words = context_words[i - history:i]
                        history_pos_tags = context_pos_tags[i - history:i]
                    history_vectors = ('ph', [history_tags])

                    cur_idx = i
                    prev_idx = cur_idx - 1
                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])

                    target_feature_vectors.append(
                        dp.construct_feature_vector(wrd, context_tags[i],
                                                    feature_dict,
                                                    history_words, history,
                                                    history_vectors,
                                                    history_pos_tags,
                                                    distance))
                    histories.append(
                        (prev_idx, history_words, history_pos_tags, distance))
            else:
                context_tags = [
                    sentence.words_tags[dt.sen_idx(sentence.raw_sentence,
                                                   wrd)][1]
                    for wrd in dt.dfirst(parsed_tree)
                ]
                for i, wrd in enumerate(dt.dfirst(parsed_tree)):
                    cur = wrd
                    history_words = []
                    history_tags = []
                    history_pos_tags = []
                    for j in range(history):
                        par = cur.head
                        if cur == par:
                            parw = '-START-'
                            idx = -1
                            tag = '-TAGSTART-'
                            pos = '-POSTAGSTART-'
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                            break
                        else:
                            parw = par.orth_
                            idx = dt.sen_idx(sentence.raw_sentence, par)
                            tag = sentence.words_tags[idx][1]
                            pos = par.tag_
                            cur = par
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                    history_vectors = ('ph', [history_tags])
                    cur_idx = dt.sen_idx(sentence.raw_sentence, wrd)

                    for prev_idx, w in enumerate(dt.dfirst(parsed_tree)):
                        if w == wrd.head:
                            break
                    if wrd.head == wrd:
                        prev_idx = -1

                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])

                    cur_tag = sentence.words_tags[idx][1]
                    target_feature_vectors.append(
                        dp.construct_feature_vector(wrd.orth_, cur_tag,
                                                    feature_dict,
                                                    history_words, history,
                                                    history_vectors,
                                                    history_pos_tags,
                                                    distance))

                    histories.append(
                        (prev_idx, history_words, history_pos_tags, distance))

            if not with_tags:
                context_tags = None

            E = sp.test_perceptron_once(E, parsed_tree, feature_dict, history,
                                        weight_matrix, histories, context_tags)

        except Exception as ex:
            log('flaw', sentence)

        print time() - current_time
    return E
def viterbi(parsed_tree, feature_dict, history, weight_matrix, histories):
	""" Input:	The sentence to be tagged
				A list of all possible tags (strings)
				History: how far you want to look back

		Output:	A list with all feature vectors, in order per word
	"""

	sentence_dict = {} # per word all possible tags
	no_tags = len(all_tags)

	# --------------------------- Viterbi forward path --------------------------- #
	t1=time()
	for i,wrd in enumerate(iterloop(parsed_tree) ): # now you know the position of the word in your sentence
		
		feature_vector_array = np.zeros((no_tags, SIZE) ) # now we assume we have only two features per tag (n.b. so this is not only correct or false, it's features)
		tag_score_array = np.zeros((no_tags))
		history_list = []
		t2=time()
		history_vectors = sentence_dict.get(histories[i][0],(0,0,[('-TAGSTART-',)]))[1:3]
		calc_feat = [None]
		for j,tag in enumerate(all_tags): 
			
			t4=time()
			feature_vectors_tag = dp.construct_feature_vector(wrd.orth_, tag, feature_dict, histories[i][1], history, history_vectors, histories[i][2], histories[i][3], calc_feat)
			
			t4=time()-t4
			t5=time()
			best_tag_score = -1e1000 # init scores --> delete once more clever list implementation with max
			best_feature_vector = np.zeros(SIZE) # number of features --> CHANGE
			history_word = ('Um') 
			for tple in feature_vectors_tag:
				tag_score = np.dot(tple[0], weight_matrix.transpose()) 
				if tag_score > best_tag_score:
					best_tag_score = tag_score
					best_feature_vector = tple[0]
					history_word = tple[1]
			t5=time()-t5

			tag_score_array[j] = best_tag_score
			feature_vector_array[j,:] = best_feature_vector
			history_list.append(history_word)
		t2=time()-t2
		sentence_dict[i] = (tag_score_array, feature_vector_array, history_list)
	t1=time()-t1

	# --------------------------- Viterbi backward path --------------------------- #
	t6=time()
	final_feature_vectors = []

	dict_len = len(sentence_dict)
	for entry in range(dict_len-1, -1, -1):
		(score, vector, history_list) = sentence_dict[entry]
		if entry == dict_len-1: 
			high_score =  score.argmax()
			best_vector = vector[high_score]
			history_best_vector = tag_idxes[history_list[high_score][-2]] 
		else:
			best_vector = vector[history_best_vector]
			history_best_vector = tag_idxes[history_list[high_score][-2]] 
		final_feature_vectors.append(best_vector) 
	t6=time()-t6
	return final_feature_vectors
def train_perceptron(all_sentences, feature_dict, tbank, history):
	weight_matrix = init_weights(len(feature_dict))
	pre_pros = []
	t1 = time()
	current_sen = 1
	for sentence in all_sentences:
		print "train sentence: "+str(current_sen)
		current_sen += 1
		try:
			parsed_tree = tbank.parse(sentence.raw_sentence)
			# For loop around this, so that you loop through all sentences --> weights should be updated
			#sentence.words_tags
			# """
			# ==== comment 0
			# hier staat hoe op de juiste manier door de boom gelopen kan worden (afhankelijk van global boolean
			# 	golinear en iterator-functie iterloop (die uit depTree komt)
			# 	Er is waarschijnlijk een nettere manier om dit in de andere code te plaatsen dan copy-pasten, maar dat is een optie
			# """
			histories = []
			target_feature_vectors = []
			if golinear:
				context_words = [w.orth_ for w in iterloop(parsed_tree) ]
				context_pos_tags = [w.tag_ for w in iterloop(parsed_tree) ]
				context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in iterloop(parsed_tree)]
				for i,wrd in enumerate(context_words):
					if i < history:
						history_tags = tuple(['-TAGSTART-']+context_tags[0:i])
						history_words = ['-START-']+context_words[0:i]
						history_pos_tags = ['-POSTAGSTART-']+context_pos_tags[0:i]
					else:
						history_tags = context_tags[i-history:i]
						history_words = context_words[i-history:i]
						history_pos_tags = context_pos_tags[i-history:i]
					history_vectors = ('ph', [history_tags] )
					cur_idx = i
					prev_idx = cur_idx-1
					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])
					target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], 
							feature_dict, history_words, history, history_vectors, history_pos_tags, distance) )
					histories.append((prev_idx,history_words,history_pos_tags,distance))
			else:
				for i,wrd in enumerate(iterloop(parsed_tree)):
					cur = wrd
					history_words = []
					history_tags = []
					history_pos_tags = []
					for j in range(history):
						par = cur.head
						if cur == par:
							parw = '-START-'
							idx = -1
							tag = '-TAGSTART-'
							pos = '-POSTAGSTART-'
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
							break
						else:
							parw = par.orth_
							idx = dt.sen_idx(sentence.raw_sentence,par)
							tag = sentence.words_tags[idx][1]
							pos = par.tag_
							cur = par
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
					history_vectors = ('ph',[history_tags] )
					cur_idx = dt.sen_idx(sentence.raw_sentence,wrd)
					
					for prev_idx,w in enumerate(iterloop(parsed_tree)):
						if w == wrd.head:
							break
					if wrd.head == wrd:
						prev_idx = -1

					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])
					#else:
					#	prev_idx = dt.sen_idx(sentence.raw_sentence,wrd.head)
					cur_tag = sentence.words_tags[idx][1]
					target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict,
									history_words, history, history_vectors, history_pos_tags, distance) )
					# hist_hist = []
					# for tag in all_tags:
					# 	hist_hist.append(
					# 		dp.construct_feature_vector(wrd.orth_,tag,feature_dict,history_words,history, history_vectors, history_pos_tags, distance)
					# 	)
					# histories.append(hist_hist)
					histories.append((prev_idx,history_words,history_pos_tags,distance))
			# """
			# /==== end comment 0
			# """
			#print histories
			dict_target_feature_vectors = [v2d(target_feature_vector[0][0]) for target_feature_vector in target_feature_vectors]
			pre_pros.append((parsed_tree,dict_target_feature_vectors,histories))
			#weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, 
	 		#			history, weight_matrix, context_words, context_pos_tags)
		except Exception as ex:
			print "error"
			pipeline.log('train',sentence)
	
	print 'pre_pros',time()-t1
	t2 = time()
	print len(pre_pros)

	for i in range(iters):
		iter_time = time()
		print "at iter",i
		cum_weights = (i)*weight_matrix
		for parsed_tree,dict_target_feature_vectors,histories in pre_pros:
			target_feature_vectors = [d2v(dict_target_feature_vector) for dict_target_feature_vector in dict_target_feature_vectors]
			weight_matrix = train_perceptron_once(parsed_tree, target_feature_vectors, feature_dict, 
							history, weight_matrix, histories)
		weight_matrix = (cum_weights + weight_matrix)/(i+1)
		print "one iter: ", time() - iter_time
	print 'train',time()-t2
	return weight_matrix
def train_perceptron(all_sentences, feature_dict, tbank, history):
    weight_matrix = init_weights(len(feature_dict))
    pre_pros = []
    t1 = time()
    current_sen = 1
    for sentence in all_sentences:
        print "train sentence: " + str(current_sen)
        current_sen += 1
        try:
            parsed_tree = tbank.parse(sentence.raw_sentence)
            # For loop around this, so that you loop through all sentences --> weights should be updated

            histories = []
            target_feature_vectors = []
            if golinear:
                context_words = [w.orth_ for w in iterloop(parsed_tree)]
                context_pos_tags = [w.tag_ for w in iterloop(parsed_tree)]
                context_tags = [
                    sentence.words_tags[dt.sen_idx(sentence.raw_sentence,
                                                   wrd)][1]
                    for wrd in iterloop(parsed_tree)
                ]
                for i, wrd in enumerate(context_words):
                    if i < history:
                        history_tags = tuple(['-TAGSTART-'] +
                                             context_tags[0:i])
                        history_words = ['-START-'] + context_words[0:i]
                        history_pos_tags = ['-POSTAGSTART-'
                                            ] + context_pos_tags[0:i]
                    else:
                        history_tags = context_tags[i - history:i]
                        history_words = context_words[i - history:i]
                        history_pos_tags = context_pos_tags[i - history:i]
                    history_vectors = ('ph', [history_tags])
                    cur_idx = i
                    prev_idx = cur_idx - 1
                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])
                    target_feature_vectors.append(
                        dp.construct_feature_vector(wrd, context_tags[i],
                                                    feature_dict,
                                                    history_words, history,
                                                    history_vectors,
                                                    history_pos_tags,
                                                    distance))
                    histories.append(
                        (prev_idx, history_words, history_pos_tags, distance))
            else:
                for i, wrd in enumerate(iterloop(parsed_tree)):
                    cur = wrd
                    history_words = []
                    history_tags = []
                    history_pos_tags = []
                    for j in range(history):
                        par = cur.head
                        if cur == par:
                            parw = '-START-'
                            idx = -1
                            tag = '-TAGSTART-'
                            pos = '-POSTAGSTART-'
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                            break
                        else:
                            parw = par.orth_
                            idx = dt.sen_idx(sentence.raw_sentence, par)
                            tag = sentence.words_tags[idx][1]
                            pos = par.tag_
                            cur = par
                            history_tags.insert(0, tag)
                            history_words.insert(0, parw)
                            history_pos_tags.insert(0, pos)
                    history_vectors = ('ph', [history_tags])
                    cur_idx = dt.sen_idx(sentence.raw_sentence, wrd)

                    for prev_idx, w in enumerate(iterloop(parsed_tree)):
                        if w == wrd.head:
                            break
                    if wrd.head == wrd:
                        prev_idx = -1

                    distance = 0
                    if prev_idx >= 0:
                        distance = parsed_tree[cur_idx].similarity(
                            parsed_tree[prev_idx])

                    cur_tag = sentence.words_tags[idx][1]
                    target_feature_vectors.append(
                        dp.construct_feature_vector(wrd.orth_, cur_tag,
                                                    feature_dict,
                                                    history_words, history,
                                                    history_vectors,
                                                    history_pos_tags,
                                                    distance))
                    histories.append(
                        (prev_idx, history_words, history_pos_tags, distance))

            dict_target_feature_vectors = [
                v2d(target_feature_vector[0][0])
                for target_feature_vector in target_feature_vectors
            ]
            pre_pros.append(
                (parsed_tree, dict_target_feature_vectors, histories))
        except Exception as ex:
            print "error"
            pipeline.log('train', sentence)

    print 'pre_pros', time() - t1
    t2 = time()
    print len(pre_pros)

    for i in range(iters):
        iter_time = time()
        print "at iter", i
        cum_weights = (i) * weight_matrix
        for parsed_tree, dict_target_feature_vectors, histories in pre_pros:
            target_feature_vectors = [
                d2v(dict_target_feature_vector)
                for dict_target_feature_vector in dict_target_feature_vectors
            ]
            weight_matrix = train_perceptron_once(parsed_tree,
                                                  target_feature_vectors,
                                                  feature_dict, history,
                                                  weight_matrix, histories)
        weight_matrix = (cum_weights + weight_matrix) / (i + 1)
        print "one iter: ", time() - iter_time
    print 'train', time() - t2
    return weight_matrix
def viterbi(parsed_tree, feature_dict, history, weight_matrix, histories):
    """ Input:	The sentence to be tagged
				A list of all possible tags (strings)
				History: how far you want to look back

		Output:	A list with all feature vectors, in order per word
	"""

    sentence_dict = {}  # per word all possible tags
    no_tags = len(all_tags)

    # --------------------------- Viterbi forward path --------------------------- #
    t1 = time()
    for i, wrd in enumerate(
            iterloop(parsed_tree)
    ):  # now you know the position of the word in your sentence

        feature_vector_array = np.zeros(
            (no_tags, SIZE)
        )  # now we assume we have only two features per tag (n.b. so this is not only correct or false, it's features)
        tag_score_array = np.zeros((no_tags))
        history_list = []
        t2 = time()
        history_vectors = sentence_dict.get(histories[i][0],
                                            (0, 0, [('-TAGSTART-', )]))[1:3]
        calc_feat = [None]
        for j, tag in enumerate(all_tags):

            t4 = time()
            feature_vectors_tag = dp.construct_feature_vector(
                wrd.orth_, tag, feature_dict, histories[i][1], history,
                history_vectors, histories[i][2], histories[i][3], calc_feat)

            t4 = time() - t4
            t5 = time()
            best_tag_score = -1e1000  # init scores --> delete once more clever list implementation with max
            best_feature_vector = np.zeros(
                SIZE)  # number of features --> CHANGE
            history_word = ('Um')
            for tple in feature_vectors_tag:
                tag_score = np.dot(tple[0], weight_matrix.transpose())
                if tag_score > best_tag_score:
                    best_tag_score = tag_score
                    best_feature_vector = tple[0]
                    history_word = tple[1]
            t5 = time() - t5

            tag_score_array[j] = best_tag_score
            feature_vector_array[j, :] = best_feature_vector
            history_list.append(history_word)
        t2 = time() - t2
        sentence_dict[i] = (tag_score_array, feature_vector_array,
                            history_list)
    t1 = time() - t1

    # --------------------------- Viterbi backward path --------------------------- #
    t6 = time()
    final_feature_vectors = []

    dict_len = len(sentence_dict)
    for entry in range(dict_len - 1, -1, -1):
        (score, vector, history_list) = sentence_dict[entry]
        if entry == dict_len - 1:
            high_score = score.argmax()
            best_vector = vector[high_score]
            history_best_vector = tag_idxes[history_list[high_score][-2]]
        else:
            best_vector = vector[history_best_vector]
            history_best_vector = tag_idxes[history_list[high_score][-2]]
        final_feature_vectors.append(best_vector)
    t6 = time() - t6
    return final_feature_vectors
Пример #8
0
def flaws(dt,all_sentences,feature_dict,tbank,history,weight_matrix=None,with_tags=True):
	if weight_matrix is None:
		weight_matrix = sp.init_weights(len(feature_dict))
	E = 0.0
	counter_flaw = 1
	for sentence in all_sentences:
		current_time = time()
		print "at flaws, sentence ",counter_flaw
		counter_flaw += 1
		try:
			parsed_tree = tbank.parse(sentence.raw_sentence)

			histories = []
			target_feature_vectors = []

			if sp.golinear:
				#print parsed_tree
				context_words = [w.orth_ for w in dt.linear(parsed_tree) ]
				context_pos_tags = [w.tag_ for w in dt.linear(parsed_tree) ]
				context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in dt.linear(parsed_tree)]
				target_feature_vectors = []
				for i,wrd in enumerate(context_words):
					if i < history:
						history_tags = tuple(['-TAGSTART-']+context_tags[0:i])
						history_words = ['-START-']+context_words[0:i]
						history_pos_tags = ['-POSTAGSTART-']+context_pos_tags[0:i]
					else:
						history_tags = context_tags[i-history:i]
						history_words = context_words[i-history:i]
						history_pos_tags = context_pos_tags[i-history:i]
					history_vectors = ('ph', [history_tags] )

					cur_idx = i
					prev_idx = cur_idx-1
					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])


					target_feature_vectors.append( dp.construct_feature_vector(wrd, context_tags[i], 
							feature_dict, history_words, history, history_vectors, history_pos_tags, distance) )
					histories.append((prev_idx,history_words,history_pos_tags,distance))
			else:
				context_tags = [sentence.words_tags[dt.sen_idx(sentence.raw_sentence, wrd)][1] for wrd in dt.dfirst(parsed_tree)]
				for i,wrd in enumerate(dt.dfirst(parsed_tree)):
					cur = wrd
					history_words = []
					history_tags = []
					history_pos_tags = []
					for j in range(history):
						par = cur.head
						if cur == par:
							parw = '-START-'
							idx = -1
							tag = '-TAGSTART-'
							pos = '-POSTAGSTART-'
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
							break
						else:
							parw = par.orth_
							idx = dt.sen_idx(sentence.raw_sentence,par)
							tag = sentence.words_tags[idx][1]
							pos = par.tag_
							cur = par
							history_tags.insert(0,tag)
							history_words.insert(0,parw)
							history_pos_tags.insert(0,pos)
					history_vectors = ('ph',[history_tags] )
					cur_idx = dt.sen_idx(sentence.raw_sentence,wrd)
					
					for prev_idx,w in enumerate(dt.dfirst(parsed_tree)):
						if w == wrd.head:
							break
					if wrd.head == wrd:
						prev_idx = -1

					distance = 0
					if prev_idx >= 0:
						distance = parsed_tree[cur_idx].similarity(parsed_tree[prev_idx])
					
					cur_tag = sentence.words_tags[idx][1]
					target_feature_vectors.append( dp.construct_feature_vector(wrd.orth_, cur_tag, feature_dict,
									history_words, history, history_vectors, history_pos_tags, distance) )
					
					histories.append((prev_idx,history_words,history_pos_tags,distance))

			if not with_tags:
				context_tags = None

			E = sp.test_perceptron_once(E, parsed_tree, feature_dict, 
						history, weight_matrix, histories, context_tags)
		
		except Exception as ex:
			log('flaw',sentence)

		print time() - current_time
	return E