Пример #1
0
	def finalize_data(self, mode, u_dataset, babi_id):
		final_data = []
		# score = 0
		# instances = 0

		# p_score = 0
		# p_instances = 0
		
		# pred_labl = -1
		
		# mark_init = time.time()

		# saver = tf.train.Saver()
		# train_step, output_layer, loss, output_layer_test = self.model()

		dataset = []
		if u_dataset == 'wikiqa':
			print('> Getting dataset: {} {}'.format(u_dataset, mode))
			dataset = utils.get_wikiqa_for_abcnn(mode)
		else:
			print('> Getting dataset: {} {} {}'.format(u_dataset, mode, babi_id))
			dataset = utils.get_babi_for_abcnn(babi_id, mode)

		glove = utils.load_glove(200)
		print('> Vectorizing the questions and answers')
		for data in tqdm(dataset, total=len(dataset), ncols=75, unit='Pairs'):
			q, a, label, tfidf, word_cnt = data
			q_vector, a_vector = self.qa_vectorize(q, a, glove)
			final_data.append((q_vector, a_vector, label, tfidf, word_cnt))

		return final_data
Пример #2
0
def test_sick_preprocess():
    from Helpers.preprocess import SICK
    from Helpers import utils

    import spacy
    nlp = spacy.load('en')

    sick = SICK.get_data()
    glove = utils.load_glove(200)

    data = sick[0]
    assert ('senetnce_A' not in data['A'])

    dtree_entry, dtne_entry = SICK.get_input_tree_single(data, nlp, glove)

    for entry in [dtree_entry, dtne_entry]:
        for x in ['A', 'B', 'score']:
            assert (x in entry)

            if x != 'score':
                for y in [
                        'word_vectors', 'parent_indices', 'is_leaf',
                        'dep_tags', 'text'
                ]:
                    assert (y in entry[x])

    assert ('ent_type' in dtne_entry['A'])
    assert ('ent_type' in dtne_entry['B'])
    return
Пример #3
0
	def ans_select(self, question, ans_list):

		ans_sents = []

		tfidf, word_cnt = self.extract_features(question, ans_list)
		
		_, _, output_layer_test, _ = self.model()
		saver = tf.train.Saver()

		with tf.Session() as sessn:
			
			filename, _, _ = self.model_state_loader()
			try:
				print(filename)
				saver.restore(sessn, filename)
				print(' > Model state restored from @ ' + filename)
			except:
				print(' > No saved state found. Exiting')
				sessn.close()
				sys.exit()

			glove = utils.load_glove(200)

			for i, ans in enumerate(ans_list):
			
				q_vector, a_vector = self.qa_vectorize(question, ans, glove)

				input_dict = {self.q: q_vector, self.a: a_vector, self.label: None, self.word_cnt: word_cnt[i], self.tfidf: tfidf[i]}
				pred = sessn.run(output_layer_test, feed_dict=input_dict)

				ans_sents.append((ans, pred))

		ans_sents = sorted(ans_sents, key=operator.itemgetter(1), reverse=True) # Sorts by scores in desc order
		
		return ans_sents
Пример #4
0
def vis_tokenize(context, question):

    glove = utils.load_glove(dim=200)

    ttt = TextTilingTokenizer()

    para_list = []
    paras = [para for para in context.split('\\n') if para != '']
    for para in paras:
        sent_list = []
        for sent in sent_tokenize(para):
            temp = {}
            temp['words'] = word_tokenize(sent)
            temp['vectors'] = [
                np.array(glove[word.lower()]) for word in temp['words']
            ]
            sent_list.append(temp)
        para_list.append(sent_list)

    q_dict = {}
    q_dict['words'] = word_tokenize(question)
    q_dict['vectors'] = [
        np.array(glove[word.lower()]) for word in q_dict['words']
    ]
    return para_list, q_dict
Пример #5
0
def main():
    glove = utils.load_glove()
    vector = []

    # query = sys.argv[1]
    # file_name = sys.argv[2]

    file_name = "../data/corpus/cricket.txt"
    # query = "what is the role of bat in cricket"
    query = "what does the batsman do with a ball"

    with open(file_name, 'r') as f:
        doc = list(filter(('\n').__ne__, f.readlines()))
        tidf_measure = np.array(tf_idf(doc, query))
        top_indices = tidf_measure.argsort()[-3:][::-1]

    for index in top_indices:
        para = doc[index]
        para_word_vec = get_word_vecs(para, glove)
        measure = centroid(para_word_vec)
        vector.append((para, measure))

    query_measure = centroid(get_word_vecs(query, glove))
    print("\n" + query)
    print("\n" + get_most_relevant(vector, query_measure))
Пример #6
0
	def test_ans_select(self):
		babi = utils.get_babi_raw_for_abcnn(babi_id='1', mode='test')
		babi = utils.process_babi_for_abcnn(babi)

		shuffle(babi)
		babi = babi[:100]

		instances, correct_op = len(babi), 0

		_, _, output_layer_test, _ = self.model()

		with tf.Session() as sess:

			filename, _, _ = self.model_state_loader()
			try:
				saver = tf.train.Saver()
				print(filename)
				saver.restore(sess, filename)
				print(' > Model state restored from @ ' + filename)
			except Exception as e:
				print(e)
				print(' > No saved state found. Exiting')
				sess.close()
				sys.exit()

			glove = utils.load_glove(200)

			for sample in tqdm(babi, total=len(babi), ncols=75, unit='Sample '):
				line_numbers, context, question, _, support = sample
				ans_sents = []

				tfidf, word_cnt = self.extract_features(question, context)

				for i, ans in enumerate(context):
					q_vector, a_vector = self.qa_vectorize(question, ans, glove)

					input_dict = {self.q: q_vector, self.a: a_vector, self.label: None, self.word_cnt: word_cnt[i], self.tfidf: tfidf[i]}
					pred = sess.run(output_layer_test, feed_dict=input_dict)

					ans_sents.append((ans, pred))

				ans_sent, _ = max(ans_sents, key=operator.itemgetter(1))
				pred_labl = line_numbers[context.index(ans_sent)]
				ans_sents = sorted(ans_sents, key=operator.itemgetter(1), reverse=True)
				all_labels = [operator.itemgetter(0)(item) for item in ans_sents]
				all_labels = [line_numbers[context.index(item)] for item in all_labels]
				with open('context_accuracy_abcnn.txt', 'a') as f:
					res = 'Correct Label: {}\tPredicted Label: {}\tSorted Labels: {}\n'.format(support, pred_labl, all_labels)
					f.write(res)

				if pred_labl == support:
					correct_op += 1

			accuracy = correct_op / instances
			print('Accuracy: {0:.2f}'.format(accuracy))
Пример #7
0
def read_dataset():
    global glove
    global dep_tags
    global nlp
    nlp = spacy.load('en')
    #PreProcessing of Data before training our SentEmbd Model includes converting of words to their vector representation
    training_set = os.path.join(os.path.join(BASE, 'data'), "SICK.txt")
    with open(training_set, 'r') as file1:
        raw_dataset = file1.read().split('\n')
    file1.close()
    # print(raw_dataset) #TESTING PURPOSE
    dataset = []
    training_dataset = []
    sim_dataset = []
    relatedness_scores = []
    depTags_training_dataset = []
    depTags_sim_dataset = []
    raw_dataset = raw_dataset[1:-1]
    for item in raw_dataset:
        temp = item.split('\t')
        temp2 = temp[4]
        temp = temp[1:3]
        temp.append(temp2.strip())
        dataset.append(temp)

    glove = utils.load_glove()
    dep_tags = utils.load_dep_tags()

    for item in dataset:

        vectorized_sent1, dep_tags_1 = utils.get_sent_details(
            item[0].strip(), glove, dep_tags, nlp)
        vectorized_sent2, dep_tags_2 = utils.get_sent_details(
            item[1].strip(), glove, dep_tags, nlp)

        training_dataset.append(vectorized_sent1)
        depTags_training_dataset.append(dep_tags_1)
        sim_dataset.append(vectorized_sent2)
        depTags_sim_dataset.append(dep_tags_2)

        relatedness_scores.append(float(item[2]))
    return dataset, training_dataset, sim_dataset, relatedness_scores, depTags_training_dataset, depTags_sim_dataset
Пример #8
0
def _retrieve_info(doc, query):
    glove = utils.load_glove(dim=200)
    vector = []
    ir_dict = {}

    doc = [x for x in doc.split('\n') if x != '']
    print(doc)

    tidf_measure = np.array(tf_idf(doc, query)[0])
    print(tidf_measure)
    top_indices = tidf_measure.argsort()[-10:][::-1]

    for index in top_indices:
        para = doc[index]
        print(para)
        print('-'*10)
        para_word_vec = get_word_vecs(para, glove)
        p_centr = centroid(para_word_vec)
        p_tfidf = tidf_measure[index]
        vector.append((para, p_centr, p_tfidf))

    # print(vector)
    query_measure = centroid(get_word_vecs(query, glove))

    # Ranked Paras - (Para, centroid, tfidf, cosine_sim)
    top_para_list = get_most_relevant(vector, query_measure)
    # print(get_most_relevant(vector, query_measure))
    top_para_dict = []
    for para in top_para_list:
        entry = {}
        entry['para'] = para[0]
        entry['centroid'] = para[1]
        entry['tf_idf'] = para[2]
        entry['cosine_sim'] = para[3]
        top_para_dict.append(entry)

    ir_dict['question'] = query
    ir_dict['q_centroid'] = query_measure
    ir_dict['top_paras'] = top_para_dict
    
    return ir_dict
Пример #9
0
def main():
    start = time.time()
    query = sys.argv[1]
    glove = utils.load_glove()
    quest = utils.init_babi_deploy(
        os.path.join(
            os.path.join(
                os.path.join(os.path.dirname(os.path.realpath(__file__)),
                             'data.bak'), 'corpus'), 'babi1.txt'), query)

    dmn = dmn_basic.DMN_basic(babi_train_raw=quest,
                              babi_test_raw=[],
                              word2vec=glove,
                              word_vector_size=50,
                              dim=40,
                              mode='deploy',
                              answer_module='feedforward',
                              input_mask_mode="sentence",
                              memory_hops=5,
                              l2=0,
                              normalize_attention=False,
                              answer_vec='index',
                              debug=False)

    dmn.load_state(
        'states/dmn_basic/dmn_basic.mh5.n40.bs10.babi1.epoch2.test1.20454.state'
    )
    # dmn.load_state('states/dmn_basic/dmn_basic.mh5.n40.bs10.babi1.epoch0.test1.48296.state')

    prediction = dmn.step_deploy()

    prediction = prediction[0][0]
    for ind in prediction.argsort()[::-1]:
        if ind < dmn.answer_size:
            print(dmn.ivocab[ind], prediction[ind])
            # break
    print('Time taken:', time.time() - start)
Пример #10
0
def retrieve_info(doc, query):
    glove = utils.load_glove()
    vector = []

    # query = sys.argv[1]
    # file_name = sys.argv[2]

    # print(tf_idf(doc, query))

    tidf_measure = np.array(tf_idf(doc, query)[0])
    top_indices = tidf_measure.argsort()[-10:][::-1]
    # print(top_indices)

    for index in top_indices:
        para = doc[index]
        para_word_vec = get_word_vecs(para, glove)
        measure = centroid(para_word_vec)
        vector.append((para, measure))

    # print(vector)
    query_measure = centroid(get_word_vecs(query, glove))

    # print(get_most_relevant(vector, query_measure))
    return get_most_relevant(vector, query_measure)
Пример #11
0
assert args.word_vector_size in [50, 100, 200, 300]

network_name = args.prefix + '%s.mh%d.n%d.bs%d%s%s%s.babi%s' % (
	args.network,
	args.memory_hops,
	args.dim,
	args.batch_size,
	".na" if args.normalize_attention else "",
	".bn" if args.batch_norm else "",
	(".d" + str(args.dropout)) if args.dropout>0 else "",
	args.babi_id)

if(args.mode != 'deploy'):
	babi_train_raw, babi_test_raw = utils.get_babi_raw(args.babi_id, args.babi_test_id)
word2vec = utils.load_glove(args.word_vector_size)
args_dict = dict(args._get_kwargs())

if(args.mode != 'deploy'):
	args_dict['babi_train_raw'] = babi_train_raw
	args_dict['babi_test_raw'] = babi_test_raw
	args_dict['babi_deploy_raw']=None
else:
	raw_task=utils.init_babi_deploy('/home/mit/Desktop/EruditeX/data/corpus/babi.txt',args.query)
	args_dict['babi_train_raw'] = None
	args_dict['babi_test_raw'] = None
	args_dict['babi_deploy_raw']=raw_task
	# print(raw_task)