Пример #1
0
def reformat():
       path = "../nlidb/template_selectors/data2.txt"
       _, _, types = load_data.load_data(path)
       path = "more.txt"
       questions = file(path, 'r').readlines()
       both = []
       for i,q in enumerate(questions):
              t = types[i]
              q = re.sub("\n", '', q)
              merged = q + "\t\t" + t
              both.append(merged)
       path = 'more.txt'
       both = "\n".join(both)
       f = file('more2.txt', 'a')
       f.write(both)
       f.close()
Пример #2
0
def rewrite():
       path = "../nlidb/template_selectors/data2.txt"
       questions, _, types = load_data.load_data(path)
       supplemented = []
       for i,q in enumerate(questions):
              t = types[i]
              #allWords, required_values, target, conditions, tables, question_type = nlp_nlidb(q)
              #allWords = ' '.join(allWords)
              #required_values = ' '.join(required_values)
              #target = ' '.join(target)
              #conditions = ' '.join(conditions)
              #tables = ' '.join(tables)
              #values = [allWords, required_values, target, conditions, tables, question_type, q]
              values = [nlp_nlidb(q) + q]
              #formatted = []
              #for v in values:
                     #if v != None: formatted.append(v)
              data = ' '.join(values)
              data = data + "\t\t" + t
              supplemented.append(data)
       supplemented = "\n".join(supplemented)
       new_file = file('more.txt', 'a')
       new_file.write(supplemented)
       new_file.close()
Пример #3
0
sents = brown.sents()
formatted = []
dummy_targets = [] # So that the question_type.train function doesn't complain when it tries to train a RandomForest.
for s in sents[0:500]:
	s = [token for token in s if token not in ['?', ',', '.', '(', ')']]
	if len(s) > 1:
		sent = ' '.join(s)
		formatted.append(sent)
		dummy_targets.append('a')
brown = formatted
print "Training sentences: " + str(len(brown))
#brown = ' '.join(brown)
#brown = brown.split('. ')
#brown = [s for s in brown if s != ' ' and s != '' and len(s) > 11]

questions, _, targets = load_data("../test/lat_data.txt") # training 
test_questions, _, test_targets = load_data("../test/test_data.txt")# cross validaiotn
questions = [q.strip('?') for q in questions]
test_questions = [q.strip('?') for q in test_questions]

#_, word_vectors = question_type.train(questions, targets)
_, word_vectors = question_type.train(brown, dummy_targets)
for word,vector in word_vectors.items():
	vector = list(vector)
	word_vectors[word] = vector


rae = SentenceRAE(200,100, word_vectors, brown)
epochs = 10
rae.train(1)
for i in range(epochs):