def reformat(): path = "../nlidb/template_selectors/data2.txt" _, _, types = load_data.load_data(path) path = "more.txt" questions = file(path, 'r').readlines() both = [] for i,q in enumerate(questions): t = types[i] q = re.sub("\n", '', q) merged = q + "\t\t" + t both.append(merged) path = 'more.txt' both = "\n".join(both) f = file('more2.txt', 'a') f.write(both) f.close()
def rewrite(): path = "../nlidb/template_selectors/data2.txt" questions, _, types = load_data.load_data(path) supplemented = [] for i,q in enumerate(questions): t = types[i] #allWords, required_values, target, conditions, tables, question_type = nlp_nlidb(q) #allWords = ' '.join(allWords) #required_values = ' '.join(required_values) #target = ' '.join(target) #conditions = ' '.join(conditions) #tables = ' '.join(tables) #values = [allWords, required_values, target, conditions, tables, question_type, q] values = [nlp_nlidb(q) + q] #formatted = [] #for v in values: #if v != None: formatted.append(v) data = ' '.join(values) data = data + "\t\t" + t supplemented.append(data) supplemented = "\n".join(supplemented) new_file = file('more.txt', 'a') new_file.write(supplemented) new_file.close()
sents = brown.sents() formatted = [] dummy_targets = [] # So that the question_type.train function doesn't complain when it tries to train a RandomForest. for s in sents[0:500]: s = [token for token in s if token not in ['?', ',', '.', '(', ')']] if len(s) > 1: sent = ' '.join(s) formatted.append(sent) dummy_targets.append('a') brown = formatted print "Training sentences: " + str(len(brown)) #brown = ' '.join(brown) #brown = brown.split('. ') #brown = [s for s in brown if s != ' ' and s != '' and len(s) > 11] questions, _, targets = load_data("../test/lat_data.txt") # training test_questions, _, test_targets = load_data("../test/test_data.txt")# cross validaiotn questions = [q.strip('?') for q in questions] test_questions = [q.strip('?') for q in test_questions] #_, word_vectors = question_type.train(questions, targets) _, word_vectors = question_type.train(brown, dummy_targets) for word,vector in word_vectors.items(): vector = list(vector) word_vectors[word] = vector rae = SentenceRAE(200,100, word_vectors, brown) epochs = 10 rae.train(1) for i in range(epochs):