def get_predictions(test_data): """compute predictions for the test set given as argument using the two different scoring heuristics """ with open(test_data) as f: i = 0 predictions = [] options = [] # stores the different options possible for a single sentence for line in f: match = re.search('\[([\d\w\'\-,]+)\]', line) option = match.group(1) if i % OPTIONS_PER_SENTENCE == 0: options = [option] line = line.replace("[%s]" % option, "") # remove fill word from sentence words_in_sentence = line.split()[1:] # start from index 1 since 1st cell contains the question number elif i % OPTIONS_PER_SENTENCE == 4: if not STEMMING: options.append(option) best_option_1, best_option_2 = get_best_option(options, words_in_sentence) predictions.append((best_option_1, best_option_2)) else: options.append(option) stemmed_options = [stem(option) for option in options] stemmed_words_in_sentence = [stem(word) for word in words_in_sentence] best_option_1, best_option_2 = get_best_option(stemmed_options, stemmed_words_in_sentence) if best_option_1 and best_option_2: best_option_index_1 = stemmed_options.index(best_option_1) best_option_index_2 = stemmed_options.index(best_option_2) predictions.append((options[best_option_index_1], options[best_option_index_2])) else: predictions.append((None, None)) else: options.append(option) i += 1 return predictions
def get_predictions(test_data): """compute predictions for the test set given as argument using the two different scoring heuristics """ with open(test_data) as f: i = 0 predictions = [] options = [ ] # stores the different options possible for a single sentence for line in f: match = re.search('\[([\d\w\'\-,]+)\]', line) option = match.group(1) if i % OPTIONS_PER_SENTENCE == 0: options = [option] line = line.replace("[%s]" % option, "") # remove fill word from sentence words_in_sentence = line.split( )[1:] # start from index 1 since 1st cell contains the question number elif i % OPTIONS_PER_SENTENCE == 4: if not STEMMING: options.append(option) best_option_1, best_option_2 = get_best_option( options, words_in_sentence) predictions.append((best_option_1, best_option_2)) else: options.append(option) stemmed_options = [stem(option) for option in options] stemmed_words_in_sentence = [ stem(word) for word in words_in_sentence ] best_option_1, best_option_2 = get_best_option( stemmed_options, stemmed_words_in_sentence) if best_option_1 and best_option_2: best_option_index_1 = stemmed_options.index( best_option_1) best_option_index_2 = stemmed_options.index( best_option_2) predictions.append((options[best_option_index_1], options[best_option_index_2])) else: predictions.append((None, None)) else: options.append(option) i += 1 return predictions
def filter_stem(input_path, output_path): """filter every word so that only the stem remains Keyword arguments: input_path -- input file path output_path -- output file path """ with open(input_path) as inp, open(output_path, 'w') as out: for line in inp: line = " ".join([stem(word) for word in line.split()]) out.write(line+'\n')
def term_normalize(term): res = ''.join(e for e in term if e.isalpha()) res = stem(res.lower()) return res