def FixLastQuestion(): # The last question (number 10 in input, number 9 in our 0-based system) is special for 2 reasons: # 1. Every answer starts with one of four options. # 2. Answers have extra random whitespaces spread all over the place. We need to clean them up to get better signals. if G._KnownFeature('choice') and G._KnownFeature('answer'): return R = len(G.raw_answer) answer = [0] * R corpora = [0] * 10 choice = [0] * R for n in range(9): corpora[n] = BuildCorpus(n, G.raw_answer.ValuesForQuestion(n)) for id, line in enumerate(G.raw_answer): if G.question[id] != 9: answer[id] = G.raw_answer[id] choice[id] = -1 options = ['lightgray', 'darkgray', 'white', 'black'] new_data = [] for id, line in G.raw_answer.ItemsForQuestion(9): if G.question[id] == 9: ii = line.find('::') ss = util.OnlyAlnum(line[:ii].lower()) choice[id] = options.index(ss) if ss in options else -1 new_data.append(line[ii + 2:]) G.Define('choice', signal.IntFeature(choice, 'Choice question.')) while True: corpora[9] = BuildCorpus(9, new_data) dd = [] for n, line in enumerate(new_data): s = RemoveSpacesFromAnswer(corpora[9], new_data[n]) if s != new_data[n]: dd.append((new_data[n], s)) #print new_data[n] #print s #print ' ' new_data[n] = s if not dd: break print '-----------------------' ids_9 = G.question.ItemsForQuestion(9) assert len(ids_9) == len(new_data) for id, val in zip(ids_9, new_data): answer[id[0]] = val G.Define('answer', signal.StringFeature(answer, 'Original: answer'))
def GenerateBasicFeatures(): assert os.path.isdir(FLAGS.data_dir) assert Exists('ids') assert Exists('question') assert Exists('raw_answer') Q.Define('max_score', IntFeature([3, 3, 2, 2, 3, 3, 2, 2, 2, 2])) processing.FixLastQuestion() G.Define('answer_length', Unary(G.answer, len, comment='answer_length', T=IntFeature)) G.Define('num_words', Unary(G.answer, NumWords, comment='NumWords', T=IntFeature)) G.Define('num_sentences', Unary(G.answer, NumSentences, comment='num_sentences', T=IntFeature)) G.Define('word_length', Binary(G.num_words, G.answer_length, lambda (a,b): float(b) / a if a else 0)) if not G._KnownFeature('is_crap'): G.Define('is_crap', GenerateCrap())