예제 #1
0
def FixLastQuestion():
  # The last question (number 10 in input, number 9 in our 0-based system) is special for 2 reasons:
  # 1. Every answer starts with one of four options.
  # 2. Answers have extra random whitespaces spread all over the place. We need to clean them up to get better signals.
  if G._KnownFeature('choice') and G._KnownFeature('answer'):
    return
  R = len(G.raw_answer)
  answer = [0] * R
  corpora = [0] * 10
  choice = [0] * R
  for n in range(9):
    corpora[n] = BuildCorpus(n, G.raw_answer.ValuesForQuestion(n))

  for id, line in enumerate(G.raw_answer):
    if G.question[id] != 9:
      answer[id] = G.raw_answer[id]
      choice[id] = -1

  options = ['lightgray', 'darkgray', 'white', 'black']
  new_data = []
  for id, line in G.raw_answer.ItemsForQuestion(9):
    if G.question[id] == 9:
      ii = line.find('::')
      ss = util.OnlyAlnum(line[:ii].lower())
      choice[id] = options.index(ss) if ss in options else -1
      new_data.append(line[ii + 2:])

  G.Define('choice', signal.IntFeature(choice, 'Choice question.'))

  while True:
    corpora[9] = BuildCorpus(9, new_data)
    dd = []
    for n, line in enumerate(new_data):
      s = RemoveSpacesFromAnswer(corpora[9], new_data[n])
      if s != new_data[n]:
        dd.append((new_data[n], s))
        #print new_data[n]
        #print s
        #print ' '
        new_data[n] = s
    if not dd:
      break
    print '-----------------------'


  ids_9 = G.question.ItemsForQuestion(9)
  assert len(ids_9) == len(new_data)
  for id, val in zip(ids_9, new_data):
    answer[id[0]] = val
  G.Define('answer', signal.StringFeature(answer, 'Original: answer'))
예제 #2
0
def GenerateBasicFeatures():
  assert os.path.isdir(FLAGS.data_dir)
  assert Exists('ids')
  assert Exists('question')
  assert Exists('raw_answer')
  Q.Define('max_score', IntFeature([3, 3, 2, 2, 3, 3, 2, 2, 2, 2]))
  processing.FixLastQuestion()
  G.Define('answer_length', Unary(G.answer, len, comment='answer_length', T=IntFeature))
  G.Define('num_words', Unary(G.answer, NumWords, comment='NumWords', T=IntFeature))
  G.Define('num_sentences', Unary(G.answer, NumSentences, comment='num_sentences', T=IntFeature))
  G.Define('word_length', Binary(G.num_words, G.answer_length, lambda (a,b): float(b) / a if a else 0))
  if not G._KnownFeature('is_crap'):
    G.Define('is_crap', GenerateCrap())