示例#1
0
def getTagTermAffinityScores(questions, includeCounts=True, frequentWords=None):
  # print 'Computing TagAffinity model on %d questions' % len(questions)
  if not frequentWords:
    frequentWords = set(wordvectors.getFrequentWords(questions)[0])
  ttas = {}
  tagCounts = {}
  infile_body = codecs.open(posts_body_file, 'r', 'utf-8')
  for (qid, question) in questions.items():
    for tagID in question.tags:
      tagCounts[tagID] = tagCounts.get(tagID, 0) + 1
    infile_body.seek(question.bodyByte)
    postWords = wordvectors.getWordsFromPost(infile_body.readline())
    for word in set(postWords):
      if word not in frequentWords:
        continue
      inner_dict = ttas.get(word, {})
      for tagID in question.tags:
        inner_dict[tagID] = inner_dict.get(tagID, 0) + 1
      ttas[word] = inner_dict
  infile_body.close()

  for (term, inner_dict) in ttas.items():
    for (tagID, freq) in inner_dict.items():
      inner_dict[tagID] = float(freq) / tagCounts[tagID]

  # print 'Finished TagAffinity model'
  if includeCounts:
    finalTagCounts = {}
    for (tagID, count) in tagCounts.items():
      if count >= 50:
        finalTagCounts[tagID] = count
    return (ttas, finalTagCounts)
  else:
    return ttas
示例#2
0
def getTagTermAffinityScores(questions,
                             includeCounts=True,
                             frequentWords=None):
    # print 'Computing TagAffinity model on %d questions' % len(questions)
    if not frequentWords:
        frequentWords = set(wordvectors.getFrequentWords(questions)[0])
    ttas = {}
    tagCounts = {}
    infile_body = codecs.open(posts_body_file, 'r', 'utf-8')
    for (qid, question) in questions.items():
        for tagID in question.tags:
            tagCounts[tagID] = tagCounts.get(tagID, 0) + 1
        infile_body.seek(question.bodyByte)
        postWords = wordvectors.getWordsFromPost(infile_body.readline())
        for word in set(postWords):
            if word not in frequentWords:
                continue
            inner_dict = ttas.get(word, {})
            for tagID in question.tags:
                inner_dict[tagID] = inner_dict.get(tagID, 0) + 1
            ttas[word] = inner_dict
    infile_body.close()

    for (term, inner_dict) in ttas.items():
        for (tagID, freq) in inner_dict.items():
            inner_dict[tagID] = float(freq) / tagCounts[tagID]

    # print 'Finished TagAffinity model'
    if includeCounts:
        finalTagCounts = {}
        for (tagID, count) in tagCounts.items():
            if count >= 50:
                finalTagCounts[tagID] = count
        return (ttas, finalTagCounts)
    else:
        return ttas
                        alpha, beta, gamma, delta, r5_tuple, r10_tuple)
                    if outfile:
                        outfile.write(
                            '%f,%f,%f,%f,%s,%s\n' %
                            (alpha, beta, gamma, delta, r5_tuple, r10_tuple))

    return bestParams5, best_r5_avg, bestParams10, best_r10_avg


## Comment out this entire block if not running from Python shell
ld.loadData(True)
# This function must be run. Be careful if this is commented out.
setQuestionModelModifications(ld.questions)
folds = ld.getCVFolds()
print 'Generating word vectors'
frequentWords, wordToIndex = wordvectors.getFrequentWords(ld.questions)
wordVecs = wordvectors.getWordVectors(ld.questions, wordToIndex)
## End block

counter = 0
recall_test_scores = [0.0, 0.0]
for fold in folds[0:5]:
    resetModels()
    counter += 1
    print 'Starting Fold %d' % counter
    trainQuestions = fold[0]
    print 'Fold size %d' % len(fold[0])
    comTagCombineModelTrain(trainQuestions)
    print 'Train complete. Beginning test.'
    testQuestions = fold[1]
    outfile = open('temp/ctc-out_%d.csv' % counter, 'w+')
            updateParameters(alpha, beta, gamma, delta, bestParams10)
          r5_tuple = (r5_avg, r5pop_avg, r5syn_avg, r5synpop_avg)
          r10_tuple = (r10_avg, r10pop_avg, r10syn_avg, r10synpop_avg)
          print '(%f, %f, %f, %f): r5 = %s, r10 = %s' % (alpha, beta, gamma, delta, r5_tuple, r10_tuple)
          if outfile:
            outfile.write('%f,%f,%f,%f,%s,%s\n' % (alpha, beta, gamma, delta, r5_tuple, r10_tuple))

  return bestParams5, best_r5_avg, bestParams10, best_r10_avg

## Comment out this entire block if not running from Python shell
ld.loadData(True)
# This function must be run. Be careful if this is commented out.
setQuestionModelModifications(ld.questions)
folds = ld.getCVFolds()
print 'Generating word vectors'
frequentWords, wordToIndex = wordvectors.getFrequentWords(ld.questions)
wordVecs = wordvectors.getWordVectors(ld.questions, wordToIndex)
## End block

counter = 0
recall_test_scores = [0.0, 0.0]
for fold in folds[0:5]:
  resetModels()
  counter += 1
  print 'Starting Fold %d' % counter
  trainQuestions = fold[0]
  print 'Fold size %d' % len(fold[0])
  comTagCombineModelTrain(trainQuestions)
  print 'Train complete. Beginning test.'
  testQuestions = fold[1]
  outfile = open('temp/ctc-out_%d.csv' % counter, 'w+')