def getTagTermAffinityScores(questions, includeCounts=True, frequentWords=None): # print 'Computing TagAffinity model on %d questions' % len(questions) if not frequentWords: frequentWords = set(wordvectors.getFrequentWords(questions)[0]) ttas = {} tagCounts = {} infile_body = codecs.open(posts_body_file, 'r', 'utf-8') for (qid, question) in questions.items(): for tagID in question.tags: tagCounts[tagID] = tagCounts.get(tagID, 0) + 1 infile_body.seek(question.bodyByte) postWords = wordvectors.getWordsFromPost(infile_body.readline()) for word in set(postWords): if word not in frequentWords: continue inner_dict = ttas.get(word, {}) for tagID in question.tags: inner_dict[tagID] = inner_dict.get(tagID, 0) + 1 ttas[word] = inner_dict infile_body.close() for (term, inner_dict) in ttas.items(): for (tagID, freq) in inner_dict.items(): inner_dict[tagID] = float(freq) / tagCounts[tagID] # print 'Finished TagAffinity model' if includeCounts: finalTagCounts = {} for (tagID, count) in tagCounts.items(): if count >= 50: finalTagCounts[tagID] = count return (ttas, finalTagCounts) else: return ttas
alpha, beta, gamma, delta, r5_tuple, r10_tuple) if outfile: outfile.write( '%f,%f,%f,%f,%s,%s\n' % (alpha, beta, gamma, delta, r5_tuple, r10_tuple)) return bestParams5, best_r5_avg, bestParams10, best_r10_avg ## Comment out this entire block if not running from Python shell ld.loadData(True) # This function must be run. Be careful if this is commented out. setQuestionModelModifications(ld.questions) folds = ld.getCVFolds() print 'Generating word vectors' frequentWords, wordToIndex = wordvectors.getFrequentWords(ld.questions) wordVecs = wordvectors.getWordVectors(ld.questions, wordToIndex) ## End block counter = 0 recall_test_scores = [0.0, 0.0] for fold in folds[0:5]: resetModels() counter += 1 print 'Starting Fold %d' % counter trainQuestions = fold[0] print 'Fold size %d' % len(fold[0]) comTagCombineModelTrain(trainQuestions) print 'Train complete. Beginning test.' testQuestions = fold[1] outfile = open('temp/ctc-out_%d.csv' % counter, 'w+')
updateParameters(alpha, beta, gamma, delta, bestParams10) r5_tuple = (r5_avg, r5pop_avg, r5syn_avg, r5synpop_avg) r10_tuple = (r10_avg, r10pop_avg, r10syn_avg, r10synpop_avg) print '(%f, %f, %f, %f): r5 = %s, r10 = %s' % (alpha, beta, gamma, delta, r5_tuple, r10_tuple) if outfile: outfile.write('%f,%f,%f,%f,%s,%s\n' % (alpha, beta, gamma, delta, r5_tuple, r10_tuple)) return bestParams5, best_r5_avg, bestParams10, best_r10_avg ## Comment out this entire block if not running from Python shell ld.loadData(True) # This function must be run. Be careful if this is commented out. setQuestionModelModifications(ld.questions) folds = ld.getCVFolds() print 'Generating word vectors' frequentWords, wordToIndex = wordvectors.getFrequentWords(ld.questions) wordVecs = wordvectors.getWordVectors(ld.questions, wordToIndex) ## End block counter = 0 recall_test_scores = [0.0, 0.0] for fold in folds[0:5]: resetModels() counter += 1 print 'Starting Fold %d' % counter trainQuestions = fold[0] print 'Fold size %d' % len(fold[0]) comTagCombineModelTrain(trainQuestions) print 'Train complete. Beginning test.' testQuestions = fold[1] outfile = open('temp/ctc-out_%d.csv' % counter, 'w+')