def predict(total_number): hit = 0 for n in range(total_number): target = APIREC.target_context[n][0] change_context = APIREC.target_context[n][1] code_context = APIREC.target_context[n][2] h = [] for candidate in APIREC.P_list: candidate_target = dict(target) candidate_target['label'] = candidate[2] _score = score(candidate_target, change_context, code_context) q = heapq.heappush(h, (_score, candidate)) top10 = heapq.nlargest(10, h) # print("Top 10: " + str(top10)) prediction = Mining.atmoic_change(target) # print("Target: " + str(prediction)) for i in top10: candidate = i[1] if candidate == prediction: hit += 1 # print("hit") break print("Final prediction accuracy: " + str(1.0 * hit / total_number))
def _predict(n): # print("{} with {}".format(n, training)) global top global weights hit = 0 target = APIREC.target_context[n][0] change_context = APIREC.target_context[n][1] code_context = APIREC.target_context[n][2] h = [] for candidate in APIREC.P_list: candidate_target = dict(target) candidate_target['label'] = candidate[2] change_score = APIREC.change_score_transaction(candidate_target, change_context) code_score = APIREC.code_score_transaction(candidate_target, code_context) w_change = weights[0] w_code = 1 - w_change _score = w_change * change_score + w_code * code_score heapq.heappush(h, (_score, candidate)) hit = [0] * 5 for i in range(5): topX = heapq.nlargest(top[i], h) # print("Top 10: " + str(top10)) prediction = Mining.atmoic_change(target) # print("Target: " + str(prediction)) for item in topX: candidate = item[1] if candidate == prediction: hit[i] += 1 # print("{} hit".format(i)) break # print("return {}".format(hit)) return hit
def _predict(n, training=[0.5]): # print("{} with {}".format(n, training)) hit = 0 target = APIREC.target_context[n][0] change_context = APIREC.target_context[n][1] code_context = APIREC.target_context[n][2] h = [] for i in training: h.append([]) for candidate in APIREC.P_list: candidate_target = dict(target) candidate_target['label'] = candidate[2] change_score = APIREC.change_score_transaction(candidate_target, change_context) code_score = APIREC.code_score_transaction(candidate_target, code_context) for i in range(len(training)): w_change = training[i] w_code = 1 - w_change _score = w_change * change_score + w_code * code_score heapq.heappush(h[i], (_score, candidate)) hit = [0] * len(training) for i in range(len(training)): top10 = heapq.nlargest(10, h[i]) # print("Top 10: " + str(top10)) prediction = Mining.atmoic_change(target) # print("Target: " + str(prediction)) for item in top10: candidate = item[1] if candidate == prediction: hit[i] += 1 # print("{} hit".format(i)) break # print("return {}".format(hit)) return hit
def test_doctorAI( modelFile='model.txt', seqFile='seq.txt', inputDimSize=20000, labelFile='label.txt', numClass=500, timeFile='', predictTime=False, useLogTime=True, hiddenDimSize=[200,200], batchSize=100, logEps=1e-8, mean_duration=20.0, verbose=False ): options = locals().copy() if len(timeFile) > 0: useTime = True else: useTime = False options['useTime'] = useTime models = np.load(modelFile) tparams = init_tparams(models) print 'build model ... ', if predictTime: x, t, mask, codePred, timePred = build_model(tparams, options) predict_code = theano.function(inputs=[x,t,mask], outputs=codePred, name='predict_code') predict_time = theano.function(inputs=[x,t,mask], outputs=timePred, name='predict_time') elif useTime: x, t, mask, codePred = build_model(tparams, options) predict_code = theano.function(inputs=[x,t,mask], outputs=codePred, name='predict_code') else: x, mask, codePred = build_model(tparams, options) predict_code = theano.function(inputs=[x,mask], outputs=codePred, name='predict_code') options['inputDimSize']=models['W_emb'].shape[0] options['numClass']=models['b_output'].shape[0] print 'load data ... ', testSet = load_data(seqFile, labelFile, timeFile) n_batches = int(np.ceil(float(len(testSet[0])) / float(batchSize))) print 'done' predVec = [] trueVec = [] predTimeVec = [] trueTimeVec = [] iteration = 0 for batchIndex in range(n_batches): tempX = testSet[0][batchIndex*batchSize: (batchIndex+1)*batchSize] tempY = testSet[1][batchIndex*batchSize: (batchIndex+1)*batchSize] if predictTime: tempT = testSet[2][batchIndex*batchSize: (batchIndex+1)*batchSize] x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options) codeResults = predict_code(x, t, mask) timeResults = predict_time(x, t, mask) elif useTime: tempT = testSet[2][batchIndex*batchSize: (batchIndex+1)*batchSize] x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options) codeResults = predict_code(x, t, mask) else: x, mask, lengths = padMatrixWithoutTime(tempX, options) codeResults = predict_code(x, mask) for i in range(codeResults.shape[1]): tensorMatrix = codeResults[:,i,:] thisY = tempY[i][1:] for timeIndex in range(lengths[i]): if len(thisY[timeIndex]) == 0: continue trueVec.append(thisY[timeIndex]) output = tensorMatrix[timeIndex] predVec.append(zip(*heapq.nlargest(30, enumerate(output), key=operator.itemgetter(1)))[0]) if predictTime: for i in range(timeResults.shape[1]): timeVec = timeResults[:,i] trueTimeVec.extend(tempT[i][1:]) for timeIndex in range(lengths[i]): predTimeVec.append(timeVec[timeIndex]) if (iteration % 10 == 0) and verbose: print 'iteration:%d/%d' % (iteration, n_batches) iteration += 1 if iteration == 10: break recall = recallTop(trueVec, predVec) print 'recall@10:%f, recall@20:%f, recall@30:%f' % (recall[0], recall[1], recall[2]) if predictTime: r_squared = calculate_r_squared(trueTimeVec, predTimeVec, options) print 'R2:%f' % r_squared
def test_doctorAI(modelFile='model.txt', seqFile='seq.txt', inputDimSize=20000, labelFile='label.txt', numClass=500, timeFile='', predictTime=False, useLogTime=True, hiddenDimSize=[200, 200], batchSize=100, logEps=1e-8, mean_duration=20.0, verbose=False, embFile='embFile.txt', topicSize=20, topicFile='', experiment='1'): options = locals().copy() if len(timeFile) > 0: useTime = True else: useTime = False options['useTime'] = useTime if len(topicFile) > 0: useTopics = True else: useTopics = False options['useTopics'] = useTopics models = np.load(modelFile) tparams = init_tparams(models) # print tparams.keys() def load_embedding(infile): Wemb = np.array(pickle.load(open(infile, 'rb'))).astype(config.floatX) return Wemb if len(embFile) > 0: W_emb = load_embedding(embFile) else: W_emb = None print 'build model ... ', if useTime and useTopics: x, t, topics, mask, codePred = build_model(tparams, options, W_emb) predict_code = theano.function(inputs=[x, t, topics, mask], outputs=codePred, name='predict_code') elif useTime: x, t, topics, mask, codePred = build_model(tparams, options, W_emb) predict_code = theano.function(inputs=[x, t, mask], outputs=codePred, name='predict_code') elif useTopics: x, topics, mask, codePred = build_model(tparams, options, W_emb) predict_code = theano.function(inputs=[x, topics, mask], outputs=codePred, name='predict_code') else: x, topics, mask, codePred = build_model(tparams, options, W_emb) predict_code = theano.function(inputs=[x, mask], outputs=codePred, name='predict_code') if 'W_emb' in models.keys(): options['inputDimSize'] = models['W_emb'].shape[0] elif W_emb is not None: options['inputDimSize'] = W_emb.shape[0] options['numClass'] = models['b_output'].shape[0] print 'load data ... ', testSet = load_data(seqFile, labelFile, timeFile, topicFile) n_batches = int(np.ceil(float(len(testSet[0])) / float(batchSize))) print 'done' predVec = [] trueVec = [] predTimeVec = [] trueTimeVec = [] iteration = 0 for batchIndex in range(n_batches): tempX = testSet[0][batchIndex * batchSize:(batchIndex + 1) * batchSize] tempY = testSet[1][batchIndex * batchSize:(batchIndex + 1) * batchSize] if useTime and useTopics: tempT = testSet[2][batchIndex * batchSize:(batchIndex + 1) * batchSize] tempTopics = testSet[3][batchIndex * batchSize:(batchIndex + 1) * batchSize] x, t, topics, mask, lengths = padMatrixWithTime( tempX, tempT, options, tempTopics) codeResults = predict_code(x, t, topics, mask) elif useTime: tempT = testSet[2][batchIndex * batchSize:(batchIndex + 1) * batchSize] x, t, mask, lengths = padMatrixWithTime(tempX, tempT, options) codeResults = predict_code(x, t, mask) elif useTopics: tempTopics = testSet[3][batchIndex * batchSize:(batchIndex + 1) * batchSize] x, topics, mask, lengths = padMatrixWithoutTime( tempX, options, tempTopics) # todo codeResults = predict_code(x, topics, mask) else: x, mask, lengths = padMatrixWithoutTime(tempX, options) codeResults = predict_code(x, mask) for i in range(codeResults.shape[1]): tensorMatrix = codeResults[:, i, :] thisY = tempY[i][1:] for timeIndex in range(lengths[i]): if len(thisY[timeIndex]) == 0: continue trueVec.append(thisY[timeIndex]) output = tensorMatrix[timeIndex] predVec.append( zip(*heapq.nlargest( 30, enumerate(output), key=operator.itemgetter(1)))[0]) if (iteration % 10 == 0) and verbose: print 'iteration:%d/%d' % (iteration, n_batches) iteration += 1 if iteration == 10: break # pickle.dump(trueVec,open('experiments/trueVec_exp' + str(experiment),'wr')) # pickle.dump(predVec,open('experiments/predVec_exp' + str(experiment),'wr')) recall = recallTop(trueVec, predVec) # for cross validation # pickle.dump(recall,open('experiments-cv/recall' + str(experiment),'wr')) # for experiments pickle.dump(recall, open('experiments/recall' + str(experiment), 'wr')) print 'recall@10:%f, recall@20:%f, recall@30:%f' % (recall[0], recall[1], recall[2])