def test(test_data_file, model, task): sys.stderr.write('\n## Testing with feature_file = %s ... \n' % (test_data_file)) if task == 1: # Step (1): construct your test feature arrays here (X, queries, index_map) = pointwise_test_features(test_data_file) # Step (2): implement your prediction code here y = pointwise_testing(X, model) elif task == 2: # Step (1): construct your test feature arrays here (X, queries, index_map) = pairwise_test_features(test_data_file) # Step (2): implement your prediction code here y = pairwise_testing(X, model) elif task == 3: # Add more features print >> sys.stderr, "Task 3\n" # Generating BM25F and WindowSizes for test_data_file bm25f_scores_output_file = "bm25f_scores.txt" Pa3Utils.generateBM25FScoreFile(test_data_file, bm25f_scores_output_file, corpus) window_sizes_output_file = "window_sizes.txt" Pa3Utils.generateWindowSizesFile(test_data_file, window_sizes_output_file, corpus) extraFeaturesInfo.load(bm25f_scores_output_file, window_sizes_output_file) # Step (1): construct your test feature arrays here #(X, queries, index_map) = pointwise_test_features(test_data_file, extraFeaturesInfo) # Step (2): implement your prediction code here #y = pointwise_testing(X, model) # Step (1): construct your test feature arrays here (X, queries, index_map) = pairwise_test_features(test_data_file, extraFeaturesInfo) # Step (2): implement your prediction code here y = pairwise_testing(X, model) elif task == 4: # Extra credit print >> sys.stderr, "Extra credit\n" # Step (1): construct your test feature arrays here (X, queries, index_map) = pointwise_test_features(test_data_file) # Step (2): implement your prediction code here y = pointwise_testing(X, model) else: queries = ['query1', 'query2'] index_map = {'query1' : {'url1':0}, 'query2': {'url2':1}} X = [[0.5, 0.5], [1.5, 1.5]] y = model.predict(X) # Step (3): output your ranking result to stdout in the format that will be scored by the ndcg.py code rankedQueries = DocUtils.getRankedQueries(queries,index_map,y) DocUtils.printRankedResults(rankedQueries,"ranked.txt")
def generateBM25FScoreFile(queryUrlFeaturesFile, bm25fScoreFile, corpus): outputFileName = bm25fScoreFile # "bm25f_scores.txt" #populate map with features from file (queries, features) = DocUtils.extractFeatures(queryUrlFeaturesFile) # [url,title,header,body,anchor] QueryPageBM25F.bm25f_B = [1.0,0.1,1.0,1.0,0.1] QueryPageBM25F.bm25f_W = [1.0,0.9,0.8,0.9,0.7] QueryPageBM25F.K1 = 1 QueryPageBM25F.lamd = 3.0 QueryPageBM25F.lamd_prime = 2.0 QueryPageBM25F.lamd_prime2 = 1.0 QueryPageBM25F.Vf = Pa3Utils.v_logarithmic fields_avg_len = Pa3Utils.features_avg_len(features) rankedQueries = Pa3Utils.bm25fRankQueries_withScores(features,fields_avg_len,corpus) Pa3Utils.printResults(rankedQueries,outputFileName)
def generateWindowSizesFile(queryUrlFeaturesFile, windowSizesFile, corpus): outputFileName = windowSizesFile # "window_sizes.txt" #populate map with features from file (queries, features) = DocUtils.extractFeatures(queryUrlFeaturesFile) INFINITE = sys.maxsize window_sizes = {} for query in features: queryObject = Query(query,features[query]) urls = [] for pageStr, pageObject in queryObject.pages.iteritems(): smallestWindow, windowSizesList = Pa3Utils.findSmallestWindow(queryObject,pageObject) window_sizes_with_zero = [0 if w==INFINITE else w for w in windowSizesList] window_sizes_with_zero_str = " ".join([str(i) for i in window_sizes_with_zero]) urls.append(pageStr + " " + window_sizes_with_zero_str) window_sizes[query] = urls Pa3Utils.printResults(window_sizes,outputFileName)
def pairwise_test_features(test_data_file, extraFeaturesInfo=None): X,queries,index_map = DocUtils.extractX_pairWise(test_data_file, corpus, extraFeaturesInfo) return (X, queries, index_map)
def pairwise_train_features(train_data_file, train_rel_file, extraFeaturesInfo=None): X,y = DocUtils.extractXy_pairWise(train_data_file, train_rel_file, corpus, extraFeaturesInfo) return (X, y)