def generateBM25FScoreFile(queryUrlFeaturesFile, bm25fScoreFile, corpus): outputFileName = bm25fScoreFile # "bm25f_scores.txt" #populate map with features from file (queries, features) = DocUtils.extractFeatures(queryUrlFeaturesFile) # [url,title,header,body,anchor] QueryPageBM25F.bm25f_B = [1.0,0.1,1.0,1.0,0.1] QueryPageBM25F.bm25f_W = [1.0,0.9,0.8,0.9,0.7] QueryPageBM25F.K1 = 1 QueryPageBM25F.lamd = 3.0 QueryPageBM25F.lamd_prime = 2.0 QueryPageBM25F.lamd_prime2 = 1.0 QueryPageBM25F.Vf = Pa3Utils.v_logarithmic fields_avg_len = Pa3Utils.features_avg_len(features) rankedQueries = Pa3Utils.bm25fRankQueries_withScores(features,fields_avg_len,corpus) Pa3Utils.printResults(rankedQueries,outputFileName)
def generateWindowSizesFile(queryUrlFeaturesFile, windowSizesFile, corpus): outputFileName = windowSizesFile # "window_sizes.txt" #populate map with features from file (queries, features) = DocUtils.extractFeatures(queryUrlFeaturesFile) INFINITE = sys.maxsize window_sizes = {} for query in features: queryObject = Query(query,features[query]) urls = [] for pageStr, pageObject in queryObject.pages.iteritems(): smallestWindow, windowSizesList = Pa3Utils.findSmallestWindow(queryObject,pageObject) window_sizes_with_zero = [0 if w==INFINITE else w for w in windowSizesList] window_sizes_with_zero_str = " ".join([str(i) for i in window_sizes_with_zero]) urls.append(pageStr + " " + window_sizes_with_zero_str) window_sizes[query] = urls Pa3Utils.printResults(window_sizes,outputFileName)