from book import Book from nltk.stem.porter import PorterStemmer from nltk import pos_tag from util import getArgMap import sys,re,copy import collections from collections import * from nltk.util import ngrams from keyword_tool import * import nltk from loadFile import * from nltk.tokenize import word_tokenize ps = PorterStemmer() argMap = getArgMap(sys.argv[1:]) bookname = argMap.get('-b','') doc_suffix=argMap.get('-s','') path = '../test/'+bookname+'/' #vocabulary is the original vocabulary #stem2dic is the defaultdict(list) key is the stemming form of the word. values are words #df defaultdict(int) # c is content def exactMatch_byWords(c,vocabulary,df,stem2dic,stopword=()): tf_temp=wordFreq(c,stopword) new_voc=stem2dic.keys() wl=set() tf=defaultdict(int) for w in tf_temp.keys(): if w in stopword or len(w) < 3:
def load_matrix(k): sim_mat = [[] for i in xrange(m.shape[0])] print 'loading precompued sim matrix...' with open('../data/u{}.usersim_method{}'.format(k+1, DIST_FUNC)) as f: for i in xrange(m.shape[0]): s = f.readline().strip().split() for j in xrange(i+1, m.shape[0]): v = float(s[j-i-1]) sim_mat[i].append(v) # break print 'loading done' return sim_mat # parameters argMap = getArgMap(sys.argv[1:]) N = int(argMap.get('-n', 10)) # N-nearest neighbors DIST_FUNC = int(argMap.get('-d', 0)) # 0: cos 1: pearson AGGREGATION_METHOD = int(argMap.get('-a', 0)) # score aggregation method PRECALCULATION = int(argMap.get('-p', 0)) USE_COMPUTED_MATRIX = int(argMap.get('-m', 0)) SAVE_RESULTS = int(argMap.get('-save', 0)) if __name__ == "__main__": mae_set = [] rmse_set = [] for i in xrange(5): m, col2rows, rows, cols = construct_ui_matrix( data_folder + "u{}.base".format(i + 1)) # pre-computation mean_val = []