def run(source, target, num_topics = 100, passes = 20, lang = 'en', distance_measure = euclidean, percentage = 0.05): """ Main entry point for this package. Contains and executes the whole data pipeline. Arguments: source -- The path string to the source file containing all reviews target -- The path string to the target directory where the neighbors for all users will be saved Keyword arguments: num_topics -- The number of topics LDA is supposed to discover (default 100) passes -- The number of iterations for the statistical inference algorithm (default 20) lang -- The language the reviews shall be sorted by (default 'en') distance_measure -- A python function that measures the distance between two vectors in a num_topics-dimensional vector space. Must take two numpy arrays and return a float. (default euclidean) percentage -- The cutoff for being a close neighbor, i.e. two users are close if their distance is within the closest percentage percent of all distances (default 0.05) """ with open(source) as f: all_reviews = [] for line in f: all_reviews.append(json.loads(line)) reviews = filter_by_language(all_reviews, lang) rt = ReviewTokenizer(reviews) rt.tokenize() db = DictionaryBuilder(rt.tokenized_docs) db.build() dtmb = DTMBuilder(db.dictionary, db.srcTexts) dtmb.build() ldaw = LDAWrapper(dtmb.dtm, db.dictionary) ldaw.run(num_topics = num_topics, passes = passes) modelwrapper = LDAModelWrapper(ldaw.ldamodel, db.dictionary, sortByUsers(rt.tokenized_docs)) posteriors = modelwrapper.get_all_posteriors() means = {} for key, value in posteriors.iteritems(): means[key] = mean(value).tolist() x = Recommender(means) y = x.calc_distances(distance_measure) threshhold = fivePercent(y, percentage) for user in means.iterkeys(): z = x.calc_neighbors(user, distance_measure, threshhold = threshhold) if len(target) > 0: fileName = target + '/' + user + '.json' else: fileName = user + '.json' with open(fileName, 'w') as g: json.dump(z, g)
import json from ldamodelwrapper import LdaModelWrapper as LMW from gensim import corpora import os import numpy as np from recommender import Recommender def euclidean(x,y): return np.sqrt(np.sum((x-y)**2)) userCurrPart = [] with open('parts/part5.json') as f: for line in f: dct = json.loads(line) key = dct.keys()[0] userCurrPart.append(key) with open('means.json') as f: means = json.loads(f.read()) x = Recommender(means) for user in userCurrPart: y = x.calc_neighbors(user, euclidean, threshhold = 0.21) with open('close_neighbors/close_neighbors_neighbors_' + user + '.json', 'w') as f: json.dump(y, f) #neighbors[user] = y