def user_agg(si=None): ''' Loads search.gl and aggregates it by UserID to get some features. NB: this did not help. ''' start = datetime.now() if si is None: si = load('search.gl') D = 2**20 si['SQexists'] = si['SearchQuery'].apply(lambda s : s != '') si['SQhash'] = si['SearchQuery'].apply(lambda s : abs(hash(s)) % D) si['SPexists'] = si['SearchParams'].apply(lambda d : d is not None) f = {'pctSQE' : agg.AVG('SQexists'), 'pctSPE' : agg.AVG('SPexists'), 'numSearches' : agg.COUNT(), 'allCat' : agg.CONCAT('CategoryID'), 'allSQ' : agg.CONCAT('SQhash')} si = si[['UserID', 'CategoryID', 'SearchParams', 'SQexists', 'SPexists', 'SQhash']] usr = si.groupby('UserID', f) usr['allSQ'] = usr['allSQ'].apply(lambda l : list(set(l))) usr['allCat'] = usr['allCat'].apply(lambda l : list(set(l))) usr_dict = sframe_to_dict('UserID', usr) avito2_io.put_artifact(usr_dict, 'user_si.pkl') print('elapsed time: %s' % (datetime.now() - start))
def _progress_multi_combiner(results): res = results.unpack('metadata').unpack('parameters') metadatas = [c for c in res.column_names() if c.startswith('metadata')] context = [c for c in res.column_names() if c.startswith('parameters')] # Unpack metrics if possible try: res = res.unpack('metric') metrics = [c for c in res.column_names() if c.startswith('metric')] except: metrics = ['metric'] pass # Do nothing metadatas.sort() context.sort() metrics.sort() res = res[metadatas + context + metrics] # Get aggregators for all metrics aggs = {} for m in metrics: aggs['mean_' + m] = _agg.MEAN(m) for m in metadatas: aggs[m] = _agg.CONCAT(m) aggs['num_folds'] = _agg.COUNT() res = res.groupby(context, aggs) # Clean up column names for s in ['parameters.', 'metric.', 'metadata.']: res = res.rename({c: c.replace(s, '') for c in res.column_names()}) return res
def summary_by_precision(self, cutoffs=[5, 10], num_users_to_sample=20): """ Create a histogram of precision values across users. Parameters ---------- cutoffs : list, optional num_users_to_sample : int, optional For each cutoff and each bin of precision scores, sample the provided number of users. See Also -------- graphlab.evaluation.precision """ results = self._pr_results if not set(cutoffs).issubset(set(self.cutoffs)): results = self.evaluate_precision_recall(cutoffs) num_items = self._item_counts.num_rows() out = {} for model_name, result in _six.iteritems(results): by_user = result['precision_recall_by_user'].filter_by( cutoffs, 'cutoff') by_user['num_recs'] = by_user['cutoff'].apply( lambda x: min(x, num_items)) by_user['num_correct'] = (by_user['precision'] * by_user['num_recs']).astype(int) by_user['precision'] = by_user['precision'].apply( lambda x: _math.floor(x * 100) / 100) by_user['recall'] = by_user['recall'].apply( lambda x: _math.floor(x * 100) / 100) cols = ['cutoff', 'num_correct', 'precision', 'recall'] by_user = by_user.groupby( cols, { 'num_users': _aggregate.COUNT, 'random_user_sample': _aggregate.CONCAT(self._user_column) }) by_user['random_user_sample'] = by_user[ 'random_user_sample'].apply(lambda x: x[:num_users_to_sample], dtype=list) out[model_name] = by_user.sort(['cutoff', 'precision']) return out
def predict_price(): title = session['title'] description = session['description'] # Build dataframe to vectorize input data sf = graphlab.SFrame({'title': [title], 'description': [description]}) sf = count_words(sf) filename = app.config['UPLOAD_FOLDER'] + request.args['filename'] image_sf = image_deep_features(filename, deep_learning_model) sf['deep_features'] = image_sf['deep_features'] # Define category category = boosted_trees_category_classifier.predict(sf, output_type='class')[0] # Define data class if category == 'Cell Phones': topic_model = topic_model_phones price_model = boosted_trees_regression_for_phones neighbors_model = similar_images_for_phones vectorizer = vectorizer_phones data = phones category_name = 'phones' elif category in ['Furniture', 'Household', 'Home & Garden']: topic_model = topic_model_home price_model = boosted_trees_regression_for_home neighbors_model = similar_images_for_home vectorizer = vectorizer_home data = home category_name = 'home' else: # 'Baby & Kids', 'Clothing & Shoes' topic_model = topic_model_apparel price_model = boosted_trees_regression_for_apparel neighbors_model = similar_images_for_apparel vectorizer = vectorizer_apparel data = apparel category_name = 'apparel' # Add topic fields sf = add_topic_fields(sf, topic_model) # Add TF-IDF transformed_sf = vectorizer.transform(sf) sf['tfidf'] = transformed_sf['count_words'] # Predict price price = round(price_model.predict(sf)[0]) # Find nearest_neighbors neighbors = neighbors_model.query(sf, k=5) neighbors = neighbors.groupby(key_columns='query_label', operations={"neighbours": agg.CONCAT("reference_label")}) neighbors_lst = neighbors['neighbours'][0] similar_offers = data.filter_by(neighbors_lst, 'id') similar_offers['image_path'] = similar_offers['id'].apply(lambda x: IMAGES_FOLDER + category_name + "/" + str(x) + '.jpg') return render_template('price.html', price=price, category=category, image=filename, offers=similar_offers)
import graphlab import pickle from graphlab import aggregate as agg import itertools authors = graphlab.SFrame( './170331_PURE_Data_Challenge/PURE Data Challenge/authors.csv') pub_authors = authors.groupby( key_columns='PERSON_ID', operations={'publications': agg.CONCAT('PUBLICATION_ID')}) solo_count = 0 #links = dict() links = [ l for pub in pub_authors for l in itertools.combinations(pub['publications'], 2) ] print "Established %d links in the publication network." % len(links) f = open('publication_net_links_dict.pkl', 'wb') pickle.dump(file=f, obj=links) f.close()
import graphlab import pickle from graphlab import aggregate as agg import itertools authors=graphlab.SFrame('./170331_PURE_Data_Challenge/PURE Data Challenge/authors.csv') pub_authors = authors.groupby(key_columns='PUBLICATION_ID', operations={'authors':agg.CONCAT('PERSON_ID')}) f=open('./publication_net_links_dict.pkl', 'rb') _pub_links = pickle.load(f) f.close() #weights = [len([author for author in pub_authors.filter_by(column_name='PUBLICATION_ID',values=link)['authors'][0] if author in pub_authors.filter_by(column_name='PUBLICATION_ID',values=link)['authors'][1]]) for link in _pub_links] weights = [len([author for author in pub_authors[pub_authors['PUBLICATION_ID']==link[0]] if author in pub_authors[pub_authors['PUBLICATION_ID']==link[1]]]) for link in _pub_links] f = open('publication_net_links_weights.pkl', 'wb') pickle.dump(file=f,obj=weights) f.close() links = dict(zip(_pub_links, weights)) f = open('publication_net_links_dict_comp.pkl', 'wb') pickle.dump(file=f,obj=links) f.close()
from graphlab import aggregate as agg train_data = graphlab.SFrame(data="FILE_PATH/rating_final.csv") #Train Model graphlab_model = graphlab.recommender.create(train_data, user_id='userID', item_id='placeID', target='rating') #Make Recommendations: graphlab_recomm = graphlab_model.recommend() #graphlab_recomm.print_rows(num_rows=45) graphlab_recomm.remove_columns(['score', 'rank']) graphlab_recomm.groupby(key_columns='userID', operations={'placeIDs': agg.CONCAT('placeID')}) df = graphlab_recomm.to_dataframe().set_index('userID') recommendations = {} for key, row in df.itertuples(): recommendations.setdefault(key, []).append(row) print "Enter user ID : " current_user = raw_input() if current_user not in recommendations: print "User does not exist!" exit() print "Your surprise me option is :" + str( random.sample(recommendations[current_user], 1)[0])