Пример #1
0
def user_agg(si=None):
  '''
  Loads search.gl and aggregates it by UserID to get some features.
  NB: this did not help.
  '''
  start = datetime.now()
  if si is None:
    si = load('search.gl')
  D = 2**20
  si['SQexists'] = si['SearchQuery'].apply(lambda s : s != '')
  si['SQhash']   = si['SearchQuery'].apply(lambda s : abs(hash(s)) % D)
  si['SPexists'] = si['SearchParams'].apply(lambda d : d is not None)
  
  f = {'pctSQE'      : agg.AVG('SQexists'),
       'pctSPE'      : agg.AVG('SPexists'),
       'numSearches' : agg.COUNT(),
       'allCat'      : agg.CONCAT('CategoryID'),
       'allSQ'       : agg.CONCAT('SQhash')}
       
  si = si[['UserID', 
           'CategoryID', 
           'SearchParams', 
           'SQexists', 
           'SPexists', 
           'SQhash']]
  usr = si.groupby('UserID', f)
  usr['allSQ']  = usr['allSQ'].apply(lambda l : list(set(l)))
  usr['allCat'] = usr['allCat'].apply(lambda l : list(set(l)))
  usr_dict = sframe_to_dict('UserID', usr)
  avito2_io.put_artifact(usr_dict, 'user_si.pkl')
  print('elapsed time: %s' % (datetime.now() - start))
Пример #2
0
def _progress_multi_combiner(results):
    res = results.unpack('metadata').unpack('parameters')
    metadatas = [c for c in res.column_names() if c.startswith('metadata')]
    context = [c for c in res.column_names() if c.startswith('parameters')]

    # Unpack metrics if possible
    try:
        res = res.unpack('metric')
        metrics = [c for c in res.column_names() if c.startswith('metric')]
    except:
        metrics = ['metric']
        pass  # Do nothing

    metadatas.sort()
    context.sort()
    metrics.sort()
    res = res[metadatas + context + metrics]

    # Get aggregators for all metrics
    aggs = {}
    for m in metrics:
        aggs['mean_' + m] = _agg.MEAN(m)
    for m in metadatas:
        aggs[m] = _agg.CONCAT(m)
    aggs['num_folds'] = _agg.COUNT()

    res = res.groupby(context, aggs)

    # Clean up column names
    for s in ['parameters.', 'metric.', 'metadata.']:
        res = res.rename({c: c.replace(s, '') for c in res.column_names()})

    return res
    def summary_by_precision(self, cutoffs=[5, 10], num_users_to_sample=20):
        """
        Create a histogram of precision values across users.

        Parameters
        ----------
        cutoffs : list, optional

        num_users_to_sample : int, optional
           For each cutoff and each bin of precision scores, sample the
           provided number of users.

        See Also
        --------
        graphlab.evaluation.precision
        """

        results = self._pr_results
        if not set(cutoffs).issubset(set(self.cutoffs)):
            results = self.evaluate_precision_recall(cutoffs)

        num_items = self._item_counts.num_rows()

        out = {}
        for model_name, result in _six.iteritems(results):
            by_user = result['precision_recall_by_user'].filter_by(
                cutoffs, 'cutoff')
            by_user['num_recs'] = by_user['cutoff'].apply(
                lambda x: min(x, num_items))
            by_user['num_correct'] = (by_user['precision'] *
                                      by_user['num_recs']).astype(int)
            by_user['precision'] = by_user['precision'].apply(
                lambda x: _math.floor(x * 100) / 100)
            by_user['recall'] = by_user['recall'].apply(
                lambda x: _math.floor(x * 100) / 100)
            cols = ['cutoff', 'num_correct', 'precision', 'recall']
            by_user = by_user.groupby(
                cols, {
                    'num_users': _aggregate.COUNT,
                    'random_user_sample': _aggregate.CONCAT(self._user_column)
                })
            by_user['random_user_sample'] = by_user[
                'random_user_sample'].apply(lambda x: x[:num_users_to_sample],
                                            dtype=list)
            out[model_name] = by_user.sort(['cutoff', 'precision'])
        return out
Пример #4
0
def predict_price():
    title = session['title']
    description = session['description']

    # Build dataframe to vectorize input data
    sf = graphlab.SFrame({'title': [title], 'description': [description]})
    sf = count_words(sf)
    filename = app.config['UPLOAD_FOLDER'] + request.args['filename']
    image_sf = image_deep_features(filename, deep_learning_model)
    sf['deep_features'] = image_sf['deep_features']

    # Define category
    category = boosted_trees_category_classifier.predict(sf, output_type='class')[0]

    # Define data class
    if category == 'Cell Phones':
        topic_model = topic_model_phones
        price_model = boosted_trees_regression_for_phones
        neighbors_model = similar_images_for_phones
        vectorizer = vectorizer_phones
        data = phones
        category_name = 'phones'
    elif category in ['Furniture', 'Household', 'Home & Garden']:
        topic_model = topic_model_home
        price_model = boosted_trees_regression_for_home
        neighbors_model = similar_images_for_home
        vectorizer = vectorizer_home
        data = home
        category_name = 'home'
    else:  # 'Baby & Kids', 'Clothing & Shoes'
        topic_model = topic_model_apparel
        price_model = boosted_trees_regression_for_apparel
        neighbors_model = similar_images_for_apparel
        vectorizer = vectorizer_apparel
        data = apparel
        category_name = 'apparel'

    # Add topic fields
    sf = add_topic_fields(sf, topic_model)

    # Add TF-IDF
    transformed_sf = vectorizer.transform(sf)
    sf['tfidf'] = transformed_sf['count_words']

    # Predict price
    price = round(price_model.predict(sf)[0])

    # Find nearest_neighbors
    neighbors = neighbors_model.query(sf, k=5)
    neighbors = neighbors.groupby(key_columns='query_label',
                                  operations={"neighbours":
                                              agg.CONCAT("reference_label")})
    neighbors_lst = neighbors['neighbours'][0]
    similar_offers = data.filter_by(neighbors_lst, 'id')
    similar_offers['image_path'] = similar_offers['id'].apply(lambda x:
                                                              IMAGES_FOLDER +
                                                              category_name +
                                                              "/" + str(x) +
                                                              '.jpg')
    return render_template('price.html', price=price, category=category,
                           image=filename, offers=similar_offers)
Пример #5
0
import graphlab
import pickle
from graphlab import aggregate as agg
import itertools

authors = graphlab.SFrame(
    './170331_PURE_Data_Challenge/PURE Data Challenge/authors.csv')

pub_authors = authors.groupby(
    key_columns='PERSON_ID',
    operations={'publications': agg.CONCAT('PUBLICATION_ID')})

solo_count = 0
#links = dict()

links = [
    l for pub in pub_authors
    for l in itertools.combinations(pub['publications'], 2)
]
print "Established %d links in the publication network." % len(links)

f = open('publication_net_links_dict.pkl', 'wb')
pickle.dump(file=f, obj=links)
f.close()
Пример #6
0
import graphlab
import pickle
from graphlab import aggregate as agg
import itertools

authors=graphlab.SFrame('./170331_PURE_Data_Challenge/PURE Data Challenge/authors.csv')
pub_authors = authors.groupby(key_columns='PUBLICATION_ID', operations={'authors':agg.CONCAT('PERSON_ID')})


f=open('./publication_net_links_dict.pkl', 'rb')
_pub_links = pickle.load(f)
f.close()

#weights = [len([author for author in pub_authors.filter_by(column_name='PUBLICATION_ID',values=link)['authors'][0] if author in pub_authors.filter_by(column_name='PUBLICATION_ID',values=link)['authors'][1]]) for link in _pub_links]

weights = [len([author for author in pub_authors[pub_authors['PUBLICATION_ID']==link[0]] if author in pub_authors[pub_authors['PUBLICATION_ID']==link[1]]]) for link in _pub_links]


f = open('publication_net_links_weights.pkl', 'wb')
pickle.dump(file=f,obj=weights)
f.close()

links = dict(zip(_pub_links, weights))

f = open('publication_net_links_dict_comp.pkl', 'wb')
pickle.dump(file=f,obj=links)
f.close()
Пример #7
0
from graphlab import aggregate as agg

train_data = graphlab.SFrame(data="FILE_PATH/rating_final.csv")
#Train Model
graphlab_model = graphlab.recommender.create(train_data,
                                             user_id='userID',
                                             item_id='placeID',
                                             target='rating')

#Make Recommendations:
graphlab_recomm = graphlab_model.recommend()
#graphlab_recomm.print_rows(num_rows=45)

graphlab_recomm.remove_columns(['score', 'rank'])
graphlab_recomm.groupby(key_columns='userID',
                        operations={'placeIDs': agg.CONCAT('placeID')})
df = graphlab_recomm.to_dataframe().set_index('userID')
recommendations = {}

for key, row in df.itertuples():
    recommendations.setdefault(key, []).append(row)

print "Enter user ID : "
current_user = raw_input()

if current_user not in recommendations:
    print "User does not exist!"
    exit()

print "Your surprise me option is :" + str(
    random.sample(recommendations[current_user], 1)[0])