示例#1
0
def main():
    # create csvs to train models
    _load_data()
    is_df = gl.SFrame.read_csv(IS_DF_PATH)
    nmf_df = gl.SFrame.read_csv(NMF_DF_PATH)
    pop_df = gl.SFrame.read_csv(POP_DF_PATH)
    item_df = gl.SFrame(get.item_data(BEERS_PATH, BREWERS_PATH))
    # list of beers to exclude from recs, by user
    exclude_beers = pd.read_csv(FTISO_PATH)[['user_id', 'beer_id']]
    exclude_beers.columns = ['user_id', 'item_id']
    exclude_beers = gl.SFrame(exclude_beers)
    # build & save models
    build.is_model(is_df, IS_MODEL_PATH)
    build.nmf_model(nmf_df, item_df, NMF_MODEL_PATH)
    build.pop_model(pop_df, POP_MODEL_PATH)
    # load all models
    is_model = gl.load_model(IS_MODEL_PATH)
    nmf_model = gl.load_model(NMF_MODEL_PATH)
    pop_model = gl.load_model(POP_MODEL_PATH)
    # get recommendations & export as json
    users = pd.read_csv(USERS_PATH)['id'].to_frame(name='id')
    users = gl.SFrame(users)
    is_recs = is_model.recommend(users=users['id'],
                    exclude=exclude_beers,
                    diversity=3)
    nmf_recs = nmf_model.recommend(users=users['id'],
                    exclude=exclude_beers,
                    diversity=3)
    pop_recs = pop_model.recommend(users=users['id'],
                    exclude=exclude_beers,
                    diversity=3)
    # save recommendations
    is_recs.export_json(IS_REC_PATH, orient='records')
    nmf_recs.export_json(NMF_REC_PATH, orient='records')
    pop_recs.export_json(POP_REC_PATH, orient='records')
示例#2
0
def main():
    # create csvs to train models
    _load_data()
    is_df = gl.SFrame.read_csv(IS_DF_PATH)
    nmf_df = gl.SFrame.read_csv(NMF_DF_PATH)
    pop_df = gl.SFrame.read_csv(POP_DF_PATH)
    item_df = gl.SFrame(get.item_data(BEERS_PATH, BREWERS_PATH))
    # list of beers to exclude from recs, by user
    exclude_beers = pd.read_csv(FTISO_PATH)[['user_id', 'beer_id']]
    exclude_beers.columns = ['user_id', 'item_id']
    exclude_beers = gl.SFrame(exclude_beers)
    # build & save models
    build.is_model(is_df, IS_MODEL_PATH)
    build.nmf_model(nmf_df, item_df, NMF_MODEL_PATH)
    build.pop_model(pop_df, POP_MODEL_PATH)
    # load all models
    is_model = gl.load_model(IS_MODEL_PATH)
    nmf_model = gl.load_model(NMF_MODEL_PATH)
    pop_model = gl.load_model(POP_MODEL_PATH)
    # get recommendations & export as json
    users = pd.read_csv(USERS_PATH)['id'].to_frame(name='id')
    users = gl.SFrame(users)
    is_recs = is_model.recommend(users=users['id'],
                                 exclude=exclude_beers,
                                 diversity=3)
    nmf_recs = nmf_model.recommend(users=users['id'],
                                   exclude=exclude_beers,
                                   diversity=3)
    pop_recs = pop_model.recommend(users=users['id'],
                                   exclude=exclude_beers,
                                   diversity=3)
    # save recommendations
    is_recs.export_json(IS_REC_PATH, orient='records')
    nmf_recs.export_json(NMF_REC_PATH, orient='records')
    pop_recs.export_json(POP_REC_PATH, orient='records')
示例#3
0
def load_models():
    popularity_model_path = os.path.join(MODEL_DIR, 'popularity',
                                         'popularity_model')
    item_model_path = os.path.join(MODEL_DIR, 'item', 'item_model')
    popularity_model = graphlab.load_model(popularity_model_path)
    item_model = graphlab.load_model(item_model_path)
    user_pkl_path = os.path.join(MODEL_DIR, 'user.pkl')
    item_pkl_path = os.path.join(MODEL_DIR, 'item.pkl')
    users = pd.read_pickle(user_pkl_path)
    items = pd.read_pickle(item_pkl_path)
    return popularity_model, item_model, users, items
示例#4
0
def test_models():
    '''
    INPUT: None
    DESCRIPTION: Tests the model
    OUTPUT: None
    '''
    list_methods = ['factorization_recommender', 'factorization_recommender',
                    'ranking_factorization_recommender']
    model = Model(model=gl.load_model(fp+'item_similarity_recommender'))
    print model.sample_recommendation(20, 10)
    for model_name in list_methods:
        model.model = gl.load_model(model_name) 
        print model.sample_recommendation(20,10)
示例#5
0
def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path):
    """
    Internal util to get a GLC object from a persistent ID in the pickle file.

    Parameters
    ----------
    type_tag : The name of the glc class as saved in the GLC pickler.

    gl_archive_abs_path: An absolute path to the GLC archive where the 
                          object was saved.

    Returns
    ----------
    The GLC object.

    """
    if type_tag == "SFrame":
        obj = _gl.SFrame(gl_archive_abs_path)
    elif type_tag == "SGraph":
        obj = _gl.load_graph(gl_archive_abs_path)
    elif type_tag == "SArray":
        obj = _gl.SArray(gl_archive_abs_path)
    elif type_tag == "Model":
        obj = _gl.load_model(gl_archive_abs_path)
    else:
        raise _pickle.UnpicklingError(
            "GraphLab pickling Error: Unspported object."
            " Only SFrames, SGraphs, SArrays, and Models are supported.")
    return obj
示例#6
0
    def test_exception(self):
        # load model from empty file
        with util.TempDirectory() as tmp_empty_file:
            with self.assertRaises(IOError):
                gl.load_model(tmp_empty_file)

        # load model from non-existing file
        if (os.path.exists('./tmp_model-%d' % temp_number)):
            shutil.rmtree('./tmp_model-%d' % temp_number)
        with self.assertRaises(IOError):
            gl.load_model('./tmp_model-%d' % temp_number)

        # save model to invalid url
        for url in ['http://test', '/root/tmp/testmodel']:
            with self.assertRaises(IOError):
                self.pr_model.save(url)
示例#7
0
 def evaluate_model(self, dataset):
     from graphlab import load_model
     
     model = None
     for folder in dataset.folders:
         model_file = self._get_model_file(dataset, folder)
     
         user_evaluation_file = self._get_evaluation_file(dataset, folder, evaluation_type = 'user')
         item_evaluation_file = self._get_evaluation_file(dataset, folder, evaluation_type = 'item')
         
         user = item = False
         
         if os.path.exists(user_evaluation_file):
             user = True
             print 'RecommendationModel ' + self.id + ' already evaluated by user in folder ' + folder.id + '.'
         
         if os.path.exists(item_evaluation_file):
             item = True
             print 'RecommendationModel ' + self.id + ' already evaluated by item in folder ' + folder.id + '.'
         
         if user and item:
             continue
         
         model       = load_model(location = model_file)
         evaluation  = model.evaluate(dataset = folder.train_sframe, metric = 'rmse')
         if not user:
             evaluation['rmse_by_user'].save(user_evaluation_file)
         if not item:
             evaluation['rmse_by_item'].save(item_evaluation_file)
示例#8
0
def predict_options(options):
    """
    Run predictions on potential options
    :param options: array of dictionary, expected format [{"user": __, "content.id": __}]
    :return: an array with predicted scores for each option; None if invalid
    """
    # TODO - Need to format option in a way that makes sense for the predictor
    if os.path.exists(MODEL_LOCATION):
        model = gl.load_model(MODEL_LOCATION)
    else:
        logger.warn("couldn't load module, re-training", exc_info=True)
        model = train()

    if "user" in options[0] and "content_id" in options[0]:
        temp_users = []
        temp_content = []
        for option in options:
            temp_users.append(option["user"])
            temp_content.append(option["content_id"])
        users = gl.SArray(temp_users)
        content = gl.SArray(temp_content)
        frame = gl.SFrame({
            "user": users,
            "content_id": content
        },
                          format="dict")
        prediction = model.predict(frame)
        logger.info("prediction: ", str(prediction))
    else:
        logger.error(
            "options not in the correct format, expected key 'user' and key 'content_id'"
        )
        prediction = None

    return list(prediction)
    def __load_data_structure__(self, filepath):
        """Return data structure if can be loaded, otherwise returns None and logs warning"""
        # try to load different supported types, since don't know what type just try all and swallow exceptions
        obj = None
        try:
            obj = _gl.load_sframe(filepath)
            return obj
        except:
            pass
        try:
            obj = _gl.load_sgraph(filepath)
            return obj
        except:
            pass
        
        try:
            obj = _gl.load_model(filepath)
            return obj
        except:
            pass

        try:
            obj = _gl.SArray(data=filepath)
            return obj
        except:
            pass

        __LOGGER__.debug("Unable to load dependency, unsupported type at path: %s" % filepath)
        return None
示例#10
0
def predict(modelname, jsonfile):
    filename, file_extension = os.path.splitext(modelname)
    fileList = filename.split('_')

    zipname = modelname + '.zip'

    download("models", zipname)

    zip_ref = zipfile.ZipFile(zipname, 'r')
    zip_ref.extractall(modelname)
    zip_ref.close()

    # predict
    # reqUserList = {"userList": ["A3SGXH7AUHU8GW"]}
    with open(jsonfile) as data_file:
        userDict = json.load(data_file)
    userList = userDict['userList']

    resultList = {}
    if os.path.exists(modelname):
        model = gl.load_model(modelname)
        recommendedItemList = model.recommend(users=userList)
        for user in userList:
            outRowList = recommendedItemList[recommendedItemList['user_id'] ==
                                             user]
            resultList[user] = list(outRowList['item_id'])
        print resultList
        return json.dumps(resultList)

    else:
        raise Exception('model does not exist.')
        return
示例#11
0
    def __init__(self, env, create_model=False, data_set=None):
        self._env = env
        self._model = None
        if not create_model:
            self._model = gl.load_model('data/FMmodel')

        print "FM policy model loaded."
示例#12
0
def recommend():
    # get user input
    input_boxes = ['beer_input1_id', 'beer_input2_id', 'beer_input3_id']
    user_input = []
    for box in input_boxes:
        submission = request.form[box]
        if submission != '':
            submission = int(submission)
            user_input.append(submission)
    user_input = pd.DataFrame({'user_id': [20000 for _ in user_input], 'item_id': user_input})

    # identify user-favored styles for additional filtering
    beers = pd.read_csv('../data/input/beers.csv')
    user_styles = user_input.merge(beers, left_on='item_id', right_on='id')['style']
    items = beers[beers['style'].isin(list(user_styles))]
    items = gl.SFrame(items)['id']

    # load model and generate recommendations
    model = gl.load_model('../models/item_similarity_model')
    user_input = gl.SFrame(user_input)
    pred = list(model.recommend(users=[20000], items=items, k=5, new_observation_data=user_input, diversity=3)['item_id'])

    # format recommendations for output
    beer_recs = beers[beers['id'].isin(pred)]
    beer_recs = beer_recs[['name', 'brewery_name', 'style', 'score']]
    beer_recs.columns = ['brew', 'brewery', 'style', 'untappd score']
    beer_recs = beer_recs.to_html(columns=['brew', 'brewery', 'style', 'untappd score'],
                                  index=False)
    beer_recs = beer_recs.replace('border="1" class="dataframe"',
                                  'class=table table-hover')
    return render_template('index.html', recommend=True, beer_recs=beer_recs)
    def __init__(self, features, model='auto', output_column_prefix=None):
        """
        Parameters
        ----------
        """
        _raise_error_if_not_of_type(features, [str, list, type(None)])
        _raise_error_if_not_of_type(model, [str, _NeuralNetClassifier])
        _raise_error_if_not_of_type(output_column_prefix, [str, type(None)])

        if isinstance(features, str):
            features = [features]

        # Set the model.
        self._state = {}
        self._state["features"] = features
        if not output_column_prefix:
            output_column_prefix = "deep_features"
        self._state["output_column_prefix"] = output_column_prefix

        self._state['model'] = model
        if self._state["model"] == 'auto':
            model_path = \
    "https://static.turi.com/products/graphlab-create/resources/models/python2.7/imagenet_model_iter45"
            import graphlab as gl
            self._state['model'] = gl.load_model(model_path)
        if type(self._state['model']) is not _NeuralNetClassifier:
            raise ValueError(
                "Model parameters must be of type NeuralNetClassifier " +
                "or string literal 'auto'")
示例#14
0
    def test_exception(self):
        # load model from empty file
        with util.TempDirectory() as tmp_empty_file:
            with self.assertRaises(IOError):
                gl.load_model(tmp_empty_file)

        # load model from non-existing file
        if (os.path.exists('./tmp_model-%d' % temp_number)):
            shutil.rmtree('./tmp_model-%d' % temp_number)
        with self.assertRaises(IOError):
            gl.load_model('./tmp_model-%d' % temp_number)

        # save model to invalid url
        for url in ['http://test', '/root/tmp/testmodel']:
            with self.assertRaises(IOError):
                self.pr_model.save(url)
def predict(modelname, jsonfile):
	filename, file_extension = os.path.splitext(modelname)
	fileList = filename.split('_')

	zipname = modelname + '.zip'

	download("models", zipname)
	
	zip_ref = zipfile.ZipFile(zipname, 'r')
	zip_ref.extractall(modelname)
	zip_ref.close()

	# predict
	# reqUserList = {"userList": ["A3SGXH7AUHU8GW"]}
	with open(jsonfile) as data_file:
		userDict = json.load(data_file)
	userList = userDict['userList']

	resultList = {}
	if os.path.exists(modelname):
		model = gl.load_model(modelname)
		recommendedItemList = model.recommend(users=userList)
		for user in userList:
			outRowList = recommendedItemList[recommendedItemList['user_id']==user]
			resultList[user] = list(outRowList['item_id'])
		print resultList
		return json.dumps(resultList)

	else:
		raise Exception('model does not exist.')
		return
def applyMF(path, model_num, limit = np.Inf):
    logger = logging.getLogger('signature.aMF')
    logger.info('starting applyMF')
    #get data
    r_file = path+'yelp_reviews_test_predictions.json'
    
    testReviews = list()
    for counter, line in enumerate(open(r_file,'r')):
        if not counter%1000:
            logger.debug('%d reviews loaded'%counter)
        if counter > limit:
            break
        testReviews.append(json.loads(line.strip()))
    
    
    #load model
    model_path = path+'regularModels/matrixFactorization_%d.model'%model_num
    model = graphlab.load_model(model_path)
    
    #run function
    reviewsPrediction = applyMatrixFactorization(testReviews, model)
    
    #save result
    outfile = open(path+'yelp_reviews_test_predictions.json','w')
    for review in reviewsPrediction:
        outfile.write(json.dumps(review).encode('utf8', 'ignore')+'\n')
    outfile.close()
    
示例#17
0
def get_topics():
    lda = gl.load_model('../lda_25topics')
    topics = lda.get_topics(output_type='topic_words')
    topic_names = []
    for topic in topics:
        topic_names.append(' '.join(topic['words']))
    return topic_names
示例#18
0
    def test_model(self, dataset, type_prediction = 'test'):
        from graphlab import load_model

        for folder in dataset.folders:
            prediction_file = self._get_prediction_file(dataset, folder, type_prediction) 
            
            model_file = self._get_model_file(dataset, folder)

            if os.path.exists(prediction_file):
                print 'RecommendationModel ' + self.id + ' already tested in folder ' + folder.id + '.'
                continue 
            
            elif not os.path.exists(model_file):
                print 'Impossible testing this model. It should be trained first.'
                return
            
            else: 
                print 'Starting to test_model model ' + self.id + '.'
                model = load_model(location = model_file)
                if type_prediction == 'test':
                    predictions = model.predict(dataset = folder.test_sframe)
                else:
                    predictions = model.predict(dataset = folder.train_sframe)
                predictions.save(filename = prediction_file)
                print 'RecommendationModel ' + self.id + ' tested.'
示例#19
0
    def LoadModel(self, train, type, modelName, dir=None):

        myDir = os.getcwd() if dir is None else dir
        myModelName = myDir + "/" + modelName

        myModel = graphlab.load_model(myModelName)

        if train not in ['train', 'full', 'elite']:
            raise ValueError("train is not valid")
        if modelName not in ['ranking', 'factorization']:
            raise ValueError("bad modelName")

        if (train, modelName) == ('train', 'ranking'):
            self.trained_model['ranking'] = myModel
        elif (train, modelName) == ('train', 'factorization'):
            self.trained_model['factorization'] = myModel
        elif (train, modelName) == ('full', 'ranking'):
            self.full_model['ranking'] = myModel
        elif (train, modelName) == ('full', 'factorization'):
            self.full_model['factorization'] = myModel
        elif (train, modelName) == ('elite', 'ranking'):
            self.elite_model['ranking'] = myModel
        elif (train, modelName) == ('elite', 'factorization'):
            self.elite_model['factorization'] = myModel

        print("the model loaded from %s" % (myModelName))
    def __init__(self, feature, model = 'auto', output_column_name=None):
        """
        Parameters
        ----------
        """
        _raise_error_if_not_of_type(feature, [str])
        _raise_error_if_not_of_type(model, [str, _NeuralNetClassifier])
        _raise_error_if_not_of_type(output_column_name, [str, _NoneType])

        # Set the model.
        self._state = {}
        self._state["features"] = feature
        if not output_column_name:
            self._state["output_column_name"] = "deep_features_%s" % feature
        else:
            self._state["output_column_name"] = output_column_name
        self._state['model'] = model
        if self._state["model"] == 'auto':
            model_path = \
    "http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45"
            import graphlab as gl
            self._state['model'] = gl.load_model(model_path)
        if type(self._state['model']) is not _NeuralNetClassifier:
            raise ValueError("Model parameters must be of type NeuralNetClassifier " +
                "or string literal 'auto'")
示例#21
0
def _get_gl_object_from_persistent_id(type_tag, gl_archive_abs_path):
    """
    Internal util to get a GLC object from a persistent ID in the pickle file.

    Parameters
    ----------
    type_tag : The name of the glc class as saved in the GLC pickler.

    gl_archive_abs_path: An absolute path to the GLC archive where the 
                          object was saved.

    Returns
    ----------
    The GLC object.

    """
    if type_tag == "SFrame":
        obj = _gl.SFrame(gl_archive_abs_path)
    elif type_tag == "SGraph":
        obj = _gl.load_graph(gl_archive_abs_path)
    elif type_tag == "SArray":
        obj = _gl.SArray(gl_archive_abs_path)
    elif type_tag == "Model":
        obj = _gl.load_model(gl_archive_abs_path)
    else:
        raise _pickle.UnpicklingError("GraphLab pickling Error: Unspported object."
              " Only SFrames, SGraphs, SArrays, and Models are supported.")
    return obj
  def _set_model(self, model):
    '''Extract supported methods from the model. Each model needs to implement
    a class method called

      _get_queryable_methods()

    that tells this Predictive Object whether or not it expects a SFrame, SArray
    or other type as input, the 'query' method of this class will automatically
    convert to appropriate SFrame or SArray that is needed. The model method can
    also expect either an SArray or an SFrame, for example, recommender.recommend()
    method could expect the first parameter 'user' to be either a list of users
    or an SFrame with more information regarding the users.

    For example, recommender model would return the following information:

               {'predict': {
                    'dataset': 'sframe',
                    'new_observation_data': 'sframe',
                    'new_user_data': 'sframe',
                    'new_item_data': 'sframe'
                },
                'recommend': {
                    'users': ['sframe', 'sarray']
                    'items': ['sframe', 'sarray'],
                    'new_observation_data': 'sframe',
                    'new_user_data': 'sframe',
                    'new_item_data': 'sframe',
                    'exclude': 'sframe'}
                }
    '''
    if is_path(model):
      # This is a path, download the file and load it
      model = graphlab.load_model(model)

    self.model = model

    self._model_methods = model._get_queryable_methods()
    if type(self._model_methods) != dict:
      raise RuntimeError("_get_queryable_methods for model %s should return a \
        dictionary" % model.__class__)

    for (method, description) in self._model_methods.iteritems():
      if type(description) != dict:
        raise RuntimeError("model %s _get_queryable_methods should use dict as method\
          description."% model.__class__)

      for (param_name, param_types) in description.iteritems():
        # support either "sframe", "sarray" or ["sframe", "sarray"]
        if not isinstance(param_types, list):
          param_types = [param_types]

        for param_type in param_types:
          if (param_type not in ['sframe', 'sarray']):
            raise RuntimeError("model %s _get_queryable_methods should only use \
              'sframe' or 'sarray' type. %s is not supported" % (model.__class__, param_type))

        description.update({param_name: param_types})

      self._model_methods.update({method: description})
示例#23
0
 def mostPopular(self, topk):
     items = self.items
     model = gl.load_model('models/Popular')
     reco = model.recommend_from_interactions(
         items[items['Score'] > 4].remove_column('UserId'),
         k=topk,
         items=items[items['Score'] > 2].select_column('ProductId'))
     return self.getData(reco)
示例#24
0
def load_model(model):
    """ input:
    model name in str format
    output:
    loaded model
    """
    loaded_model = graphlab.load_model(model)
    return loaded_model
示例#25
0
def load_topic_model(model_path):
    logger_9988.info('load_topic_model {} begin ...'.format(model_path))
    global model_instance
    if not model_instance:
        model_instance = TopicModel()
        model_instance.version = os.path.split(model_path)[-1]
        model_instance.model = gl.load_model(model_path)
    logger_9988.info('load_topic_model finished!')
示例#26
0
def RandomForestfunc():

    data = gl.SFrame.read_csv('Phase2_data/mergedFeaturesMod.csv')
    model = gl.load_model('Phase2_codes/Random_Forest_Model')
    predictions = model.predict(data)
    results = model.evaluate(data)
    print results
    predictions.save('Phase2_data/ItemsBought.csv', format='csv')
示例#27
0
def load_models(models_dir):
    global g_channel_kmeans_model_dict, model_v
    import os
    model_v = os.path.split(models_dir)[1]
    if len(g_channel_kmeans_model_dict) != 0:
        g_channel_kmeans_model_dict.clear()
    models_files = os.listdir(models_dir)
    for mf in models_files:
        g_channel_kmeans_model_dict[mf] = gl.load_model(models_dir + '/' + mf)
示例#28
0
 def whatsTrending(self, topk):
     trends = self.trends[:5000]
     model = gl.load_model('models/Trending')
     reco = model.recommend_from_interactions(
         trends[trends['Score'] > 4][:10].remove_column('UserId'),
         k=topk,
         items=trends[trends['Score'] > 3][100:1100].select_column(
             'ProductId'))
     return self.getData(reco)
 def load_model(self, model, model_cols, user_col, item_col, listen_col=None):
     if type(model) == str:
         self.model = gl.load_model(model)
     else:
         self.model = model
     self.model_cols = model_cols
     self.user_col = user_col
     self.item_col = item_col
     self.listen_col = listen_col
示例#30
0
def build_prediction_results(topic_count, model_file_name):
    """
    Writes the results from the model build phase into html files
    Creates files for each topic and its paragraphs and their probabilities
    and file for each topic and its belonging words
    :param topic_count: number of topics
    :param model_file_name: the file name of the model for loading it
    :return: None
    """
    model = gl.load_model(model_file_name)
    root_results_dir = 'my-privacypolicy-thesis/results{}'.format(topic_count)
    if os.path.exists(root_results_dir):
        shutil.rmtree(root_results_dir)
    os.makedirs(root_results_dir)

    results_html_file = open(root_results_dir + "/results.html", "w+")
    results_html_file.write(
        "<html><table border='1'><tr><td>Topic Number</td><td>Words</td><td>Paragraphs</td></tr>"
    )

    # Creates html file for each topic and its belonging words
    print('started phase 1 of build predictions results')
    paragraphs_html_list = []
    for i in range(topic_count):
        paragraphs_html_list.append("<html><table border='1'>")
        words_list = model.get_topics(num_words=20,
                                      output_type='topic_words')['words'][i]
        print_words_list = ', '.join(words_list)
        paragraphs_url = "<a href='./paragraphs_topic_{}.html'>paragraphs</a>".format(
            i)
        results_html_file.write(
            "<tr><td>{}</td><td>{}</td><td>{}</td></tr>".format(
                i, print_words_list, paragraphs_url))
        print('{} out of {}'.format(i + 1, topic_count))
    results_html_file.write("</table></html>")
    results_html_file.close()

    # Creating html files for each topic and its paragraphs and their probabilities
    print('started phase 2 of build predictions results')
    for topic_id in range(0, topic_count):
        results_records = db_utils.db_select(
            "select probability,paragraph from privacy_policy_paragraphs_prediction "
            "where running_id= {} and topic_id={} "
            "group by paragraph,probability "
            "order by probability desc".format(topic_count, topic_id))
        topic_html = "<html><table border='1'><tr><td>Probability</td><td>Paragraph</td></tr>"
        for results_record in results_records:
            topic_html += "<tr><td>{:.4f}</td><td>{}</td></tr>".format(
                results_record.get('probability'),
                results_record.get('paragraph'))
        topic_html += "</table></html>"
        paragraphs_html_file = open(
            root_results_dir + "/paragraphs_topic_{}.html".format(topic_id),
            "w+")
        paragraphs_html_file.write(topic_html)
        print('{} out of {}'.format(topic_id + 1, topic_count))
    print("done")
示例#31
0
def comments_sentimenting(book_id):

    comments_data = graphlab.load_sframe('helper/coeffi_comments_data')
    sentiment_model = graphlab.load_model(
        'helper/books_comments_sentiment_model')
    commentsFromABook = comments_data[comments_data['book_id'] == int(book_id)]
    commentsFromABook['predicted_sentiment'] = sentiment_model.predict(
        commentsFromABook, output_type='probability')
    # comments_data['predicted_sentiment'] = sentiment_model.predict(comments_data, output_type='probability')
    return commentsFromABook.sort('created_time', ascending=True)
示例#32
0
 def __init__(self, filePath='../../data/ranking_factorization_recommender', model=None):
     '''
     INPUT: String
     DESCRIPTION: Loads and saves the model given a filepath
     OUTPUT: None
     '''
     if model == None:
         self.model = gl.load_model(filePath)
     else:
         self.model = model
示例#33
0
def recommend(userid=None):
    if request.method=='POST':
        userid = request.args.get('userid')
        model = gl.load_model('../models/pickled_models/mf_model')
        recs = model.recommend(users=[str(userid)], k=5)
        perfume_id = [str(i) for i in recs['perfume_id']]
        rec_perfumes = list(collection.find({'perfume_id': {'$in': perfume_id}},
                                            {'item_name':1, 'brand':1, 'gender':1,
                                            'note':1, 'tags':1, 'theme':1, '_id':0}))
        return render_template('recommend.html', rec_perfumes=rec_perfumes)
 def _load_graphlab_object(cls, obj_type, obj_path):
   if obj_type == 'model':
     return graphlab.load_model(obj_path)
   elif obj_type == 'sarray':
     return graphlab.SArray(obj_path)
   elif obj_type == 'sframe':
     return graphlab.load_sframe(obj_path)
   elif obj_type == 'sgraph':
     return graphlab.load_sgraph(obj_path)
   else:
     raise RuntimeError(str(obj_type) + ' is not supported')
示例#35
0
def load_model(location):

    if not os.path.exists(location):
        raise IOError(location + ' does not exist')

    with open(location+"/data.json", "r") as f:
        data = json.load(f)

    lst = [gl.load_model(location+"/"+f) for f in os.listdir(location) if f != 'data.json']
    
    return Ensemble(lst,  weights=data['weights'], vote_fn=data['vote_fn'])
示例#36
0
 def test_exception(self):
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....'))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....'))
     self.assertRaises(IOError, lambda: self.graph.save("/root/tmp.graph"))
     self.assertRaises(IOError, lambda: self.sframe.save("/root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: self.model.save("/root/tmp.model"))
     self.assertRaises(IOError, lambda: graphlab.load_graph("/root/tmp.graph"))
     self.assertRaises(IOError, lambda: graphlab.load_sframe("/root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: graphlab.load_model("/root/tmp.model"))
示例#37
0
def popular(cur):
    import graphlab as gl
    model = gl.load_model("mymodel1")
    l = []
    nn = model.recommend_from_interactions([5000], k=10)
    for i in nn:
        rows = cur.execute("select name from anime where anime_id = (?)",
                           [i['anime_id']]).fetchone()
        l.append((rows[0], i['rank']))

    return l
def main():

    # load selected photos
    photos = graphlab.SFrame('photos_las_vegas_food_drinks.gl')

    # load AlexNet model pre-trained and provided by Dato
    alexnet_model = graphlab.load_model('imagenet_model_iter45.gl')

    # extract and save deep features of selected photos
    photos['deep_features'] = alexnet_model.extract_features(photos)
    photos.save('photos_deep_features.gl')
示例#39
0
def main():

    # load and save photos in graphlab format
    photos = graphlab.image_analysis.load_images(
        'photos_las_vegas_food_drinks')
    photos.save('photos_las_vegas_food_drinks.gl')

    # load and save AlexNet model pre-trained and provided by Dato
    deep_learning_model = graphlab.load_model(
        'http://s3.amazonaws.com/GraphLab-Datasets/deeplearning/imagenet_model_iter45'
    )
    deep_learning_model.save('imagenet_model_iter45.gl')
示例#40
0
 def test_exception(self):
     self.assertRaises(ValueError, lambda: self._test_read_write_helper(self.tempfile, 'hello world'))
     self.assertRaises(ValueError, lambda: self._test_read_write_helper("local://" + self.tempfile + ".csv.gz", 'hello,world,woof'))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("remote:///root/tmp", '.....'))
     self.assertRaises(IOError, lambda: self.graph.save("remote:///root/tmp.graph"))
     self.assertRaises(IOError, lambda: self.sframe.save("remote:///root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: self.model.save("remote:///root/tmp.model"))
     self.assertRaises(IOError, lambda: graphlab.load_graph("remote:///root/tmp.graph"))
     self.assertRaises(IOError, lambda: graphlab.load_sframe("remote:///root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: graphlab.load_model("remote:///root/tmp.model"))
示例#41
0
def fetch_mainmodel():
    '''
    INPUT:
        - None: None

    OUTPUT:
        - None: None
        
    Graphlab allows you to store your model, similar to storing a model as a pickle.
    This function will grab and load the chosen model.
    '''
    return gl.load_model('game_recommender')
示例#42
0
    def test_basic_save_load(self):
        # save and load the pagerank model
        with util.TempDirectory() as tmp_pr_model_file:
            self.pr_model.save(tmp_pr_model_file)
            pr_model2 = gl.load_model(tmp_pr_model_file)
            self.__assert_model_equals__(self.pr_model, pr_model2)

        # save and load the connected_component model
        with util.TempDirectory() as tmp_cc_model_file:
            self.cc_model.save(tmp_cc_model_file)
            cc_model2 = gl.load_model(tmp_cc_model_file)
            self.__assert_model_equals__(self.cc_model, cc_model2)

        # handle different types of urls.
        # TODO: test hdfs and s3 urls.
        for url in ['./tmp_model-%d' % temp_number,
                    '/tmp/tmp_model-%d' % temp_number,
                    'remote:///tmp/tmp_model2-%d' % temp_number]:

            self.pr_model.save(url)
            self.__assert_model_equals__(self.pr_model, gl.load_model(url))
示例#43
0
def main():
    model_path = 'dato_model/neuralnet.model'
    model = gl.load_model(model_path)
    mnist_path = os.path.expanduser("~/model-serving/data/mnist_data")
    X, y = load_digits(mnist_path, "test-mnist-dense-with-labels.data")

    first_x = X[1]
    data = gl.SFrame(first_x)
    data['image'] = graphlab.image_analysis.resize(data['image'], 256, 256, 3)
    fs = model.extract_features(xxx)

    print fs
示例#44
0
 def visualize(self):
     # Start at your instinct
     self.items.show()
     training_data, test_data = self.items.random_split(0.8, seed=0)
     model = gl.load_model("item-model1")
     pred = model.predict(test_data)
     results = model.evaluate(test_data)
     print(results)
     view = model.views.overview(validation_set=test_data)
     view.show()
     gl.evaluation.rmse(self.validation_data)
     view = model.views.overview(validation_set=self.validation_data)
     view.show()
示例#45
0
def extract_image_features():
    # Used graphlab.neuralnet_classifier.NeuralNetClassifier.extract_features which takes an input dataset, propagates each example through the network, and returns an SArray of dense feature vectors, each of which is the concatenation of all the hidden unit values at layer[layer_id].
    #Used a pre-trained model for ImageNet, as described by Alex Krizhevsky et. al. It is located at http://s3.amazonaws.com/dato-datasets/deeplearning/imagenet_model_iter45

    items = graphlab.SFrame.read_json('../data/items.json')
    #Remove duplicate rows of the SFrame
    items = items.unique()
    items.remove_column('image')

    #Split data by category
    phones = items.filter_by(['Cell Phones'], 'category_name')
    apparel = items.filter_by(['Baby & Kids', 'Clothing & Shoes'], 'category_name')
    home = items.filter_by(['Furniture', 'Household', 'Home & Garden'], 'category_name')

    #Load images
    phone_images = graphlab.image_analysis.load_images('data/images_by_category/Cell Phones', "auto", with_path=True, recursive=True)
    baby_images = graphlab.image_analysis.load_images('data/images_by_category/Baby & Kids', "auto", with_path=True, recursive=True)
    clothing_images = graphlab.image_analysis.load_images('data/images_by_category/Clothing & Shoes', "auto", with_path=True, recursive=True)
    furniture_images = graphlab.image_analysis.load_images('data/images_by_category/Furniture', "auto", with_path=True, recursive=True)
    household_images = graphlab.image_analysis.load_images('data/images_by_category/Household', "auto", with_path=True, recursive=True)
    home_garden_images = graphlab.image_analysis.load_images('data/images_by_category/Home & Garden', "auto", with_path=True, recursive=True)

    apparel_images = baby_images.append(clothing_images)
    home_images = furniture_images.append(household_images).append(home_garden_images)

    phone_images['id'] = phone_images['path'].apply(get_id)
    apparel_images['id'] = apparel_images['path'].apply(get_id)
    home_images['id'] = home_images['path'].apply(get_id)

    phones_with_images = phones.join(phone_images, on='id', how='inner')
    apparel_with_images = apparel.join(apparel_images, on='id', how='inner')
    home_with_images = home.join(home_images, on='id', how='inner')

    #Split data into train and test select_features
    phones_train, phones_test = phones_with_images.random_split(.8, seed=0)
    apparel_train, apparel_test = apparel_with_images.random_split(.8, seed=0)
    home_train, home_test = home_with_images.random_split(.8, seed=0)

    #Used the neural network trained on the 1.2 million images of the ImageNet Challenge.
    deep_learning_model = graphlab.load_model('../data/imagenet_model')

    phones_train['deep_features'] = deep_learning_model.extract_features(phones_train)
    apparel_train['deep_features'] = deep_learning_model.extract_features(apparel_train)
    home_train['deep_features'] = deep_learning_model.extract_features(home_train)
    phones_test['deep_features'] = deep_learning_model.extract_features(phones_test)
    apparel_test['deep_features'] = deep_learning_model.extract_features(apparel_test)
    home_test['deep_features'] = deep_learning_model.extract_features(home_test)

    #Store into data folder
    phones_train.save('data/phones_train')
示例#46
0
def train_model(filename):
    # load already prepared data in form of an SFrame
    image_train = graphlab.SFrame(filename)
    # load the pre-trained model
    loaded_model = graphlab.load_model('model/')
    # extract features of the model on the given pictures
    image_train['deep_features'] = loaded_model.extract_features(SFrame(image_train))
    # add ids to the SFrame to be able to find the closest images
    ids = SArray(list(range(0,len(image_train))))
    image_train.add_column(ids, name='id')
    # print image_train.head()
    # train the NN model on the extracted features
    knn_model = graphlab.nearest_neighbors.create(image_train, features=['deep_features'], label='id')
    return knn_model, image_train
示例#47
0
    def test_basic_save_load(self):
        # save and load the pagerank model
        with util.TempDirectory() as tmp_pr_model_file:
            self.pr_model.save(tmp_pr_model_file)
            pr_model2 = gl.load_model(tmp_pr_model_file)
            self.__assert_model_equals__(self.pr_model, pr_model2)

        # save and load the connected_component model
        with util.TempDirectory() as tmp_cc_model_file:
            self.cc_model.save(tmp_cc_model_file)
            cc_model2 = gl.load_model(tmp_cc_model_file)
            self.__assert_model_equals__(self.cc_model, cc_model2)

        # handle different types of urls.
        # TODO: test hdfs and s3 urls.
        for url in [
                './tmp_model-%d' % temp_number,
                '/tmp/tmp_model-%d' % temp_number,
                'remote:///tmp/tmp_model2-%d' % temp_number
        ]:

            self.pr_model.save(url)
            self.__assert_model_equals__(self.pr_model, gl.load_model(url))
示例#48
0
 def _get_recommened_movie_ids():
     if not request.user.is_authenticated():
         avg_scores =  Rating.objects. \
             exclude(item__in=excluded_items).values('item'). \
             annotate(average_rating=Avg('rating'))
         top_items = avg_scores. \
             order_by('-average_rating', 'item')[:cnt]
         return [item['item'] for item in top_items]
     else:
         cf_model = graphlab.load_model('cf_model')
         recomm = cf_model.recommend(
             users=[request.user.id],
             k=int(cnt),
             items=list(set(all_items) - set(excluded_items)))
         return recomm['item']
示例#49
0
 def test_exception(self):
     bad_url = "hdfs:///root/"
     if self.has_hdfs:
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///tmp"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs://" + self.tempfile))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__write__(bad_url + "/tmp", "somerandomcontent"))
         self.assertRaises(IOError, lambda: self.graph.save(bad_url + "x.graph"))
         self.assertRaises(IOError, lambda: self.sframe.save(bad_url + "x.frame_idx"))
         self.assertRaises(IOError, lambda: self.model.save(bad_url + "x.model"))
         self.assertRaises(IOError, lambda: graphlab.load_graph(bad_url + "mygraph"))
         self.assertRaises(IOError, lambda: graphlab.load_sframe(bad_url + "x.frame_idx"))
         self.assertRaises(IOError, lambda: graphlab.load_model(bad_url + "x.model"))
     else:
         logging.getLogger(__name__).info("No hdfs avaiable. Test pass.")
示例#50
0
def get_wines_for_movie(movie):
    path_to_wine = '/home/cully/Documents/capstone/data/gridsearch_sf'
    path_to_movies = '/home/cully/Documents/capstone/data/flask_movies_sf'
    wine_rec = gl.load_model(path_to_wine)
    movies_sf = gl.load_sframe(path_to_movies)
    cols = movies_sf.column_names()
    movies_df = movies_sf.to_dataframe()
    ids = [i for i in movies_df.index]
    movies_df.insert(0, 'id', ids)
    value_vars = [x for x in movies_df.columns if x != 'id']
    movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna()
    movies_rec = gl.factorization_recommender.create(movies_melted, user_id='id', item_id='variable', target='value')
    movie_pos = movie_order_dict[movie]
    sims = pairwise_distances(np.array(movies_rec.coefficients['variable']['factors'])[movie_pos].reshape(1,-1), np.array(wine_rec.coefficients['wine_name']['factors'])[:,:8], metric='cosine')
    wine_names = np.array(wine_rec.coefficients['wine_name']['wine_name'])
    return wine_names[np.argsort(sims[0])[::-1]][:5]
示例#51
0
 def test_exception(self):
     if self.has_s3:
         bad_bucket = "i_am_a_bad_bucket"
         prefix = "s3://" + bad_bucket
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3:///"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + self.standard_bucket + "/somerandomfile"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + "/somerandomfile"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + "/somerandomfile", "somerandomcontent"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + self.standard_bucket + "I'amABadUrl/", "somerandomcontent"))
         self.assertRaises(IOError, lambda: self.graph.save(prefix + "/x.graph"))
         self.assertRaises(IOError, lambda: self.sframe.save(prefix + "/x.frame_idx"))
         self.assertRaises(IOError, lambda: self.model.save(prefix + "/x.model"))
         self.assertRaises(IOError, lambda: graphlab.load_graph(prefix + "/x.graph"))
         self.assertRaises(IOError, lambda: graphlab.load_sframe(prefix + "/x.frame_idx"))
         self.assertRaises(IOError, lambda: graphlab.load_model(prefix + "/x.model"))
     else:
         logging.getLogger(__name__).info("No s3 bucket avaiable. Test pass.")
示例#52
0
def main():

    # Cursor to extract userId and business from all reviews
    reviews_cursor = tags_collection.find()
    reviews_cursor.batch_size(1000)

    # Create userList and businessList to store unique User Id
    userList=[]
    businessList=[]

    #Create an LDA model from graphlab
    if os.path.exists("ldamodel"):
        print "reading from the file"
        lda=gl.load_model("ldamodel")
    else:
        lda = create()

    # Go through all reviews in the Collection and Add userId and business to userList
    counter=0
    for review in reviews_cursor:
        userList.append(review["userId"])
        businessList.append(review["business"])
        counter = counter + 1
        if counter % 100000 == 0:
            print str(counter) + 'Record Read from reviews collection'

    # Remove duplicate business and userId from the List
    userList=Set(userList)
    userList=list(userList)
    businessList=Set(businessList)
    businessList=list(businessList)

    print 'Number of User in Dataset' + str(len(userList))
    print 'Number of Business in Dataset' + str(len(businessList))


    # Process User Review to create User Profile
    count = len(userList)
    worker(1, 0, count,userList,"userId", lda)


    #Process Business Review to create Business Profile
    count = len(businessList)
    worker(1, 0, count,businessList,"business", lda)
示例#53
0
def _test_save_load_object_helper(testcase, obj, url):
    """
    Helper function to test save and load a server side object to a given url.
    """
    def cleanup(url):
        """
        Remove the saved file from temp directory.
        """
        protocol = None
        path = None
        splits = url.split("://")
        if len(splits) > 1:
            protocol = splits[0]
            path = splits[1]
        else:
            path = url
        if not protocol or protocol is "local" or protocol is "remote":
            tempdir = tempfile.gettempdir()
            pattern = path + ".*"
            for f in os.listdir(tempdir):
                if re.search(pattern, f):
                    os.remove(os.path.join(tempdir, f))

    if isinstance(obj, graphlab.SGraph):
        obj.save(url + ".graph")
        newobj = graphlab.load_graph(url + ".graph")
        testcase.assertItemsEqual(obj.get_fields(), newobj.get_fields())
        testcase.assertDictEqual(obj.summary(), newobj.summary())
    elif isinstance(obj, graphlab.Model):
        obj.save(url + ".model")
        newobj = graphlab.load_model(url + ".model")
        testcase.assertItemsEqual(obj.list_fields(), newobj.list_fields())
        testcase.assertEqual(type(obj), type(newobj))
    elif isinstance(obj, graphlab.SFrame):
        obj.save(url + ".frame_idx")
        newobj = graphlab.load_sframe(url + ".frame_idx")
        testcase.assertEqual(obj.shape, newobj.shape)
        testcase.assertEqual(obj.column_names(), newobj.column_names())
        testcase.assertEqual(obj.column_types(), newobj.column_types())
        assert_frame_equal(obj.head(obj.num_rows()).to_dataframe(),
                           newobj.head(newobj.num_rows()).to_dataframe())
    else:
        raise TypeError
    cleanup(url)
示例#54
0
def get_wine_recs(ratings):
    path_to_movies = '/home/cully/Documents/capstone/data/flask_movies_sf'
    path_to_wine = '/home/cully/Documents/capstone/data/gridsearch_sf'
    wine_rec = gl.load_model(path_to_wine)
    movies_sf = gl.load_sframe(path_to_movies)
    movies_df = movies_sf.to_dataframe()
    value_vars = [x for x in movies_df.columns if x != 'id']
    new_ratings = {movie_dict[name]:int(ratings[name]) for name in ratings}
    new_df = pd.DataFrame.from_dict([new_ratings], orient='columns').replace(-1, np.nan)
    movies_df = pd.concat([movies_df, new_df]).reset_index(drop=True)
    ids = [i for i in movies_df.index]
    movies_df.insert(0, 'id', ids)
    movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna()
    movies_rec = gl.factorization_recommender.create(movies_melted, user_id = 'id', item_id='variable', target='value')
    movies_user_intercept, movies_user_factors, _, _, movies_intercept = get_rec_coeffs(movies_rec)
    wine_item_factors = np.array(wine_rec.coefficients['wine_name']['factors'])[:,:8]
    wine_names = np.array(wine_rec.coefficients['wine_name']['wine_name'])
    comb = np.dot(np.array(movies_user_factors[-1]), wine_item_factors.T)
    return wine_names[np.argsort(comb)[::-1]]
def main():
    parser = argparse.ArgumentParser(description = "Classifies given dataset and saves the results.")
    parser.add_argument("--dataset_dir", required = False, default=None ,type=str,
                        help = "Dataset directory ex: my_dataset_test or my_dataset ")
    parser.add_argument("--classified_dir", required = True, default=None ,type=str,
                        help = "Directory for dataset after classification ex: result_dataset")
    parser.add_argument("--print", required = False ,action='store_true', dest='print_results',
                        help = "")

    args = parser.parse_args()
    if args.dataset_dir:
        vec_model = word2vec.Word2Vec.load_word2vec_format('word2vec_model.txt',binary=False)
        cls = gl.load_model("graphlab/my_classifier")
        dataset = gl.load_sframe(args.dataset_dir)
        result171_dataset = test_classifier(cls,dataset,vec_model)
        dataset.add_column(result171_dataset.select_column("class"),"class")
        dataset.add_column(result171_dataset.select_column("probability"),"probability")
        dataset.save(args.classified_dir)
    elif args.classified_dir:
        result171_dataset = gl.load_sframe(args.classified_dir)
    if args.print_results:
        print_positives_and_confidence(result171_dataset,result171_dataset)