Exemplo n.º 1
0
def loadNeigh_2():
    print 'loadNeighbor 2'
    zhima_usr = loadZhima()
    usr_id = zhima_usr['snwb']

    #loadZhima_1
    subgraph_edges = gl.load_sframe(
        os.path.join(resultDataFolder, 'subgraph_zhima_1'))

    #add zhima 2_neighbor for subgraph
    neighbor_1 = subgraph_edges['dst']
    sub_vertices = usr_id.append(neighbor_1)

    # sframeFiles = os.listdir(sFrameFolder)
    for sf in sframeFiles:
        edgesData = gl.load_sframe(os.path.join(sFrameFolder, sf))
        edgesData.rename({'X1': 'src', 'X2': 'dst'})
        #     zhima_neighbors_2 = edgesData.filter_by(sub_vertices,'src')
        zhima_neighbors_2 = edgesData[(edgesData['src'] in sub_vertices)
                                      & (edgesData['dst'] in sub_vertices)]
        subgraph_edges.append(zhima_neighbors_2)
        print sf

    print 'save subgraph'
    subgraph_edges.save(os.path.join(resultDataFolder, 'subgraph_zhima_2'))
Exemplo n.º 2
0
def get_song_recs(ratings, n_features):

    '''
    Takes user new movie ratings from website user and returns 
    recommended song titles
    '''

    path_to_songs_sf = '/home/cully/Documents/capstone/data/flask_songs_sf'
    path_to_movies_sf = '/home/cully/Documents/capstone/data/flask_movies_sf'
    songs_sf = gl.load_sframe(path_to_songs_sf)
    songs_df = songs_sf.to_dataframe()
    value_vars = [x for x in songs_df.columns if x != 'id']
    ids = [x for x in songs_df.index]
    if 'id' not in songs_df.columns:
        songs_df.insert(0, 'id', ids)
    songs_melted = gl.SFrame(pd.melt(songs_df, id_vars = 'id', value_vars=value_vars))
    songs_rec = gl.factorization_recommender.create(songs_melted, user_id = 'id', item_id='variable', target='value', num_factors = n_features)
    _, _, songs_item_intercept, songs_item_factors, songs_intercept = get_rec_coeffs(songs_rec)
    movies_sf = gl.load_sframe(path_to_movies_sf)
    movies_df = movies_sf.to_dataframe()
    
    value_vars = [x for x in movies_df.columns if x != 'id']

    new_ratings = {movie_dict[name]:int(ratings[name]) for name in ratings}
    new_df = pd.DataFrame.from_dict([new_ratings], orient='columns').replace(-1,np.nan)
    movies_df = pd.concat([movies_df, new_df]).reset_index(drop=True)
    ids = [str(i) for i in movies_df.index]
    movies_df.insert(0, 'id', ids)
    movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna()
    movies_rec = gl.factorization_recommender.create(movies_melted, user_id='id', item_id='variable', target='value', num_factors=n_features)
    movies_user_intercept, movies_user_factors, _, _, movies_intercept = get_rec_coeffs(movies_rec)
    comb = np.dot(np.array(movies_user_factors)[-1], np.array(songs_item_factors).T)
    return songs_df.columns[1:][np.argsort(comb)[::-1]]
Exemplo n.º 3
0
def read_data():
    # Load image analysis datasets.
    # Data was reduced to 6 categories in 3 groups: phones, home
    # (Furniture, Household, Home & Garden), apparel
    # (Baby & Kids, Clothing & Shoes).

    phones_set = graphlab.load_sframe(DATA_PATH + 'phones_with_ids')
    home_set = graphlab.load_sframe(DATA_PATH + 'home_with_ids')
    apparel_set = graphlab.load_sframe(DATA_PATH + 'apparel_with_ids')

    return phones_set, home_set, apparel_set
Exemplo n.º 4
0
	def __init__(self,version,session_id,diff):
		if(int(version) == 2):
			sf = gl.load_sframe("data/py2_session_clean")
		else:
			sf = gl.load_sframe("data/py3_session_clean")

		self.session = sf.filter_by([int(session_id)],"session_id")
		self.python_version = version
		self.session_id = session_id
		

		if(diff == "Full"):
			self.diffs = False
		else:
			self.diffs = True
def pagerank():
    sf = graphlab.load_sframe('/users/erwin/work/ml_datasets/freebase_performances.csv')
    print sf
    g = graphlab.SGraph()
    g = g.add_edges(sf, 'actor_name', 'film_name')
    pr = graphlab.pagerank.create(g)
    print(pr.get('pagerank').topk(column_name='pagerank'))
Exemplo n.º 6
0
def load_data():
    data = None
    if os.path.exists(DATA_SFRAME):
        print "\n----- Loading SFrame -----"
        print " filename: ", DATA_SFRAME
        print "--------------------------\n"
        data = gl.load_sframe(DATA_SFRAME)
    else:
        print "\n----- Creating SFrame -----"
        print " filename: ", DATA_FILE
        print "---------------------------\n"
        raw_data = {'user_id': [], 'item_id': [], 'rating': []}
        with open(DATA_FILE, 'r') as f:
            lines = f.readlines()[0].split('\r')
            for line in lines:
                raw_data['user_id'].append(int(line.split('\t')[0]))
                raw_data['item_id'].append(int(line.split('\t')[1]))
                raw_data['rating'].append(int(line.split('\t')[2]))
        data = gl.SFrame(raw_data)
        print "\n----- Saving SFrame -----"
        print " filename: ", DATA_SFRAME
        print "------------------------\n-"
        data.save(DATA_SFRAME)

    return data
Exemplo n.º 7
0
 def predict(self, ratings, user, n=5):
     book_data = gl.load_sframe("./book_data_clean/")
     ids_ratings = self.getRecommendations(ratings, user, n+50)
     #list storing details of recommended books
     list_of_books = []
     list_of_ids = []
 
     # Serach a book via its id in book_data and append all its details along with rating to list_of_books
     count = 0
     for item in ids_ratings:
         if count == n: break
         # if book details not present in book_data, skip over to next until (n) books are appended to list
         if item[0] not in book_data["book_id"]: continue
         
         count += 1
         book = book_data[book_data["book_id"] == item[0]][0]
         if item[1] > 10:
             book["rating"] = 10
         else:
             book["rating"] = item[1]
         # append id to another list and delete book id from dictionary
         list_of_ids.append(book["book_id"])
         del(book["book_id"])
         del(book["rating"])
         list_of_books.append(book)
     
     return list_of_books[0:n], list_of_ids[0:n]
Exemplo n.º 8
0
def load_data():
    data = None
    if os.path.exists(DATA_SFRAME):
        print "\n----- Loading SFrame -----"
        print " filename: ", DATA_SFRAME
        print "--------------------------\n"
        data = gl.load_sframe(DATA_SFRAME)
    else:
        print "\n----- Creating SFrame -----"
        print " filename: ", DATA_FILE
        print "---------------------------\n"
        raw_data = {'user_id': [], 'item_id': [], 'rating': []}
        with open(DATA_FILE, 'r') as f:
            lines = f.readlines()[0].split('\r')
            for line in lines:
                raw_data['user_id'].append(int(line.split('\t')[0]))
                raw_data['item_id'].append(int(line.split('\t')[1]))
                raw_data['rating'].append(int(line.split('\t')[2]))
        data = gl.SFrame(raw_data)
        print "\n----- Saving SFrame -----"
        print " filename: ", DATA_SFRAME
        print "------------------------\n-"
        data.save(DATA_SFRAME)

    return data
    def __load_data_structure__(self, filepath):
        """Return data structure if can be loaded, otherwise returns None and logs warning"""
        # try to load different supported types, since don't know what type just try all and swallow exceptions
        obj = None
        try:
            obj = _gl.load_sframe(filepath)
            return obj
        except:
            pass
        try:
            obj = _gl.load_sgraph(filepath)
            return obj
        except:
            pass
        
        try:
            obj = _gl.load_model(filepath)
            return obj
        except:
            pass

        try:
            obj = _gl.SArray(data=filepath)
            return obj
        except:
            pass

        __LOGGER__.debug("Unable to load dependency, unsupported type at path: %s" % filepath)
        return None
Exemplo n.º 10
0
def main(args):
    # Load the dataset.
    sf = gl.load_sframe(args.dataset)

    # Get the pos_tag count column names.
    tag_cols = [i for i in sf.column_names() if i.startswith('pos_count')]
    
    # Set up some distance metrics
    dists = [[('unigrams', 'bigrams'), 'jaccard', 1],
             [('pos_bigrams',), 'weighted_jaccard', 1],
             [('doc_vecs',), 'cosine', 1],
             [tuple(['time'] + tag_cols), 'euclidean', 1]
            ]
    feats = []
    [feats.extend(list(i[0])) for i in dists]
    # If a valid sample size was provided
    # then replace the full dataset with a sample.
    if 0. < args.sample_size < 1.:
        sf = sf.sample(args.sample_size)

    # Create and fit the model.
    nnh = NNGraphHierarchy()
    nnh.fit(
        sf,
        label=args.label,
        features=feats,
        dist=dists,
        split_column=args.split_column,
        window_size=args.win_size,
        window_offset=args.win_offset,
        path=args.output,
        quantile=args.quantile,
        k=args.num_neighbors,
        radius=args.radius,
    )

    # Save the results.
    nnh.sf.save(args.output)
    nnh.g.save(args.output+'.graph')

    # If a path to rumor-related tweets was provided
    # then run an analysis of rumor-tweet distribution
    # across top-level components.
    if args.rel_path:
        # Load the list of related tweet ids for each rumor.
        related = gl.SFrame.read_csv(args.rel_path)
        rumor_report = rumor_component_distribution(
            nnh.sf,
            related,
        )
        rumor_report.save(args.output + 'rumor_report.csv', format='csv')

    # Save a report containing various information about
    # the top-level components.
    #hier_report = top_level_report(nnh.sf)
    #hier_report.save(args.output + '_hier_report.csv')

    print 'Success!'
    exit()
Exemplo n.º 11
0
def loadData():
	edgesData = gl.load_sframe(sframeDataFolder)
	print 'num_rows:%d ' %edgesData.num_rows()

	#create graph
	G = gl.SGraph()
	G = G.add_edges(edges = edgesData, src_field ='src',dst_field = 'dst')
	pritn 'create graph done!'
	return G
Exemplo n.º 12
0
def load_all_sframes(repos_path, edges_path, nn_items_path=None , nn_text_path=None):
    """
    uploads all the precomputed sframes, which are the following:
    repos: sframe containing repo_name as unique id, readme, language, watchers, etc...
    edges: sframe containing the edges between repos and watchers, along with the weights
    nn_items: precomputed nearest neighbors for all the repos
    """
    repos = gl.load_sframe(repos_path)
    edges = gl.load_sframe(edges_path)
    nn_items = None
    nn_text = None
    if nn_items_path:
        nn_items = gl.load_sframe( nn_items_path )

    if nn_text_path:
        nn_text = gl.load_sframe( nn_text_path )

    return repos, edges, nn_items, nn_text
Exemplo n.º 13
0
def build_docs_for_modeling(in_docs, sframe_raw_filename):
    # Remove stop words and convert to bag of words
    in_docs = gl.text_analytics.count_words(in_docs['X1'])
    in_docs = in_docs.dict_trim_by_keys(gl.text_analytics.stopwords(),
                                        exclude=True)
    freq_words = get_freq_words(gl.load_sframe(sframe_raw_filename))
    in_docs = in_docs.dict_trim_by_keys(freq_words['word'], exclude=False)
    in_docs = in_docs.dict_trim_by_keys(['information', 'data', 'privacy'],
                                        exclude=True)
    return in_docs
Exemplo n.º 14
0
def main():
    
    loaded_frame = graphlab.load_sframe("py2_session_clean")

    check = [25]
    
    test = loaded_frame.filter_by(check, 'session_id')
    
    for e in test:
        print e
Exemplo n.º 15
0
def comments_sentimenting(book_id):

    comments_data = graphlab.load_sframe('helper/coeffi_comments_data')
    sentiment_model = graphlab.load_model(
        'helper/books_comments_sentiment_model')
    commentsFromABook = comments_data[comments_data['book_id'] == int(book_id)]
    commentsFromABook['predicted_sentiment'] = sentiment_model.predict(
        commentsFromABook, output_type='probability')
    # comments_data['predicted_sentiment'] = sentiment_model.predict(comments_data, output_type='probability')
    return commentsFromABook.sort('created_time', ascending=True)
Exemplo n.º 16
0
 def from_previous_reduction(cls, input_dir):
     parent = gl.load_sgraph(input_dir+'parent')
     verticy_descriptions = gl.load_sframe(input_dir+'verticy_descriptions')
     child = gl.load_sgraph(input_dir+'child')
     gw = cls()
     gw.g = parent
     gw.verticy_descriptions = verticy_descriptions
     gw.child = cls()
     gw.child.g = child
     return gw
Exemplo n.º 17
0
 def test_exception(self):
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....'))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("/root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("/root/tmp", '.....'))
     self.assertRaises(IOError, lambda: self.graph.save("/root/tmp.graph"))
     self.assertRaises(IOError, lambda: self.sframe.save("/root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: self.model.save("/root/tmp.model"))
     self.assertRaises(IOError, lambda: graphlab.load_graph("/root/tmp.graph"))
     self.assertRaises(IOError, lambda: graphlab.load_sframe("/root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: graphlab.load_model("/root/tmp.model"))
Exemplo n.º 18
0
def save_positive_results_with_event_type_and_date(result_dataset):
    csvfile = "classification/data/extraction_fields.tsv"
    with codecs.open(csvfile, "r", encoding="utf8") as infile:
        lines = infile.readlines()
    #types = set()
    #for line in lines[5:]:
    #        types.add(line.split(",")[2].strip().lower())

    sf = gl.load_sframe("graphlab/my_training_dataset")

    # lines[-1].split("\t") = [u'620', u'E1', u'protest', u'', u'', u'T2', u'NGO' ...]
    size = int(lines[-1].split("\t")
               [0]) + 1  # latest news index 620 starts from 0 do 620+1
    labels = [0] * size

    for line in lines:
        fields = line.split("\t")
        key = fields[2].strip().lower()
        if key:
            ind = int(fields[0].strip())
            labels[ind] = types[key]

    #rel_folder="classification/data/v6/class_rel/"
    ef = sf.filter_by([1], "rel")  # add_arguments(None,rel_folder,1,vec_model)

    ef['event_type'] = ef['filenames'].apply(lambda p: labels[int(p[1:5])])

    # evnt type classifier
    event_type_cls = gl.classifier.create(
        ef, target="event_type", features=['vectors', '1gram features'])

    pos_results = result_dataset.filter_by([1], "class")

    pos_res_res = event_type_cls.classify(pos_results)

    pos_results.add_column(pos_res_res.select_column("class"), "event_type")
    pos_results.add_column(pos_res_res.select_column("probability"),
                           "et_probability")

    pos_results.filter_by([5], "event_type")

    pos_results['date'] = pos_results['filenames'].apply(
        lambda x: x[:-5].split('_'))
    pos_results = pos_results.unpack('date')
    pos_results.rename({
        'date.0': 'year',
        'date.1': 'month',
        'date.2': 'day',
        'date.3': 'index'
    })
    pos_results['year'] = pos_results['year'].apply(
        lambda year_str: int(year_str))
    pos_results['month'] = pos_results['month'].apply(lambda m_str: int(m_str))
    pos_results.save("graphlab/pos_results")  ##_2005")
Exemplo n.º 19
0
 def from_previous_reduction(cls, input_dir):
     parent = gl.load_sgraph(input_dir + 'parent')
     verticy_descriptions = gl.load_sframe(input_dir +
                                           'verticy_descriptions')
     child = gl.load_sgraph(input_dir + 'child')
     gw = cls()
     gw.g = parent
     gw.verticy_descriptions = verticy_descriptions
     gw.child = cls()
     gw.child.g = child
     return gw
Exemplo n.º 20
0
def process_frame(filename):
    sf = gl.load_sframe(filename)
    
    output_frame = SFrame()
    
    #Setup our output frame
    id = []
    ip = []
    sub_count = []
    error_count = []
    time_count = []
    error_sequence_raw = []
    error_sequence = []
    
    #How many session ID's do we have?
    sa = gl.SArray()
    sa = sf['session_id']
    test = sa.unique()
    
    limit = len(test)
    
    #Start grabbing each session
    for i in range(1,limit):
        
        #Test output
        if (i % 100 == 0):   
            break 
        
        #Get the session and sort it by the date time
        session_frame = sf.filter_by(i,"session_id")
        #sorted_session = session_frame.sort("dt")
        
        row = sf[0]
        
        id += [i]
        ip += [row['ip']]
        sub_count += [len(row)]
        #time_count += [fn_time_count(sorted_session)]
        #error_count += [fn_error_count(sorted_session)]
        #error_sequence_raw += [fn_error_sequence_raw(sorted_session)]
    
    print len(id)
    print len(ip)
    print len(sub_count)
    #print len(time_count)
    
    output_frame = output_frame.add_column(SArray(id), name='id')
    output_frame.add_column(SArray(ip), name='ip')
    output_frame.add_column(SArray(sub_count),name='sub_count')
    #output_frame.add_column(SArray(time_count),name='sub_length')
    #output_frame.add_column(SArray(error_count),name='error_count')
    #output_frame.add_column(SArray(error_sequence_raw,dtype=str),name='err_seq_raw')

    output_frame.save('py2_session_analysis')
 def _load_graphlab_object(cls, obj_type, obj_path):
   if obj_type == 'model':
     return graphlab.load_model(obj_path)
   elif obj_type == 'sarray':
     return graphlab.SArray(obj_path)
   elif obj_type == 'sframe':
     return graphlab.load_sframe(obj_path)
   elif obj_type == 'sgraph':
     return graphlab.load_sgraph(obj_path)
   else:
     raise RuntimeError(str(obj_type) + ' is not supported')
Exemplo n.º 22
0
def load(infile):
  '''
  Reads a binary format SFrame from GL_DATA/
  
  args:
    infile - name of a graphlab binary to read from GL_DATA/
  
  return:
    the SFrame stored at GL_DATA/infile  
  '''
  path = os.path.join(GL_DATA, infile)
  return gl.load_sframe(path)
Exemplo n.º 23
0
 def test_exception(self):
     self.assertRaises(ValueError, lambda: self._test_read_write_helper(self.tempfile, 'hello world'))
     self.assertRaises(ValueError, lambda: self._test_read_write_helper("local://" + self.tempfile + ".csv.gz", 'hello,world,woof'))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("remote:///root/tmp"))
     self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("remote:///root/tmp", '.....'))
     self.assertRaises(IOError, lambda: self.graph.save("remote:///root/tmp.graph"))
     self.assertRaises(IOError, lambda: self.sframe.save("remote:///root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: self.model.save("remote:///root/tmp.model"))
     self.assertRaises(IOError, lambda: graphlab.load_graph("remote:///root/tmp.graph"))
     self.assertRaises(IOError, lambda: graphlab.load_sframe("remote:///root/tmp.frame_idx"))
     self.assertRaises(IOError, lambda: graphlab.load_model("remote:///root/tmp.model"))
Exemplo n.º 24
0
def loadSubData():
	sframeFiles = os.listdir('/home/lt/sframe')
	edgesData = gl.sFrame()
	for sf in sframeFiles:
		edgesData.append(gl.load_sframe('/home/lt/sframe',sf))
	edgesData.rename({'X1':'src','X2':'dst'})

	#create graph
	G = gl.SGraph()
	G = G.add_edges(edges = edgesData, src_field ='src',dst_field = 'dst')
	pritn 'create graph done!'
	return G
Exemplo n.º 25
0
def load(infile):
  '''
  Reads a binary format SFrame from GL_DATA/
  
  args:
    infile - name of a graphlab binary to read from GL_DATA/
  
  return:
    the SFrame stored at GL_DATA/infile  
  '''
  path = os.path.join(GL_DATA, infile)
  return gl.load_sframe(path)
Exemplo n.º 26
0
def main():
    parser = argparse.ArgumentParser(description = "Classifies given dataset and saves the results.")
    parser.add_argument("--dataset_dir", required = False, default=None ,type=str,
                        help = "Dataset directory ex: my_dataset_test or my_dataset ")
    parser.add_argument("--classified_dir", required = True, default=None ,type=str,
                        help = "Directory for dataset after classification ex: result_dataset")
    parser.add_argument("--print", required = False ,action='store_true', dest='print_results',
                        help = "")

    args = parser.parse_args()
    if args.dataset_dir:
        vec_model = word2vec.Word2Vec.load_word2vec_format('word2vec_model.txt',binary=False)
        cls = gl.load_model("graphlab/my_classifier")
        dataset = gl.load_sframe(args.dataset_dir)
        result171_dataset = test_classifier(cls,dataset,vec_model)
        dataset.add_column(result171_dataset.select_column("class"),"class")
        dataset.add_column(result171_dataset.select_column("probability"),"probability")
        dataset.save(args.classified_dir)
    elif args.classified_dir:
        result171_dataset = gl.load_sframe(args.classified_dir)
    if args.print_results:
        print_positives_and_confidence(result171_dataset,result171_dataset)
def main():
    parser = argparse.ArgumentParser(description = "Classifies given dataset and saves the results.")
    parser.add_argument("--classified_dir", required = False, default=None ,type=str,
                        help = "Directory for dataset after classification ex: result_dataset")
    parser.add_argument("--print", required = False ,action='store_true',dest="print_results",help = "")
    parser.add_argument("--pprint", required = False , default=10, type=int ,dest="print_pretty",help = "")

    args = parser.parse_args()
    if args.classified_dir:
        result_dataset = gl.load_sframe(args.classified_dir)
        save_positive_results_with_event_type_and_date(result_dataset)
    if args.print_results:
        pos_results = gl.load_sframe("graphlab/pos_results")
        sframe = count_monthly(pos_results)
        sframe.print_rows(sframe.shape[0])
    if args.print_pretty < 10:
        pos_results = gl.load_sframe("graphlab/pos_results")
        my_dict = get_count_dict(pos_results)

        if args.print_pretty == 0:
            print("\n".join(["%d-%d %s" %(year,month, " ".join([str(l) for l in my_dict[year][month]]) ) for year in pos_results['year'].unique().sort()  for month in range(1,13) ]))
        elif args.print_pretty == 1:
            count_dict = get_norm_dict(pos_results,events = [3,5])
            print("\n".join(["%d-%d %.4f %.4f %.4f" %(year,month,
                                                  my_dict[year][month][3]/count_dict[3][year],
                                                  my_dict[year][month][5]/count_dict[5][year],
                                                  sum(my_dict[year][month])/count_dict['total'][year])
                         for month in range(1,13) for year in pos_results['year'].unique()]))
        elif args.print_pretty == 2:
            count_dict = get_norm_dict(pos_results)
            print("\n".join(["%d-%d %.4f %.4f %.4f %.4f %.4f %.4f %.4f" %(year,month,
                                                  my_dict[year][month][0]/count_dict[0][year],
                                                  my_dict[year][month][1]/count_dict[1][year],
                                                  my_dict[year][month][2]/count_dict[2][year],
                                                  my_dict[year][month][3]/count_dict[3][year],
                                                  my_dict[year][month][4]/count_dict[4][year],
                                                  my_dict[year][month][5]/count_dict[5][year],
                                                  sum(my_dict[year][month])/count_dict['total'][year])
                             for month in range(1,13) for year in pos_results['year'].unique().sort()]))
Exemplo n.º 28
0
    def predict(self, location, age, search_over, n=3):
        # Load required models and data
        regression_model = gl.load_model("./regression_model_file/")
        book_data = gl.load_sframe("./book_data_clean/")
        implicit_data = gl.load_sframe("./implicit_rating_data/")
        book_data.filter_by(implicit_data["book_id"], "book_id")
        
        # Select approx (search_over) books by splitting data RANDOMLY
        split = search_over/45000.0
        book_data, other_data = book_data.random_split(split)
        
        predicted_ratings = []
        count = 0
        for book in book_data:
            if count == search_over:
                break
            count += 1
            book["location"] = location
            book["age"] = age
            rating = regression_model.predict(book)[0]
            if rating >= 8.0:
                predicted_ratings.append((book["book_id"], rating))
    
        predicted_ratings = sorted(predicted_ratings, key=itemgetter(1), reverse=True)

        # Recommeded books in decresing values of ratings
        recommended_books_id = []
        for i in range(5):
            recommended_books_id.append(predicted_ratings[i][0])

        recommended_books = []
        for book in recommended_books_id:
            for item in book_data:
                if book in item["book_id"]:
                    del(item["book_id"])
                    recommended_books.append(item)
                    break
        return recommended_books[0:n], recommended_books_id[0:n]
Exemplo n.º 29
0
 def __init__(self, sf_path=None, g_path=None, cache_max=0.75):
     self.sf = None
     self.label = None
     self.bin_sfs = None
     self.reps = gl.SArray(dtype=str)
     self.hier_graph = None
     self.num_bins = 0
     self.features = None
     self.distance = None
     self.cache_max = cache_max
     if g_path:
         self.g = gl.load_sgraph(g_path)
         self.sf = self.g.vertices
     elif sf_path:
         self.sf = gl.load_sframe(sf_path)
Exemplo n.º 30
0
 def test_exception(self):
     bad_url = "hdfs:///root/"
     if self.has_hdfs:
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs:///tmp"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("hdfs://" + self.tempfile))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__write__(bad_url + "/tmp", "somerandomcontent"))
         self.assertRaises(IOError, lambda: self.graph.save(bad_url + "x.graph"))
         self.assertRaises(IOError, lambda: self.sframe.save(bad_url + "x.frame_idx"))
         self.assertRaises(IOError, lambda: self.model.save(bad_url + "x.model"))
         self.assertRaises(IOError, lambda: graphlab.load_graph(bad_url + "mygraph"))
         self.assertRaises(IOError, lambda: graphlab.load_sframe(bad_url + "x.frame_idx"))
         self.assertRaises(IOError, lambda: graphlab.load_model(bad_url + "x.model"))
     else:
         logging.getLogger(__name__).info("No hdfs avaiable. Test pass.")
Exemplo n.º 31
0
def test():
    sf = gl.load_sframe('sydney_processed')
    label = 'mongo_id'

    # Use 1% of the data.
    sf = sf.sample(0.5)

    # Run the algorithm
    nnh = NNGraphHierarchy()
    radius = nnh.find_radius(sf, label=label, z_val=1.)
    nnh.radius = radius
    nnh.fit(sf, label=label, split_column='time', num_bins=150)
    nnh.sf.save('final/final_results')
    accuracy_report(nnh.sf)
    print 'Success!'
    exit()
Exemplo n.º 32
0
def get_wines_for_movie(movie):
    path_to_wine = '/home/cully/Documents/capstone/data/gridsearch_sf'
    path_to_movies = '/home/cully/Documents/capstone/data/flask_movies_sf'
    wine_rec = gl.load_model(path_to_wine)
    movies_sf = gl.load_sframe(path_to_movies)
    cols = movies_sf.column_names()
    movies_df = movies_sf.to_dataframe()
    ids = [i for i in movies_df.index]
    movies_df.insert(0, 'id', ids)
    value_vars = [x for x in movies_df.columns if x != 'id']
    movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna()
    movies_rec = gl.factorization_recommender.create(movies_melted, user_id='id', item_id='variable', target='value')
    movie_pos = movie_order_dict[movie]
    sims = pairwise_distances(np.array(movies_rec.coefficients['variable']['factors'])[movie_pos].reshape(1,-1), np.array(wine_rec.coefficients['wine_name']['factors'])[:,:8], metric='cosine')
    wine_names = np.array(wine_rec.coefficients['wine_name']['wine_name'])
    return wine_names[np.argsort(sims[0])[::-1]][:5]
Exemplo n.º 33
0
 def test_exception(self):
     if self.has_s3:
         bad_bucket = "i_am_a_bad_bucket"
         prefix = "s3://" + bad_bucket
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3:///"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + self.standard_bucket + "/somerandomfile"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__read__("s3://" + "/somerandomfile"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + "/somerandomfile", "somerandomcontent"))
         self.assertRaises(IOError, lambda: glconnect.get_unity().__write__("s3://" + self.standard_bucket + "I'amABadUrl/", "somerandomcontent"))
         self.assertRaises(IOError, lambda: self.graph.save(prefix + "/x.graph"))
         self.assertRaises(IOError, lambda: self.sframe.save(prefix + "/x.frame_idx"))
         self.assertRaises(IOError, lambda: self.model.save(prefix + "/x.model"))
         self.assertRaises(IOError, lambda: graphlab.load_graph(prefix + "/x.graph"))
         self.assertRaises(IOError, lambda: graphlab.load_sframe(prefix + "/x.frame_idx"))
         self.assertRaises(IOError, lambda: graphlab.load_model(prefix + "/x.model"))
     else:
         logging.getLogger(__name__).info("No s3 bucket avaiable. Test pass.")
Exemplo n.º 34
0
def loadNeigh_1():
    print 'loadNeighbor 1'
    zhima_usr = loadZhima()
    usr_id = zhima_usr['snwb']

    #add zhima 1_neighbor for subgraph
    subgraph_edges = gl.SFrame()
    sframeFiles = os.listdir(sFrameFolder)
    for sf in sframeFiles:
        edgesData = gl.load_sframe(os.path.join(sFrameFolder, sf))
        edgesData.rename({'X1': 'src', 'X2': 'dst'})
        zhima_neighbors = edgesData.filter_by(usr_id, 'src')
        subgraph_edges.append(zhima_neighbors)
        print sf

    #save subgraph
    subgraph_edges.save(os.path.join(resultDataFolder, 'subgraph_zhima_1'))
def save_positive_results_with_event_type_and_date(result_dataset):
    csvfile = "classification/data/extraction_fields.tsv"
    with codecs.open(csvfile,"r",encoding="utf8") as infile:
        lines = infile.readlines()
    #types = set()
    #for line in lines[5:]:
    #        types.add(line.split(",")[2].strip().lower())

    sf = gl.load_sframe("graphlab/my_training_dataset")

    # lines[-1].split("\t") = [u'620', u'E1', u'protest', u'', u'', u'T2', u'NGO' ...]
    size = int(lines[-1].split("\t")[0]) + 1 # latest news index 620 starts from 0 do 620+1
    labels = [0]*size

    for line in lines:
        fields = line.split("\t")
        key = fields[2].strip().lower()
        if key:
            ind = int(fields[0].strip())
            labels[ind] = types[key]

    #rel_folder="classification/data/v6/class_rel/"
    ef = sf.filter_by([1], "rel") # add_arguments(None,rel_folder,1,vec_model)

    ef['event_type'] = ef['filenames'].apply(lambda p: labels[int(p[1:5])])

    # evnt type classifier
    event_type_cls = gl.classifier.create(ef, target="event_type",features=['vectors','1gram features'])

    pos_results = result_dataset.filter_by([1], "class")

    pos_res_res = event_type_cls.classify(pos_results)

    pos_results.add_column(pos_res_res.select_column("class"),"event_type")
    pos_results.add_column(pos_res_res.select_column("probability"),"et_probability")

    pos_results.filter_by([5],"event_type")


    pos_results['date'] = pos_results['filenames'].apply(lambda x: x[:-5].split('_'))
    pos_results = pos_results.unpack('date')
    pos_results.rename({'date.0':'year', 'date.1':'month','date.2':'day', 'date.3':'index'})
    pos_results['year'] = pos_results['year'].apply(lambda year_str : int(year_str))
    pos_results['month'] = pos_results['month'].apply(lambda m_str : int(m_str))
    pos_results.save("graphlab/pos_results") ##_2005")
Exemplo n.º 36
0
def createGraph():
    zhima_usr = loadZhima()
    zhima_usr.rename({'snwb': '_id'})

    subgraph_edges = gl.load_sframe(
        os.path.join(resultDataFolder, 'subgraph_zhima_2'))
    #create graph
    sub_G = gl.SGraph()
    sub_G = sub_G.add_edges(edges=subgraph_edges,
                            src_field='src',
                            dst_field='dst')

    #join label to vertices
    sub_G.vertices.join(zhima_usr, on='_id', how='left')
    # sub_G.vertices.head(5)

    print 'save graph'
    sub_G.save(os.path.join(resultDataFolder, 'subgraph_zhima'))
Exemplo n.º 37
0
def _test_save_load_object_helper(testcase, obj, url):
    """
    Helper function to test save and load a server side object to a given url.
    """
    def cleanup(url):
        """
        Remove the saved file from temp directory.
        """
        protocol = None
        path = None
        splits = url.split("://")
        if len(splits) > 1:
            protocol = splits[0]
            path = splits[1]
        else:
            path = url
        if not protocol or protocol is "local" or protocol is "remote":
            tempdir = tempfile.gettempdir()
            pattern = path + ".*"
            for f in os.listdir(tempdir):
                if re.search(pattern, f):
                    os.remove(os.path.join(tempdir, f))

    if isinstance(obj, graphlab.SGraph):
        obj.save(url + ".graph")
        newobj = graphlab.load_graph(url + ".graph")
        testcase.assertItemsEqual(obj.get_fields(), newobj.get_fields())
        testcase.assertDictEqual(obj.summary(), newobj.summary())
    elif isinstance(obj, graphlab.Model):
        obj.save(url + ".model")
        newobj = graphlab.load_model(url + ".model")
        testcase.assertItemsEqual(obj.list_fields(), newobj.list_fields())
        testcase.assertEqual(type(obj), type(newobj))
    elif isinstance(obj, graphlab.SFrame):
        obj.save(url + ".frame_idx")
        newobj = graphlab.load_sframe(url + ".frame_idx")
        testcase.assertEqual(obj.shape, newobj.shape)
        testcase.assertEqual(obj.column_names(), newobj.column_names())
        testcase.assertEqual(obj.column_types(), newobj.column_types())
        assert_frame_equal(obj.head(obj.num_rows()).to_dataframe(),
                           newobj.head(newobj.num_rows()).to_dataframe())
    else:
        raise TypeError
    cleanup(url)
Exemplo n.º 38
0
 def _get_frame(self, fname, url):
     if os.path.isdir(self.folder + fname + '.gl'):
         return gl.load_sframe(self.folder + fname + '.gl')
     else:
         if fname.endswith('.gz') and os.path.isfile(self.folder +
                                                     fname[:-3]):
             frame = gl.SFrame.read_csv(self.folder + fname[:-3],
                                        delimiter='\t')
         elif os.path.isfile(self.folder + fname):
             frame = gl.SFrame.read_csv(self.folder + fname, delimiter='\t')
         else:
             urllib2.urlopen(url)
             print 'Downloading data from STITCH:', fname
             with file(self.folder + fname, 'wb') as f:
                 f.write(urllib2.urlopen(url).read())
             frame = gl.SFrame.read_csv(self.folder + fname, delimiter='\t')
             os.remove(self.folder + fname)
         frame.save(self.folder + fname + '.gl')
         return frame
Exemplo n.º 39
0
def get_wine_recs(ratings):
    path_to_movies = '/home/cully/Documents/capstone/data/flask_movies_sf'
    path_to_wine = '/home/cully/Documents/capstone/data/gridsearch_sf'
    wine_rec = gl.load_model(path_to_wine)
    movies_sf = gl.load_sframe(path_to_movies)
    movies_df = movies_sf.to_dataframe()
    value_vars = [x for x in movies_df.columns if x != 'id']
    new_ratings = {movie_dict[name]:int(ratings[name]) for name in ratings}
    new_df = pd.DataFrame.from_dict([new_ratings], orient='columns').replace(-1, np.nan)
    movies_df = pd.concat([movies_df, new_df]).reset_index(drop=True)
    ids = [i for i in movies_df.index]
    movies_df.insert(0, 'id', ids)
    movies_melted = gl.SFrame(pd.melt(movies_df, id_vars='id', value_vars=value_vars)).dropna()
    movies_rec = gl.factorization_recommender.create(movies_melted, user_id = 'id', item_id='variable', target='value')
    movies_user_intercept, movies_user_factors, _, _, movies_intercept = get_rec_coeffs(movies_rec)
    wine_item_factors = np.array(wine_rec.coefficients['wine_name']['factors'])[:,:8]
    wine_names = np.array(wine_rec.coefficients['wine_name']['wine_name'])
    comb = np.dot(np.array(movies_user_factors[-1]), wine_item_factors.T)
    return wine_names[np.argsort(comb)[::-1]]
    def test_graphlab_classifier(self):
        """ Test the graphlab clasifier from file

        :return:
        """

        this_dir, _ = os.path.split(os.path.abspath(__file__))
        this_dir = os.path.abspath(this_dir)
        model_path = os.path.join(this_dir, 'data', 'gl_mdl')

        model = GraphLabClassifierFromFile(model_path)
        self.assertEqual(model._model.name(), 'NeuralNetClassifier')

        x = gl.load_sframe(os.path.join(this_dir, 'data', 'img_10'))
        pred = model.predict(x)
        self.assertEqual(len(pred), 10)
        pred_prob = model.predict_proba(x)
        rows, cols = pred_prob.shape
        self.assertEqual(rows, 10 * 7)
        self.assertEqual(cols, 3)
def load_gl_object(filename):
    """
    Load a GLC datastructure from a filename.

    Parameters
    ----------
    filename : Filename for the archive

    Returns
    ----------
    The GLC object.

    """
    obj = None
    if not os.path.exists(filename):
        raise "Loading error: %s is not a valid filename." % filename

    try:
        obj = _gl.load_sframe(filename)
        return obj
    except:
        pass
    try:
        obj = _gl.load_sgraph(filename)
        return obj
    except:
        pass

    try:
        obj = _gl.load_model(filename)
        return obj
    except:
        pass

    try:
        obj = _gl.SArray(data=filename)
        return obj
    except:
        pass

    return obj
Exemplo n.º 42
0
    def __init__(self, dataframe):
        '''Must be run in a python 2 environment.
        INPUT: Cleaned and preprocessed pandas dataframe'''
        self.dataframe = dataframe
        #Graphlab LDA needs a bag of words dictionary for each document in the dataset.
        self.dataframe['bow'] = dataframe.ttl_ctxt.apply(
            lambda x: dict(Counter(x.split())))
        #Graphlab also require a sframe object
        self.sframe = gl.load_sframe(dataframe)
        self.bow = self.sframe['bow']

        def topic_modelling(self, n_topics, n_iterations):
            #Train Graphlab topic model
            topic_model = gl.topic_model.create(self.bows, num_topics,
                                                num_iterations)
            return topic_model

        def lda_vis(self, topic_model):
            #Visualize graphlab topic model
            plt.figure()
            pyLDAvis.graphlab.prepare(topic_model, self.bows)
            plt.show()
    def test_graphlab_classifier(self):
        """ Test the graphlab clasifier from file

        :return:
        """

        this_dir, _ = os.path.split(os.path.abspath(__file__))
        this_dir = os.path.abspath(this_dir)
        model_path = os.path.join(this_dir, 'data', 'gl_mdl')

        model = GraphLabClassifierFromFile(model_path)
        self.assertEqual(model._model.name(),
                         'NeuralNetClassifier')

        x = gl.load_sframe(os.path.join(this_dir,
                                        'data',
                                        'img_10'))
        pred = model.predict(x)
        self.assertEqual(len(pred), 10)
        pred_prob = model.predict_proba(x)
        rows, cols = pred_prob.shape
        self.assertEqual(rows, 10 * 7)
        self.assertEqual(cols, 3)
Exemplo n.º 44
0
    if not isinstance(s3_bucket, boto.s3.bucket.Bucket):
        s3_bucket = S3Connection().get_bucket(s3_bucket)

    for _id, url in id_url_pairs:
        results.append(get_source(s3_bucket, s3_save_path, _id, url))

    return results

# Divvy up a list of items as evenly as possible into n lists
def divvy(items, n):
    q, r = divmod(len(items), n)
    indices = [q * i + min(i, r) for i in xrange(n + 1)]
    return [items[indices[i]:indices[i + 1]] for i in xrange(n)]

# Load Hacker News metadata SFrame from S3
stories_sf = gl.load_sframe("s3://dato-datasets/hacker_news/stories.sframe")

# Get a list of ID, URL pairs from SFrame
id_url_pairs = [(x["id"], x["url"]) for x in stories_sf if x["url"]]

# Divvy the list of ID, URL pairs from above and pass to n=4 workers
chunks = divvy(id_url_pairs, 4)

# The S3 bucket and path to where source articles are to be stored in S3
# Set this to a bucket to which you have write access
s3_bucket = "my-bucket"
s3_save_path = "hacker_news/source_html"

# Specify EC2 execution environment
# The 2nd parameter should be set to an S3 bucket to which you will write logs
ec2 = gl.deploy.environment.EC2("ec2", "s3://my-bucket/logs")
Exemplo n.º 45
0
cursor = conn.cursor()
pcursor = conn2.cursor()
#
#
# cursor.execute("select owner_id,name,language,forked_from from projects where id=5")
#
# r=cursor.fetchone()
# print r[0]

# sf = graphlab.SFrame({'user_id': ["0", "0", "0", "1", "1", "2", "2", "2"],'item_id': ["a", "b", "c", "a", "b", "b", "c", "d"],'rating': [1, 3, 2, 5, 4, 1, 4, 3]})
# m = graphlab.recommender.create(sf, target='rating')
# model=gl.load_model('models/watcher_item')

# print 'loaded model',time()

sf = gl.load_sframe('csv/data')
pdata = gl.load_sframe('csv/project_data')
pdata = gl.SFrame({
    'item_id': pdata['item_id'],
    'language': pdata['language'],
    'name': pdata['name'],
    'description': pdata['description']
})
# print sf['user_id']
model = gl.recommender.ranking_factorization_recommender.create(
    sf, target='rating', item_data=pdata)
model.save('models/pdata_max')
# model=gl.load_model('models/rfm_rating')

while True:
    try:
Exemplo n.º 46
0
 def __init__(self):
     self.dbCon = None
     self.department_dist = {"departments": [], "orders": []}
     self.item_similarity_top_k = gl.load_sframe(
         '/home/ec2-user/insta/data/item_similarity_top_5_model')
Exemplo n.º 47
0
def load(name):
    return gl.load_sframe('data/%s_train.sframe' % name), \
        gl.load_sframe('data/%s_test.sframe' % name)
Exemplo n.º 48
0
import graphlab
import pandas as pd

binary_sf = graphlab.load_sframe('data/presence_absence.sframe/')
proportional_sf = graphlab.load_sframe('data/survey_proportion.sframe/')

proportional_df = proportional_sf.to_dataframe()
proportional_lookup_table = proportional_df.pivot_table(values= '%present', index='site', columns='taxa', fill_value=0.)

binary_df = binary_sf.to_dataframe()
binary_lookup_table = binary_df.pivot_table(values= 'present', index='site', columns='taxa', fill_value=0)

Exemplo n.º 49
0
#            else:
#                dtypes.pop()
#                chunk = gl.SFrame.read_csv(file_name, header=True, verbose=True, column_type_hints=dtypes)
#
#            whole = gl.SFrame(chunk)
#        else:
#            chunk = gl.SFrame.read_csv(file_name, header=True, verbose=True, column_type_hints=dtypes)
#            whole = whole.append(chunk)
#        print whole.shape
#
#    whole.save('~/datasets/Springleaf/' + prefix +'_binary')

#        unmatches = [i for i, j in zip(dtypes, tocheck) if i != j]
#'  '.join(data.column_names())

data = gl.load_sframe("~/datasets/Springleaf/train_binary")
test = gl.load_sframe("~/datasets/Springleaf/test_binary")

# row & column count
print "training data has " + str(data.shape[0]) + " rows and " + str(data.shape[1]) + " columns"
print "test data has " + str(test.shape[0]) + " rows and " + str(test.shape[1]) + " columns"

# Pre-process data
## add the target columns to test data as 9999
# test['target'] = 9999
#
## combine the training and test datasets for data preprocessing
# combined = data.append(test)

counter = 0
for i in data.column_names():
import graphlab as gl
from graphlab import mxnet as mx
import os
##mx.pretrained_model.download_model('https://static.turi.com/models/mxnet_models/release/image_classifier/imagenet1k_inception_bn-1.0.tar.gz')

mx.pretrained_model.list_models()

image_classifier = mx.pretrained_model.load_model('imagenet1k_inception_bn',
                                                  ctx=mx.gpu(0))

# Load image data into SFrame
data_file = 'cats_dogs_sf'
if os.path.exists(data_file):
    sf = gl.load_sframe(data_file)
else:
    url = 'https://static.turi.com/datasets/' + data_file
    sf = gl.load_sframe(url)
    sf.save(data_file)

# Predict using the pretrained image classifier
prediction = image_classifier.predict_topk(sf['image'], k=1)

# Extract features from images
features = image_classifier.extract_features(sf['image'])
Exemplo n.º 51
0
        dog_info['dpt'] = dog_data[i]['refuge_name'][:2]
        dog_info['image_filename'] = dog_data[i]['image_filename']
        dog_info['name'] = dog_data[i]['name']
        dog_info['url'] = dog_data[i]['url']
        dog_json['dog' + str(i)] = dog_info
    response['response'] = dog_json

    return response


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Name of database')
    parser.add_argument('db_name')
    args = parser.parse_args()

    images = graphlab.load_sframe(path + 'my_images')
    model = graphlab.load_model(path + 'my_model')
    client = MongoClient("localhost")
    db = client.dogos
    db.dogos_temp.drop()

    # Issue the serverStatus command and print the results
    # serverStatusResult=db.command("serverStatus")
    # pprint(serverStatusResult)

    num_images = len(images)

    for dog_id in xrange(num_images):
        print "Query %d in %d" % (dog_id, num_images)
        dogo = images[dog_id:dog_id + 1]
        neighbours = query_model(dogo, model, images)
Exemplo n.º 52
0
def main():
    #sf = create_frame_from_file('../../Data/data_file_modified.txt')
    x = gl.load_sframe('py2_ready_for_session')
    sessions = create_sessions(x)
    sessions.save('session')