def load_dataset(filename): """ Load given file with SFrame """ data_frame = sframe.SFrame() train_data = sframe.SFrame() dataset = data_frame.read_csv(filename, delimiter='|', header=False, nrows=100) train_data['topics'] = dataset['X1'] train_data['content'] = dataset['X2'] return train_data, dataset['X3']
def query(vec, model, k, max_search_radius): data = model['data'] table = model['table'] random_vectors = model['random_vectors'] num_vector = random_vectors.shape[1] # Compute bin index for the query vector, in bit representation. bin_index_bits = (vec.dot(random_vectors) >= 0).flatten() # Search nearby bins and collect candidates candidate_set = set() for search_radius in xrange(max_search_radius + 1): candidate_set = search_nearby_bins(bin_index_bits, table, search_radius, initial_candidates=candidate_set) # Sort candidates by their true distances from the query nearest_neighbors = sframe.SFrame({'id': candidate_set}) candidates = data[np.array(list(candidate_set)), :] nearest_neighbors['distance'] = pairwise_distances( candidates, vec, metric='cosine').flatten() return nearest_neighbors.topk('distance', k, reverse=True), len(candidate_set)
def test_zero_size_array(self): self.data = gl.SFrame() self.data['x'] = [array.array('d')] * 10 it = mxnet.io.SFrameIter(self.data, data_field='x') data_actual = [] for d in it: data_actual.extend(d.data[0].asnumpy().flatten()) self.assertEquals(data_actual, [])
def test_variable_size_image(self): shape1 = (2, 3, 1) shape2 = (2, 2, 2) tmp1 = gl.SArray([array.array('d', [0] * 6)]) tmp2 = gl.SArray([array.array('d', [0] * 8)]) data = gl.SFrame({'x': [tmp1.pixel_array_to_image(*shape1)[0], tmp2.pixel_array_to_image(*shape2)[0]]}) it = mxnet.io.SFrameIter(data, data_field='x') self.assertRaises(lambda: [it])
def brute_force_query(vec, data, k): num_data_points = data.shape[0] # Compute distances for ALL data points in training set nearest_neighbors = sframe.SFrame({'id':range(num_data_points)}) nearest_neighbors['distance'] = metrics.pairwise_distances(data, vec, metric='cosine').flatten() return nearest_neighbors.topk('distance', k, reverse=True)
def setUp(self): self.data = gl.SFrame({'x': [np.random.randn(8)] * 10, 'y': np.random.randint(2, size=10)}) self.shape = (8,) self.label_field = 'y' self.data_field = 'x' self.data_size = len(self.data) self.data_expected = list(x for arr in self.data['x'] for x in arr) self.label_expected = list(self.data['y'])
def setUp(self): self.data = gl.SFrame({'x': np.random.randn(10), 'y': np.random.randint(2, size=10)}) self.shape = (1,) self.label_field = 'y' self.data_field = 'x' self.data_size = len(self.data) self.data_expected = list(self.data['x']) self.label_expected = list(self.data['y'])
def setUp(self): (w, h, c, n) = (2, 4, 3, 100) self.images = [np.random.randint(256, size=(h,w,c)) for i in range(n)] self.data = gl.SFrame({'arr': [array.array('d', x.flatten()) for x in self.images], 'y': np.random.randint(2, size=n)}) self.data['img'] = self.data['arr'].pixel_array_to_image(w, h, c) self.shape = (c, h, w) self.label_field = 'y' self.data_field = 'img' self.data_size = len(self.data) self.data_expected = list(x for arr in self.data['arr'] for x in arr) self.label_expected = list(self.data['y'])
def polynomial_sframe(feature, degree): # assume that degree >= 1 & initialize the SFrame poly_sframe = sframe.SFrame() # first degree poly_sframe['power_1'] = feature if degree > 1: # then loop over the remaining degrees: # range usually starts at 0 and stops at the endpoint-1. We want it to start at 2 and stop at degree for power in range(2, degree + 1): name = 'power_' + str(power) poly_sframe[name] = feature.apply(lambda x: x**power) return poly_sframe
def test_size1_array(self): # setup data self.data = gl.SFrame({'x': [np.random.randn(1)] * 10, 'y': np.random.randint(2, size=10)}) self.shape = (1,) self.label_field = 'y' self.data_field = 'x' self.data_size = len(self.data) self.data_expected = list(x for arr in self.data['x'] for x in arr) self.label_expected = list(self.data['y']) self.test_one_batch() self.test_non_divisible_batch() self.test_padding() self.test_shape_inference()
def CreateDataFrame(csvFileName, protocol, sframe): if sframe: import sframe frameName = csvFileName.replace(".csv", "_SFRAME") dataframe = sframe.SFrame(csvFileName) #create dataframe in SFrame dataframe.save(frameName) #save sframe print dat(), "Creating SFRAME:", frameName else: import pandas frameName = csvFileName.replace(".csv", ".PANDAS") pDataframe = pandas.read_csv(csvFileName).fillna( 'N/A') #create pandas dataframe pDataframe.to_pickle(frameName) #save pandas dataframe print dat(), "Creating:", frameName
def read_and_clean_data(): '''Reads data and clean the words with punctuation e.g. Hello! => Hello ''' products = sframe.SFrame('amazon_baby.gl/') products['clean_review'] = products['review'].apply(remove_punctuation) # Filter the neutral ratings, neutral ratings does not help much in # learning process products = products[products['rating'] != 3] # Add sentiment feature to indicate whether reviwe is positive or negative # this will be out target of prediction products['sentiment'] = products['rating'].apply(lambda rating : \ 1 if rating > 3 else -1) return products
def get_numpy_data(data_sframe, features, output): # add a constant column to an SFrame data_sframe['constant'] = 1 # prepend variable 'constant' to the features list features = ['constant'] + features # select the columns of data_SFrame given by the 'features' list into the SFrame 'features_sframe' features_sframe = sframe.SFrame() features_sframe[features] = data_sframe[features] # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!! features_matrix = features_sframe.to_numpy() features_matrix = features_matrix # assign the column of data_sframe associated with the target to the variable 'output_sarray' output_sarray = data_sframe[output] # this will convert the SArray into a numpy array: output_array = output_sarray.to_numpy() # GraphLab Create>= 1.7!! return features_matrix, output_array
def setUp(self): w = 2 h = 3 c = 1 d = 6 n = 5 self.data = gl.SFrame({'arr': [array.array('d', range(0, 6)), array.array('d', range(50, 56)), array.array('d', range(100, 106)), array.array('d', range(200, 206)), array.array('d', range(249, 255))], 'y': np.random.randint(2, size=n)}) self.data['img'] = self.data['arr'].pixel_array_to_image(w, h, c) self.shape = (c, h, w) self.label_field = 'y' self.data_field = 'img' self.data_size = len(self.data) self.data_expected = list(x for arr in self.data['arr'] for x in arr) self.label_expected = list(self.data['y'])
def setUp(self): self.data = gl.SFrame({'i': [x for x in range(10)], '-i': [-x for x in range(10)], 'f': [float(x) for x in range(10)], '-f': [-float(x) for x in range(10)], 'arr': [range(2) for x in range(10)], 'y': np.random.randint(2, size=10)}) self.shape = (6,) self.label_field = 'y' self.data_field = ['i', '-i', 'f', '-f', 'arr'] self.data_size = len(self.data) def val_iter(): for row in self.data: for col in self.data_field: v = row[col] if type(v) is array.array: for x in v: yield x else: yield float(v) self.data_expected = list(val_iter()) self.label_expected = list(self.data['y'])
def lag(df, features, numlag, type='pd'): if len(features) == len(numlag): if type == 'sf': import sframe as SF sf = SF.SFrame(df) for j in range(0, len(features)): feature = features[j] column = sf[feature] for i in range(1, numlag[j] + 1): lead = [0] * i stri = str(i) lead.extend(column[0:len(column) - i]) exec('sf["' + feature + 'lag' + stri + '"] = lead') sf = sf[max(numlag):len(sf)] return (sf) elif type == 'pd': import pandas as pd sf = pd.DataFrame(df) for j in range(0, len(features)): feature = features[j] column = sf[feature] for i in range(1, numlag[j] + 1): lead = [0] * i stri = str(i) lead.extend(column[0:len(column) - i]) exec('sf["' + feature + 'lag' + stri + '"] = lead') sf = sf[max(numlag):len(sf)] return (sf) else: print('len(features) != len(numlag)') return (0)
import sframe loans = sframe.SFrame('lending-club-data.gl/') features = ['grade', # grade of the loan 'term', # the term of the loan 'home_ownership', # home_ownership status: own, mortgage or rent 'emp_length', # number of years of employment ] target = 'safe_loans' loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1) loans = loans.remove_column('bad_loans') # Subsample dataset to make sure classes are balanced safe_loans_raw = loans[loans[target] == 1] risky_loans_raw = loans[loans[target] == -1] # Since there are less risky loans than safe loans, find the ratio of the sizes # and use that percentage to undersample the safe loans. percentage = len(risky_loans_raw)/float(len(safe_loans_raw)) safe_loans = safe_loans_raw.sample(percentage, seed = 1) risky_loans = risky_loans_raw loans_data = risky_loans.append(safe_loans) print "Percentage of safe loans :", len(safe_loans) / float(len(loans_data)) print "Percentage of risky loans :", len(risky_loans) / float(len(loans_data)) print "Total number of loans in our new dataset :", len(loans_data) loans_data_features = loans_data[features] def one_hot_normalize_to_columns(sfData):
word_count_table = row[['tf_idf' ]].stack('tf_idf', new_column_name=['word', 'weight']) return word_count_table.sort('weight', ascending=False) def has_top_words(word_count_vector): # extract the keys of word_count_vector and convert it to a set unique_words = set(word_count_vector.keys()) # return True if common_words is a subset of unique_words # return False otherwise return common_words.issubset(unique_words) # El cuerpo de todos los datos de wikipedia sobre los que trabajaremos wiki = sframe.SFrame('..//w2-a1//people_wiki.gl/') wiki = wiki.add_row_number() # El conteo de palabras de cada articulo nos lo dan, aunque se podria extraer por nuestros propios medios (explorar sklearn.CountVectorize) word_count = load_sparse_csr('..//w2-a1//people_wiki_word_count.npz') map_index_to_word = sframe.SFrame( '..//w2-a1//people_wiki_map_index_to_word.gl/') # Ahora experimentaremos con KNN. Primero con los word counts en Bruto model = NearestNeighbors(metric='euclidean', algorithm='brute') model.fit(word_count) # Buscamos el artculo mas parecido a Obama obama_id = wiki[wiki['name'] == 'Barack Obama']['id'][0] distances, indices = model.kneighbors(word_count[obama_id], n_neighbors=10)
Created on Sun Dec 17 11:37:56 2017 @author: Abhishek S """ import sframe as sf import numpy as np import sframe.aggregate as agg import sklearn import sklearn.ensemble from sklearn.ensemble import GradientBoostingClassifier loans=sf.SFrame('E:/Machine learning Classification/Week 5/lending-club-data.gl') loans loans.print_rows(5,68) dt={0:1,1:-1} loans['safe_loans']=loans['bad_loans'].apply(lambda x:1 if x==0 else -1) del loans['bad_loans'] loans target = 'safe_loans' features = ['grade', # grade of the loan (categorical) 'sub_grade_num', # sub-grade of the loan as a number from 0 to 1 'short_emp', # one year or less of employment 'emp_length_num', # number of years of employment 'home_ownership', # home_ownership status: own, mortgage or rent 'dti', # debt to income ratio 'purpose', # the purpose of the loan 'payment_inc_ratio', # ratio of the monthly payment to income
for i in xrange(num_doc)] def top_words(name): """ Get a table of the most frequent words in the given person's wikipedia page. """ row = wiki[wiki['name'] == name] print row word_count_table = row[['word_count' ]].stack('word_count', new_column_name=['word', 'count']) return word_count_table.sort('count', ascending=False) wiki = sframe.SFrame('people_wiki.gl/') wiki = wiki.add_row_number() word_count = load_sparse_csr('people_wiki_word_count.npz') # print word_count[35817] map_index_to_word = sframe.SFrame('people_wiki_map_index_to_word.gl/') # model = NearestNeighbors(metric='euclidean', algorithm='brute') # model.fit(word_count) # print wiki[wiki['name'] == 'Barack Obama'] # distances, indices = model.kneighbors(word_count[35817], n_neighbors=10) # 1st # print distances, indices # neighbors = sframe.SFrame({'distance':distances.flatten(), 'id':indices.flatten()}) # print wiki.join(neighbors, on='id').sort('distance')[['id','name','distance']] wiki['word_count'] = unpack_dict(word_count, map_index_to_word)
# neogi - just numpy functions BEST ONE # ramaranjanruj - uses sklearn # corylstewart - uses graphlab # justindomingue - very high level only functions import pandas as pd import sframe import numpy as np from sklearn.model_selection import train_test_split from math import sqrt #import graphlab # import data sales = sframe.SFrame( '/Users/davidbartram-shaw/Machine Learning Course/Course 2 - Regression/kc_house_data.gl/' ) #sales=sframe.SFrame.to_dataframe(sales) # Test/Train split (train_and_validation, test) = sales.random_split(.8, seed=1) # initial train/test split (train, validation) = train_and_validation.random_split( .8, seed=1) # split training set into training and validation sets ################################################################# # REGRESSION - KNN REGRESSION & KERNALS ################################################################# # optimize matrix operations
# brute force query, for comparison def brute_force_query(vec, data, k): num_data_points = data.shape[0] # Compute distances for ALL data points in training set nearest_neighbors = sframe.SFrame({'id':range(num_data_points)}) nearest_neighbors['distance'] = metrics.pairwise_distances(data, vec, metric='cosine').flatten() return nearest_neighbors.topk('distance', k, reverse=True) # ------------------------------------------------------------------------------- # Test # # ------------------------------------------------------------------------------- people = sframe.SFrame('data/people_wiki.gl') people = people.add_row_number() # preprocessed tf-idf vectors words = load_sparse_csr('data/people_wiki_tf_idf.npz') map_index_to_word = sframe.SFrame('data/people_wiki_map_index_to_word.gl') model = train_lsh(words, num_vector=16, seed=143) example_search = 'Alberto Contador' example_search_document = people[people['name'] == example_search] example_search_id = example_search_document['id'][0] example_bin = [key for key, value in model['table'].iteritems() if example_search_id in value][0] print('\nBin for example search({}): {}, content: {}\n'.format(example_search, example_bin, model['table'][example_bin]))
from sklearn.linear_model import LogisticRegression import numpy as np # Function to remove punctuations def remove_punctuation(text): return string.translate(text, None, string.punctuation) # Function to compute sigmoid response def sigmoid(x): return 1.0 / (1 + np.exp(-x)) # Read data products = sframe.SFrame('../data/Week01/amazon_baby.gl/') products['review_clean'] = products['review'].apply(remove_punctuation) # Discard rating with value 3; these are treated as neither negative nor positive products = products[products['rating'] != 3] products['sentiment'] = products['rating'].apply(lambda rating: +1 if rating > 3 else -1) # Split data into training and testing sets train_data, test_data = products.random_split(fraction=0.8, seed=1) # Create training and test matrices from the corresponding data using a CountVectorizer vectorizer = CountVectorizer(token_pattern=r'\b\w+\b') train_matrix = vectorizer.fit_transform(train_data['review_clean']) test_matrix = vectorizer.transform(test_data['review_clean']) words = vectorizer.get_feature_names()
#!/usr/bin/env python # coding: utf-8 # In[10]: import sframe # # Read product review data # In[23]: products = sframe.SFrame('../Data/amazon_baby.gl') # In[33]: pwd # In[34]: products.save('../Data/products.csv', format='csv') # In[2]:
import sframe import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score products = sframe.SFrame('amazon_baby.gl/') def remove_punctuation(text): import string return text.translate(None, string.punctuation) # method for printing out confusion matrix # In the case of binary classification, the confusion matrix is a 2-by-2 matrix def print_confusion_matrix(y, y_hat, classifier): from sklearn.metrics import confusion_matrix # use the same order of class as the LR model cmat = confusion_matrix(y_true = y, y_pred = y_hat, labels = classifier.classes_) print ' target_label | predicted_label | count ' print '--------------+-----------------+-------' # Print out the confusion matrix. for i, target_label in enumerate(classifier.classes_): for j, predicted_label in enumerate(classifier.classes_): print '{0:^13} | {1:^15} | {2:5d}'.format(target_label, predicted_label, cmat[i,j]) def apply_threshold(probabilities, threshold): return probabilities.applymap(lambda x : 1 if x >= threshold else -1) ## !!
import sframe import json import numpy as np from math import sqrt products = sframe.SFrame('amazon_baby_subset.gl/') # For this assignment, we eliminated class imbalance by choosing a # subset of the data with a similar number of positive and negative reviews. print '# of positive reviews =', len(products[products['sentiment'] == 1]) print '# of negative reviews =', len(products[products['sentiment'] == -1]) def remove_punctuation(text): import string return text.translate(None, string.punctuation) # the following function extracts columns from an SFrame and converts # them into a NumPy array # # the feature matrix includes an additional column 'intercept' # to take account of the intercept term - all 1s def get_numpy_data(data_sframe, features, label): data_sframe['intercept'] = 1 features = ['intercept'] + features features_sframe = data_sframe[features] feature_matrix = features_sframe.to_numpy() label_sarray = data_sframe[label] label_array = label_sarray.to_numpy() return (feature_matrix, label_array)
def test_variable_size_array(self): self.data = gl.SFrame({'x': [[0], [0, 1], [0, 1, 2]]}) self.assertRaises(ValueError, lambda: mxnet.io.SFrameIter(self.data, data_field='x'))
# Function to compute accuracy def accuracy(prediction, actual): """ Purpose: Compute accuracy Input : Predicted output values, true output values Output : Accuracy """ prediction_correct = sum((actual == prediction) * 1.0) prediction_total = len(prediction) accuracy = prediction_correct / prediction_total return accuracy # Read data loans = sframe.SFrame('../data/Week05/lending-club-data.gl/') # Preprocess data loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x == 0 else -1) loans = loans.remove_column('bad_loans') # Selected features features = [ 'grade', # grade of the loan (categorical) 'sub_grade_num', # sub-grade of the loan as a number from 0 to 1 'short_emp', # one year or less of employment 'emp_length_num', # number of years of employment 'home_ownership', # home_ownership status: own, mortgage or rent 'dti', # debt to income ratio 'purpose', # the purpose of the loan 'payment_inc_ratio', # ratio of the monthly payment to income
# Function to plot likelihood curves def make_plot(log_likelihood_all, len_data, batch_size, smoothing_window=1, label=''): plt.rcParams.update({'figure.figsize': (9,5)}) log_likelihood_all_ma = np.convolve(np.array(log_likelihood_all), \ np.ones((smoothing_window,))/smoothing_window, mode='valid') plt.plot(np.array(range(smoothing_window-1, len(log_likelihood_all)))*float(batch_size)/len_data, log_likelihood_all_ma, linewidth=4.0, label=label) plt.rcParams.update({'font.size': 16}) plt.tight_layout() plt.xlabel('# of passes over data') plt.ylabel('Average log likelihood per data point') plt.legend(loc='lower right', prop={'size':14}) # Read data products = sframe.SFrame('../data/Week02/amazon_baby_subset.gl/') # Set of important words; these will be the features list_of_words = ["baby", "one", "great", "love", "use", "would", "like", "easy", "little", "seat", "old", "well", "get", "also", "really", "son", "time", "bought", "product", "good", "daughter", "much", "loves", "stroller", "put", "months", "car", "still", "back", "used", "recommend", "first", "even", "perfect", "nice", "bag", "two", "using", "got", "fit", "around", "diaper", "enough", "month", "price", "go", "could", "soft", "since", "buy", "room", "works", "made", "child", "keep", "size", "small", "need", "year", "big", "make", "take", "easily", "think", "crib", "clean", "way", "quality", "thing", "better", "without", "set", "new", "every", "cute", "best", "bottles", "work", "purchased", "right", "lot", "side", "happy", "comfortable", "toy", "able", "kids", "bit", "night", "long", "fits", "see", "us", "another", "play", "day", "money", "monitor", "tried", "thought", "never", "item", "hard", "plastic", "however", "disappointed", "reviews", "something", "going", "pump", "bottle", "cup", "waste", "return", "amazon", "different", "top", "want", "problem", "know", "water", "try", "received", "sure", "times", "chair", "find", "hold", "gate", "open", "bottom", "away", "actually", "cheap", "worked", "getting", "ordered", "came", "milk", "bad", "part", "worth", "found", "cover", "many", "design", "looking", "weeks", "say", "wanted", "look", "place", "purchase", "looks", "second", "piece", "box", "pretty", "trying", "difficult", "together", "though", "give", "started", "anything", "last", "company", "come", "returned", "maybe", "took", "broke", "makes", "stay", "instead", "idea", "head", "said", "less", "went", "working", "high", "unit", "seems", "picture", "completely", "wish", "buying", "babies", "won", "tub", "almost", "either"] # The label label = ['sentiment'] # Remove punctuations products['review_clean'] = products['review'].apply(lr_mle_sg.remove_punctuation) # For each important word add a new column and determine count of that word in all reviews for word in list_of_words: products[word] = products['review_clean'].apply(lambda x: x.split().count(word)) train_data, validation_data = products.random_split(.9, seed=1)
'sentiment': 1 }, { 'text': 'I hate this f*****g product. Piece of shit.', 'sentiment': -1 }, { 'text': 'I love this excellent product. Great great great', 'sentiment': 1 }, { 'text': 'Hate hate hate! Never again. Bad', 'sentiment': 1 }, { 'text': 'Bad product. Really bad. I hate it.', 'sentiment': -1 }] sf = sframe.SFrame(train_dataset) data = sf.unpack('X1') count_vect = CountVectorizer() counts = count_vect.fit_transform(data['X1.text']) regression = LogisticRegression() model = regression.fit(counts, data['X1.sentiment']) predict_data = ["I love this product.", "This is a really bad product."] predict_count_vects = CountVectorizer(vocabulary=count_vect.vocabulary_) predict_counts = predict_count_vects.fit_transform(predict_data) predictions = model.predict(predict_counts) for text, prediction in zip(predict_data, predictions):