Exemplo n.º 1
0
def load_dataset(filename):
    """
    Load given file with SFrame
    """
    data_frame = sframe.SFrame()
    train_data = sframe.SFrame()
    dataset = data_frame.read_csv(filename,
                                  delimiter='|',
                                  header=False,
                                  nrows=100)
    train_data['topics'] = dataset['X1']
    train_data['content'] = dataset['X2']
    return train_data, dataset['X3']
Exemplo n.º 2
0
def query(vec, model, k, max_search_radius):

    data = model['data']
    table = model['table']
    random_vectors = model['random_vectors']
    num_vector = random_vectors.shape[1]

    # Compute bin index for the query vector, in bit representation.
    bin_index_bits = (vec.dot(random_vectors) >= 0).flatten()

    # Search nearby bins and collect candidates
    candidate_set = set()
    for search_radius in xrange(max_search_radius + 1):
        candidate_set = search_nearby_bins(bin_index_bits,
                                           table,
                                           search_radius,
                                           initial_candidates=candidate_set)

    # Sort candidates by their true distances from the query
    nearest_neighbors = sframe.SFrame({'id': candidate_set})
    candidates = data[np.array(list(candidate_set)), :]
    nearest_neighbors['distance'] = pairwise_distances(
        candidates, vec, metric='cosine').flatten()

    return nearest_neighbors.topk('distance', k,
                                  reverse=True), len(candidate_set)
Exemplo n.º 3
0
 def test_zero_size_array(self):
     self.data = gl.SFrame()
     self.data['x'] = [array.array('d')] * 10
     it = mxnet.io.SFrameIter(self.data, data_field='x')
     data_actual = []
     for d in it:
         data_actual.extend(d.data[0].asnumpy().flatten())
     self.assertEquals(data_actual, [])
Exemplo n.º 4
0
 def test_variable_size_image(self):
     shape1 = (2, 3, 1)
     shape2 = (2, 2, 2)
     tmp1 = gl.SArray([array.array('d', [0] * 6)])
     tmp2 = gl.SArray([array.array('d', [0] * 8)])
     data = gl.SFrame({'x': [tmp1.pixel_array_to_image(*shape1)[0], tmp2.pixel_array_to_image(*shape2)[0]]})
     it = mxnet.io.SFrameIter(data, data_field='x')
     self.assertRaises(lambda: [it])
Exemplo n.º 5
0
def brute_force_query(vec, data, k):
    num_data_points = data.shape[0]
    
    # Compute distances for ALL data points in training set
    nearest_neighbors = sframe.SFrame({'id':range(num_data_points)})
    nearest_neighbors['distance'] = metrics.pairwise_distances(data, vec, metric='cosine').flatten()
    
    return nearest_neighbors.topk('distance', k, reverse=True)
Exemplo n.º 6
0
 def setUp(self):
     self.data = gl.SFrame({'x': [np.random.randn(8)] * 10,
                           'y': np.random.randint(2, size=10)})
     self.shape = (8,)
     self.label_field = 'y'
     self.data_field = 'x'
     self.data_size = len(self.data)
     self.data_expected = list(x for arr in self.data['x'] for x in arr)
     self.label_expected = list(self.data['y'])
Exemplo n.º 7
0
 def setUp(self):
     self.data = gl.SFrame({'x': np.random.randn(10),
                           'y': np.random.randint(2, size=10)})
     self.shape = (1,)
     self.label_field = 'y'
     self.data_field = 'x'
     self.data_size = len(self.data)
     self.data_expected = list(self.data['x'])
     self.label_expected = list(self.data['y'])
Exemplo n.º 8
0
 def setUp(self):
     (w, h, c, n) = (2, 4, 3, 100)
     self.images = [np.random.randint(256, size=(h,w,c)) for i in range(n)]
     self.data = gl.SFrame({'arr': [array.array('d', x.flatten()) for x in self.images],
                           'y': np.random.randint(2, size=n)})
     self.data['img'] = self.data['arr'].pixel_array_to_image(w, h, c)
     self.shape = (c, h, w)
     self.label_field = 'y'
     self.data_field = 'img'
     self.data_size = len(self.data)
     self.data_expected = list(x for arr in self.data['arr'] for x in arr)
     self.label_expected = list(self.data['y'])
def polynomial_sframe(feature, degree):
    # assume that degree >= 1 & initialize the SFrame
    poly_sframe = sframe.SFrame()
    # first degree
    poly_sframe['power_1'] = feature
    if degree > 1:
        # then loop over the remaining degrees:
        # range usually starts at 0 and stops at the endpoint-1. We want it to start at 2 and stop at degree
        for power in range(2, degree + 1):
            name = 'power_' + str(power)
            poly_sframe[name] = feature.apply(lambda x: x**power)
    return poly_sframe
Exemplo n.º 10
0
    def test_size1_array(self):
        # setup data
        self.data = gl.SFrame({'x': [np.random.randn(1)] * 10,
                              'y': np.random.randint(2, size=10)})
        self.shape = (1,)
        self.label_field = 'y'
        self.data_field = 'x'
        self.data_size = len(self.data)
        self.data_expected = list(x for arr in self.data['x'] for x in arr)
        self.label_expected = list(self.data['y'])

        self.test_one_batch()
        self.test_non_divisible_batch()
        self.test_padding()
        self.test_shape_inference()
Exemplo n.º 11
0
def CreateDataFrame(csvFileName, protocol, sframe):
    if sframe:
        import sframe

        frameName = csvFileName.replace(".csv", "_SFRAME")
        dataframe = sframe.SFrame(csvFileName)  #create dataframe in SFrame
        dataframe.save(frameName)  #save sframe
        print dat(), "Creating SFRAME:", frameName
    else:
        import pandas

        frameName = csvFileName.replace(".csv", ".PANDAS")
        pDataframe = pandas.read_csv(csvFileName).fillna(
            'N/A')  #create pandas dataframe
        pDataframe.to_pickle(frameName)  #save pandas dataframe
        print dat(), "Creating:", frameName
Exemplo n.º 12
0
def read_and_clean_data():
    '''Reads data and clean the words with punctuation
	e.g. Hello! => Hello
	'''
    products = sframe.SFrame('amazon_baby.gl/')
    products['clean_review'] = products['review'].apply(remove_punctuation)

    # Filter the neutral ratings, neutral ratings does not help much in
    # learning process
    products = products[products['rating'] != 3]

    # Add sentiment feature to indicate whether reviwe is positive or negative
    # this will be out target of prediction
    products['sentiment'] = products['rating'].apply(lambda rating : \
          1 if rating > 3 else -1)
    return products
Exemplo n.º 13
0
def get_numpy_data(data_sframe, features, output):
    # add a constant column to an SFrame
    data_sframe['constant'] = 1
    # prepend variable 'constant' to the features list
    features = ['constant'] + features
    # select the columns of data_SFrame given by the 'features' list into the SFrame 'features_sframe'
    features_sframe = sframe.SFrame()
    features_sframe[features] = data_sframe[features]
    # this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
    features_matrix = features_sframe.to_numpy()
    features_matrix = features_matrix
    # assign the column of data_sframe associated with the target to the variable 'output_sarray'
    output_sarray = data_sframe[output]
    # this will convert the SArray into a numpy array:
    output_array = output_sarray.to_numpy()  # GraphLab Create>= 1.7!!
    return features_matrix, output_array
Exemplo n.º 14
0
 def setUp(self):
     w = 2
     h = 3
     c = 1
     d = 6
     n = 5
     self.data = gl.SFrame({'arr': [array.array('d', range(0, 6)),
                                    array.array('d', range(50, 56)),
                                    array.array('d', range(100, 106)),
                                    array.array('d', range(200, 206)),
                                    array.array('d', range(249, 255))],
                           'y': np.random.randint(2, size=n)})
     self.data['img'] = self.data['arr'].pixel_array_to_image(w, h, c)
     self.shape = (c, h, w)
     self.label_field = 'y'
     self.data_field = 'img'
     self.data_size = len(self.data)
     self.data_expected = list(x for arr in self.data['arr'] for x in arr)
     self.label_expected = list(self.data['y'])
Exemplo n.º 15
0
 def setUp(self):
     self.data = gl.SFrame({'i': [x for x in range(10)],
                           '-i': [-x for x in range(10)],
                           'f': [float(x) for x in range(10)],
                           '-f': [-float(x) for x in range(10)],
                           'arr': [range(2) for x in range(10)],
                           'y': np.random.randint(2, size=10)})
     self.shape = (6,)
     self.label_field = 'y'
     self.data_field = ['i', '-i', 'f', '-f', 'arr']
     self.data_size = len(self.data)
     def val_iter():
         for row in self.data:
             for col in self.data_field:
                 v = row[col]
                 if type(v) is array.array:
                     for x in v:
                         yield x
                 else:
                     yield float(v)
     self.data_expected = list(val_iter())
     self.label_expected = list(self.data['y'])
Exemplo n.º 16
0
def lag(df, features, numlag, type='pd'):
    if len(features) == len(numlag):
        if type == 'sf':
            import sframe as SF

            sf = SF.SFrame(df)

            for j in range(0, len(features)):

                feature = features[j]
                column = sf[feature]

                for i in range(1, numlag[j] + 1):
                    lead = [0] * i
                    stri = str(i)
                    lead.extend(column[0:len(column) - i])
                    exec('sf["' + feature + 'lag' + stri + '"] = lead')
            sf = sf[max(numlag):len(sf)]

            return (sf)
        elif type == 'pd':

            import pandas as pd
            sf = pd.DataFrame(df)
            for j in range(0, len(features)):
                feature = features[j]
                column = sf[feature]
                for i in range(1, numlag[j] + 1):
                    lead = [0] * i
                    stri = str(i)
                    lead.extend(column[0:len(column) - i])
                    exec('sf["' + feature + 'lag' + stri + '"] = lead')
            sf = sf[max(numlag):len(sf)]
            return (sf)
    else:
        print('len(features) != len(numlag)')
        return (0)
import sframe
loans = sframe.SFrame('lending-club-data.gl/')

features = ['grade',              # grade of the loan
            'term',               # the term of the loan
            'home_ownership',     # home_ownership status: own, mortgage or rent
            'emp_length',         # number of years of employment
           ]
target = 'safe_loans'

loans['safe_loans'] = loans['bad_loans'].apply(lambda x : +1 if x==0 else -1)
loans = loans.remove_column('bad_loans')

# Subsample dataset to make sure classes are balanced
safe_loans_raw = loans[loans[target] == 1]
risky_loans_raw = loans[loans[target] == -1]

# Since there are less risky loans than safe loans, find the ratio of the sizes
# and use that percentage to undersample the safe loans.
percentage = len(risky_loans_raw)/float(len(safe_loans_raw))
safe_loans = safe_loans_raw.sample(percentage, seed = 1)
risky_loans = risky_loans_raw
loans_data = risky_loans.append(safe_loans)

print "Percentage of safe loans                 :", len(safe_loans) / float(len(loans_data))
print "Percentage of risky loans                :", len(risky_loans) / float(len(loans_data))
print "Total number of loans in our new dataset :", len(loans_data)

loans_data_features = loans_data[features]

def one_hot_normalize_to_columns(sfData):
Exemplo n.º 18
0
    word_count_table = row[['tf_idf'
                            ]].stack('tf_idf',
                                     new_column_name=['word', 'weight'])
    return word_count_table.sort('weight', ascending=False)


def has_top_words(word_count_vector):
    # extract the keys of word_count_vector and convert it to a set
    unique_words = set(word_count_vector.keys())
    # return True if common_words is a subset of unique_words
    # return False otherwise
    return common_words.issubset(unique_words)


# El cuerpo de todos los datos de wikipedia sobre los que trabajaremos
wiki = sframe.SFrame('..//w2-a1//people_wiki.gl/')
wiki = wiki.add_row_number()

# El conteo de palabras de cada articulo nos lo dan, aunque se podria extraer por nuestros propios medios (explorar sklearn.CountVectorize)
word_count = load_sparse_csr('..//w2-a1//people_wiki_word_count.npz')
map_index_to_word = sframe.SFrame(
    '..//w2-a1//people_wiki_map_index_to_word.gl/')

# Ahora experimentaremos con KNN. Primero con los word counts en Bruto
model = NearestNeighbors(metric='euclidean', algorithm='brute')
model.fit(word_count)

# Buscamos el artculo mas parecido a Obama
obama_id = wiki[wiki['name'] == 'Barack Obama']['id'][0]

distances, indices = model.kneighbors(word_count[obama_id], n_neighbors=10)
Exemplo n.º 19
0
Created on Sun Dec 17 11:37:56 2017

@author: Abhishek S
"""

import sframe as sf
import numpy as np
import sframe.aggregate as agg
import sklearn
import sklearn.ensemble
from sklearn.ensemble import GradientBoostingClassifier


 

loans=sf.SFrame('E:/Machine learning Classification/Week 5/lending-club-data.gl')
loans
loans.print_rows(5,68)
dt={0:1,1:-1}
loans['safe_loans']=loans['bad_loans'].apply(lambda x:1 if x==0 else -1)
del loans['bad_loans']
loans
target = 'safe_loans'
features = ['grade',                     # grade of the loan (categorical)
            'sub_grade_num',             # sub-grade of the loan as a number from 0 to 1
            'short_emp',                 # one year or less of employment
            'emp_length_num',            # number of years of employment
            'home_ownership',            # home_ownership status: own, mortgage or rent
            'dti',                       # debt to income ratio
            'purpose',                   # the purpose of the loan
            'payment_inc_ratio',         # ratio of the monthly payment to income
            for i in xrange(num_doc)]


def top_words(name):
    """
    Get a table of the most frequent words in the given person's wikipedia page.
    """
    row = wiki[wiki['name'] == name]
    print row
    word_count_table = row[['word_count'
                            ]].stack('word_count',
                                     new_column_name=['word', 'count'])
    return word_count_table.sort('count', ascending=False)


wiki = sframe.SFrame('people_wiki.gl/')
wiki = wiki.add_row_number()
word_count = load_sparse_csr('people_wiki_word_count.npz')
# print word_count[35817]
map_index_to_word = sframe.SFrame('people_wiki_map_index_to_word.gl/')
# model = NearestNeighbors(metric='euclidean', algorithm='brute')
# model.fit(word_count)
# print wiki[wiki['name'] == 'Barack Obama']

# distances, indices = model.kneighbors(word_count[35817], n_neighbors=10) # 1st

# print distances, indices

# neighbors = sframe.SFrame({'distance':distances.flatten(), 'id':indices.flatten()})
# print wiki.join(neighbors, on='id').sort('distance')[['id','name','distance']]
wiki['word_count'] = unpack_dict(word_count, map_index_to_word)
Exemplo n.º 21
0
# neogi - just numpy functions BEST ONE
# ramaranjanruj - uses sklearn
# corylstewart - uses graphlab
# justindomingue - very high level only functions

import pandas as pd
import sframe
import numpy as np
from sklearn.model_selection import train_test_split
from math import sqrt
#import graphlab

# import data
sales = sframe.SFrame(
    '/Users/davidbartram-shaw/Machine Learning Course/Course 2 - Regression/kc_house_data.gl/'
)
#sales=sframe.SFrame.to_dataframe(sales)

# Test/Train split
(train_and_validation,
 test) = sales.random_split(.8, seed=1)  # initial train/test split
(train, validation) = train_and_validation.random_split(
    .8, seed=1)  # split training set into training and validation sets

#################################################################
# REGRESSION - KNN REGRESSION & KERNALS
#################################################################


# optimize matrix operations
Exemplo n.º 22
0
# brute force query, for comparison 
def brute_force_query(vec, data, k):
    num_data_points = data.shape[0]
    
    # Compute distances for ALL data points in training set
    nearest_neighbors = sframe.SFrame({'id':range(num_data_points)})
    nearest_neighbors['distance'] = metrics.pairwise_distances(data, vec, metric='cosine').flatten()
    
    return nearest_neighbors.topk('distance', k, reverse=True)
  
# ------------------------------------------------------------------------------- 
#                      Test                                                     #
# -------------------------------------------------------------------------------


people = sframe.SFrame('data/people_wiki.gl')
people = people.add_row_number()

# preprocessed tf-idf vectors
words = load_sparse_csr('data/people_wiki_tf_idf.npz')

map_index_to_word = sframe.SFrame('data/people_wiki_map_index_to_word.gl')


model = train_lsh(words, num_vector=16, seed=143)

example_search = 'Alberto Contador'
example_search_document = people[people['name'] == example_search]
example_search_id = example_search_document['id'][0]
example_bin = [key for key, value in model['table'].iteritems() if example_search_id in value][0]
print('\nBin for example search({}): {}, content: {}\n'.format(example_search, example_bin, model['table'][example_bin]))
Exemplo n.º 23
0
from sklearn.linear_model import LogisticRegression
import numpy as np


# Function to remove punctuations
def remove_punctuation(text):
    return string.translate(text, None, string.punctuation)


# Function to compute sigmoid response
def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))


# Read data
products = sframe.SFrame('../data/Week01/amazon_baby.gl/')
products['review_clean'] = products['review'].apply(remove_punctuation)

# Discard rating with value 3; these are treated as neither negative nor positive
products = products[products['rating'] != 3]
products['sentiment'] = products['rating'].apply(lambda rating: +1
                                                 if rating > 3 else -1)

# Split data into training and testing sets
train_data, test_data = products.random_split(fraction=0.8, seed=1)

# Create training and test matrices from the corresponding data using a CountVectorizer
vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
train_matrix = vectorizer.fit_transform(train_data['review_clean'])
test_matrix = vectorizer.transform(test_data['review_clean'])
words = vectorizer.get_feature_names()
Exemplo n.º 24
0
#!/usr/bin/env python
# coding: utf-8

# In[10]:


import sframe


# # Read product review data

# In[23]:


products = sframe.SFrame('../Data/amazon_baby.gl')


# In[33]:


pwd


# In[34]:


products.save('../Data/products.csv', format='csv')


# In[2]:
import sframe
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

products = sframe.SFrame('amazon_baby.gl/')

def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation) 
    
# method for printing out confusion matrix
# In the case of binary classification, the confusion matrix is a 2-by-2 matrix
def print_confusion_matrix(y, y_hat, classifier):
    from sklearn.metrics import confusion_matrix
    # use the same order of class as the LR model
    cmat = confusion_matrix(y_true = y, y_pred = y_hat, labels = classifier.classes_)
    print ' target_label | predicted_label | count '
    print '--------------+-----------------+-------'
    # Print out the confusion matrix.    
    for i, target_label in enumerate(classifier.classes_):
        for j, predicted_label in enumerate(classifier.classes_):
            print '{0:^13} | {1:^15} | {2:5d}'.format(target_label, predicted_label, cmat[i,j])
            
def apply_threshold(probabilities, threshold):
    return probabilities.applymap(lambda x : 1 if x >= threshold else -1) ## !!
    
    
Exemplo n.º 26
0
import sframe
import json
import numpy as np
from math import sqrt

products = sframe.SFrame('amazon_baby_subset.gl/')

# For this assignment, we eliminated class imbalance by choosing a
# subset of the data with a similar number of positive and negative reviews.
print '# of positive reviews =', len(products[products['sentiment'] == 1])
print '# of negative reviews =', len(products[products['sentiment'] == -1])


def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)


# the following function extracts columns from an SFrame and converts
# them into a NumPy array
#
# the feature matrix includes an additional column 'intercept'
# to take account of the intercept term - all 1s
def get_numpy_data(data_sframe, features, label):
    data_sframe['intercept'] = 1
    features = ['intercept'] + features
    features_sframe = data_sframe[features]
    feature_matrix = features_sframe.to_numpy()
    label_sarray = data_sframe[label]
    label_array = label_sarray.to_numpy()
    return (feature_matrix, label_array)
Exemplo n.º 27
0
 def test_variable_size_array(self):
     self.data = gl.SFrame({'x': [[0], [0, 1], [0, 1, 2]]})
     self.assertRaises(ValueError, lambda: mxnet.io.SFrameIter(self.data, data_field='x'))
# Function to compute accuracy
def accuracy(prediction, actual):
    """
    Purpose: Compute accuracy
    Input  : Predicted output values, true output values
    Output : Accuracy
    """
    prediction_correct = sum((actual == prediction) * 1.0)
    prediction_total = len(prediction)
    accuracy = prediction_correct / prediction_total
    return accuracy


# Read data
loans = sframe.SFrame('../data/Week05/lending-club-data.gl/')

# Preprocess data
loans['safe_loans'] = loans['bad_loans'].apply(lambda x: +1 if x == 0 else -1)
loans = loans.remove_column('bad_loans')

# Selected features
features = [
    'grade',  # grade of the loan (categorical)
    'sub_grade_num',  # sub-grade of the loan as a number from 0 to 1
    'short_emp',  # one year or less of employment
    'emp_length_num',  # number of years of employment
    'home_ownership',  # home_ownership status: own, mortgage or rent
    'dti',  # debt to income ratio
    'purpose',  # the purpose of the loan
    'payment_inc_ratio',  # ratio of the monthly payment to income
# Function to plot likelihood curves
def make_plot(log_likelihood_all, len_data, batch_size, smoothing_window=1, label=''):
    plt.rcParams.update({'figure.figsize': (9,5)})
    log_likelihood_all_ma = np.convolve(np.array(log_likelihood_all), \
                                        np.ones((smoothing_window,))/smoothing_window, mode='valid')
    plt.plot(np.array(range(smoothing_window-1, len(log_likelihood_all)))*float(batch_size)/len_data,
             log_likelihood_all_ma, linewidth=4.0, label=label)
    plt.rcParams.update({'font.size': 16})
    plt.tight_layout()
    plt.xlabel('# of passes over data')
    plt.ylabel('Average log likelihood per data point')
    plt.legend(loc='lower right', prop={'size':14})

# Read data
products = sframe.SFrame('../data/Week02/amazon_baby_subset.gl/')

# Set of important words; these will be the features
list_of_words = ["baby", "one", "great", "love", "use", "would", "like", "easy", "little", "seat", "old", "well", "get", "also", "really", "son", "time", "bought", "product", "good", "daughter", "much", "loves", "stroller", "put", "months", "car", "still", "back", "used", "recommend", "first", "even", "perfect", "nice", "bag", "two", "using", "got", "fit", "around", "diaper", "enough", "month", "price", "go", "could", "soft", "since", "buy", "room", "works", "made", "child", "keep", "size", "small", "need", "year", "big", "make", "take", "easily", "think", "crib", "clean", "way", "quality", "thing", "better", "without", "set", "new", "every", "cute", "best", "bottles", "work", "purchased", "right", "lot", "side", "happy", "comfortable", "toy", "able", "kids", "bit", "night", "long", "fits", "see", "us", "another", "play", "day", "money", "monitor", "tried", "thought", "never", "item", "hard", "plastic", "however", "disappointed", "reviews", "something", "going", "pump", "bottle", "cup", "waste", "return", "amazon", "different", "top", "want", "problem", "know", "water", "try", "received", "sure", "times", "chair", "find", "hold", "gate", "open", "bottom", "away", "actually", "cheap", "worked", "getting", "ordered", "came", "milk", "bad", "part", "worth", "found", "cover", "many", "design", "looking", "weeks", "say", "wanted", "look", "place", "purchase", "looks", "second", "piece", "box", "pretty", "trying", "difficult", "together", "though", "give", "started", "anything", "last", "company", "come", "returned", "maybe", "took", "broke", "makes", "stay", "instead", "idea", "head", "said", "less", "went", "working", "high", "unit", "seems", "picture", "completely", "wish", "buying", "babies", "won", "tub", "almost", "either"]

# The label
label = ['sentiment']

# Remove punctuations
products['review_clean'] = products['review'].apply(lr_mle_sg.remove_punctuation)

# For each important word add a new column and determine count of that word in all reviews
for word in list_of_words:
    products[word] = products['review_clean'].apply(lambda x: x.split().count(word))

train_data, validation_data = products.random_split(.9, seed=1)
Exemplo n.º 30
0
    'sentiment': 1
}, {
    'text': 'I hate this f*****g product. Piece of shit.',
    'sentiment': -1
}, {
    'text': 'I love this excellent product. Great great great',
    'sentiment': 1
}, {
    'text': 'Hate hate hate! Never again. Bad',
    'sentiment': 1
}, {
    'text': 'Bad product. Really bad. I hate it.',
    'sentiment': -1
}]

sf = sframe.SFrame(train_dataset)
data = sf.unpack('X1')

count_vect = CountVectorizer()
counts = count_vect.fit_transform(data['X1.text'])

regression = LogisticRegression()
model = regression.fit(counts, data['X1.sentiment'])

predict_data = ["I love this product.", "This is a really bad product."]
predict_count_vects = CountVectorizer(vocabulary=count_vect.vocabulary_)
predict_counts = predict_count_vects.fit_transform(predict_data)

predictions = model.predict(predict_counts)

for text, prediction in zip(predict_data, predictions):