def outliersSvdReduction(self):
     svd = TruncatedSVD(n_components=1)
     ordersSvd = svd.fit_transform(
         self.training_order_start_end_districts_and_time,
         self.training_number_of_orders)
     priceSvd = svd.fit_transform(
         self.training_order_start_end_districts_and_time,
         self.training_order_median_price)
     self.outliersPriceOrders(ordersSvd, priceSvd)
Exemplo n.º 2
0
class RegressionRecommender(object):
    def __init__(self, feature_size=10, regressor=None):
        self.feature_size = feature_size
        self.user_svd = TruncatedSVD(n_components=feature_size)
        self.item_svd = TruncatedSVD(n_components=feature_size)
        if regressor is None:
            self.regressor = LinearRegression()

    def fit(self, rating):
        # rating (item x user)
        item_features = self.item_svd.fit_transform(rating)
        user_features = self.user_svd.fit_transform(rating.T)
        self.item_features = item_features
        self.user_features = user_features

        n_item, n_user = rating.shape
        n_examples = rating.count_nonzero()
        X = zeros((n_examples, self.feature_size + self.feature_size))
        y = zeros((n_examples, 1))
        for i, (item, user) in enumerate(zip(*rating.nonzero())):
            X[i] = concatenate([item_features[item], user_features[user]],
                               axis=0)
            y[i] = rating[item, user]

        self.regressor.fit(X, y)
        return self

    def predict(self, item, user):
        user_features = self.user_features[user]
        item_features = self.item_features[item]

        input_features = concatenate(user_features, item_features)
        return self.regressor.predict(input_features)

    def save(self, filepath):
        to_save = {
            'regressor': self.regressor,
            'user_svd': self.user_svd,
            'item_svd': self.item_svd
        }
        with open(filepath, 'wb') as handle:
            saver = Pickler(handle, protocol=HIGHEST_PROTOCOL)
            saver.save(to_save)

    def load(self, filepath):
        with open(filepath, 'rb') as handle:
            loader = Unpickler(handle)
            state = loader.load()
            self.regressor = state['regressor']
            self.user_svd = state['user_svd']
            self.item_svd = state['item_svd']
Exemplo n.º 3
0
class RawModel:
    def __init__(self):
        # 2015-05-15 GEL Found that n_components=20 gives a nice balance of 
        # speed (substantial improvement), accuracy, and reduced memory usage 
        # (25% decrease).
        self.decomposer = TruncatedSVD(n_components=20)

        # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than 
        # algorithm='kd_tree'
        
        # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples)
        # euclidean        0.950025
        # manhattan        0.933533
        # chebyshev        0.675662
        # hamming          0.708646
        # canberra         0.934033
        # braycurtis       0.940530
        self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')

    def fit(self, trainExamples):       
        X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) )
        Y = [x.Y for x in trainExamples]

        self.model.fit(X, Y)
        return self

    def predict(self, examples):
        X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) )
        return self.model.predict( X )
Exemplo n.º 4
0
def write_spacy_vocab(output_dirpath, vocab_size, embedding_dim):
    if not os.path.exists(output_dirpath):
        os.makedirs(output_dirpath)

    allowed_chars = set(string.ascii_letters + string.punctuation)
    ascii = set(string.ascii_letters)
    ascii_plus_period = set(string.ascii_letters + '.')
    word_set = set()
    spacy_vocab = spacy.load('en').vocab
    top_words = []

    for w in spacy_vocab:
        if w.rank > 2 * vocab_size:
            continue
        try:
            word_string = str(w.lower_).strip()
            if not word_string:
                continue
            if word_string in word_set:
                continue
            if any(bad_char in word_string
                   for bad_char in ('[', ']', '<', '>', '{', '}')):
                # these are used to mark word types and person ids.
                continue
            if any(c not in allowed_chars for c in word_string):
                continue
            if sum(1 for c in word_string if c not in ascii_plus_period) > 2:
                continue
            if word_string[-1] == '.' and sum(
                    1 for c in word_string if c in ascii) > 2:
                continue

            top_words.append(w)
            word_set.add(word_string)
        except:
            pass

    top_words.sort(key=lambda w: w.rank)
    top_words = top_words[:vocab_size]

    with open(os.path.join(output_dirpath, 'vocab'), 'w') as f:
        for word in top_words:
            f.write('%s\n' % word.lower_.strip())

    vectors = np.array([w.vector for w in top_words])
    svd = TruncatedSVD(n_components=embedding_dim, algorithm='arpack')
    embeddings = svd.fit_transform(vectors)

    print embeddings.shape
    print[
        sum(svd.explained_variance_ratio_[:i])
        for i in range(1, embedding_dim + 1)
    ]
    np.save(os.path.join(output_dirpath, 'pretrained_embeddings.npy'),
            embeddings)
Exemplo n.º 5
0
 def fit_transform(self, X, Y):
     if self.standardize:
         X = _standardize(X)
         Y = _standardize(Y)
     K = X.dot(Y.T)
     model = TruncatedSVD(n_components=self.n_components,
                          random_state=self.random_state)
     U = model.fit_transform(K)
     U = U / np.linalg.norm(U, axis=0)
     V = model.components_.T
     self.correlations = np.array(
         [pearsonr(u.dot(X), v.dot(Y))[0] for u, v in zip(U.T, V.T)])
     return U, V
Exemplo n.º 6
0
class SimilarityRecommender(object):
    def __init__(self, feature_size=10):
        self.feature_size = feature_size
        self.svd = TruncatedSVD(n_components=feature_size)
        self.rating = None

    def fit(self, rating):
        # rating (item x user)
        self.rating = rating
        item = self.svd.fit_transform(rating)
        similarity = defaultdict(lambda: dict())

        n_item, n_user = rating.shape
        for first in tqdm(range(n_item)):
            for second in range(first):
                first_item = item[first].reshape(1, -1)
                second_item = item[second].reshape(1, -1)
                similarity[first][second] = float(
                    cosine_similarity(first_item, second_item)[0, 0])

        self.similarity = dict(similarity)
        return self

    def predict(self, user, item):
        history = self.rating[:, user].nonzero()
        absolute_score = sum(
            self.get_similarity(item, user_item) * rating
            for user_item, rating in history)
        score = float(absolute_score) / sum(rating for item, rating in history)
        return score

    def similar_to(self, item, n=5):

        return

    def get_similarity(self, item, target):
        return self.similarity[item][
            target] if item > target else self.similarity[target][item]

    def save(self, filepath):
        with open(filepath, 'w') as handle:
            json.dump(self.similarity, handle)
Exemplo n.º 7
0
def compute_reduced_embeddings_original_vocab(output_vocab_filepath,
                                              output_embeddings_filepath,
                                              input_vocab_filepath, vocab_size,
                                              embedding_dim):
    print N_FREE_TOKENS
    vocab = Vocab(input_vocab_filepath, 1.5 * vocab_size)
    spacy_vocab = spacy.load('en').vocab
    matrix = np.zeros((vocab_size, spacy_vocab.vectors_length),
                      dtype=np.float32)
    new_i = 0
    final_vocab = []

    for i, word in vocab._id_to_word.iteritems():
        if new_i == vocab_size:
            break

        if i >= N_FREE_TOKENS and unicode(word) not in spacy_vocab:
            continue

        if i >= N_FREE_TOKENS:
            final_vocab.append(word)

        matrix[new_i] = spacy_vocab[unicode(word)].vector
        new_i += 1

    print 'Last word added:', final_vocab[-1]
    if embedding_dim < spacy_vocab.vectors_length:
        svd = TruncatedSVD(n_components=embedding_dim, algorithm='arpack')
        embeddings = svd.fit_transform(matrix)
        print embeddings.shape
        print[
            sum(svd.explained_variance_ratio_[:i])
            for i in range(1, embedding_dim + 1)
        ]
    else:
        embeddings = matrix

    with open(output_vocab_filepath, 'w') as output:
        for word in final_vocab:
            output.write('%s\n' % word)
    np.save(output_embeddings_filepath, embeddings)
Exemplo n.º 8
0
    def __init__(self, path, corpusName, query=None):
        self.query = query
        documents = (line.lower().split() for line in codecs.open(
            corpusName + ".txt", mode='r', encoding='utf-8', errors='ignore'))
        self.corpus = [' '.join(i) for i in documents]
        if self.query is not None:
            self.corpus.append(' '.join(query.getTokens()))

        # Make models
        t0 = time()
        print "Creating SciKit TF-IDF Model"
        self.tfidfModel = TfidfVectorizer().fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LSA Model"
        t0 = time()
        lsa = TruncatedSVD(n_components=300)
        self.lsaModel = lsa.fit_transform(self.tfidfModel)
        self.lsaModel = Normalizer(copy=False).fit_transform(self.lsaModel)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LDA Model"
        # Use tf (raw term count) features for LDA.
        print("Extracting tf features for LDA")
        tf_vectorizer = CountVectorizer(max_features=2000)
        t0 = time()
        tf = tf_vectorizer.fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))
        print("Fitting LDA model")
        lda = LatentDirichletAllocation(n_topics=300,
                                        max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
        t0 = time()
        self.ldaModel = lda.fit_transform(tf)
        self.ldaModel = Normalizer(copy=False).fit_transform(self.ldaModel)
        print("Done in %0.3fs." % (time() - t0))
Exemplo n.º 9
0
    def __init__(self, path, corpusName, query=None):
        self.query = query
        documents = (line.lower().split() for line in codecs.open(
            corpusName + ".txt", mode='r', encoding='utf-8', errors='ignore'))
        self.corpus = [' '.join(i) for i in documents]
        if self.query is not None:
            self.corpus.append(' '.join(query.getTokens()))

        # Make models
        t0 = time()
        print "Creating SciKit TF-IDF Model"
        self.tfidfModel = TfidfVectorizer().fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LSA Model"
        t0 = time()
        lsa = TruncatedSVD(n_components=300)
        self.lsaModel = lsa.fit_transform(self.tfidfModel)
        self.lsaModel = Normalizer(copy=False).fit_transform(self.lsaModel)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LDA Model"
        # Use tf (raw term count) features for LDA.
        print("Extracting tf features for LDA")
        tf_vectorizer = CountVectorizer(max_features=2000)
        t0 = time()
        tf = tf_vectorizer.fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))
        print("Fitting LDA model")
        lda = LatentDirichletAllocation(n_topics=300, max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
        t0 = time()
        self.ldaModel = lda.fit_transform(tf)
        self.ldaModel = Normalizer(copy=False).fit_transform(self.ldaModel)
        print("Done in %0.3fs." % (time() - t0))
Exemplo n.º 10
0
    words.append(word)
    vecs.append(model[word])

print len(vecs)
CUTAWAY = 6000

vecs = np.matrix(np.random.permutation(vecs))[:CUTAWAY, :]
words = words[:CUTAWAY]
print vecs.shape

print 'Dimensionality reductio'
# dr = TruncatedSVD(n_iter=15)
# X = dr.fit_transform(vecs)
print 'Dimensionality reduction done, manifold learning'
tsne = RandomTreesEmbedding(n_estimators=15, random_state=0, max_depth=5, verbose=2, n_jobs=3)
X = tsne.fit_transform(vecs)
print X.shape
print 'Dim reduction'
dr = TruncatedSVD(n_components=2)
X = dr.fit_transform(X)
print 'Manifold learning done'

# X = vecs
PLOT_CUTAWAY = 250

plt.figure()
plt.scatter(X[:PLOT_CUTAWAY, 0], X[:PLOT_CUTAWAY, 1], c='green')
for i in xrange(min(X.shape[0], PLOT_CUTAWAY)):
    plt.annotate(words[i], xy=(X[i, 0], X[i, 1]))
plt.show()
Exemplo n.º 11
0
from sklearn.cross_validation import train_test_split, cross_val_score


df = pd.read_csv('/path/file.csv',
                     header=0, sep=',', names=['SentenceId', 'Sentence', 'Sentiment'])



reduced_data = tfidf_vect.fit_transform(df['Sentence'].values)
y = df['Sentiment'].values



from sklearn.decomposition.truncated_svd import TruncatedSVD
svd = TruncatedSVD(n_components=5)
reduced_data = svd.fit_transform(reduced_data)

X_train, X_test, y_train, y_test = train_test_split(reduced_data,
                                                    y, test_size=0.33,
                                                    random_state=42)

from sklearn.ensemble import RandomForestClassifier

#se pasmo con 1000000
#probar con mas parametros
classifier=RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)


#print X_train.shape
print(X_train_counts.shape)
print(count_vect.vocabulary_.get(u'algorithm'))
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


df= pd.DataFrame({'text':test_doc, 'class': test_data.target})

X = tfidf_vect.fit_transform(df['text'].values)
y = df['class'].values

from sklearn.decomposition.truncated_svd import TruncatedSVD 
pca = TruncatedSVD(n_components=2)                                
X_reduced_train = pca.fit_transform(X)
a_train, a_test, b_train, b_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn.ensemble import RandomForestClassifier 
classifier=RandomForestClassifier(n_estimators=10)                  
classifier.fit(a_train.toarray(), b_train)                            



clf = svm.SVC(kernel=my_kernel)
# Support Vector Machine model
#text_clf = Pipeline([('vect', CountVectorizer()),
#(#'tfidf', TfidfTransformer()),
#('clf', clf),])

clf.fit(a_train.toarray(), b_train)
    ch2 = SelectKBest(chi2, k=opts.select_chi2)
    X_train = ch2.fit_transform(X_train, y_train)
    X_test = ch2.transform(X_test)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
    print("done in %fs" % (time() - t0))
    print()

if feature_names:
    feature_names = np.asarray(feature_names)


print(X_train.shape)
X_train = svd.fit_transform(X_train)
X_test = svd.transform(X_test)
#u,o,X_train = fastica(X_train.toarray(),n_comp=1000)
print(X_train)
print(X_train.shape)
def trim(s):
    """Trim string to fit on terminal (assuming 80-column display)"""
    return s if len(s) <= 80 else s[:77] + "..."



def final_accuracy(predicted):
    count_same = 0
    total = 0
    for i in xrange(0,len(predicted)):
        tags_needed =  question_tags[(target_files[i].split("/")[-1])]
Exemplo n.º 14
0
class RunRegression(object):

    REGRESSION_TRAINING_INPUT_FILE_NAME = "RegressionTrainingInput.npz"
    REGRESSION_TESTING_INPUT_FILE_NAME = "RegressionTestingInput.npz"
    MAXIMUM_NUMBER_OF_JOBS = -1
    NUMBER_OF_CROSS_VALIDATION_FOLDS = 5
    ROWS_TO_USE_FOR_GAUSSIAN_KERNEL_REGRESSION = 15
    DISTRICT_SIZE = 132
    TIME_SIZE = 152
    POI_SIZE = 352
    WEATHER_SIZE = 9
    TRAFFIC_SIZE = 8

    def __init__(self):

        self.components = 2
        self.svd = TruncatedSVD(n_components=self.components)
        self.reductCount = 0
        for file_name, data_set in [
            (RunRegression.REGRESSION_TRAINING_INPUT_FILE_NAME,
             FileIo.TRAINING_DATA_SET),
            (RunRegression.REGRESSION_TESTING_INPUT_FILE_NAME,
             FileIo.TEST_DATA_SET)
        ]:

            # Check and see if the data has already been saved
            try:

                logging.info("RunRegression: Trying to load " + data_set +
                             " data")

                saved_data = numpy.load(file_name, mmap_mode='r')

            # If the data is not found, load it
            except IOError:

                logging.info(
                    "RunRegression: Saved data not found. Generating " +
                    data_set + " data")

                # Generate inputs
                poi_district_lookup = PoiDistrictLookup.PoiDistrictLookup()
                order_categorical_lookup = OrderCategoricalLookup.OrderCategoricalLookup(
                    poi_district_lookup)
                regression_input = RegressionInput.RegressionInput(
                    data_set, order_categorical_lookup, poi_district_lookup)

                if data_set == FileIo.TRAINING_DATA_SET:

                    self.training_order_start_end_districts_and_time, self.training_order_median_price, \
                        self.training_number_of_orders = regression_input.get_regression_inputs()

                    # Save the data for next time
                    numpy.savez(
                        file_name,
                        order_keys=self.
                        training_order_start_end_districts_and_time,
                        order_value_price=self.training_order_median_price,
                        order_value_number=self.training_number_of_orders)

                else:

                    self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \
                        self.testing_number_of_orders  = regression_input.get_regression_inputs()

                    # Save the data for next time
                    numpy.savez(
                        file_name,
                        order_keys=self.
                        testing_order_start_end_districts_and_time,
                        order_value_price=self.testing_order_median_price,
                        order_value_number=self.testing_number_of_orders)

            # If the saved data is found, load it
            else:

                logging.info("RunRegression: Loading " + data_set + " data")

                if data_set == FileIo.TRAINING_DATA_SET:

                    self.training_order_start_end_districts_and_time, self.training_order_median_price, \
                        self.training_number_of_orders = saved_data['order_keys'], \
                                                         saved_data['order_value_price'], \
                                                         saved_data['order_value_number']

                    self.dimensions = self.training_order_start_end_districts_and_time.shape[
                        1]
                    self.initial = self.training_order_start_end_districts_and_time
                    logging.info("RunRegression: Loaded " +
                                 str(len(self.training_number_of_orders)) +
                                 " train data rows")
                else:

                    self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \
                        self.testing_number_of_orders = saved_data['order_keys'], \
                                                        saved_data['order_value_price'], \
                                                        saved_data['order_value_number']

                    self.initialTesting = self.testing_order_start_end_districts_and_time
                    logging.info("RunRegression: Loaded " +
                                 str(len(self.testing_number_of_orders)) +
                                 " test data rows")

    """
    Run sgd regression
    """

    def run_sgd_regression(self):

        losses = ["squared_loss"]
        penalties = ["none", "l2", "l1", "elasticnet"]
        initial_learning_rates = [0.1, 0.01, 0.001]
        learning_rates = ["constant", "optimal", "invscaling"]

        lowest_ride_prediction_error = float('inf')

        best_loss = ""
        best_penalty = ""
        best_initial_learning_rate = 0.0
        best_learning_rate = ""

        # Find the best hyper-parameters
        for loss in losses:
            for penalty in penalties:
                for initial_learning_rate in initial_learning_rates:
                    for learning_rate in learning_rates:

                        mean_ride_prediction_error = 0.0

                        # Do k-fold cross-validation using mini-batch training.
                        for testing_fold_number in range(
                                RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS
                        ):

                            # Create the sgd regressor using the input parameters
                            sgd_regressor = linear_model.SGDRegressor(
                                loss=loss,
                                penalty=penalty,
                                eta0=initial_learning_rate,
                                learning_rate=learning_rate)

                            # Run mini batch training for the fold if its not the training fold
                            for fold_number in range(
                                    RunRegression.
                                    NUMBER_OF_CROSS_VALIDATION_FOLDS):

                                if fold_number == testing_fold_number:
                                    continue

                                training_start_row = fold_number * \
                                                     len(self.training_order_start_end_districts_and_time) // \
                                                     RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                                training_end_row = (fold_number + 1) * \
                                                   len(self.training_order_start_end_districts_and_time) // \
                                                    RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                                logging.info(
                                    "RunRegression: " +
                                    str(RunRegression.
                                        NUMBER_OF_CROSS_VALIDATION_FOLDS) +
                                    " fold cross validation training SGD Regressor for fold "
                                    + str(fold_number) + ", starting row " +
                                    str(training_start_row) + ", ending row " +
                                    str(training_end_row) + ", loss " + loss +
                                    ", penalty " + penalty +
                                    ", initial learning rate " +
                                    str(initial_learning_rate) +
                                    " and learning rate " + learning_rate)

                                # Train regression model
                                sgd_regressor\
                                   .partial_fit(X=self.training_order_start_end_districts_and_time[training_start_row :
                                                                                                   training_end_row],
                                                y=self.training_number_of_orders[training_start_row:training_end_row])

                            testing_start_row = testing_fold_number * \
                                                len(self.testing_order_start_end_districts_and_time) // \
                                                 RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                            testing_end_row = (testing_fold_number + 1 )* \
                                                len(self.testing_order_start_end_districts_and_time) // \
                                                 RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                            predicted_number_of_orders = sgd_regressor\
                                .predict(self.testing_order_start_end_districts_and_time[testing_start_row :
                                                                                         testing_end_row])

                            current_ride_prediction_error = numpy.mean(
                                (predicted_number_of_orders -
                                 self.testing_number_of_orders[
                                     testing_start_row:testing_end_row])**2)

                            logging.info(
                                "RunRegression: Prediction error for fold " +
                                str(testing_fold_number) + " is " +
                                str(current_ride_prediction_error))

                            mean_ride_prediction_error += current_ride_prediction_error

                            if RunRegression.__is_mean_prediction_error_too_high(
                                    mean_ride_prediction_error,
                                    lowest_ride_prediction_error):
                                logging.info(
                                    "RunRegression: Mean prediction error of "
                                    + str(mean_ride_prediction_error) +
                                    "is too high compared to best so far " +
                                    str(lowest_ride_prediction_error) +
                                    ". Ending current cross validation.")
                                break

                        else:

                            mean_ride_prediction_error /= RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                            logging.info(
                                "RunRegression: Mean prediction error is " +
                                str(mean_ride_prediction_error))

                            # Save values if better than previous best
                            if mean_ride_prediction_error < lowest_ride_prediction_error:

                                logging.info(
                                    "RunRegression: mean error of " +
                                    str(mean_ride_prediction_error) +
                                    " is the best so far. Saving loss " +
                                    loss + ", penalty " + penalty +
                                    ", initial learning rate " +
                                    str(initial_learning_rate) +
                                    " and learning rate " + learning_rate)

                                lowest_ride_prediction_error = mean_ride_prediction_error
                                best_loss = loss
                                best_penalty = penalty
                                best_initial_learning_rate = initial_learning_rate
                                best_learning_rate = learning_rate

        logging.info(
            "RunRegression: Running regression with best values so far: loss "
            + best_loss + ", penalty " + best_penalty +
            ", initial learning rate " + str(best_initial_learning_rate) +
            " and learning rate " + best_learning_rate)

        sgd_regressor = linear_model.SGDRegressor(
            loss=best_loss,
            penalty=best_penalty,
            eta0=best_initial_learning_rate,
            learning_rate=best_learning_rate)

        sgd_regressor.fit(X=self.training_order_start_end_districts_and_time,
                          y=self.training_number_of_orders)
        best_predicted_number_of_orders = sgd_regressor.predict(
            self.testing_order_start_end_districts_and_time)

        coef = sgd_regressor.coef_
        print(coef)

        logging.info(
            "RunRegression: Mean squared prediction error after cross validation is "
            + str(
                numpy.mean((best_predicted_number_of_orders -
                            self.testing_number_of_orders)**2)))

    """
    Check if mean prediction error is to high to qualify as the best so far
    """

    @staticmethod
    def __is_mean_prediction_error_too_high(cumulative_mean_prediction_error,
                                            best_prediction_error_so_far):

        return cumulative_mean_prediction_error / RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS > \
               best_prediction_error_so_far

    """
    Run regression based on multidimensional scaling
    """

    def run_mds_regression(self):

        # Create a square matrix with number of test data rows preserved
        training_data_square_matrix = numpy.dot(
            self.training_order_start_end_districts_and_time.T,
            self.training_order_start_end_districts_and_time)

        logging.info("RunRegression: Square matrix shape " +
                     str(training_data_square_matrix.shape))

        # Get Eigen values and eigen vectors
        training_data_eigen_values, training_data_eigen_vectors = linalg.eig(
            training_data_square_matrix)
        #print(training_data_eigen_values)
        #print(training_data_eigen_vectors)
        print(self.training_order_start_end_districts_and_time)
        sorted_index = training_data_eigen_values.argsort()[::-1]
        sorted_training_data_eigen_values = training_data_eigen_values[
            sorted_index]
        sorted_training_data_eigen_vectors = training_data_eigen_vectors[:,
                                                                         sorted_index]

        logging.info("RunRegression: Found " +
                     str(len(sorted_training_data_eigen_values)) +
                     " eigen values.")
        logging.info("RunRegression: Eigen vectors have length " +
                     str(len(sorted_training_data_eigen_vectors[0])))

        if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
            RunRegression.__show_eigen_values_trend(
                eigen_values=sorted_training_data_eigen_values)

    """
    Show Eigen values trend
    """

    @staticmethod
    def __show_eigen_values_trend(self, eigen_values):

        # Plot eigen values
        plt.plot(eigen_values)
        plt.ylabel('Eigen Values')
        plt.title('Sorted Eigen Values')
        plt.show()

    def leastAngleRegression(self):
        lar = linear_model.Lars()
        lar.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = lar.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(lar.coef_)

    def orthogonalMatchingPursuit(self):
        omp = linear_model.OrthogonalMatchingPursuit(n_nonzero_coefs=10)
        omp.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = omp.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(omp.coef_)

    def theilSenRegressor(self):
        tsr = linear_model.TheilSenRegressor()
        tsr.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = tsr.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(tsr.coef_)

    def polynomial(self):
        poly = PolynomialFeatures(degree=3)
        self.training_order_start_end_districts_and_time = poly.fit_transform(
            self.training_order_start_end_districts_and_time,
            self.training_number_of_orders)
        predict = poly.transform(
            self.testing_order_start_end_districts_and_time)

        clf = linear_model.LinearRegression()
        clf.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = clf.predict(predict)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(clf.coef_)

    def svm(self):
        oneClass = svm.OneClassSVM()
        logging.info("svm fit")
        oneClass.fit(self.training_order_start_end_districts_and_time,
                     self.training_number_of_orders)
        logging.info("svm predict")
        predicted_number_of_orders = oneClass.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(oneClass.coef_)

    def districtReduction(self, keyType, key):
        y = key
        districts = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                           self.DISTRICT_SIZE)
        if keyType == "training":
            districtRed = self.svd.fit_transform(
                districts, self.training_number_of_orders)
        else:
            districtRed = self.svd.transform(districts)
        nonDistrict = numpy.apply_along_axis(sliceTransform, 1, y,
                                             self.DISTRICT_SIZE,
                                             self.dimensions)
        keyWithDist = numpy.append(districtRed, nonDistrict, axis=1)
        return keyWithDist

    def timeReduction(self, keyType, key):
        y = key
        time = numpy.apply_along_axis(sliceTransform, 1, y, self.components,
                                      self.TIME_SIZE + self.components)
        if keyType == "training":
            timeRed = self.svd.fit_transform(time,
                                             self.training_number_of_orders)
        else:
            timeRed = self.svd.transform(time)
        befTime = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                         self.components)
        aftTime = numpy.apply_along_axis(sliceTransform, 1, y,
                                         self.TIME_SIZE + self.components,
                                         self.dimensions)
        keyWithTime = numpy.append(befTime, timeRed, axis=1)
        keyWithTime = numpy.append(keyWithTime, aftTime, axis=1)
        return keyWithTime

    def POIReduction(self, keyType, key):
        y = key
        poi = numpy.apply_along_axis(sliceTransform, 1, y, self.components * 2,
                                     self.POI_SIZE + self.components * 2)
        if keyType == "training":
            poiRed = self.svd.fit_transform(poi,
                                            self.training_number_of_orders)
        else:
            poiRed = self.svd.transform(poi)
        befPoi = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                        self.components * 2)
        aftPoi = numpy.apply_along_axis(sliceTransform, 1, y,
                                        self.POI_SIZE + self.components * 2,
                                        self.dimensions)
        keyWithPoi = numpy.append(befPoi, poiRed, axis=1)
        keyWithPoi = numpy.append(keyWithPoi, aftPoi, axis=1)
        return keyWithPoi

    def WeatherReduction(self, keyType, key):
        y = key
        weather = numpy.apply_along_axis(
            sliceTransform, 1, y, self.components * 3,
            self.WEATHER_SIZE + self.components * 3)
        if keyType == "training":
            weatherRed = self.svd.fit_transform(weather,
                                                self.training_number_of_orders)
        else:
            weatherRed = self.svd.transform(weather)
        befWeather = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                            self.components * 3)
        aftWeather = numpy.apply_along_axis(
            sliceTransform, 1, y, self.WEATHER_SIZE + self.components * 3,
            self.dimensions)
        keyWithWeather = numpy.append(befWeather, weatherRed, axis=1)
        keyWithWeather = numpy.append(keyWithWeather, aftWeather, axis=1)
        return keyWithWeather

    def TrafficReduction(self, keyType, key):
        y = key
        traffic = numpy.apply_along_axis(
            sliceTransform, 1, y, self.components * 4,
            self.TRAFFIC_SIZE + self.components * 4)
        if keyType == "training":
            trafficRed = self.svd.fit_transform(traffic,
                                                self.training_number_of_orders)
            if self.reductCount == 0:
                self.boxPlot(trafficRed)
                self.reductCount = 1
        else:
            trafficRed = self.svd.transform(traffic)
        befTraffic = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                            self.components * 4)
        keyWithTraffic = numpy.append(befTraffic, trafficRed, axis=1)
        return keyWithTraffic

    def wholeReductionTraining(self):
        y = self.training_order_start_end_districts_and_time
        b = self.svd.fit_transform(y, self.training_number_of_orders)
        if self.reductCount < 2:
            self.boxPlot(b)
        self.reductCount += 1
        self.training_order_start_end_districts_and_time = b

    def wholeReductionTesting(self):
        y = self.testing_order_start_end_districts_and_time
        b = self.svd.transform(y)
        self.testing_order_start_end_districts_and_time = b

    def reduction(self):
        self.training_order_start_end_districts_and_time = self.initial
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        self.testing_order_start_end_districts_and_time = self.initialTesting

        logging.info("RunRegression: Reducing Districts")
        self.training_order_start_end_districts_and_time = run_regression.districtReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.districtReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.testing_order_start_end_districts_and_time[:, 0:1]
        y = self.testing_order_start_end_districts_and_time[:, 1:2]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        print(self.dimensions)

        logging.info("RunRegression: Reducing Time")
        self.training_order_start_end_districts_and_time = run_regression.timeReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.timeReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 2:3]
        y = self.training_order_start_end_districts_and_time[:, 3:4]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        #plt.scatter(x,y)
        #plt.show()
        logging.info("RunRegression: Reducing POI")
        self.training_order_start_end_districts_and_time = run_regression.POIReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.POIReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 4:5]
        y = self.training_order_start_end_districts_and_time[:, 5:6]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        #plt.scatter(x,y)
        #plt.show()
        logging.info("RunRegression: Reducing Weather")
        self.training_order_start_end_districts_and_time = run_regression.WeatherReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.WeatherReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 6:7]
        y = self.training_order_start_end_districts_and_time[:, 7:8]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        #plt.scatter(x,y)
        #plt.show()
        logging.info("RunRegression: Reducing Traffic")
        self.training_order_start_end_districts_and_time = run_regression.TrafficReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.TrafficReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 8:9]
        y = self.training_order_start_end_districts_and_time[:, 9:10]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        print(self.initial.shape)

    def boxPlot(self, arrayBox):
        a = plt.boxplot(arrayBox)
        plt.show()
        idx = set()
        idxSet = set(
            numpy.arange(len(
                self.training_order_start_end_districts_and_time)))
        for d in a['fliers']:
            print(len(d.get_ydata()))
            for point in d.get_ydata():
                pIdx = numpy.where(arrayBox == point)
                for rIdx in pIdx[0]:
                    idx.add(rIdx)
        logging.info("done with loop")
        idxKeep = list(idxSet.difference(idx))
        self.initial = self.initial[[idxKeep], :]
        self.training_number_of_orders = self.training_number_of_orders[[
            idxKeep
        ]]
        self.initial = self.initial.reshape(self.initial.shape[1:])
Exemplo n.º 15
0
 def buildModel(self):
     tfidfModel = TfidfVectorizer().fit_transform(self.corpus)
     lsa = TruncatedSVD(n_components=200)
     self.Model = lsa.fit_transform(tfidfModel)
     self.Model = Normalizer(copy=False).fit_transform(self.Model)
Exemplo n.º 16
0
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=False)
from sklearn.cross_validation import train_test_split, cross_val_score

df = pd.read_csv('/path/file.csv',
                 header=0,
                 sep=',',
                 names=['SentenceId', 'Sentence', 'Sentiment'])

reduced_data = tfidf_vect.fit_transform(df['Sentence'].values)
y = df['Sentiment'].values

from sklearn.decomposition.truncated_svd import TruncatedSVD
svd = TruncatedSVD(n_components=5)
reduced_data = svd.fit_transform(reduced_data)

X_train, X_test, y_train, y_test = train_test_split(reduced_data,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

from sklearn.ensemble import RandomForestClassifier

#se pasmo con 1000000
#probar con mas parametros
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)

#print X_train.shape
Exemplo n.º 17
0
    def train(self, model_name, corpus, log, opts, chain_features=None):
        from whim.entity_narrative import DistributionalVectorsNarrativeChainModel
        log.info("Training context vectors model")

        training_metadata = {
            "data": corpus.directory,
            "pmi": opts.pmi or opts.ppmi,
            "ppmi": opts.ppmi,
        }

        log.info("Extracting event counts")
        pbar = get_progress_bar(len(corpus), title="Event feature extraction")
        # Loop over all the chains again to collect events
        event_counts = Counter()
        for doc_num, document in enumerate(corpus):
            chains = document.get_chains()
            if len(chains):
                event_chains = list(
                    DistributionalVectorsNarrativeChainModel.
                    extract_chain_feature_lists(chains,
                                                only_verb=opts.only_verb,
                                                adjectives=opts.adj))
                # Count all the events
                for chain in event_chains:
                    event_counts.update(chain)

            pbar.update(doc_num)
        pbar.finish()

        if opts.event_threshold is not None and opts.event_threshold > 0:
            log.info("Applying event threshold")
            # Apply a threshold event count
            to_remove = [
                event for (event, count) in event_counts.items()
                if count < opts.event_threshold
            ]
            pbar = get_progress_bar(len(to_remove), title="Filtering counts")
            for i, event in enumerate(to_remove):
                del event_counts[event]
                pbar.update(i)
            pbar.finish()

        log.info("Extracting pair counts")
        pbar = get_progress_bar(len(corpus), title="Pair feature extraction")
        # Loop over all the chains again to collect pairs of events
        pair_counts = Counter()
        for doc_num, document in enumerate(corpus):
            chains = document.get_chains()
            if len(chains):
                event_chains = list(
                    DistributionalVectorsNarrativeChainModel.
                    extract_chain_feature_lists(chains,
                                                only_verb=opts.only_verb,
                                                adjectives=opts.adj))
                # Count all the events
                for chain in event_chains:
                    # Count all pairs
                    pairs = []
                    for i in range(len(chain) - 1):
                        for j in range(i + 1, len(chain)):
                            if chain[i] in event_counts and chain[
                                    j] in event_counts:
                                pairs.append(
                                    tuple(sorted([chain[i], chain[j]])))
                    pair_counts.update(pairs)

            pbar.update(doc_num)
        pbar.finish()

        if opts.pair_threshold is not None and opts.pair_threshold > 0:
            log.info("Applying pair threshold")
            # Apply a threshold pair count
            to_remove = [
                pair for (pair, count) in pair_counts.items()
                if count < opts.pair_threshold
            ]
            if to_remove:
                pbar = get_progress_bar(len(to_remove),
                                        title="Filtering pair counts")
                for i, pair in enumerate(to_remove):
                    del pair_counts[pair]
                    pbar.update(i)
                pbar.finish()
            else:
                log.info("No counts removed")

        # Create a dictionary of the remaining vocabulary
        log.info("Building dictionary")
        dictionary = Dictionary([[event] for event in event_counts.keys()])
        # Put all the co-occurrence counts into a big matrix
        log.info("Building counts matrix: vocab size %d" % len(dictionary))
        vectors = numpy.zeros((len(dictionary), len(dictionary)),
                              dtype=numpy.float64)
        # Fill the matrix with raw counts
        for (event0, event1), count in pair_counts.items():
            if event0 in dictionary.token2id and event1 in dictionary.token2id:
                e0, e1 = dictionary.token2id[event0], dictionary.token2id[
                    event1]
                vectors[e0, e1] = count
                # Add the count both ways (it's only stored once above)
                vectors[e1, e0] = count

        # Now there are many things we could do to these counts
        if opts.pmi or opts.ppmi:
            log.info("Applying %sPMI" % "P" if opts.ppmi else "")
            # Apply PMI to the matrix
            # Compute the total counts for each event (note row and col totals are the same)
            log_totals = numpy.ma.log(vectors.sum(axis=0))
            vectors = numpy.ma.log(vectors * vectors.sum()) - log_totals
            vectors = (vectors.T - log_totals).T
            vectors = vectors.filled(0.)

            if opts.ppmi:
                # Threshold the PMIs at zero
                vectors[vectors < 0.] = 0.

        # Convert to sparse for SVD and storage
        vectors = csr_matrix(vectors)

        if opts.svd:
            log.info("Fitting SVD with %d dimensions" % opts.svd)
            training_metadata["svd from"] = vectors.shape[1]
            training_metadata["svd"] = opts.svd
            vector_svd = TruncatedSVD(opts.svd)
            vectors = vector_svd.fit_transform(vectors)

        log.info("Saving model: %s" % model_name)
        model = DistributionalVectorsNarrativeChainModel(
            dictionary,
            vectors,
            only_verb=opts.only_verb,
            training_metadata=training_metadata,
            adjectives=opts.adj)
        model.save(model_name)
        return model
Exemplo n.º 18
0
    # strip_accents = 'unicode' : replace all accented unicode char ;  use_idf = True : enable inverse-document-frequency reweighting ;
    # smooth_idf = True : prevents zero division for unseen words   ;
    # max_features : If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus
    tfidf_vect = TfidfVectorizer(strip_accents='unicode',
                                 max_features=500,
                                 use_idf=True,
                                 smooth_idf=True,
                                 sublinear_tf=False)
    trainForVector = tfidf_vect.fit_transform(trainForVector)
    num_features = len(tfidf_vect.get_feature_names())
    # n_components : Desired dimensionality of output data. Must be strictly less than the number of features.
    # n_iter : Number of iterations for randomized SVD solver.
    # random_state : If int, random_state is the seed used by the random number generator.
    #pca = TruncatedSVD(n_components = num_features-1, n_iter = 7, random_state = 42)
    pca = TruncatedSVD(n_components=300, n_iter=7, random_state=42)
    trainForVector = pca.fit_transform(trainForVector)

    # train
    i = 0
    trainFeat = []
    for pdf in trainF:
        v_all = list(pdf.getImgHistogram()) + list(trainForVector[i]) + list(
            pdf.getFeatVec())
        trainFeat.append(v_all)
        i += 1
    # test
    testFeat = []
    for pdf in testF:
        v_all = list(pdf.getImgHistogram()) + list(trainForVector[i]) + list(
            pdf.getFeatVec())
        testFeat.append(v_all)