def _bootstrap_pool(X, Y, X_saliences, Y_saliences, n_components,procrustes, algorithm, boot_i): 
    """ basic version for parallel implementation of bootstrapping using pool
    """
    #call random seed so not the same random number is used in each process
    np.random.seed( int( time() ) + boot_i)
    #choose indices to resample randomly with replacement for a sample of same size
    sample_indices = np.random.choice(range(X.shape[0]), size=X.shape[0], replace=True)
    X_boot = X[sample_indices,:]
    Y_boot = Y[sample_indices,:]
    X_boot_scaled = scale(X_boot)
    Y_boot_scaled = scale(Y_boot)

    covariance_boot = np.dot(Y_boot_scaled.T, X_boot_scaled)
    svd = TruncatedSVD(n_components, algorithm=algorithm)
    Y_saliences_boot, _, X_saliences_boot = svd._fit(covariance_boot)
    X_saliences_boot = X_saliences_boot.T
    
    #It does not matter which side we use to calculate the rotated singular values
    #let's pick the smaller one for optimization
    if len(X_saliences_boot) > len(Y_saliences_boot):
        #use procrustes_rotation on smaller dataset
        Y_bootstraps, rotation_matrix = _procrustes_rotation(Y_saliences, Y_saliences_boot)
        X_bootstraps = np.dot(X_saliences_boot, rotation_matrix)
    else:
        X_bootstraps, rotation_matrix = _procrustes_rotation(X_saliences, X_saliences_boot)
        Y_bootstraps = np.dot(Y_saliences_boot, rotation_matrix)  
         
    
    #print np.shape(X_bootstraps)
    #print np.shape(Y_bootstraps)
   
    return X_bootstraps, Y_bootstraps
Exemplo n.º 2
0
class TruncatedSVDImpl():
    def __init__(self,
                 n_components=2,
                 algorithm='randomized',
                 n_iter=5,
                 random_state=None,
                 tol=0.0):
        self._hyperparams = {
            'n_components': n_components,
            'algorithm': algorithm,
            'n_iter': n_iter,
            'random_state': random_state,
            'tol': tol
        }
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if (y is not None):
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def transform(self, X):
        return self._wrapped_model.transform(X)
Exemplo n.º 3
0
def _permute_and_calc_singular_values_pool(X, Y, X_saliences, Y_saliences,
                                           n_components, procrustes, algorithm,
                                           perm_i):
    """ basic version for parallel implementation using pool
    """
    #call random seed so not the same random number is used in each process
    np.random.seed(int(time()) + perm_i)

    if len(X) < len(Y):
        #apply permutation to shorter list
        #print "randomization X<Y"
        X_perm = np.random.permutation(X)
        covariance_perm = np.dot(Y.T, X_perm)
    else:
        #print "other permutation"
        Y_perm = np.random.permutation(Y)
        covariance_perm = np.dot(Y_perm.T, X)

    svd = TruncatedSVD(n_components, algorithm=algorithm)

    Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit(
        covariance_perm)

    if procrustes:
        #It does not matter which side we use to calculate the rotated singular values
        #let's pick the smaller one for optimization
        if len(X_saliences_perm) > len(Y_saliences_perm):
            _, _, singular_values_perm = _procrustes_rotation(
                Y_saliences, Y_saliences_perm, singular_values_perm)
        else:
            X_saliences_perm = X_saliences_perm.T
            _, _, singular_values_perm = _procrustes_rotation(
                X_saliences, X_saliences_perm, singular_values_perm)

    return singular_values_perm
Exemplo n.º 4
0
class RawModel:
    def __init__(self):
        # 2015-05-15 GEL Found that n_components=20 gives a nice balance of 
        # speed (substantial improvement), accuracy, and reduced memory usage 
        # (25% decrease).
        self.decomposer = TruncatedSVD(n_components=20)

        # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than 
        # algorithm='kd_tree'
        
        # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples)
        # euclidean        0.950025
        # manhattan        0.933533
        # chebyshev        0.675662
        # hamming          0.708646
        # canberra         0.934033
        # braycurtis       0.940530
        self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')

    def fit(self, trainExamples):       
        X = self.decomposer.fit_transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in trainExamples] ) )
        Y = [x.Y for x in trainExamples]

        self.model.fit(X, Y)
        return self

    def predict(self, examples):
        X = self.decomposer.transform( vstack( [reshape(x.X, (1, x.WIDTH * x.HEIGHT)) for x in examples] ) )
        return self.model.predict( X )
Exemplo n.º 5
0
    def __init__(self, *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
        
        # False
        self._clf = TruncatedSVD(
              n_components=self.hyperparams['n_components'],
              algorithm=self.hyperparams['algorithm']['choice'],
              n_iter=self.hyperparams['algorithm'].get('n_iter', 5),
              tol=self.hyperparams['algorithm'].get('tol', 0),
              random_state=self.random_seed,
        )

        self.primitiveNo = PrimitiveCount.primitive_no
        PrimitiveCount.primitive_no += 1

        
        
        self._inputs = None
        self._outputs = None
        self._training_inputs = None
        self._training_outputs = None
        self._target_names = None
        self._training_indices = None
        self._target_column_indices = None
        self._target_columns_metadata: List[OrderedDict] = None
        self._input_column_names = None
        self._fitted = False
Exemplo n.º 6
0
def _permute_and_calc_singular_values(X,
                                      Y,
                                      X_saliences,
                                      Y_saliences,
                                      singular_values_samples,
                                      perm_i,
                                      n_components,
                                      procrustes=False,
                                      algorithm="randomized"):
    if len(X) < len(Y):
        X_perm = np.random.permutation(X)
        covariance_perm = np.dot(Y.T, X_perm)
    else:
        Y_perm = np.random.permutation(Y)
        covariance_perm = np.dot(Y_perm.T, X)
    svd = TruncatedSVD(n_components, algorithm=algorithm)
    Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit(
        covariance_perm)

    if procrustes:
        #It does not matter which side we use to calculate the rotated singular values
        #let's pick the smaller one for optimization
        if len(X_saliences_perm) > len(Y_saliences_perm):
            _, _, singular_values_samples[:, perm_i] = _procrustes_rotation(
                Y_saliences, Y_saliences_perm, singular_values_perm)
        else:
            X_saliences_perm = X_saliences_perm.T
            _, _, singular_values_samples[:, perm_i] = _procrustes_rotation(
                X_saliences, X_saliences_perm, singular_values_perm)
    else:
        singular_values_samples[:, perm_i] = singular_values_perm
Exemplo n.º 7
0
def _permute_and_calc_singular_values_process(X, Y, a, b, n_components,
                                              algorithm, output, x):  #perm_i
    """ basic version for parallel implementation using processes and output queue
    """

    #call random seed so not the same random number is used each time
    #pid = current_process()._identity[0]
    #randst = np.random.mtrand.RandomState(pid)
    np.random.seed(int(time()) + x + 50)

    #test how permutation works
    c = np.random.permutation(a)
    print a
    print c

    if len(X) < len(Y):
        #apply permutation to shorter list
        #print "randomization X<Y"
        X_perm = np.random.permutation(X)
        covariance_perm = np.dot(Y.T, X_perm)
    else:
        #print "other permutation"
        Y_perm = np.random.permutation(Y)
        covariance_perm = np.dot(Y_perm.T, X)

    svd = TruncatedSVD(n_components, algorithm=algorithm)

    #print covariance_perm
    Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit(
        covariance_perm)

    output.put(singular_values_perm)
 def outliersSvdReduction(self):
     svd = TruncatedSVD(n_components=1)
     ordersSvd = svd.fit_transform(
         self.training_order_start_end_districts_and_time,
         self.training_number_of_orders)
     priceSvd = svd.fit_transform(
         self.training_order_start_end_districts_and_time,
         self.training_order_median_price)
     self.outliersPriceOrders(ordersSvd, priceSvd)
Exemplo n.º 9
0
def write_spacy_vocab(output_dirpath, vocab_size, embedding_dim):
    if not os.path.exists(output_dirpath):
        os.makedirs(output_dirpath)

    allowed_chars = set(string.ascii_letters + string.punctuation)
    ascii = set(string.ascii_letters)
    ascii_plus_period = set(string.ascii_letters + '.')
    word_set = set()
    spacy_vocab = spacy.load('en').vocab
    top_words = []

    for w in spacy_vocab:
        if w.rank > 2 * vocab_size:
            continue
        try:
            word_string = str(w.lower_).strip()
            if not word_string:
                continue
            if word_string in word_set:
                continue
            if any(bad_char in word_string
                   for bad_char in ('[', ']', '<', '>', '{', '}')):
                # these are used to mark word types and person ids.
                continue
            if any(c not in allowed_chars for c in word_string):
                continue
            if sum(1 for c in word_string if c not in ascii_plus_period) > 2:
                continue
            if word_string[-1] == '.' and sum(
                    1 for c in word_string if c in ascii) > 2:
                continue

            top_words.append(w)
            word_set.add(word_string)
        except:
            pass

    top_words.sort(key=lambda w: w.rank)
    top_words = top_words[:vocab_size]

    with open(os.path.join(output_dirpath, 'vocab'), 'w') as f:
        for word in top_words:
            f.write('%s\n' % word.lower_.strip())

    vectors = np.array([w.vector for w in top_words])
    svd = TruncatedSVD(n_components=embedding_dim, algorithm='arpack')
    embeddings = svd.fit_transform(vectors)

    print embeddings.shape
    print[
        sum(svd.explained_variance_ratio_[:i])
        for i in range(1, embedding_dim + 1)
    ]
    np.save(os.path.join(output_dirpath, 'pretrained_embeddings.npy'),
            embeddings)
Exemplo n.º 10
0
 def fit_transform(self, X, Y):
     if self.standardize:
         X = _standardize(X)
         Y = _standardize(Y)
     K = X.dot(Y.T)
     model = TruncatedSVD(n_components=self.n_components,
                          random_state=self.random_state)
     U = model.fit_transform(K)
     U = U / np.linalg.norm(U, axis=0)
     V = model.components_.T
     self.correlations = np.array(
         [pearsonr(u.dot(X), v.dot(Y))[0] for u, v in zip(U.T, V.T)])
     return U, V
Exemplo n.º 11
0
class RegressionRecommender(object):
    def __init__(self, feature_size=10, regressor=None):
        self.feature_size = feature_size
        self.user_svd = TruncatedSVD(n_components=feature_size)
        self.item_svd = TruncatedSVD(n_components=feature_size)
        if regressor is None:
            self.regressor = LinearRegression()

    def fit(self, rating):
        # rating (item x user)
        item_features = self.item_svd.fit_transform(rating)
        user_features = self.user_svd.fit_transform(rating.T)
        self.item_features = item_features
        self.user_features = user_features

        n_item, n_user = rating.shape
        n_examples = rating.count_nonzero()
        X = zeros((n_examples, self.feature_size + self.feature_size))
        y = zeros((n_examples, 1))
        for i, (item, user) in enumerate(zip(*rating.nonzero())):
            X[i] = concatenate([item_features[item], user_features[user]],
                               axis=0)
            y[i] = rating[item, user]

        self.regressor.fit(X, y)
        return self

    def predict(self, item, user):
        user_features = self.user_features[user]
        item_features = self.item_features[item]

        input_features = concatenate(user_features, item_features)
        return self.regressor.predict(input_features)

    def save(self, filepath):
        to_save = {
            'regressor': self.regressor,
            'user_svd': self.user_svd,
            'item_svd': self.item_svd
        }
        with open(filepath, 'wb') as handle:
            saver = Pickler(handle, protocol=HIGHEST_PROTOCOL)
            saver.save(to_save)

    def load(self, filepath):
        with open(filepath, 'rb') as handle:
            loader = Unpickler(handle)
            state = loader.load()
            self.regressor = state['regressor']
            self.user_svd = state['user_svd']
            self.item_svd = state['item_svd']
Exemplo n.º 12
0
 def fit(self, X, y=None):
     self._sklearn_model = SKLModel(**self._hyperparams)
     if (y is not None):
         self._sklearn_model.fit(X, y)
     else:
         self._sklearn_model.fit(X)
     return self
Exemplo n.º 13
0
def create_union_transf(_):
    pca2c_transformer = make_pipeline(
        drop_transform,
        SimpleImputer(),
        StandardScaler(),
        PCA(n_components=2),
    )

    os_transformer = make_pipeline(
        FunctionTransformer(lambda x: x.os, validate=False),
        CountVectorizer(),
        TruncatedSVD(n_components=10),
    )

    arch_transformer = FunctionTransformer(lambda x: pd.get_dummies(x.cpuArch),
                                           validate=False)

    gmm_transformer = make_pipeline(
        drop_transform, SimpleImputer(), StandardScaler(), PCA(n_components=2),
        FunctionTransformer(lambda x: GaussianMixture(n_components=3).
                            fit_predict(x)[np.newaxis].T))

    transf = make_union(
        drop_transform,
        gmm_transformer,
        os_transformer,
        arch_transformer,
        pca2c_transformer,
    )
    return transf
Exemplo n.º 14
0
def fit_pls(X, Y, n_components, scale=True, algorithm="randomized"):
    #scaling
    if scale:
        X_scaled = zscore(X, axis=0, ddof=1)
        Y_scaled = zscore(Y, axis=0, ddof=1)
        covariance = np.dot(Y_scaled.T, X_scaled)
    else:
        covariance = np.dot(Y.T, X)

    svd = TruncatedSVD(n_components, algorithm)
    Y_saliences, singular_values, X_saliences = svd._fit(covariance)
    X_saliences = X_saliences.T
    inertia = singular_values.sum()

    if scale:
        return X_saliences, Y_saliences, singular_values, inertia, X_scaled, Y_scaled
    else:
        return X_saliences, Y_saliences, singular_values, inertia
Exemplo n.º 15
0
def reduce_dimensionality(dataframe, maxvariance, columns_to_drop):
    '''
    Performs PCA on feature pandas dataframe and reduces number of
    principal components to those which explain a defined variance
    '''
    dataframe_without_columns = dataframe.drop(columns_to_drop, axis=1)
    LOGGER.info('Columns to be used by pca:')
    print dataframe_without_columns.columns
    LOGGER.info('Adding noise to dataframe')
    dataframe_without_columns = dataframe_without_columns + numpy.random.normal(
        size=dataframe_without_columns.shape) * 1.e-19
    LOGGER.info('Starting PCA')
    try:
        pca = PCA(n_components='mle')
        pca.fit(dataframe_without_columns)
        # transform
        samples = pca.transform(dataframe_without_columns)
        # aggregated sum of variances
        sum_variance = sum(pca.explained_variance_)
        list_variance = pca.explained_variance_
        #print sum_variance, pca.explained_variance_
        # get those having aggregated variance below threshold
    except ValueError:
        LOGGER.info('PCA failed, using truncated SVD')
        svd = TruncatedSVD(n_components=3)
        svd.fit(dataframe_without_columns)
        samples = svd.transform(dataframe_without_columns)
        sum_variance = sum(svd.explained_variance_)
        list_variance = svd.explained_variance_

    scomp = 0
    ncomp = 0
    while scomp < maxvariance:
        #c = pca.explained_variance_[ncomp]
        c = list_variance[ncomp]
        scomp = scomp + c / sum_variance
        ncomp = ncomp + 1
    # reduce dimensionality
    samples = samples[:, :ncomp]
    LOGGER.info("Number of features after PCA transformation %s" %
                samples.shape[1])
    return samples
Exemplo n.º 16
0
    def fit_transform(self, Xs):
        """
        Optimize each CC components and return per-data set projections.
        :param Xs: List of matrices with the same number of columns.
        :return: CCA subspace.
        """
        p = len(Xs)
        Ws = [np.zeros((X.shape[0], self.n_components)) for X in Xs]
        if self.standardize:
            Xs = list(map(_standardize, Xs))
        Ws_init = [
            TruncatedSVD(n_components=self.n_components,
                         random_state=self.random_state).fit_transform(X)
            for X in Xs
        ]
        correlations = np.zeros((self.n_components, ))

        # Optimize each CC component individually
        for cc in range(self.n_components):
            w_cur = [Wi[:, cc] / np.linalg.norm(Wi[:, cc]) for Wi in Ws_init]
            for itr in range(self.max_iter):
                o1 = self._objective(Xs, w_cur)
                for i in range(p):
                    wi = 0
                    for j in range(p):
                        if i == j:
                            continue
                        wj = w_cur[j]
                        Dj = np.diag(
                            np.diagonal(Ws[i].T.dot(Xs[i]).dot(Xs[j].T.dot(
                                Ws[j]))))
                        wi += Xs[i].dot((Xs[j].T.dot(wj))) - Ws[i].dot(Dj).dot(
                            Ws[j].T).dot(wj)
                    w_cur[i] = wi / np.linalg.norm(wi)
                o2 = self._objective(Xs, w_cur)
                if abs(o2 - o1) / abs(o1) < self.tol:
                    break
            for i in range(p):
                Ws[i][:, cc] = w_cur[i]

            # Compute average correlations
            n_pairs = p * (p - 1) / 2
            for i, j in it.combinations(range(p), 2):
                wi = Ws[i][:, cc].T.dot(Xs[i])
                wj = Ws[j][:, cc].T.dot(Xs[j])
                correlations[cc] += pearsonr(wi, wj)[0] / n_pairs

        # Orientate vectors
        s = np.sign(Ws[0][0, :])
        for i in range(p):
            Ws[i] = Ws[i] * s

        self.correlations = correlations
        return Ws
Exemplo n.º 17
0
def compute_reduced_embeddings_original_vocab(output_vocab_filepath,
                                              output_embeddings_filepath,
                                              input_vocab_filepath, vocab_size,
                                              embedding_dim):
    print N_FREE_TOKENS
    vocab = Vocab(input_vocab_filepath, 1.5 * vocab_size)
    spacy_vocab = spacy.load('en').vocab
    matrix = np.zeros((vocab_size, spacy_vocab.vectors_length),
                      dtype=np.float32)
    new_i = 0
    final_vocab = []

    for i, word in vocab._id_to_word.iteritems():
        if new_i == vocab_size:
            break

        if i >= N_FREE_TOKENS and unicode(word) not in spacy_vocab:
            continue

        if i >= N_FREE_TOKENS:
            final_vocab.append(word)

        matrix[new_i] = spacy_vocab[unicode(word)].vector
        new_i += 1

    print 'Last word added:', final_vocab[-1]
    if embedding_dim < spacy_vocab.vectors_length:
        svd = TruncatedSVD(n_components=embedding_dim, algorithm='arpack')
        embeddings = svd.fit_transform(matrix)
        print embeddings.shape
        print[
            sum(svd.explained_variance_ratio_[:i])
            for i in range(1, embedding_dim + 1)
        ]
    else:
        embeddings = matrix

    with open(output_vocab_filepath, 'w') as output:
        for word in final_vocab:
            output.write('%s\n' % word)
    np.save(output_embeddings_filepath, embeddings)
Exemplo n.º 18
0
    def __init__(self, path, corpusName, query=None):
        self.query = query
        documents = (line.lower().split() for line in codecs.open(
            corpusName + ".txt", mode='r', encoding='utf-8', errors='ignore'))
        self.corpus = [' '.join(i) for i in documents]
        if self.query is not None:
            self.corpus.append(' '.join(query.getTokens()))

        # Make models
        t0 = time()
        print "Creating SciKit TF-IDF Model"
        self.tfidfModel = TfidfVectorizer().fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LSA Model"
        t0 = time()
        lsa = TruncatedSVD(n_components=300)
        self.lsaModel = lsa.fit_transform(self.tfidfModel)
        self.lsaModel = Normalizer(copy=False).fit_transform(self.lsaModel)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LDA Model"
        # Use tf (raw term count) features for LDA.
        print("Extracting tf features for LDA")
        tf_vectorizer = CountVectorizer(max_features=2000)
        t0 = time()
        tf = tf_vectorizer.fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))
        print("Fitting LDA model")
        lda = LatentDirichletAllocation(n_topics=300,
                                        max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
        t0 = time()
        self.ldaModel = lda.fit_transform(tf)
        self.ldaModel = Normalizer(copy=False).fit_transform(self.ldaModel)
        print("Done in %0.3fs." % (time() - t0))
Exemplo n.º 19
0
def _boostrap(X,
              Y,
              X_saliences,
              Y_saliences,
              X_saliences_bootstraps,
              Y_saliences_bootstraps,
              bootstrap_i,
              n_components,
              algorithm="randomized"):
    sample_indices = np.random.choice(list(range(X.shape[0])),
                                      size=X.shape[0],
                                      replace=True)
    X_boot = X[sample_indices, :]
    Y_boot = Y[sample_indices, :]
    X_boot_scaled = scale(X_boot)
    Y_boot_scaled = scale(Y_boot)

    covariance_boot = np.dot(Y_boot_scaled.T, X_boot_scaled)
    svd = TruncatedSVD(n_components, algorithm=algorithm)
    Y_saliences_boot, _, X_saliences_boot = svd._fit(covariance_boot)
    X_saliences_boot = X_saliences_boot.T

    #It does not matter which side we use to calculate the rotated singular values
    #let's pick the smaller one for optimization
    if len(X_saliences_boot) > len(Y_saliences_boot):
        Y_saliences_bootstraps[:, :,
                               bootstrap_i], rotation_matrix = _procrustes_rotation(
                                   Y_saliences, Y_saliences_boot)
        X_saliences_bootstraps[:, :,
                               bootstrap_i] = np.dot(X_saliences_boot,
                                                     rotation_matrix)
    else:
        X_saliences_bootstraps[:, :,
                               bootstrap_i], rotation_matrix = _procrustes_rotation(
                                   X_saliences, X_saliences_boot)
        Y_saliences_bootstraps[:, :,
                               bootstrap_i] = np.dot(Y_saliences_boot,
                                                     rotation_matrix)
Exemplo n.º 20
0
 def __init__(self,
              n_components=2,
              algorithm='randomized',
              n_iter=5,
              random_state=None,
              tol=0.0):
     self._hyperparams = {
         'n_components': n_components,
         'algorithm': algorithm,
         'n_iter': n_iter,
         'random_state': random_state,
         'tol': tol
     }
     self._wrapped_model = Op(**self._hyperparams)
Exemplo n.º 21
0
    def __init__(self, path, corpusName, query=None):
        self.query = query
        documents = (line.lower().split() for line in codecs.open(
            corpusName + ".txt", mode='r', encoding='utf-8', errors='ignore'))
        self.corpus = [' '.join(i) for i in documents]
        if self.query is not None:
            self.corpus.append(' '.join(query.getTokens()))

        # Make models
        t0 = time()
        print "Creating SciKit TF-IDF Model"
        self.tfidfModel = TfidfVectorizer().fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LSA Model"
        t0 = time()
        lsa = TruncatedSVD(n_components=300)
        self.lsaModel = lsa.fit_transform(self.tfidfModel)
        self.lsaModel = Normalizer(copy=False).fit_transform(self.lsaModel)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LDA Model"
        # Use tf (raw term count) features for LDA.
        print("Extracting tf features for LDA")
        tf_vectorizer = CountVectorizer(max_features=2000)
        t0 = time()
        tf = tf_vectorizer.fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))
        print("Fitting LDA model")
        lda = LatentDirichletAllocation(n_topics=300, max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
        t0 = time()
        self.ldaModel = lda.fit_transform(tf)
        self.ldaModel = Normalizer(copy=False).fit_transform(self.ldaModel)
        print("Done in %0.3fs." % (time() - t0))
Exemplo n.º 22
0
def fit_pls(X, Y, n_components, scale=True, algorithm="randomized"):
    #scaling

    print "calculating SVD"
    if scale:
        X_scaled = zscore(X, axis=0, ddof=1)
        Y_scaled = zscore(Y, axis=0, ddof=1)
        covariance = np.dot(Y_scaled.T, X_scaled)
    else:
        covariance = np.dot(Y.T, X)

    print np.shape(covariance)
    sum_var = covariance
    svd = TruncatedSVD(n_components, algorithm)
    #computes only the first n_components largest singular values
    #produces a low-rank approximation of covariance matrix
    Y_saliences, singular_values, X_saliences = svd._fit(covariance)
    X_saliences = X_saliences.T
    inertia = singular_values.sum()

    if scale:
        return X_saliences, Y_saliences, singular_values, inertia, X_scaled, Y_scaled, sum_var
    else:
        return X_saliences, Y_saliences, singular_values, inertia
Exemplo n.º 23
0
    def __init__(self):
        # 2015-05-15 GEL Found that n_components=20 gives a nice balance of 
        # speed (substantial improvement), accuracy, and reduced memory usage 
        # (25% decrease).
        self.decomposer = TruncatedSVD(n_components=20)

        # 2015-05-15 GEL algorithm='ball_tree' uses less memory on average than 
        # algorithm='kd_tree'
        
        # 2015-05-15 GEL Evaluation of metrics by accuracy (based on 8000 training examples)
        # euclidean        0.950025
        # manhattan        0.933533
        # chebyshev        0.675662
        # hamming          0.708646
        # canberra         0.934033
        # braycurtis       0.940530
        self.model = KNeighborsClassifier(n_neighbors=5, algorithm='ball_tree', metric='euclidean')
Exemplo n.º 24
0
def build_accesson(options):
    ngroups, ncell_cut = int(options.ngroup), int(options.ncell)
    reads = scipy.io.mmread(options.s + '/matrix/filtered_reads.mtx')
    reads = scipy.sparse.csr_matrix(reads) * 1.0
    cells = pandas.read_csv(options.s + '/matrix/filtered_cells.csv',
                            sep='\t',
                            index_col=0,
                            engine='c',
                            na_filter=False,
                            low_memory=False)
    cells = cells.index.values
    peaks = ['peak' + str(x) for x in range(0, reads.shape[0])]
    scale = numpy.array(10000.0 / reads.sum(axis=0))[0]
    sklearn.utils.sparsefuncs.inplace_column_scale(reads, scale)
    reads.data = numpy.log2(reads.data + 1)
    npc = min(int(options.npc), reads.shape[0], reads.shape[1])
    if len(cells) > ncell_cut:
        pca_result = TruncatedSVD(n_components=npc,
                                  algorithm='arpack',
                                  random_state=0).fit_transform(reads)
    else:
        pca_result = PCA(n_components=npc,
                         svd_solver='full').fit_transform(reads.A)
    connectivity = kneighbors_graph(pca_result,
                                    n_neighbors=10,
                                    include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)
    ward_linkage = cluster.AgglomerativeClustering(n_clusters=ngroups,
                                                   linkage='ward',
                                                   connectivity=connectivity)
    #    ward_linkage = cluster.AgglomerativeClustering(n_clusters=ngroups, linkage='ward')
    y_predict = ward_linkage.fit_predict(pca_result)
    peak_labels_df = pandas.DataFrame(y_predict,
                                      index=peaks,
                                      columns=['group'])
    peak_labels_df.to_csv(options.s + '/matrix/Accesson_peaks.csv', sep='\t')
    groups = list(set(y_predict))
    coAccess_matrix = numpy.array(
        [reads[numpy.where(y_predict == x)[0], :].sum(axis=0) for x in groups])
    coAccess_matrix = coAccess_matrix[:, 0, :].T
    coAccess_df = pandas.DataFrame(coAccess_matrix,
                                   index=cells,
                                   columns=groups)
    coAccess_df.to_csv(options.s + '/matrix/Accesson_reads.csv', sep=',')
    return
Exemplo n.º 25
0
class SimilarityRecommender(object):
    def __init__(self, feature_size=10):
        self.feature_size = feature_size
        self.svd = TruncatedSVD(n_components=feature_size)
        self.rating = None

    def fit(self, rating):
        # rating (item x user)
        self.rating = rating
        item = self.svd.fit_transform(rating)
        similarity = defaultdict(lambda: dict())

        n_item, n_user = rating.shape
        for first in tqdm(range(n_item)):
            for second in range(first):
                first_item = item[first].reshape(1, -1)
                second_item = item[second].reshape(1, -1)
                similarity[first][second] = float(
                    cosine_similarity(first_item, second_item)[0, 0])

        self.similarity = dict(similarity)
        return self

    def predict(self, user, item):
        history = self.rating[:, user].nonzero()
        absolute_score = sum(
            self.get_similarity(item, user_item) * rating
            for user_item, rating in history)
        score = float(absolute_score) / sum(rating for item, rating in history)
        return score

    def similar_to(self, item, n=5):

        return

    def get_similarity(self, item, target):
        return self.similarity[item][
            target] if item > target else self.similarity[target][item]

    def save(self, filepath):
        with open(filepath, 'w') as handle:
            json.dump(self.similarity, handle)
Exemplo n.º 26
0
                               names=header,
                               engine='python')

# Number of users in current set
print('Number of unique users in current data-set',
      active_time_data.user_id.unique().shape[0])
print('Number of unique articles in current data-set',
      active_time_data.item_id.unique().shape[0])

# SVD allows us to look at our input matrix as a product of three smaller matrices; U, Z and V.
# In short this will help us discover concepts from the original input matrix,
# (subsets of users that like subsets of items)
# Note that use of SVD is not strictly restricted to user-item matrices
# https://www.youtube.com/watch?v=P5mlg91as1c

algorithm = TruncatedSVD()

# Finally we run our cross validation in n folds, where n is denoted by the cv parameter.
# Verbose can be adjusted by an integer to determine level of verbosity.
# We pass in our SVD algorithm as the estimator used to fit the data.
# X is our data set that we want to fit.
# Since our estimator (The SVD algorithm), We must either define our own estimator, or we can simply define how it
# score the fitting.
# Since we currently evaluate the enjoyment of our users per article highly binary, (Please see the rate_article fn in
# the filter script), we can easily decide our precision and recall based on whether or not our prediction exactly
# matches the binary rating field in the test set.
# This, the F1 scoring metric seems an intuitive choice for measuring our success, as it provides a balanced score
# based on the two.

cv(estimator=algorithm, X=active_time_data, scoring='f1', cv=5, verbose=True)
Exemplo n.º 27
0
    def __init__(self):

        self.components = 2
        self.svd = TruncatedSVD(n_components=self.components)
        self.reductCount = 0
        for file_name, data_set in [
            (RunRegression.REGRESSION_TRAINING_INPUT_FILE_NAME,
             FileIo.TRAINING_DATA_SET),
            (RunRegression.REGRESSION_TESTING_INPUT_FILE_NAME,
             FileIo.TEST_DATA_SET)
        ]:

            # Check and see if the data has already been saved
            try:

                logging.info("RunRegression: Trying to load " + data_set +
                             " data")

                saved_data = numpy.load(file_name, mmap_mode='r')

            # If the data is not found, load it
            except IOError:

                logging.info(
                    "RunRegression: Saved data not found. Generating " +
                    data_set + " data")

                # Generate inputs
                poi_district_lookup = PoiDistrictLookup.PoiDistrictLookup()
                order_categorical_lookup = OrderCategoricalLookup.OrderCategoricalLookup(
                    poi_district_lookup)
                regression_input = RegressionInput.RegressionInput(
                    data_set, order_categorical_lookup, poi_district_lookup)

                if data_set == FileIo.TRAINING_DATA_SET:

                    self.training_order_start_end_districts_and_time, self.training_order_median_price, \
                        self.training_number_of_orders = regression_input.get_regression_inputs()

                    # Save the data for next time
                    numpy.savez(
                        file_name,
                        order_keys=self.
                        training_order_start_end_districts_and_time,
                        order_value_price=self.training_order_median_price,
                        order_value_number=self.training_number_of_orders)

                else:

                    self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \
                        self.testing_number_of_orders  = regression_input.get_regression_inputs()

                    # Save the data for next time
                    numpy.savez(
                        file_name,
                        order_keys=self.
                        testing_order_start_end_districts_and_time,
                        order_value_price=self.testing_order_median_price,
                        order_value_number=self.testing_number_of_orders)

            # If the saved data is found, load it
            else:

                logging.info("RunRegression: Loading " + data_set + " data")

                if data_set == FileIo.TRAINING_DATA_SET:

                    self.training_order_start_end_districts_and_time, self.training_order_median_price, \
                        self.training_number_of_orders = saved_data['order_keys'], \
                                                         saved_data['order_value_price'], \
                                                         saved_data['order_value_number']

                    self.dimensions = self.training_order_start_end_districts_and_time.shape[
                        1]
                    self.initial = self.training_order_start_end_districts_and_time
                    logging.info("RunRegression: Loaded " +
                                 str(len(self.training_number_of_orders)) +
                                 " train data rows")
                else:

                    self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \
                        self.testing_number_of_orders = saved_data['order_keys'], \
                                                        saved_data['order_value_price'], \
                                                        saved_data['order_value_number']

                    self.initialTesting = self.testing_order_start_end_districts_and_time
                    logging.info("RunRegression: Loaded " +
                                 str(len(self.testing_number_of_orders)) +
                                 " test data rows")
Exemplo n.º 28
0
 def buildModel(self):
     tfidfModel = TfidfVectorizer().fit_transform(self.corpus)
     lsa = TruncatedSVD(n_components=200)
     self.Model = lsa.fit_transform(tfidfModel)
     self.Model = Normalizer(copy=False).fit_transform(self.Model)
Exemplo n.º 29
0
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=False)
from sklearn.cross_validation import train_test_split, cross_val_score

df = pd.read_csv('/path/file.csv',
                 header=0,
                 sep=',',
                 names=['SentenceId', 'Sentence', 'Sentiment'])

reduced_data = tfidf_vect.fit_transform(df['Sentence'].values)
y = df['Sentiment'].values

from sklearn.decomposition.truncated_svd import TruncatedSVD
svd = TruncatedSVD(n_components=5)
reduced_data = svd.fit_transform(reduced_data)

X_train, X_test, y_train, y_test = train_test_split(reduced_data,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

from sklearn.ensemble import RandomForestClassifier

#se pasmo con 1000000
#probar con mas parametros
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
Exemplo n.º 30
0
class RunRegression(object):

    REGRESSION_TRAINING_INPUT_FILE_NAME = "RegressionTrainingInput.npz"
    REGRESSION_TESTING_INPUT_FILE_NAME = "RegressionTestingInput.npz"
    MAXIMUM_NUMBER_OF_JOBS = -1
    NUMBER_OF_CROSS_VALIDATION_FOLDS = 5
    ROWS_TO_USE_FOR_GAUSSIAN_KERNEL_REGRESSION = 15
    DISTRICT_SIZE = 132
    TIME_SIZE = 152
    POI_SIZE = 352
    WEATHER_SIZE = 9
    TRAFFIC_SIZE = 8

    def __init__(self):

        self.components = 2
        self.svd = TruncatedSVD(n_components=self.components)
        self.reductCount = 0
        for file_name, data_set in [
            (RunRegression.REGRESSION_TRAINING_INPUT_FILE_NAME,
             FileIo.TRAINING_DATA_SET),
            (RunRegression.REGRESSION_TESTING_INPUT_FILE_NAME,
             FileIo.TEST_DATA_SET)
        ]:

            # Check and see if the data has already been saved
            try:

                logging.info("RunRegression: Trying to load " + data_set +
                             " data")

                saved_data = numpy.load(file_name, mmap_mode='r')

            # If the data is not found, load it
            except IOError:

                logging.info(
                    "RunRegression: Saved data not found. Generating " +
                    data_set + " data")

                # Generate inputs
                poi_district_lookup = PoiDistrictLookup.PoiDistrictLookup()
                order_categorical_lookup = OrderCategoricalLookup.OrderCategoricalLookup(
                    poi_district_lookup)
                regression_input = RegressionInput.RegressionInput(
                    data_set, order_categorical_lookup, poi_district_lookup)

                if data_set == FileIo.TRAINING_DATA_SET:

                    self.training_order_start_end_districts_and_time, self.training_order_median_price, \
                        self.training_number_of_orders = regression_input.get_regression_inputs()

                    # Save the data for next time
                    numpy.savez(
                        file_name,
                        order_keys=self.
                        training_order_start_end_districts_and_time,
                        order_value_price=self.training_order_median_price,
                        order_value_number=self.training_number_of_orders)

                else:

                    self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \
                        self.testing_number_of_orders  = regression_input.get_regression_inputs()

                    # Save the data for next time
                    numpy.savez(
                        file_name,
                        order_keys=self.
                        testing_order_start_end_districts_and_time,
                        order_value_price=self.testing_order_median_price,
                        order_value_number=self.testing_number_of_orders)

            # If the saved data is found, load it
            else:

                logging.info("RunRegression: Loading " + data_set + " data")

                if data_set == FileIo.TRAINING_DATA_SET:

                    self.training_order_start_end_districts_and_time, self.training_order_median_price, \
                        self.training_number_of_orders = saved_data['order_keys'], \
                                                         saved_data['order_value_price'], \
                                                         saved_data['order_value_number']

                    self.dimensions = self.training_order_start_end_districts_and_time.shape[
                        1]
                    self.initial = self.training_order_start_end_districts_and_time
                    logging.info("RunRegression: Loaded " +
                                 str(len(self.training_number_of_orders)) +
                                 " train data rows")
                else:

                    self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \
                        self.testing_number_of_orders = saved_data['order_keys'], \
                                                        saved_data['order_value_price'], \
                                                        saved_data['order_value_number']

                    self.initialTesting = self.testing_order_start_end_districts_and_time
                    logging.info("RunRegression: Loaded " +
                                 str(len(self.testing_number_of_orders)) +
                                 " test data rows")

    """
    Run sgd regression
    """

    def run_sgd_regression(self):

        losses = ["squared_loss"]
        penalties = ["none", "l2", "l1", "elasticnet"]
        initial_learning_rates = [0.1, 0.01, 0.001]
        learning_rates = ["constant", "optimal", "invscaling"]

        lowest_ride_prediction_error = float('inf')

        best_loss = ""
        best_penalty = ""
        best_initial_learning_rate = 0.0
        best_learning_rate = ""

        # Find the best hyper-parameters
        for loss in losses:
            for penalty in penalties:
                for initial_learning_rate in initial_learning_rates:
                    for learning_rate in learning_rates:

                        mean_ride_prediction_error = 0.0

                        # Do k-fold cross-validation using mini-batch training.
                        for testing_fold_number in range(
                                RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS
                        ):

                            # Create the sgd regressor using the input parameters
                            sgd_regressor = linear_model.SGDRegressor(
                                loss=loss,
                                penalty=penalty,
                                eta0=initial_learning_rate,
                                learning_rate=learning_rate)

                            # Run mini batch training for the fold if its not the training fold
                            for fold_number in range(
                                    RunRegression.
                                    NUMBER_OF_CROSS_VALIDATION_FOLDS):

                                if fold_number == testing_fold_number:
                                    continue

                                training_start_row = fold_number * \
                                                     len(self.training_order_start_end_districts_and_time) // \
                                                     RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                                training_end_row = (fold_number + 1) * \
                                                   len(self.training_order_start_end_districts_and_time) // \
                                                    RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                                logging.info(
                                    "RunRegression: " +
                                    str(RunRegression.
                                        NUMBER_OF_CROSS_VALIDATION_FOLDS) +
                                    " fold cross validation training SGD Regressor for fold "
                                    + str(fold_number) + ", starting row " +
                                    str(training_start_row) + ", ending row " +
                                    str(training_end_row) + ", loss " + loss +
                                    ", penalty " + penalty +
                                    ", initial learning rate " +
                                    str(initial_learning_rate) +
                                    " and learning rate " + learning_rate)

                                # Train regression model
                                sgd_regressor\
                                   .partial_fit(X=self.training_order_start_end_districts_and_time[training_start_row :
                                                                                                   training_end_row],
                                                y=self.training_number_of_orders[training_start_row:training_end_row])

                            testing_start_row = testing_fold_number * \
                                                len(self.testing_order_start_end_districts_and_time) // \
                                                 RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                            testing_end_row = (testing_fold_number + 1 )* \
                                                len(self.testing_order_start_end_districts_and_time) // \
                                                 RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                            predicted_number_of_orders = sgd_regressor\
                                .predict(self.testing_order_start_end_districts_and_time[testing_start_row :
                                                                                         testing_end_row])

                            current_ride_prediction_error = numpy.mean(
                                (predicted_number_of_orders -
                                 self.testing_number_of_orders[
                                     testing_start_row:testing_end_row])**2)

                            logging.info(
                                "RunRegression: Prediction error for fold " +
                                str(testing_fold_number) + " is " +
                                str(current_ride_prediction_error))

                            mean_ride_prediction_error += current_ride_prediction_error

                            if RunRegression.__is_mean_prediction_error_too_high(
                                    mean_ride_prediction_error,
                                    lowest_ride_prediction_error):
                                logging.info(
                                    "RunRegression: Mean prediction error of "
                                    + str(mean_ride_prediction_error) +
                                    "is too high compared to best so far " +
                                    str(lowest_ride_prediction_error) +
                                    ". Ending current cross validation.")
                                break

                        else:

                            mean_ride_prediction_error /= RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS

                            logging.info(
                                "RunRegression: Mean prediction error is " +
                                str(mean_ride_prediction_error))

                            # Save values if better than previous best
                            if mean_ride_prediction_error < lowest_ride_prediction_error:

                                logging.info(
                                    "RunRegression: mean error of " +
                                    str(mean_ride_prediction_error) +
                                    " is the best so far. Saving loss " +
                                    loss + ", penalty " + penalty +
                                    ", initial learning rate " +
                                    str(initial_learning_rate) +
                                    " and learning rate " + learning_rate)

                                lowest_ride_prediction_error = mean_ride_prediction_error
                                best_loss = loss
                                best_penalty = penalty
                                best_initial_learning_rate = initial_learning_rate
                                best_learning_rate = learning_rate

        logging.info(
            "RunRegression: Running regression with best values so far: loss "
            + best_loss + ", penalty " + best_penalty +
            ", initial learning rate " + str(best_initial_learning_rate) +
            " and learning rate " + best_learning_rate)

        sgd_regressor = linear_model.SGDRegressor(
            loss=best_loss,
            penalty=best_penalty,
            eta0=best_initial_learning_rate,
            learning_rate=best_learning_rate)

        sgd_regressor.fit(X=self.training_order_start_end_districts_and_time,
                          y=self.training_number_of_orders)
        best_predicted_number_of_orders = sgd_regressor.predict(
            self.testing_order_start_end_districts_and_time)

        coef = sgd_regressor.coef_
        print(coef)

        logging.info(
            "RunRegression: Mean squared prediction error after cross validation is "
            + str(
                numpy.mean((best_predicted_number_of_orders -
                            self.testing_number_of_orders)**2)))

    """
    Check if mean prediction error is to high to qualify as the best so far
    """

    @staticmethod
    def __is_mean_prediction_error_too_high(cumulative_mean_prediction_error,
                                            best_prediction_error_so_far):

        return cumulative_mean_prediction_error / RunRegression.NUMBER_OF_CROSS_VALIDATION_FOLDS > \
               best_prediction_error_so_far

    """
    Run regression based on multidimensional scaling
    """

    def run_mds_regression(self):

        # Create a square matrix with number of test data rows preserved
        training_data_square_matrix = numpy.dot(
            self.training_order_start_end_districts_and_time.T,
            self.training_order_start_end_districts_and_time)

        logging.info("RunRegression: Square matrix shape " +
                     str(training_data_square_matrix.shape))

        # Get Eigen values and eigen vectors
        training_data_eigen_values, training_data_eigen_vectors = linalg.eig(
            training_data_square_matrix)
        #print(training_data_eigen_values)
        #print(training_data_eigen_vectors)
        print(self.training_order_start_end_districts_and_time)
        sorted_index = training_data_eigen_values.argsort()[::-1]
        sorted_training_data_eigen_values = training_data_eigen_values[
            sorted_index]
        sorted_training_data_eigen_vectors = training_data_eigen_vectors[:,
                                                                         sorted_index]

        logging.info("RunRegression: Found " +
                     str(len(sorted_training_data_eigen_values)) +
                     " eigen values.")
        logging.info("RunRegression: Eigen vectors have length " +
                     str(len(sorted_training_data_eigen_vectors[0])))

        if logging.getLogger().getEffectiveLevel() == logging.DEBUG:
            RunRegression.__show_eigen_values_trend(
                eigen_values=sorted_training_data_eigen_values)

    """
    Show Eigen values trend
    """

    @staticmethod
    def __show_eigen_values_trend(self, eigen_values):

        # Plot eigen values
        plt.plot(eigen_values)
        plt.ylabel('Eigen Values')
        plt.title('Sorted Eigen Values')
        plt.show()

    def leastAngleRegression(self):
        lar = linear_model.Lars()
        lar.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = lar.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(lar.coef_)

    def orthogonalMatchingPursuit(self):
        omp = linear_model.OrthogonalMatchingPursuit(n_nonzero_coefs=10)
        omp.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = omp.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(omp.coef_)

    def theilSenRegressor(self):
        tsr = linear_model.TheilSenRegressor()
        tsr.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = tsr.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(tsr.coef_)

    def polynomial(self):
        poly = PolynomialFeatures(degree=3)
        self.training_order_start_end_districts_and_time = poly.fit_transform(
            self.training_order_start_end_districts_and_time,
            self.training_number_of_orders)
        predict = poly.transform(
            self.testing_order_start_end_districts_and_time)

        clf = linear_model.LinearRegression()
        clf.fit(self.training_order_start_end_districts_and_time,
                self.training_number_of_orders)
        predicted_number_of_orders = clf.predict(predict)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(clf.coef_)

    def svm(self):
        oneClass = svm.OneClassSVM()
        logging.info("svm fit")
        oneClass.fit(self.training_order_start_end_districts_and_time,
                     self.training_number_of_orders)
        logging.info("svm predict")
        predicted_number_of_orders = oneClass.predict(
            self.testing_order_start_end_districts_and_time)
        current_ride_prediction_error = numpy.mean(
            (predicted_number_of_orders - self.testing_number_of_orders)**2)
        print(current_ride_prediction_error)
        print(oneClass.coef_)

    def districtReduction(self, keyType, key):
        y = key
        districts = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                           self.DISTRICT_SIZE)
        if keyType == "training":
            districtRed = self.svd.fit_transform(
                districts, self.training_number_of_orders)
        else:
            districtRed = self.svd.transform(districts)
        nonDistrict = numpy.apply_along_axis(sliceTransform, 1, y,
                                             self.DISTRICT_SIZE,
                                             self.dimensions)
        keyWithDist = numpy.append(districtRed, nonDistrict, axis=1)
        return keyWithDist

    def timeReduction(self, keyType, key):
        y = key
        time = numpy.apply_along_axis(sliceTransform, 1, y, self.components,
                                      self.TIME_SIZE + self.components)
        if keyType == "training":
            timeRed = self.svd.fit_transform(time,
                                             self.training_number_of_orders)
        else:
            timeRed = self.svd.transform(time)
        befTime = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                         self.components)
        aftTime = numpy.apply_along_axis(sliceTransform, 1, y,
                                         self.TIME_SIZE + self.components,
                                         self.dimensions)
        keyWithTime = numpy.append(befTime, timeRed, axis=1)
        keyWithTime = numpy.append(keyWithTime, aftTime, axis=1)
        return keyWithTime

    def POIReduction(self, keyType, key):
        y = key
        poi = numpy.apply_along_axis(sliceTransform, 1, y, self.components * 2,
                                     self.POI_SIZE + self.components * 2)
        if keyType == "training":
            poiRed = self.svd.fit_transform(poi,
                                            self.training_number_of_orders)
        else:
            poiRed = self.svd.transform(poi)
        befPoi = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                        self.components * 2)
        aftPoi = numpy.apply_along_axis(sliceTransform, 1, y,
                                        self.POI_SIZE + self.components * 2,
                                        self.dimensions)
        keyWithPoi = numpy.append(befPoi, poiRed, axis=1)
        keyWithPoi = numpy.append(keyWithPoi, aftPoi, axis=1)
        return keyWithPoi

    def WeatherReduction(self, keyType, key):
        y = key
        weather = numpy.apply_along_axis(
            sliceTransform, 1, y, self.components * 3,
            self.WEATHER_SIZE + self.components * 3)
        if keyType == "training":
            weatherRed = self.svd.fit_transform(weather,
                                                self.training_number_of_orders)
        else:
            weatherRed = self.svd.transform(weather)
        befWeather = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                            self.components * 3)
        aftWeather = numpy.apply_along_axis(
            sliceTransform, 1, y, self.WEATHER_SIZE + self.components * 3,
            self.dimensions)
        keyWithWeather = numpy.append(befWeather, weatherRed, axis=1)
        keyWithWeather = numpy.append(keyWithWeather, aftWeather, axis=1)
        return keyWithWeather

    def TrafficReduction(self, keyType, key):
        y = key
        traffic = numpy.apply_along_axis(
            sliceTransform, 1, y, self.components * 4,
            self.TRAFFIC_SIZE + self.components * 4)
        if keyType == "training":
            trafficRed = self.svd.fit_transform(traffic,
                                                self.training_number_of_orders)
            if self.reductCount == 0:
                self.boxPlot(trafficRed)
                self.reductCount = 1
        else:
            trafficRed = self.svd.transform(traffic)
        befTraffic = numpy.apply_along_axis(sliceTransform, 1, y, 0,
                                            self.components * 4)
        keyWithTraffic = numpy.append(befTraffic, trafficRed, axis=1)
        return keyWithTraffic

    def wholeReductionTraining(self):
        y = self.training_order_start_end_districts_and_time
        b = self.svd.fit_transform(y, self.training_number_of_orders)
        if self.reductCount < 2:
            self.boxPlot(b)
        self.reductCount += 1
        self.training_order_start_end_districts_and_time = b

    def wholeReductionTesting(self):
        y = self.testing_order_start_end_districts_and_time
        b = self.svd.transform(y)
        self.testing_order_start_end_districts_and_time = b

    def reduction(self):
        self.training_order_start_end_districts_and_time = self.initial
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        self.testing_order_start_end_districts_and_time = self.initialTesting

        logging.info("RunRegression: Reducing Districts")
        self.training_order_start_end_districts_and_time = run_regression.districtReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.districtReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.testing_order_start_end_districts_and_time[:, 0:1]
        y = self.testing_order_start_end_districts_and_time[:, 1:2]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        print(self.dimensions)

        logging.info("RunRegression: Reducing Time")
        self.training_order_start_end_districts_and_time = run_regression.timeReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.timeReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 2:3]
        y = self.training_order_start_end_districts_and_time[:, 3:4]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        #plt.scatter(x,y)
        #plt.show()
        logging.info("RunRegression: Reducing POI")
        self.training_order_start_end_districts_and_time = run_regression.POIReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.POIReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 4:5]
        y = self.training_order_start_end_districts_and_time[:, 5:6]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        #plt.scatter(x,y)
        #plt.show()
        logging.info("RunRegression: Reducing Weather")
        self.training_order_start_end_districts_and_time = run_regression.WeatherReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.WeatherReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 6:7]
        y = self.training_order_start_end_districts_and_time[:, 7:8]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        #plt.scatter(x,y)
        #plt.show()
        logging.info("RunRegression: Reducing Traffic")
        self.training_order_start_end_districts_and_time = run_regression.TrafficReduction(
            'training', self.training_order_start_end_districts_and_time)
        self.testing_order_start_end_districts_and_time = run_regression.TrafficReduction(
            'testing', self.testing_order_start_end_districts_and_time)
        x = self.training_order_start_end_districts_and_time[:, 8:9]
        y = self.training_order_start_end_districts_and_time[:, 9:10]
        self.dimensions = self.training_order_start_end_districts_and_time.shape[
            1]
        print(self.initial.shape)

    def boxPlot(self, arrayBox):
        a = plt.boxplot(arrayBox)
        plt.show()
        idx = set()
        idxSet = set(
            numpy.arange(len(
                self.training_order_start_end_districts_and_time)))
        for d in a['fliers']:
            print(len(d.get_ydata()))
            for point in d.get_ydata():
                pIdx = numpy.where(arrayBox == point)
                for rIdx in pIdx[0]:
                    idx.add(rIdx)
        logging.info("done with loop")
        idxKeep = list(idxSet.difference(idx))
        self.initial = self.initial[[idxKeep], :]
        self.training_number_of_orders = self.training_number_of_orders[[
            idxKeep
        ]]
        self.initial = self.initial.reshape(self.initial.shape[1:])
Exemplo n.º 31
0
            ("ravel", Ravel()),
            ('tfid_vect', TfidfVectorizer(max_df= 0.743, min_df=0.036, ngram_range=(1,4),\
strip_accents='ascii', analyzer= "word", stop_words='english', norm = "l1", use_idf = True))
        ])

# des_rescu_pipe = Pipeline([
#             ('sel_num', DataFrameSelector(["Description", "RescuerID"], ravel = True)),
#             add rescuer to description
#             ('rm_nan', FnanToStr()),
#             ('tfid_vect', TfidfVectorizer(max_df= 0.743, min_df=0.036, ngram_range=(1,4),\
# strip_accents='ascii', analyzer= "word", stop_words='english', use_idf = True, norm = None))
#         ])

des_pipe_svd = Pipeline([
    ('des_pipe', des_pipe),
    ('SVD', TruncatedSVD(n_components=20)
     )  #ValueError: n_components must be < n_features; got 140 >= 124
])

des_pipe_for_svd = replace_step(
    des_pipe,
    "tfid_vect",
    ("tfid_vect", TfidfVectorizer(max_df= 0.95, min_df=0.005, ngram_range=(1,4),\
                                  strip_accents='ascii', analyzer= "word", stop_words='english',
                                  norm = "l1", use_idf = True))
)
des_pipe_svd_v2 = Pipeline([('des_pipe_for_svd', des_pipe_for_svd),
                            ('SVD', TruncatedSVD(n_components=20))])

des_pipe_svd_v3 = replace_step(des_pipe_svd_v2, "SVD",
                               ('SVD', TruncatedSVD(n_components=100)))
    X_train = vectorizer.fit_transform(X_train)
duration = time() - t0
print("n_samples: %d, n_features: %d" % X_train.shape)
print()

print("Extracting features from the test data using the same vectorizer")
t0 = time()

#X_test,y_test = get_test_data()

X_test = vectorizer.transform(X_test)
duration = time() - t0
print(X_train.shape)

#x,z, X_train = fastica(X_train.toarray())
svd = TruncatedSVD(n_components=1000)
print(X_train)

#print("n_samples: %d, n_features: %d" % X_test.shape)
print()

# mapping from integer feature name to original token string
if opts.use_hashing:
    feature_names = None
else:
    feature_names = vectorizer.get_feature_names()

if opts.select_chi2:
    print("Extracting %d best features by a chi-squared test" %
          opts.select_chi2)
    t0 = time()
Exemplo n.º 33
0
    def train(self, model_name, corpus, log, opts, chain_features=None):
        from whim.entity_narrative import DistributionalVectorsNarrativeChainModel
        log.info("Training context vectors model")

        training_metadata = {
            "data": corpus.directory,
            "pmi": opts.pmi or opts.ppmi,
            "ppmi": opts.ppmi,
        }

        log.info("Extracting event counts")
        pbar = get_progress_bar(len(corpus), title="Event feature extraction")
        # Loop over all the chains again to collect events
        event_counts = Counter()
        for doc_num, document in enumerate(corpus):
            chains = document.get_chains()
            if len(chains):
                event_chains = list(
                    DistributionalVectorsNarrativeChainModel.
                    extract_chain_feature_lists(chains,
                                                only_verb=opts.only_verb,
                                                adjectives=opts.adj))
                # Count all the events
                for chain in event_chains:
                    event_counts.update(chain)

            pbar.update(doc_num)
        pbar.finish()

        if opts.event_threshold is not None and opts.event_threshold > 0:
            log.info("Applying event threshold")
            # Apply a threshold event count
            to_remove = [
                event for (event, count) in event_counts.items()
                if count < opts.event_threshold
            ]
            pbar = get_progress_bar(len(to_remove), title="Filtering counts")
            for i, event in enumerate(to_remove):
                del event_counts[event]
                pbar.update(i)
            pbar.finish()

        log.info("Extracting pair counts")
        pbar = get_progress_bar(len(corpus), title="Pair feature extraction")
        # Loop over all the chains again to collect pairs of events
        pair_counts = Counter()
        for doc_num, document in enumerate(corpus):
            chains = document.get_chains()
            if len(chains):
                event_chains = list(
                    DistributionalVectorsNarrativeChainModel.
                    extract_chain_feature_lists(chains,
                                                only_verb=opts.only_verb,
                                                adjectives=opts.adj))
                # Count all the events
                for chain in event_chains:
                    # Count all pairs
                    pairs = []
                    for i in range(len(chain) - 1):
                        for j in range(i + 1, len(chain)):
                            if chain[i] in event_counts and chain[
                                    j] in event_counts:
                                pairs.append(
                                    tuple(sorted([chain[i], chain[j]])))
                    pair_counts.update(pairs)

            pbar.update(doc_num)
        pbar.finish()

        if opts.pair_threshold is not None and opts.pair_threshold > 0:
            log.info("Applying pair threshold")
            # Apply a threshold pair count
            to_remove = [
                pair for (pair, count) in pair_counts.items()
                if count < opts.pair_threshold
            ]
            if to_remove:
                pbar = get_progress_bar(len(to_remove),
                                        title="Filtering pair counts")
                for i, pair in enumerate(to_remove):
                    del pair_counts[pair]
                    pbar.update(i)
                pbar.finish()
            else:
                log.info("No counts removed")

        # Create a dictionary of the remaining vocabulary
        log.info("Building dictionary")
        dictionary = Dictionary([[event] for event in event_counts.keys()])
        # Put all the co-occurrence counts into a big matrix
        log.info("Building counts matrix: vocab size %d" % len(dictionary))
        vectors = numpy.zeros((len(dictionary), len(dictionary)),
                              dtype=numpy.float64)
        # Fill the matrix with raw counts
        for (event0, event1), count in pair_counts.items():
            if event0 in dictionary.token2id and event1 in dictionary.token2id:
                e0, e1 = dictionary.token2id[event0], dictionary.token2id[
                    event1]
                vectors[e0, e1] = count
                # Add the count both ways (it's only stored once above)
                vectors[e1, e0] = count

        # Now there are many things we could do to these counts
        if opts.pmi or opts.ppmi:
            log.info("Applying %sPMI" % "P" if opts.ppmi else "")
            # Apply PMI to the matrix
            # Compute the total counts for each event (note row and col totals are the same)
            log_totals = numpy.ma.log(vectors.sum(axis=0))
            vectors = numpy.ma.log(vectors * vectors.sum()) - log_totals
            vectors = (vectors.T - log_totals).T
            vectors = vectors.filled(0.)

            if opts.ppmi:
                # Threshold the PMIs at zero
                vectors[vectors < 0.] = 0.

        # Convert to sparse for SVD and storage
        vectors = csr_matrix(vectors)

        if opts.svd:
            log.info("Fitting SVD with %d dimensions" % opts.svd)
            training_metadata["svd from"] = vectors.shape[1]
            training_metadata["svd"] = opts.svd
            vector_svd = TruncatedSVD(opts.svd)
            vectors = vector_svd.fit_transform(vectors)

        log.info("Saving model: %s" % model_name)
        model = DistributionalVectorsNarrativeChainModel(
            dictionary,
            vectors,
            only_verb=opts.only_verb,
            training_metadata=training_metadata,
            adjectives=opts.adj)
        model.save(model_name)
        return model
X_train_counts = count_vect.fit_transform(train_data.data)
print(X_train_counts.shape)
print(count_vect.vocabulary_.get(u'algorithm'))
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


df= pd.DataFrame({'text':test_doc, 'class': test_data.target})

X = tfidf_vect.fit_transform(df['text'].values)
y = df['class'].values

from sklearn.decomposition.truncated_svd import TruncatedSVD 
pca = TruncatedSVD(n_components=2)                                
X_reduced_train = pca.fit_transform(X)
a_train, a_test, b_train, b_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn.ensemble import RandomForestClassifier 
classifier=RandomForestClassifier(n_estimators=10)                  
classifier.fit(a_train.toarray(), b_train)                            



clf = svm.SVC(kernel=my_kernel)
# Support Vector Machine model
#text_clf = Pipeline([('vect', CountVectorizer()),
#(#'tfidf', TfidfTransformer()),
#('clf', clf),])
Exemplo n.º 35
0
    words.append(word)
    vecs.append(model[word])

print len(vecs)
CUTAWAY = 6000

vecs = np.matrix(np.random.permutation(vecs))[:CUTAWAY, :]
words = words[:CUTAWAY]
print vecs.shape

print 'Dimensionality reductio'
# dr = TruncatedSVD(n_iter=15)
# X = dr.fit_transform(vecs)
print 'Dimensionality reduction done, manifold learning'
tsne = RandomTreesEmbedding(n_estimators=15, random_state=0, max_depth=5, verbose=2, n_jobs=3)
X = tsne.fit_transform(vecs)
print X.shape
print 'Dim reduction'
dr = TruncatedSVD(n_components=2)
X = dr.fit_transform(X)
print 'Manifold learning done'

# X = vecs
PLOT_CUTAWAY = 250

plt.figure()
plt.scatter(X[:PLOT_CUTAWAY, 0], X[:PLOT_CUTAWAY, 1], c='green')
for i in xrange(min(X.shape[0], PLOT_CUTAWAY)):
    plt.annotate(words[i], xy=(X[i, 0], X[i, 1]))
plt.show()
Exemplo n.º 36
0
tfidf_vect= TfidfVectorizer(  use_idf=True, smooth_idf=True, sublinear_tf=False)
from sklearn.cross_validation import train_test_split, cross_val_score


df = pd.read_csv('/path/file.csv',
                     header=0, sep=',', names=['SentenceId', 'Sentence', 'Sentiment'])



reduced_data = tfidf_vect.fit_transform(df['Sentence'].values)
y = df['Sentiment'].values



from sklearn.decomposition.truncated_svd import TruncatedSVD
svd = TruncatedSVD(n_components=5)
reduced_data = svd.fit_transform(reduced_data)

X_train, X_test, y_train, y_test = train_test_split(reduced_data,
                                                    y, test_size=0.33,
                                                    random_state=42)

from sklearn.ensemble import RandomForestClassifier

#se pasmo con 1000000
#probar con mas parametros
classifier=RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)