Exemplo n.º 1
0
    def __init__(self, *,
                 hyperparams: Hyperparams,
                 random_seed: int = 0,
                 docker_containers: Dict[str, DockerContainer] = None) -> None:

        super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers)
        
        # False
        self._clf = TruncatedSVD(
              n_components=self.hyperparams['n_components'],
              algorithm=self.hyperparams['algorithm']['choice'],
              n_iter=self.hyperparams['algorithm'].get('n_iter', 5),
              tol=self.hyperparams['algorithm'].get('tol', 0),
              random_state=self.random_seed,
        )

        self.primitiveNo = PrimitiveCount.primitive_no
        PrimitiveCount.primitive_no += 1

        
        
        self._inputs = None
        self._outputs = None
        self._training_inputs = None
        self._training_outputs = None
        self._target_names = None
        self._training_indices = None
        self._target_column_indices = None
        self._target_columns_metadata: List[OrderedDict] = None
        self._input_column_names = None
        self._fitted = False
Exemplo n.º 2
0
def _permute_and_calc_singular_values(X,
                                      Y,
                                      X_saliences,
                                      Y_saliences,
                                      singular_values_samples,
                                      perm_i,
                                      n_components,
                                      procrustes=False,
                                      algorithm="randomized"):
    if len(X) < len(Y):
        X_perm = np.random.permutation(X)
        covariance_perm = np.dot(Y.T, X_perm)
    else:
        Y_perm = np.random.permutation(Y)
        covariance_perm = np.dot(Y_perm.T, X)
    svd = TruncatedSVD(n_components, algorithm=algorithm)
    Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit(
        covariance_perm)

    if procrustes:
        #It does not matter which side we use to calculate the rotated singular values
        #let's pick the smaller one for optimization
        if len(X_saliences_perm) > len(Y_saliences_perm):
            _, _, singular_values_samples[:, perm_i] = _procrustes_rotation(
                Y_saliences, Y_saliences_perm, singular_values_perm)
        else:
            X_saliences_perm = X_saliences_perm.T
            _, _, singular_values_samples[:, perm_i] = _procrustes_rotation(
                X_saliences, X_saliences_perm, singular_values_perm)
    else:
        singular_values_samples[:, perm_i] = singular_values_perm
def _bootstrap_pool(X, Y, X_saliences, Y_saliences, n_components,procrustes, algorithm, boot_i): 
    """ basic version for parallel implementation of bootstrapping using pool
    """
    #call random seed so not the same random number is used in each process
    np.random.seed( int( time() ) + boot_i)
    #choose indices to resample randomly with replacement for a sample of same size
    sample_indices = np.random.choice(range(X.shape[0]), size=X.shape[0], replace=True)
    X_boot = X[sample_indices,:]
    Y_boot = Y[sample_indices,:]
    X_boot_scaled = scale(X_boot)
    Y_boot_scaled = scale(Y_boot)

    covariance_boot = np.dot(Y_boot_scaled.T, X_boot_scaled)
    svd = TruncatedSVD(n_components, algorithm=algorithm)
    Y_saliences_boot, _, X_saliences_boot = svd._fit(covariance_boot)
    X_saliences_boot = X_saliences_boot.T
    
    #It does not matter which side we use to calculate the rotated singular values
    #let's pick the smaller one for optimization
    if len(X_saliences_boot) > len(Y_saliences_boot):
        #use procrustes_rotation on smaller dataset
        Y_bootstraps, rotation_matrix = _procrustes_rotation(Y_saliences, Y_saliences_boot)
        X_bootstraps = np.dot(X_saliences_boot, rotation_matrix)
    else:
        X_bootstraps, rotation_matrix = _procrustes_rotation(X_saliences, X_saliences_boot)
        Y_bootstraps = np.dot(Y_saliences_boot, rotation_matrix)  
         
    
    #print np.shape(X_bootstraps)
    #print np.shape(Y_bootstraps)
   
    return X_bootstraps, Y_bootstraps
Exemplo n.º 4
0
def _permute_and_calc_singular_values_pool(X, Y, X_saliences, Y_saliences,
                                           n_components, procrustes, algorithm,
                                           perm_i):
    """ basic version for parallel implementation using pool
    """
    #call random seed so not the same random number is used in each process
    np.random.seed(int(time()) + perm_i)

    if len(X) < len(Y):
        #apply permutation to shorter list
        #print "randomization X<Y"
        X_perm = np.random.permutation(X)
        covariance_perm = np.dot(Y.T, X_perm)
    else:
        #print "other permutation"
        Y_perm = np.random.permutation(Y)
        covariance_perm = np.dot(Y_perm.T, X)

    svd = TruncatedSVD(n_components, algorithm=algorithm)

    Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit(
        covariance_perm)

    if procrustes:
        #It does not matter which side we use to calculate the rotated singular values
        #let's pick the smaller one for optimization
        if len(X_saliences_perm) > len(Y_saliences_perm):
            _, _, singular_values_perm = _procrustes_rotation(
                Y_saliences, Y_saliences_perm, singular_values_perm)
        else:
            X_saliences_perm = X_saliences_perm.T
            _, _, singular_values_perm = _procrustes_rotation(
                X_saliences, X_saliences_perm, singular_values_perm)

    return singular_values_perm
Exemplo n.º 5
0
def create_union_transf(_):
    pca2c_transformer = make_pipeline(
        drop_transform,
        SimpleImputer(),
        StandardScaler(),
        PCA(n_components=2),
    )

    os_transformer = make_pipeline(
        FunctionTransformer(lambda x: x.os, validate=False),
        CountVectorizer(),
        TruncatedSVD(n_components=10),
    )

    arch_transformer = FunctionTransformer(lambda x: pd.get_dummies(x.cpuArch),
                                           validate=False)

    gmm_transformer = make_pipeline(
        drop_transform, SimpleImputer(), StandardScaler(), PCA(n_components=2),
        FunctionTransformer(lambda x: GaussianMixture(n_components=3).
                            fit_predict(x)[np.newaxis].T))

    transf = make_union(
        drop_transform,
        gmm_transformer,
        os_transformer,
        arch_transformer,
        pca2c_transformer,
    )
    return transf
Exemplo n.º 6
0
def _permute_and_calc_singular_values_process(X, Y, a, b, n_components,
                                              algorithm, output, x):  #perm_i
    """ basic version for parallel implementation using processes and output queue
    """

    #call random seed so not the same random number is used each time
    #pid = current_process()._identity[0]
    #randst = np.random.mtrand.RandomState(pid)
    np.random.seed(int(time()) + x + 50)

    #test how permutation works
    c = np.random.permutation(a)
    print a
    print c

    if len(X) < len(Y):
        #apply permutation to shorter list
        #print "randomization X<Y"
        X_perm = np.random.permutation(X)
        covariance_perm = np.dot(Y.T, X_perm)
    else:
        #print "other permutation"
        Y_perm = np.random.permutation(Y)
        covariance_perm = np.dot(Y_perm.T, X)

    svd = TruncatedSVD(n_components, algorithm=algorithm)

    #print covariance_perm
    Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit(
        covariance_perm)

    output.put(singular_values_perm)
 def outliersSvdReduction(self):
     svd = TruncatedSVD(n_components=1)
     ordersSvd = svd.fit_transform(
         self.training_order_start_end_districts_and_time,
         self.training_number_of_orders)
     priceSvd = svd.fit_transform(
         self.training_order_start_end_districts_and_time,
         self.training_order_median_price)
     self.outliersPriceOrders(ordersSvd, priceSvd)
Exemplo n.º 8
0
def write_spacy_vocab(output_dirpath, vocab_size, embedding_dim):
    if not os.path.exists(output_dirpath):
        os.makedirs(output_dirpath)

    allowed_chars = set(string.ascii_letters + string.punctuation)
    ascii = set(string.ascii_letters)
    ascii_plus_period = set(string.ascii_letters + '.')
    word_set = set()
    spacy_vocab = spacy.load('en').vocab
    top_words = []

    for w in spacy_vocab:
        if w.rank > 2 * vocab_size:
            continue
        try:
            word_string = str(w.lower_).strip()
            if not word_string:
                continue
            if word_string in word_set:
                continue
            if any(bad_char in word_string
                   for bad_char in ('[', ']', '<', '>', '{', '}')):
                # these are used to mark word types and person ids.
                continue
            if any(c not in allowed_chars for c in word_string):
                continue
            if sum(1 for c in word_string if c not in ascii_plus_period) > 2:
                continue
            if word_string[-1] == '.' and sum(
                    1 for c in word_string if c in ascii) > 2:
                continue

            top_words.append(w)
            word_set.add(word_string)
        except:
            pass

    top_words.sort(key=lambda w: w.rank)
    top_words = top_words[:vocab_size]

    with open(os.path.join(output_dirpath, 'vocab'), 'w') as f:
        for word in top_words:
            f.write('%s\n' % word.lower_.strip())

    vectors = np.array([w.vector for w in top_words])
    svd = TruncatedSVD(n_components=embedding_dim, algorithm='arpack')
    embeddings = svd.fit_transform(vectors)

    print embeddings.shape
    print[
        sum(svd.explained_variance_ratio_[:i])
        for i in range(1, embedding_dim + 1)
    ]
    np.save(os.path.join(output_dirpath, 'pretrained_embeddings.npy'),
            embeddings)
Exemplo n.º 9
0
    def fit_transform(self, Xs):
        """
        Optimize each CC components and return per-data set projections.
        :param Xs: List of matrices with the same number of columns.
        :return: CCA subspace.
        """
        p = len(Xs)
        Ws = [np.zeros((X.shape[0], self.n_components)) for X in Xs]
        if self.standardize:
            Xs = list(map(_standardize, Xs))
        Ws_init = [
            TruncatedSVD(n_components=self.n_components,
                         random_state=self.random_state).fit_transform(X)
            for X in Xs
        ]
        correlations = np.zeros((self.n_components, ))

        # Optimize each CC component individually
        for cc in range(self.n_components):
            w_cur = [Wi[:, cc] / np.linalg.norm(Wi[:, cc]) for Wi in Ws_init]
            for itr in range(self.max_iter):
                o1 = self._objective(Xs, w_cur)
                for i in range(p):
                    wi = 0
                    for j in range(p):
                        if i == j:
                            continue
                        wj = w_cur[j]
                        Dj = np.diag(
                            np.diagonal(Ws[i].T.dot(Xs[i]).dot(Xs[j].T.dot(
                                Ws[j]))))
                        wi += Xs[i].dot((Xs[j].T.dot(wj))) - Ws[i].dot(Dj).dot(
                            Ws[j].T).dot(wj)
                    w_cur[i] = wi / np.linalg.norm(wi)
                o2 = self._objective(Xs, w_cur)
                if abs(o2 - o1) / abs(o1) < self.tol:
                    break
            for i in range(p):
                Ws[i][:, cc] = w_cur[i]

            # Compute average correlations
            n_pairs = p * (p - 1) / 2
            for i, j in it.combinations(range(p), 2):
                wi = Ws[i][:, cc].T.dot(Xs[i])
                wj = Ws[j][:, cc].T.dot(Xs[j])
                correlations[cc] += pearsonr(wi, wj)[0] / n_pairs

        # Orientate vectors
        s = np.sign(Ws[0][0, :])
        for i in range(p):
            Ws[i] = Ws[i] * s

        self.correlations = correlations
        return Ws
Exemplo n.º 10
0
 def fit_transform(self, X, Y):
     if self.standardize:
         X = _standardize(X)
         Y = _standardize(Y)
     K = X.dot(Y.T)
     model = TruncatedSVD(n_components=self.n_components,
                          random_state=self.random_state)
     U = model.fit_transform(K)
     U = U / np.linalg.norm(U, axis=0)
     V = model.components_.T
     self.correlations = np.array(
         [pearsonr(u.dot(X), v.dot(Y))[0] for u, v in zip(U.T, V.T)])
     return U, V
Exemplo n.º 11
0
def build_accesson(options):
    ngroups, ncell_cut = int(options.ngroup), int(options.ncell)
    reads = scipy.io.mmread(options.s + '/matrix/filtered_reads.mtx')
    reads = scipy.sparse.csr_matrix(reads) * 1.0
    cells = pandas.read_csv(options.s + '/matrix/filtered_cells.csv',
                            sep='\t',
                            index_col=0,
                            engine='c',
                            na_filter=False,
                            low_memory=False)
    cells = cells.index.values
    peaks = ['peak' + str(x) for x in range(0, reads.shape[0])]
    scale = numpy.array(10000.0 / reads.sum(axis=0))[0]
    sklearn.utils.sparsefuncs.inplace_column_scale(reads, scale)
    reads.data = numpy.log2(reads.data + 1)
    npc = min(int(options.npc), reads.shape[0], reads.shape[1])
    if len(cells) > ncell_cut:
        pca_result = TruncatedSVD(n_components=npc,
                                  algorithm='arpack',
                                  random_state=0).fit_transform(reads)
    else:
        pca_result = PCA(n_components=npc,
                         svd_solver='full').fit_transform(reads.A)
    connectivity = kneighbors_graph(pca_result,
                                    n_neighbors=10,
                                    include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)
    ward_linkage = cluster.AgglomerativeClustering(n_clusters=ngroups,
                                                   linkage='ward',
                                                   connectivity=connectivity)
    #    ward_linkage = cluster.AgglomerativeClustering(n_clusters=ngroups, linkage='ward')
    y_predict = ward_linkage.fit_predict(pca_result)
    peak_labels_df = pandas.DataFrame(y_predict,
                                      index=peaks,
                                      columns=['group'])
    peak_labels_df.to_csv(options.s + '/matrix/Accesson_peaks.csv', sep='\t')
    groups = list(set(y_predict))
    coAccess_matrix = numpy.array(
        [reads[numpy.where(y_predict == x)[0], :].sum(axis=0) for x in groups])
    coAccess_matrix = coAccess_matrix[:, 0, :].T
    coAccess_df = pandas.DataFrame(coAccess_matrix,
                                   index=cells,
                                   columns=groups)
    coAccess_df.to_csv(options.s + '/matrix/Accesson_reads.csv', sep=',')
    return
Exemplo n.º 12
0
def fit_pls(X, Y, n_components, scale=True, algorithm="randomized"):
    #scaling
    if scale:
        X_scaled = zscore(X, axis=0, ddof=1)
        Y_scaled = zscore(Y, axis=0, ddof=1)
        covariance = np.dot(Y_scaled.T, X_scaled)
    else:
        covariance = np.dot(Y.T, X)

    svd = TruncatedSVD(n_components, algorithm)
    Y_saliences, singular_values, X_saliences = svd._fit(covariance)
    X_saliences = X_saliences.T
    inertia = singular_values.sum()

    if scale:
        return X_saliences, Y_saliences, singular_values, inertia, X_scaled, Y_scaled
    else:
        return X_saliences, Y_saliences, singular_values, inertia
Exemplo n.º 13
0
def reduce_dimensionality(dataframe, maxvariance, columns_to_drop):
    '''
    Performs PCA on feature pandas dataframe and reduces number of
    principal components to those which explain a defined variance
    '''
    dataframe_without_columns = dataframe.drop(columns_to_drop, axis=1)
    LOGGER.info('Columns to be used by pca:')
    print dataframe_without_columns.columns
    LOGGER.info('Adding noise to dataframe')
    dataframe_without_columns = dataframe_without_columns + numpy.random.normal(
        size=dataframe_without_columns.shape) * 1.e-19
    LOGGER.info('Starting PCA')
    try:
        pca = PCA(n_components='mle')
        pca.fit(dataframe_without_columns)
        # transform
        samples = pca.transform(dataframe_without_columns)
        # aggregated sum of variances
        sum_variance = sum(pca.explained_variance_)
        list_variance = pca.explained_variance_
        #print sum_variance, pca.explained_variance_
        # get those having aggregated variance below threshold
    except ValueError:
        LOGGER.info('PCA failed, using truncated SVD')
        svd = TruncatedSVD(n_components=3)
        svd.fit(dataframe_without_columns)
        samples = svd.transform(dataframe_without_columns)
        sum_variance = sum(svd.explained_variance_)
        list_variance = svd.explained_variance_

    scomp = 0
    ncomp = 0
    while scomp < maxvariance:
        #c = pca.explained_variance_[ncomp]
        c = list_variance[ncomp]
        scomp = scomp + c / sum_variance
        ncomp = ncomp + 1
    # reduce dimensionality
    samples = samples[:, :ncomp]
    LOGGER.info("Number of features after PCA transformation %s" %
                samples.shape[1])
    return samples
Exemplo n.º 14
0
def compute_reduced_embeddings_original_vocab(output_vocab_filepath,
                                              output_embeddings_filepath,
                                              input_vocab_filepath, vocab_size,
                                              embedding_dim):
    print N_FREE_TOKENS
    vocab = Vocab(input_vocab_filepath, 1.5 * vocab_size)
    spacy_vocab = spacy.load('en').vocab
    matrix = np.zeros((vocab_size, spacy_vocab.vectors_length),
                      dtype=np.float32)
    new_i = 0
    final_vocab = []

    for i, word in vocab._id_to_word.iteritems():
        if new_i == vocab_size:
            break

        if i >= N_FREE_TOKENS and unicode(word) not in spacy_vocab:
            continue

        if i >= N_FREE_TOKENS:
            final_vocab.append(word)

        matrix[new_i] = spacy_vocab[unicode(word)].vector
        new_i += 1

    print 'Last word added:', final_vocab[-1]
    if embedding_dim < spacy_vocab.vectors_length:
        svd = TruncatedSVD(n_components=embedding_dim, algorithm='arpack')
        embeddings = svd.fit_transform(matrix)
        print embeddings.shape
        print[
            sum(svd.explained_variance_ratio_[:i])
            for i in range(1, embedding_dim + 1)
        ]
    else:
        embeddings = matrix

    with open(output_vocab_filepath, 'w') as output:
        for word in final_vocab:
            output.write('%s\n' % word)
    np.save(output_embeddings_filepath, embeddings)
Exemplo n.º 15
0
    def __init__(self, path, corpusName, query=None):
        self.query = query
        documents = (line.lower().split() for line in codecs.open(
            corpusName + ".txt", mode='r', encoding='utf-8', errors='ignore'))
        self.corpus = [' '.join(i) for i in documents]
        if self.query is not None:
            self.corpus.append(' '.join(query.getTokens()))

        # Make models
        t0 = time()
        print "Creating SciKit TF-IDF Model"
        self.tfidfModel = TfidfVectorizer().fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LSA Model"
        t0 = time()
        lsa = TruncatedSVD(n_components=300)
        self.lsaModel = lsa.fit_transform(self.tfidfModel)
        self.lsaModel = Normalizer(copy=False).fit_transform(self.lsaModel)
        print("Done in %0.3fs." % (time() - t0))

        print "Creating SciKit LDA Model"
        # Use tf (raw term count) features for LDA.
        print("Extracting tf features for LDA")
        tf_vectorizer = CountVectorizer(max_features=2000)
        t0 = time()
        tf = tf_vectorizer.fit_transform(self.corpus)
        print("Done in %0.3fs." % (time() - t0))
        print("Fitting LDA model")
        lda = LatentDirichletAllocation(n_topics=300,
                                        max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
        t0 = time()
        self.ldaModel = lda.fit_transform(tf)
        self.ldaModel = Normalizer(copy=False).fit_transform(self.ldaModel)
        print("Done in %0.3fs." % (time() - t0))
Exemplo n.º 16
0
def _boostrap(X,
              Y,
              X_saliences,
              Y_saliences,
              X_saliences_bootstraps,
              Y_saliences_bootstraps,
              bootstrap_i,
              n_components,
              algorithm="randomized"):
    sample_indices = np.random.choice(list(range(X.shape[0])),
                                      size=X.shape[0],
                                      replace=True)
    X_boot = X[sample_indices, :]
    Y_boot = Y[sample_indices, :]
    X_boot_scaled = scale(X_boot)
    Y_boot_scaled = scale(Y_boot)

    covariance_boot = np.dot(Y_boot_scaled.T, X_boot_scaled)
    svd = TruncatedSVD(n_components, algorithm=algorithm)
    Y_saliences_boot, _, X_saliences_boot = svd._fit(covariance_boot)
    X_saliences_boot = X_saliences_boot.T

    #It does not matter which side we use to calculate the rotated singular values
    #let's pick the smaller one for optimization
    if len(X_saliences_boot) > len(Y_saliences_boot):
        Y_saliences_bootstraps[:, :,
                               bootstrap_i], rotation_matrix = _procrustes_rotation(
                                   Y_saliences, Y_saliences_boot)
        X_saliences_bootstraps[:, :,
                               bootstrap_i] = np.dot(X_saliences_boot,
                                                     rotation_matrix)
    else:
        X_saliences_bootstraps[:, :,
                               bootstrap_i], rotation_matrix = _procrustes_rotation(
                                   X_saliences, X_saliences_boot)
        Y_saliences_bootstraps[:, :,
                               bootstrap_i] = np.dot(Y_saliences_boot,
                                                     rotation_matrix)
Exemplo n.º 17
0
def fit_pls(X, Y, n_components, scale=True, algorithm="randomized"):
    #scaling

    print "calculating SVD"
    if scale:
        X_scaled = zscore(X, axis=0, ddof=1)
        Y_scaled = zscore(Y, axis=0, ddof=1)
        covariance = np.dot(Y_scaled.T, X_scaled)
    else:
        covariance = np.dot(Y.T, X)

    print np.shape(covariance)
    sum_var = covariance
    svd = TruncatedSVD(n_components, algorithm)
    #computes only the first n_components largest singular values
    #produces a low-rank approximation of covariance matrix
    Y_saliences, singular_values, X_saliences = svd._fit(covariance)
    X_saliences = X_saliences.T
    inertia = singular_values.sum()

    if scale:
        return X_saliences, Y_saliences, singular_values, inertia, X_scaled, Y_scaled, sum_var
    else:
        return X_saliences, Y_saliences, singular_values, inertia
Exemplo n.º 18
0
 def __init__(self, feature_size=10):
     self.feature_size = feature_size
     self.svd = TruncatedSVD(n_components=feature_size)
     self.rating = None
X_train_counts = count_vect.fit_transform(train_data.data)
print(X_train_counts.shape)
print(count_vect.vocabulary_.get(u'algorithm'))
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


df= pd.DataFrame({'text':test_doc, 'class': test_data.target})

X = tfidf_vect.fit_transform(df['text'].values)
y = df['class'].values

from sklearn.decomposition.truncated_svd import TruncatedSVD 
pca = TruncatedSVD(n_components=2)                                
X_reduced_train = pca.fit_transform(X)
a_train, a_test, b_train, b_test = train_test_split(X, y, test_size=0.33, random_state=42)

from sklearn.ensemble import RandomForestClassifier 
classifier=RandomForestClassifier(n_estimators=10)                  
classifier.fit(a_train.toarray(), b_train)                            



clf = svm.SVC(kernel=my_kernel)
# Support Vector Machine model
#text_clf = Pipeline([('vect', CountVectorizer()),
#(#'tfidf', TfidfTransformer()),
#('clf', clf),])
Exemplo n.º 20
0
    def train(self, model_name, corpus, log, opts, chain_features=None):
        from whim.entity_narrative import DistributionalVectorsNarrativeChainModel
        log.info("Training context vectors model")

        training_metadata = {
            "data": corpus.directory,
            "pmi": opts.pmi or opts.ppmi,
            "ppmi": opts.ppmi,
        }

        log.info("Extracting event counts")
        pbar = get_progress_bar(len(corpus), title="Event feature extraction")
        # Loop over all the chains again to collect events
        event_counts = Counter()
        for doc_num, document in enumerate(corpus):
            chains = document.get_chains()
            if len(chains):
                event_chains = list(
                    DistributionalVectorsNarrativeChainModel.
                    extract_chain_feature_lists(chains,
                                                only_verb=opts.only_verb,
                                                adjectives=opts.adj))
                # Count all the events
                for chain in event_chains:
                    event_counts.update(chain)

            pbar.update(doc_num)
        pbar.finish()

        if opts.event_threshold is not None and opts.event_threshold > 0:
            log.info("Applying event threshold")
            # Apply a threshold event count
            to_remove = [
                event for (event, count) in event_counts.items()
                if count < opts.event_threshold
            ]
            pbar = get_progress_bar(len(to_remove), title="Filtering counts")
            for i, event in enumerate(to_remove):
                del event_counts[event]
                pbar.update(i)
            pbar.finish()

        log.info("Extracting pair counts")
        pbar = get_progress_bar(len(corpus), title="Pair feature extraction")
        # Loop over all the chains again to collect pairs of events
        pair_counts = Counter()
        for doc_num, document in enumerate(corpus):
            chains = document.get_chains()
            if len(chains):
                event_chains = list(
                    DistributionalVectorsNarrativeChainModel.
                    extract_chain_feature_lists(chains,
                                                only_verb=opts.only_verb,
                                                adjectives=opts.adj))
                # Count all the events
                for chain in event_chains:
                    # Count all pairs
                    pairs = []
                    for i in range(len(chain) - 1):
                        for j in range(i + 1, len(chain)):
                            if chain[i] in event_counts and chain[
                                    j] in event_counts:
                                pairs.append(
                                    tuple(sorted([chain[i], chain[j]])))
                    pair_counts.update(pairs)

            pbar.update(doc_num)
        pbar.finish()

        if opts.pair_threshold is not None and opts.pair_threshold > 0:
            log.info("Applying pair threshold")
            # Apply a threshold pair count
            to_remove = [
                pair for (pair, count) in pair_counts.items()
                if count < opts.pair_threshold
            ]
            if to_remove:
                pbar = get_progress_bar(len(to_remove),
                                        title="Filtering pair counts")
                for i, pair in enumerate(to_remove):
                    del pair_counts[pair]
                    pbar.update(i)
                pbar.finish()
            else:
                log.info("No counts removed")

        # Create a dictionary of the remaining vocabulary
        log.info("Building dictionary")
        dictionary = Dictionary([[event] for event in event_counts.keys()])
        # Put all the co-occurrence counts into a big matrix
        log.info("Building counts matrix: vocab size %d" % len(dictionary))
        vectors = numpy.zeros((len(dictionary), len(dictionary)),
                              dtype=numpy.float64)
        # Fill the matrix with raw counts
        for (event0, event1), count in pair_counts.items():
            if event0 in dictionary.token2id and event1 in dictionary.token2id:
                e0, e1 = dictionary.token2id[event0], dictionary.token2id[
                    event1]
                vectors[e0, e1] = count
                # Add the count both ways (it's only stored once above)
                vectors[e1, e0] = count

        # Now there are many things we could do to these counts
        if opts.pmi or opts.ppmi:
            log.info("Applying %sPMI" % "P" if opts.ppmi else "")
            # Apply PMI to the matrix
            # Compute the total counts for each event (note row and col totals are the same)
            log_totals = numpy.ma.log(vectors.sum(axis=0))
            vectors = numpy.ma.log(vectors * vectors.sum()) - log_totals
            vectors = (vectors.T - log_totals).T
            vectors = vectors.filled(0.)

            if opts.ppmi:
                # Threshold the PMIs at zero
                vectors[vectors < 0.] = 0.

        # Convert to sparse for SVD and storage
        vectors = csr_matrix(vectors)

        if opts.svd:
            log.info("Fitting SVD with %d dimensions" % opts.svd)
            training_metadata["svd from"] = vectors.shape[1]
            training_metadata["svd"] = opts.svd
            vector_svd = TruncatedSVD(opts.svd)
            vectors = vector_svd.fit_transform(vectors)

        log.info("Saving model: %s" % model_name)
        model = DistributionalVectorsNarrativeChainModel(
            dictionary,
            vectors,
            only_verb=opts.only_verb,
            training_metadata=training_metadata,
            adjectives=opts.adj)
        model.save(model_name)
        return model
Exemplo n.º 21
0
    def __init__(self):

        self.components = 2
        self.svd = TruncatedSVD(n_components=self.components)
        self.reductCount = 0
        for file_name, data_set in [
            (RunRegression.REGRESSION_TRAINING_INPUT_FILE_NAME,
             FileIo.TRAINING_DATA_SET),
            (RunRegression.REGRESSION_TESTING_INPUT_FILE_NAME,
             FileIo.TEST_DATA_SET)
        ]:

            # Check and see if the data has already been saved
            try:

                logging.info("RunRegression: Trying to load " + data_set +
                             " data")

                saved_data = numpy.load(file_name, mmap_mode='r')

            # If the data is not found, load it
            except IOError:

                logging.info(
                    "RunRegression: Saved data not found. Generating " +
                    data_set + " data")

                # Generate inputs
                poi_district_lookup = PoiDistrictLookup.PoiDistrictLookup()
                order_categorical_lookup = OrderCategoricalLookup.OrderCategoricalLookup(
                    poi_district_lookup)
                regression_input = RegressionInput.RegressionInput(
                    data_set, order_categorical_lookup, poi_district_lookup)

                if data_set == FileIo.TRAINING_DATA_SET:

                    self.training_order_start_end_districts_and_time, self.training_order_median_price, \
                        self.training_number_of_orders = regression_input.get_regression_inputs()

                    # Save the data for next time
                    numpy.savez(
                        file_name,
                        order_keys=self.
                        training_order_start_end_districts_and_time,
                        order_value_price=self.training_order_median_price,
                        order_value_number=self.training_number_of_orders)

                else:

                    self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \
                        self.testing_number_of_orders  = regression_input.get_regression_inputs()

                    # Save the data for next time
                    numpy.savez(
                        file_name,
                        order_keys=self.
                        testing_order_start_end_districts_and_time,
                        order_value_price=self.testing_order_median_price,
                        order_value_number=self.testing_number_of_orders)

            # If the saved data is found, load it
            else:

                logging.info("RunRegression: Loading " + data_set + " data")

                if data_set == FileIo.TRAINING_DATA_SET:

                    self.training_order_start_end_districts_and_time, self.training_order_median_price, \
                        self.training_number_of_orders = saved_data['order_keys'], \
                                                         saved_data['order_value_price'], \
                                                         saved_data['order_value_number']

                    self.dimensions = self.training_order_start_end_districts_and_time.shape[
                        1]
                    self.initial = self.training_order_start_end_districts_and_time
                    logging.info("RunRegression: Loaded " +
                                 str(len(self.training_number_of_orders)) +
                                 " train data rows")
                else:

                    self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \
                        self.testing_number_of_orders = saved_data['order_keys'], \
                                                        saved_data['order_value_price'], \
                                                        saved_data['order_value_number']

                    self.initialTesting = self.testing_order_start_end_districts_and_time
                    logging.info("RunRegression: Loaded " +
                                 str(len(self.testing_number_of_orders)) +
                                 " test data rows")
Exemplo n.º 22
0
def build_accesson(options):
    ngroups, ncell_cut = int(options.ngroup), int(options.ncell)
    if options.format == 'mtx':
        reads = scipy.io.mmread(options.s + '/matrix/filtered_reads.mtx')
        reads = scipy.sparse.csr_matrix(reads)
        cells = pandas.read_csv(options.s + '/matrix/filtered_cells.csv',
                                sep='\t',
                                index_col=0,
                                engine='c',
                                na_filter=False,
                                low_memory=False)
        cells = cells.index.values
        peaks = ['peak' + str(x) for x in range(0, reads.shape[1])]
        if len(cells) > ncell_cut:
            normal = copy.deepcopy(reads)
            normal.data = numpy.ones(len(normal.data))
            index_sampled = random.sample(
                numpy.arange(0, len(cells), 1, dtype=int), ncell_cut)
            normal2 = normal[index_sampled, :]
        else:
            reads = numpy.array(reads.todense())
            normal = numpy.array([x * 10000.0 / x.sum() for x in reads])
            normal = numpy.log2(normal + 1)
            normal2 = normal
    else:
        reads_df = pandas.read_csv(options.s + '/matrix/filtered_reads.csv',
                                   sep=',',
                                   index_col=0,
                                   engine='c',
                                   na_filter=False,
                                   low_memory=False)
        cells, peaks = reads_df.index.values, reads_df.columns.values
        reads = reads_df.values
        normal = numpy.array([x * 10000.0 / x.sum() for x in reads])
        normal = numpy.log2(normal + 1)
        normal2 = normal
        normal_df = pandas.DataFrame(normal2,
                                     index=reads_df.index,
                                     columns=reads_df.columns)
        normal_df.to_csv(options.s + '/matrix/normal_reads.csv', sep=',')
    npc = min(int(options.npc), normal2.shape[0], normal2.shape[1])
    if len(cells) > ncell_cut:
        pca_result = TruncatedSVD(n_components=npc).fit_transform(normal2.T)
    else:
        pca_result = PCA(n_components=npc,
                         svd_solver='full').fit_transform(normal2.T)
    connectivity = kneighbors_graph(pca_result,
                                    n_neighbors=10,
                                    include_self=False)
    connectivity = 0.5 * (connectivity + connectivity.T)
    ward_linkage = cluster.AgglomerativeClustering(n_clusters=ngroups,
                                                   linkage='ward',
                                                   connectivity=connectivity)
    ward_linkage.fit(pca_result)
    y_predict = ward_linkage.labels_.astype(numpy.int)
    peak_labels_df = pandas.DataFrame(y_predict,
                                      index=peaks,
                                      columns=['group'])
    peak_labels_df.to_csv(options.s + '/matrix/Accesson_peaks.csv', sep='\t')
    groups = list(set(y_predict))
    coAccess_matrix = numpy.array([
        normal[:, numpy.where(y_predict == x)[0]].sum(axis=1) for x in groups
    ]).T
    if (options.format == 'mtx') & (len(cells) > ncell_cut):
        coAccess_df = pandas.DataFrame(coAccess_matrix[0],
                                       index=cells,
                                       columns=groups)
    else:
        coAccess_df = pandas.DataFrame(coAccess_matrix,
                                       index=cells,
                                       columns=groups)
    coAccess_df.to_csv(options.s + '/matrix/Accesson_reads.csv', sep=',')
    return
Exemplo n.º 23
0
                               names=header,
                               engine='python')

# Number of users in current set
print('Number of unique users in current data-set',
      active_time_data.user_id.unique().shape[0])
print('Number of unique articles in current data-set',
      active_time_data.item_id.unique().shape[0])

# SVD allows us to look at our input matrix as a product of three smaller matrices; U, Z and V.
# In short this will help us discover concepts from the original input matrix,
# (subsets of users that like subsets of items)
# Note that use of SVD is not strictly restricted to user-item matrices
# https://www.youtube.com/watch?v=P5mlg91as1c

algorithm = TruncatedSVD()

# Finally we run our cross validation in n folds, where n is denoted by the cv parameter.
# Verbose can be adjusted by an integer to determine level of verbosity.
# We pass in our SVD algorithm as the estimator used to fit the data.
# X is our data set that we want to fit.
# Since our estimator (The SVD algorithm), We must either define our own estimator, or we can simply define how it
# score the fitting.
# Since we currently evaluate the enjoyment of our users per article highly binary, (Please see the rate_article fn in
# the filter script), we can easily decide our precision and recall based on whether or not our prediction exactly
# matches the binary rating field in the test set.
# This, the F1 scoring metric seems an intuitive choice for measuring our success, as it provides a balanced score
# based on the two.

cv(estimator=algorithm, X=active_time_data, scoring='f1', cv=5, verbose=True)
Exemplo n.º 24
0
 def __init__(self, feature_size=10, regressor=None):
     self.feature_size = feature_size
     self.user_svd = TruncatedSVD(n_components=feature_size)
     self.item_svd = TruncatedSVD(n_components=feature_size)
     if regressor is None:
         self.regressor = LinearRegression()
Exemplo n.º 25
0
 def buildModel(self):
     tfidfModel = TfidfVectorizer().fit_transform(self.corpus)
     lsa = TruncatedSVD(n_components=200)
     self.Model = lsa.fit_transform(tfidfModel)
     self.Model = Normalizer(copy=False).fit_transform(self.Model)
Exemplo n.º 26
0
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=False)
from sklearn.cross_validation import train_test_split, cross_val_score

df = pd.read_csv('/path/file.csv',
                 header=0,
                 sep=',',
                 names=['SentenceId', 'Sentence', 'Sentiment'])

reduced_data = tfidf_vect.fit_transform(df['Sentence'].values)
y = df['Sentiment'].values

from sklearn.decomposition.truncated_svd import TruncatedSVD
svd = TruncatedSVD(n_components=5)
reduced_data = svd.fit_transform(reduced_data)

X_train, X_test, y_train, y_test = train_test_split(reduced_data,
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

from sklearn.ensemble import RandomForestClassifier

#se pasmo con 1000000
#probar con mas parametros
classifier = RandomForestClassifier(n_estimators=100)
classifier.fit(X_train, y_train)
prediction = classifier.predict(X_test)
Exemplo n.º 27
0
            ("ravel", Ravel()),
            ('tfid_vect', TfidfVectorizer(max_df= 0.743, min_df=0.036, ngram_range=(1,4),\
strip_accents='ascii', analyzer= "word", stop_words='english', norm = "l1", use_idf = True))
        ])

# des_rescu_pipe = Pipeline([
#             ('sel_num', DataFrameSelector(["Description", "RescuerID"], ravel = True)),
#             add rescuer to description
#             ('rm_nan', FnanToStr()),
#             ('tfid_vect', TfidfVectorizer(max_df= 0.743, min_df=0.036, ngram_range=(1,4),\
# strip_accents='ascii', analyzer= "word", stop_words='english', use_idf = True, norm = None))
#         ])

des_pipe_svd = Pipeline([
    ('des_pipe', des_pipe),
    ('SVD', TruncatedSVD(n_components=20)
     )  #ValueError: n_components must be < n_features; got 140 >= 124
])

des_pipe_for_svd = replace_step(
    des_pipe,
    "tfid_vect",
    ("tfid_vect", TfidfVectorizer(max_df= 0.95, min_df=0.005, ngram_range=(1,4),\
                                  strip_accents='ascii', analyzer= "word", stop_words='english',
                                  norm = "l1", use_idf = True))
)
des_pipe_svd_v2 = Pipeline([('des_pipe_for_svd', des_pipe_for_svd),
                            ('SVD', TruncatedSVD(n_components=20))])

des_pipe_svd_v3 = replace_step(des_pipe_svd_v2, "SVD",
                               ('SVD', TruncatedSVD(n_components=100)))
Exemplo n.º 28
0
    # strip_accents = 'unicode' : replace all accented unicode char ;  use_idf = True : enable inverse-document-frequency reweighting ;
    # smooth_idf = True : prevents zero division for unseen words   ;
    # max_features : If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus
    tfidf_vect = TfidfVectorizer(strip_accents='unicode',
                                 max_features=500,
                                 use_idf=True,
                                 smooth_idf=True,
                                 sublinear_tf=False)
    trainForVector = tfidf_vect.fit_transform(trainForVector)
    num_features = len(tfidf_vect.get_feature_names())
    # n_components : Desired dimensionality of output data. Must be strictly less than the number of features.
    # n_iter : Number of iterations for randomized SVD solver.
    # random_state : If int, random_state is the seed used by the random number generator.
    #pca = TruncatedSVD(n_components = num_features-1, n_iter = 7, random_state = 42)
    pca = TruncatedSVD(n_components=300, n_iter=7, random_state=42)
    trainForVector = pca.fit_transform(trainForVector)

    # train
    i = 0
    trainFeat = []
    for pdf in trainF:
        v_all = list(pdf.getImgHistogram()) + list(trainForVector[i]) + list(
            pdf.getFeatVec())
        trainFeat.append(v_all)
        i += 1
    # test
    testFeat = []
    for pdf in testF:
        v_all = list(pdf.getImgHistogram()) + list(trainForVector[i]) + list(
            pdf.getFeatVec())