def __init__(self, *, hyperparams: Hyperparams, random_seed: int = 0, docker_containers: Dict[str, DockerContainer] = None) -> None: super().__init__(hyperparams=hyperparams, random_seed=random_seed, docker_containers=docker_containers) # False self._clf = TruncatedSVD( n_components=self.hyperparams['n_components'], algorithm=self.hyperparams['algorithm']['choice'], n_iter=self.hyperparams['algorithm'].get('n_iter', 5), tol=self.hyperparams['algorithm'].get('tol', 0), random_state=self.random_seed, ) self.primitiveNo = PrimitiveCount.primitive_no PrimitiveCount.primitive_no += 1 self._inputs = None self._outputs = None self._training_inputs = None self._training_outputs = None self._target_names = None self._training_indices = None self._target_column_indices = None self._target_columns_metadata: List[OrderedDict] = None self._input_column_names = None self._fitted = False
def _permute_and_calc_singular_values(X, Y, X_saliences, Y_saliences, singular_values_samples, perm_i, n_components, procrustes=False, algorithm="randomized"): if len(X) < len(Y): X_perm = np.random.permutation(X) covariance_perm = np.dot(Y.T, X_perm) else: Y_perm = np.random.permutation(Y) covariance_perm = np.dot(Y_perm.T, X) svd = TruncatedSVD(n_components, algorithm=algorithm) Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit( covariance_perm) if procrustes: #It does not matter which side we use to calculate the rotated singular values #let's pick the smaller one for optimization if len(X_saliences_perm) > len(Y_saliences_perm): _, _, singular_values_samples[:, perm_i] = _procrustes_rotation( Y_saliences, Y_saliences_perm, singular_values_perm) else: X_saliences_perm = X_saliences_perm.T _, _, singular_values_samples[:, perm_i] = _procrustes_rotation( X_saliences, X_saliences_perm, singular_values_perm) else: singular_values_samples[:, perm_i] = singular_values_perm
def _bootstrap_pool(X, Y, X_saliences, Y_saliences, n_components,procrustes, algorithm, boot_i): """ basic version for parallel implementation of bootstrapping using pool """ #call random seed so not the same random number is used in each process np.random.seed( int( time() ) + boot_i) #choose indices to resample randomly with replacement for a sample of same size sample_indices = np.random.choice(range(X.shape[0]), size=X.shape[0], replace=True) X_boot = X[sample_indices,:] Y_boot = Y[sample_indices,:] X_boot_scaled = scale(X_boot) Y_boot_scaled = scale(Y_boot) covariance_boot = np.dot(Y_boot_scaled.T, X_boot_scaled) svd = TruncatedSVD(n_components, algorithm=algorithm) Y_saliences_boot, _, X_saliences_boot = svd._fit(covariance_boot) X_saliences_boot = X_saliences_boot.T #It does not matter which side we use to calculate the rotated singular values #let's pick the smaller one for optimization if len(X_saliences_boot) > len(Y_saliences_boot): #use procrustes_rotation on smaller dataset Y_bootstraps, rotation_matrix = _procrustes_rotation(Y_saliences, Y_saliences_boot) X_bootstraps = np.dot(X_saliences_boot, rotation_matrix) else: X_bootstraps, rotation_matrix = _procrustes_rotation(X_saliences, X_saliences_boot) Y_bootstraps = np.dot(Y_saliences_boot, rotation_matrix) #print np.shape(X_bootstraps) #print np.shape(Y_bootstraps) return X_bootstraps, Y_bootstraps
def _permute_and_calc_singular_values_pool(X, Y, X_saliences, Y_saliences, n_components, procrustes, algorithm, perm_i): """ basic version for parallel implementation using pool """ #call random seed so not the same random number is used in each process np.random.seed(int(time()) + perm_i) if len(X) < len(Y): #apply permutation to shorter list #print "randomization X<Y" X_perm = np.random.permutation(X) covariance_perm = np.dot(Y.T, X_perm) else: #print "other permutation" Y_perm = np.random.permutation(Y) covariance_perm = np.dot(Y_perm.T, X) svd = TruncatedSVD(n_components, algorithm=algorithm) Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit( covariance_perm) if procrustes: #It does not matter which side we use to calculate the rotated singular values #let's pick the smaller one for optimization if len(X_saliences_perm) > len(Y_saliences_perm): _, _, singular_values_perm = _procrustes_rotation( Y_saliences, Y_saliences_perm, singular_values_perm) else: X_saliences_perm = X_saliences_perm.T _, _, singular_values_perm = _procrustes_rotation( X_saliences, X_saliences_perm, singular_values_perm) return singular_values_perm
def create_union_transf(_): pca2c_transformer = make_pipeline( drop_transform, SimpleImputer(), StandardScaler(), PCA(n_components=2), ) os_transformer = make_pipeline( FunctionTransformer(lambda x: x.os, validate=False), CountVectorizer(), TruncatedSVD(n_components=10), ) arch_transformer = FunctionTransformer(lambda x: pd.get_dummies(x.cpuArch), validate=False) gmm_transformer = make_pipeline( drop_transform, SimpleImputer(), StandardScaler(), PCA(n_components=2), FunctionTransformer(lambda x: GaussianMixture(n_components=3). fit_predict(x)[np.newaxis].T)) transf = make_union( drop_transform, gmm_transformer, os_transformer, arch_transformer, pca2c_transformer, ) return transf
def _permute_and_calc_singular_values_process(X, Y, a, b, n_components, algorithm, output, x): #perm_i """ basic version for parallel implementation using processes and output queue """ #call random seed so not the same random number is used each time #pid = current_process()._identity[0] #randst = np.random.mtrand.RandomState(pid) np.random.seed(int(time()) + x + 50) #test how permutation works c = np.random.permutation(a) print a print c if len(X) < len(Y): #apply permutation to shorter list #print "randomization X<Y" X_perm = np.random.permutation(X) covariance_perm = np.dot(Y.T, X_perm) else: #print "other permutation" Y_perm = np.random.permutation(Y) covariance_perm = np.dot(Y_perm.T, X) svd = TruncatedSVD(n_components, algorithm=algorithm) #print covariance_perm Y_saliences_perm, singular_values_perm, X_saliences_perm = svd._fit( covariance_perm) output.put(singular_values_perm)
def outliersSvdReduction(self): svd = TruncatedSVD(n_components=1) ordersSvd = svd.fit_transform( self.training_order_start_end_districts_and_time, self.training_number_of_orders) priceSvd = svd.fit_transform( self.training_order_start_end_districts_and_time, self.training_order_median_price) self.outliersPriceOrders(ordersSvd, priceSvd)
def write_spacy_vocab(output_dirpath, vocab_size, embedding_dim): if not os.path.exists(output_dirpath): os.makedirs(output_dirpath) allowed_chars = set(string.ascii_letters + string.punctuation) ascii = set(string.ascii_letters) ascii_plus_period = set(string.ascii_letters + '.') word_set = set() spacy_vocab = spacy.load('en').vocab top_words = [] for w in spacy_vocab: if w.rank > 2 * vocab_size: continue try: word_string = str(w.lower_).strip() if not word_string: continue if word_string in word_set: continue if any(bad_char in word_string for bad_char in ('[', ']', '<', '>', '{', '}')): # these are used to mark word types and person ids. continue if any(c not in allowed_chars for c in word_string): continue if sum(1 for c in word_string if c not in ascii_plus_period) > 2: continue if word_string[-1] == '.' and sum( 1 for c in word_string if c in ascii) > 2: continue top_words.append(w) word_set.add(word_string) except: pass top_words.sort(key=lambda w: w.rank) top_words = top_words[:vocab_size] with open(os.path.join(output_dirpath, 'vocab'), 'w') as f: for word in top_words: f.write('%s\n' % word.lower_.strip()) vectors = np.array([w.vector for w in top_words]) svd = TruncatedSVD(n_components=embedding_dim, algorithm='arpack') embeddings = svd.fit_transform(vectors) print embeddings.shape print[ sum(svd.explained_variance_ratio_[:i]) for i in range(1, embedding_dim + 1) ] np.save(os.path.join(output_dirpath, 'pretrained_embeddings.npy'), embeddings)
def fit_transform(self, Xs): """ Optimize each CC components and return per-data set projections. :param Xs: List of matrices with the same number of columns. :return: CCA subspace. """ p = len(Xs) Ws = [np.zeros((X.shape[0], self.n_components)) for X in Xs] if self.standardize: Xs = list(map(_standardize, Xs)) Ws_init = [ TruncatedSVD(n_components=self.n_components, random_state=self.random_state).fit_transform(X) for X in Xs ] correlations = np.zeros((self.n_components, )) # Optimize each CC component individually for cc in range(self.n_components): w_cur = [Wi[:, cc] / np.linalg.norm(Wi[:, cc]) for Wi in Ws_init] for itr in range(self.max_iter): o1 = self._objective(Xs, w_cur) for i in range(p): wi = 0 for j in range(p): if i == j: continue wj = w_cur[j] Dj = np.diag( np.diagonal(Ws[i].T.dot(Xs[i]).dot(Xs[j].T.dot( Ws[j])))) wi += Xs[i].dot((Xs[j].T.dot(wj))) - Ws[i].dot(Dj).dot( Ws[j].T).dot(wj) w_cur[i] = wi / np.linalg.norm(wi) o2 = self._objective(Xs, w_cur) if abs(o2 - o1) / abs(o1) < self.tol: break for i in range(p): Ws[i][:, cc] = w_cur[i] # Compute average correlations n_pairs = p * (p - 1) / 2 for i, j in it.combinations(range(p), 2): wi = Ws[i][:, cc].T.dot(Xs[i]) wj = Ws[j][:, cc].T.dot(Xs[j]) correlations[cc] += pearsonr(wi, wj)[0] / n_pairs # Orientate vectors s = np.sign(Ws[0][0, :]) for i in range(p): Ws[i] = Ws[i] * s self.correlations = correlations return Ws
def fit_transform(self, X, Y): if self.standardize: X = _standardize(X) Y = _standardize(Y) K = X.dot(Y.T) model = TruncatedSVD(n_components=self.n_components, random_state=self.random_state) U = model.fit_transform(K) U = U / np.linalg.norm(U, axis=0) V = model.components_.T self.correlations = np.array( [pearsonr(u.dot(X), v.dot(Y))[0] for u, v in zip(U.T, V.T)]) return U, V
def build_accesson(options): ngroups, ncell_cut = int(options.ngroup), int(options.ncell) reads = scipy.io.mmread(options.s + '/matrix/filtered_reads.mtx') reads = scipy.sparse.csr_matrix(reads) * 1.0 cells = pandas.read_csv(options.s + '/matrix/filtered_cells.csv', sep='\t', index_col=0, engine='c', na_filter=False, low_memory=False) cells = cells.index.values peaks = ['peak' + str(x) for x in range(0, reads.shape[0])] scale = numpy.array(10000.0 / reads.sum(axis=0))[0] sklearn.utils.sparsefuncs.inplace_column_scale(reads, scale) reads.data = numpy.log2(reads.data + 1) npc = min(int(options.npc), reads.shape[0], reads.shape[1]) if len(cells) > ncell_cut: pca_result = TruncatedSVD(n_components=npc, algorithm='arpack', random_state=0).fit_transform(reads) else: pca_result = PCA(n_components=npc, svd_solver='full').fit_transform(reads.A) connectivity = kneighbors_graph(pca_result, n_neighbors=10, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) ward_linkage = cluster.AgglomerativeClustering(n_clusters=ngroups, linkage='ward', connectivity=connectivity) # ward_linkage = cluster.AgglomerativeClustering(n_clusters=ngroups, linkage='ward') y_predict = ward_linkage.fit_predict(pca_result) peak_labels_df = pandas.DataFrame(y_predict, index=peaks, columns=['group']) peak_labels_df.to_csv(options.s + '/matrix/Accesson_peaks.csv', sep='\t') groups = list(set(y_predict)) coAccess_matrix = numpy.array( [reads[numpy.where(y_predict == x)[0], :].sum(axis=0) for x in groups]) coAccess_matrix = coAccess_matrix[:, 0, :].T coAccess_df = pandas.DataFrame(coAccess_matrix, index=cells, columns=groups) coAccess_df.to_csv(options.s + '/matrix/Accesson_reads.csv', sep=',') return
def fit_pls(X, Y, n_components, scale=True, algorithm="randomized"): #scaling if scale: X_scaled = zscore(X, axis=0, ddof=1) Y_scaled = zscore(Y, axis=0, ddof=1) covariance = np.dot(Y_scaled.T, X_scaled) else: covariance = np.dot(Y.T, X) svd = TruncatedSVD(n_components, algorithm) Y_saliences, singular_values, X_saliences = svd._fit(covariance) X_saliences = X_saliences.T inertia = singular_values.sum() if scale: return X_saliences, Y_saliences, singular_values, inertia, X_scaled, Y_scaled else: return X_saliences, Y_saliences, singular_values, inertia
def reduce_dimensionality(dataframe, maxvariance, columns_to_drop): ''' Performs PCA on feature pandas dataframe and reduces number of principal components to those which explain a defined variance ''' dataframe_without_columns = dataframe.drop(columns_to_drop, axis=1) LOGGER.info('Columns to be used by pca:') print dataframe_without_columns.columns LOGGER.info('Adding noise to dataframe') dataframe_without_columns = dataframe_without_columns + numpy.random.normal( size=dataframe_without_columns.shape) * 1.e-19 LOGGER.info('Starting PCA') try: pca = PCA(n_components='mle') pca.fit(dataframe_without_columns) # transform samples = pca.transform(dataframe_without_columns) # aggregated sum of variances sum_variance = sum(pca.explained_variance_) list_variance = pca.explained_variance_ #print sum_variance, pca.explained_variance_ # get those having aggregated variance below threshold except ValueError: LOGGER.info('PCA failed, using truncated SVD') svd = TruncatedSVD(n_components=3) svd.fit(dataframe_without_columns) samples = svd.transform(dataframe_without_columns) sum_variance = sum(svd.explained_variance_) list_variance = svd.explained_variance_ scomp = 0 ncomp = 0 while scomp < maxvariance: #c = pca.explained_variance_[ncomp] c = list_variance[ncomp] scomp = scomp + c / sum_variance ncomp = ncomp + 1 # reduce dimensionality samples = samples[:, :ncomp] LOGGER.info("Number of features after PCA transformation %s" % samples.shape[1]) return samples
def compute_reduced_embeddings_original_vocab(output_vocab_filepath, output_embeddings_filepath, input_vocab_filepath, vocab_size, embedding_dim): print N_FREE_TOKENS vocab = Vocab(input_vocab_filepath, 1.5 * vocab_size) spacy_vocab = spacy.load('en').vocab matrix = np.zeros((vocab_size, spacy_vocab.vectors_length), dtype=np.float32) new_i = 0 final_vocab = [] for i, word in vocab._id_to_word.iteritems(): if new_i == vocab_size: break if i >= N_FREE_TOKENS and unicode(word) not in spacy_vocab: continue if i >= N_FREE_TOKENS: final_vocab.append(word) matrix[new_i] = spacy_vocab[unicode(word)].vector new_i += 1 print 'Last word added:', final_vocab[-1] if embedding_dim < spacy_vocab.vectors_length: svd = TruncatedSVD(n_components=embedding_dim, algorithm='arpack') embeddings = svd.fit_transform(matrix) print embeddings.shape print[ sum(svd.explained_variance_ratio_[:i]) for i in range(1, embedding_dim + 1) ] else: embeddings = matrix with open(output_vocab_filepath, 'w') as output: for word in final_vocab: output.write('%s\n' % word) np.save(output_embeddings_filepath, embeddings)
def __init__(self, path, corpusName, query=None): self.query = query documents = (line.lower().split() for line in codecs.open( corpusName + ".txt", mode='r', encoding='utf-8', errors='ignore')) self.corpus = [' '.join(i) for i in documents] if self.query is not None: self.corpus.append(' '.join(query.getTokens())) # Make models t0 = time() print "Creating SciKit TF-IDF Model" self.tfidfModel = TfidfVectorizer().fit_transform(self.corpus) print("Done in %0.3fs." % (time() - t0)) print "Creating SciKit LSA Model" t0 = time() lsa = TruncatedSVD(n_components=300) self.lsaModel = lsa.fit_transform(self.tfidfModel) self.lsaModel = Normalizer(copy=False).fit_transform(self.lsaModel) print("Done in %0.3fs." % (time() - t0)) print "Creating SciKit LDA Model" # Use tf (raw term count) features for LDA. print("Extracting tf features for LDA") tf_vectorizer = CountVectorizer(max_features=2000) t0 = time() tf = tf_vectorizer.fit_transform(self.corpus) print("Done in %0.3fs." % (time() - t0)) print("Fitting LDA model") lda = LatentDirichletAllocation(n_topics=300, max_iter=5, learning_method='online', learning_offset=50., random_state=0) t0 = time() self.ldaModel = lda.fit_transform(tf) self.ldaModel = Normalizer(copy=False).fit_transform(self.ldaModel) print("Done in %0.3fs." % (time() - t0))
def _boostrap(X, Y, X_saliences, Y_saliences, X_saliences_bootstraps, Y_saliences_bootstraps, bootstrap_i, n_components, algorithm="randomized"): sample_indices = np.random.choice(list(range(X.shape[0])), size=X.shape[0], replace=True) X_boot = X[sample_indices, :] Y_boot = Y[sample_indices, :] X_boot_scaled = scale(X_boot) Y_boot_scaled = scale(Y_boot) covariance_boot = np.dot(Y_boot_scaled.T, X_boot_scaled) svd = TruncatedSVD(n_components, algorithm=algorithm) Y_saliences_boot, _, X_saliences_boot = svd._fit(covariance_boot) X_saliences_boot = X_saliences_boot.T #It does not matter which side we use to calculate the rotated singular values #let's pick the smaller one for optimization if len(X_saliences_boot) > len(Y_saliences_boot): Y_saliences_bootstraps[:, :, bootstrap_i], rotation_matrix = _procrustes_rotation( Y_saliences, Y_saliences_boot) X_saliences_bootstraps[:, :, bootstrap_i] = np.dot(X_saliences_boot, rotation_matrix) else: X_saliences_bootstraps[:, :, bootstrap_i], rotation_matrix = _procrustes_rotation( X_saliences, X_saliences_boot) Y_saliences_bootstraps[:, :, bootstrap_i] = np.dot(Y_saliences_boot, rotation_matrix)
def fit_pls(X, Y, n_components, scale=True, algorithm="randomized"): #scaling print "calculating SVD" if scale: X_scaled = zscore(X, axis=0, ddof=1) Y_scaled = zscore(Y, axis=0, ddof=1) covariance = np.dot(Y_scaled.T, X_scaled) else: covariance = np.dot(Y.T, X) print np.shape(covariance) sum_var = covariance svd = TruncatedSVD(n_components, algorithm) #computes only the first n_components largest singular values #produces a low-rank approximation of covariance matrix Y_saliences, singular_values, X_saliences = svd._fit(covariance) X_saliences = X_saliences.T inertia = singular_values.sum() if scale: return X_saliences, Y_saliences, singular_values, inertia, X_scaled, Y_scaled, sum_var else: return X_saliences, Y_saliences, singular_values, inertia
def __init__(self, feature_size=10): self.feature_size = feature_size self.svd = TruncatedSVD(n_components=feature_size) self.rating = None
X_train_counts = count_vect.fit_transform(train_data.data) print(X_train_counts.shape) print(count_vect.vocabulary_.get(u'algorithm')) tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) df= pd.DataFrame({'text':test_doc, 'class': test_data.target}) X = tfidf_vect.fit_transform(df['text'].values) y = df['class'].values from sklearn.decomposition.truncated_svd import TruncatedSVD pca = TruncatedSVD(n_components=2) X_reduced_train = pca.fit_transform(X) a_train, a_test, b_train, b_test = train_test_split(X, y, test_size=0.33, random_state=42) from sklearn.ensemble import RandomForestClassifier classifier=RandomForestClassifier(n_estimators=10) classifier.fit(a_train.toarray(), b_train) clf = svm.SVC(kernel=my_kernel) # Support Vector Machine model #text_clf = Pipeline([('vect', CountVectorizer()), #(#'tfidf', TfidfTransformer()), #('clf', clf),])
def train(self, model_name, corpus, log, opts, chain_features=None): from whim.entity_narrative import DistributionalVectorsNarrativeChainModel log.info("Training context vectors model") training_metadata = { "data": corpus.directory, "pmi": opts.pmi or opts.ppmi, "ppmi": opts.ppmi, } log.info("Extracting event counts") pbar = get_progress_bar(len(corpus), title="Event feature extraction") # Loop over all the chains again to collect events event_counts = Counter() for doc_num, document in enumerate(corpus): chains = document.get_chains() if len(chains): event_chains = list( DistributionalVectorsNarrativeChainModel. extract_chain_feature_lists(chains, only_verb=opts.only_verb, adjectives=opts.adj)) # Count all the events for chain in event_chains: event_counts.update(chain) pbar.update(doc_num) pbar.finish() if opts.event_threshold is not None and opts.event_threshold > 0: log.info("Applying event threshold") # Apply a threshold event count to_remove = [ event for (event, count) in event_counts.items() if count < opts.event_threshold ] pbar = get_progress_bar(len(to_remove), title="Filtering counts") for i, event in enumerate(to_remove): del event_counts[event] pbar.update(i) pbar.finish() log.info("Extracting pair counts") pbar = get_progress_bar(len(corpus), title="Pair feature extraction") # Loop over all the chains again to collect pairs of events pair_counts = Counter() for doc_num, document in enumerate(corpus): chains = document.get_chains() if len(chains): event_chains = list( DistributionalVectorsNarrativeChainModel. extract_chain_feature_lists(chains, only_verb=opts.only_verb, adjectives=opts.adj)) # Count all the events for chain in event_chains: # Count all pairs pairs = [] for i in range(len(chain) - 1): for j in range(i + 1, len(chain)): if chain[i] in event_counts and chain[ j] in event_counts: pairs.append( tuple(sorted([chain[i], chain[j]]))) pair_counts.update(pairs) pbar.update(doc_num) pbar.finish() if opts.pair_threshold is not None and opts.pair_threshold > 0: log.info("Applying pair threshold") # Apply a threshold pair count to_remove = [ pair for (pair, count) in pair_counts.items() if count < opts.pair_threshold ] if to_remove: pbar = get_progress_bar(len(to_remove), title="Filtering pair counts") for i, pair in enumerate(to_remove): del pair_counts[pair] pbar.update(i) pbar.finish() else: log.info("No counts removed") # Create a dictionary of the remaining vocabulary log.info("Building dictionary") dictionary = Dictionary([[event] for event in event_counts.keys()]) # Put all the co-occurrence counts into a big matrix log.info("Building counts matrix: vocab size %d" % len(dictionary)) vectors = numpy.zeros((len(dictionary), len(dictionary)), dtype=numpy.float64) # Fill the matrix with raw counts for (event0, event1), count in pair_counts.items(): if event0 in dictionary.token2id and event1 in dictionary.token2id: e0, e1 = dictionary.token2id[event0], dictionary.token2id[ event1] vectors[e0, e1] = count # Add the count both ways (it's only stored once above) vectors[e1, e0] = count # Now there are many things we could do to these counts if opts.pmi or opts.ppmi: log.info("Applying %sPMI" % "P" if opts.ppmi else "") # Apply PMI to the matrix # Compute the total counts for each event (note row and col totals are the same) log_totals = numpy.ma.log(vectors.sum(axis=0)) vectors = numpy.ma.log(vectors * vectors.sum()) - log_totals vectors = (vectors.T - log_totals).T vectors = vectors.filled(0.) if opts.ppmi: # Threshold the PMIs at zero vectors[vectors < 0.] = 0. # Convert to sparse for SVD and storage vectors = csr_matrix(vectors) if opts.svd: log.info("Fitting SVD with %d dimensions" % opts.svd) training_metadata["svd from"] = vectors.shape[1] training_metadata["svd"] = opts.svd vector_svd = TruncatedSVD(opts.svd) vectors = vector_svd.fit_transform(vectors) log.info("Saving model: %s" % model_name) model = DistributionalVectorsNarrativeChainModel( dictionary, vectors, only_verb=opts.only_verb, training_metadata=training_metadata, adjectives=opts.adj) model.save(model_name) return model
def __init__(self): self.components = 2 self.svd = TruncatedSVD(n_components=self.components) self.reductCount = 0 for file_name, data_set in [ (RunRegression.REGRESSION_TRAINING_INPUT_FILE_NAME, FileIo.TRAINING_DATA_SET), (RunRegression.REGRESSION_TESTING_INPUT_FILE_NAME, FileIo.TEST_DATA_SET) ]: # Check and see if the data has already been saved try: logging.info("RunRegression: Trying to load " + data_set + " data") saved_data = numpy.load(file_name, mmap_mode='r') # If the data is not found, load it except IOError: logging.info( "RunRegression: Saved data not found. Generating " + data_set + " data") # Generate inputs poi_district_lookup = PoiDistrictLookup.PoiDistrictLookup() order_categorical_lookup = OrderCategoricalLookup.OrderCategoricalLookup( poi_district_lookup) regression_input = RegressionInput.RegressionInput( data_set, order_categorical_lookup, poi_district_lookup) if data_set == FileIo.TRAINING_DATA_SET: self.training_order_start_end_districts_and_time, self.training_order_median_price, \ self.training_number_of_orders = regression_input.get_regression_inputs() # Save the data for next time numpy.savez( file_name, order_keys=self. training_order_start_end_districts_and_time, order_value_price=self.training_order_median_price, order_value_number=self.training_number_of_orders) else: self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \ self.testing_number_of_orders = regression_input.get_regression_inputs() # Save the data for next time numpy.savez( file_name, order_keys=self. testing_order_start_end_districts_and_time, order_value_price=self.testing_order_median_price, order_value_number=self.testing_number_of_orders) # If the saved data is found, load it else: logging.info("RunRegression: Loading " + data_set + " data") if data_set == FileIo.TRAINING_DATA_SET: self.training_order_start_end_districts_and_time, self.training_order_median_price, \ self.training_number_of_orders = saved_data['order_keys'], \ saved_data['order_value_price'], \ saved_data['order_value_number'] self.dimensions = self.training_order_start_end_districts_and_time.shape[ 1] self.initial = self.training_order_start_end_districts_and_time logging.info("RunRegression: Loaded " + str(len(self.training_number_of_orders)) + " train data rows") else: self.testing_order_start_end_districts_and_time, self.testing_order_median_price, \ self.testing_number_of_orders = saved_data['order_keys'], \ saved_data['order_value_price'], \ saved_data['order_value_number'] self.initialTesting = self.testing_order_start_end_districts_and_time logging.info("RunRegression: Loaded " + str(len(self.testing_number_of_orders)) + " test data rows")
def build_accesson(options): ngroups, ncell_cut = int(options.ngroup), int(options.ncell) if options.format == 'mtx': reads = scipy.io.mmread(options.s + '/matrix/filtered_reads.mtx') reads = scipy.sparse.csr_matrix(reads) cells = pandas.read_csv(options.s + '/matrix/filtered_cells.csv', sep='\t', index_col=0, engine='c', na_filter=False, low_memory=False) cells = cells.index.values peaks = ['peak' + str(x) for x in range(0, reads.shape[1])] if len(cells) > ncell_cut: normal = copy.deepcopy(reads) normal.data = numpy.ones(len(normal.data)) index_sampled = random.sample( numpy.arange(0, len(cells), 1, dtype=int), ncell_cut) normal2 = normal[index_sampled, :] else: reads = numpy.array(reads.todense()) normal = numpy.array([x * 10000.0 / x.sum() for x in reads]) normal = numpy.log2(normal + 1) normal2 = normal else: reads_df = pandas.read_csv(options.s + '/matrix/filtered_reads.csv', sep=',', index_col=0, engine='c', na_filter=False, low_memory=False) cells, peaks = reads_df.index.values, reads_df.columns.values reads = reads_df.values normal = numpy.array([x * 10000.0 / x.sum() for x in reads]) normal = numpy.log2(normal + 1) normal2 = normal normal_df = pandas.DataFrame(normal2, index=reads_df.index, columns=reads_df.columns) normal_df.to_csv(options.s + '/matrix/normal_reads.csv', sep=',') npc = min(int(options.npc), normal2.shape[0], normal2.shape[1]) if len(cells) > ncell_cut: pca_result = TruncatedSVD(n_components=npc).fit_transform(normal2.T) else: pca_result = PCA(n_components=npc, svd_solver='full').fit_transform(normal2.T) connectivity = kneighbors_graph(pca_result, n_neighbors=10, include_self=False) connectivity = 0.5 * (connectivity + connectivity.T) ward_linkage = cluster.AgglomerativeClustering(n_clusters=ngroups, linkage='ward', connectivity=connectivity) ward_linkage.fit(pca_result) y_predict = ward_linkage.labels_.astype(numpy.int) peak_labels_df = pandas.DataFrame(y_predict, index=peaks, columns=['group']) peak_labels_df.to_csv(options.s + '/matrix/Accesson_peaks.csv', sep='\t') groups = list(set(y_predict)) coAccess_matrix = numpy.array([ normal[:, numpy.where(y_predict == x)[0]].sum(axis=1) for x in groups ]).T if (options.format == 'mtx') & (len(cells) > ncell_cut): coAccess_df = pandas.DataFrame(coAccess_matrix[0], index=cells, columns=groups) else: coAccess_df = pandas.DataFrame(coAccess_matrix, index=cells, columns=groups) coAccess_df.to_csv(options.s + '/matrix/Accesson_reads.csv', sep=',') return
names=header, engine='python') # Number of users in current set print('Number of unique users in current data-set', active_time_data.user_id.unique().shape[0]) print('Number of unique articles in current data-set', active_time_data.item_id.unique().shape[0]) # SVD allows us to look at our input matrix as a product of three smaller matrices; U, Z and V. # In short this will help us discover concepts from the original input matrix, # (subsets of users that like subsets of items) # Note that use of SVD is not strictly restricted to user-item matrices # https://www.youtube.com/watch?v=P5mlg91as1c algorithm = TruncatedSVD() # Finally we run our cross validation in n folds, where n is denoted by the cv parameter. # Verbose can be adjusted by an integer to determine level of verbosity. # We pass in our SVD algorithm as the estimator used to fit the data. # X is our data set that we want to fit. # Since our estimator (The SVD algorithm), We must either define our own estimator, or we can simply define how it # score the fitting. # Since we currently evaluate the enjoyment of our users per article highly binary, (Please see the rate_article fn in # the filter script), we can easily decide our precision and recall based on whether or not our prediction exactly # matches the binary rating field in the test set. # This, the F1 scoring metric seems an intuitive choice for measuring our success, as it provides a balanced score # based on the two. cv(estimator=algorithm, X=active_time_data, scoring='f1', cv=5, verbose=True)
def __init__(self, feature_size=10, regressor=None): self.feature_size = feature_size self.user_svd = TruncatedSVD(n_components=feature_size) self.item_svd = TruncatedSVD(n_components=feature_size) if regressor is None: self.regressor = LinearRegression()
def buildModel(self): tfidfModel = TfidfVectorizer().fit_transform(self.corpus) lsa = TruncatedSVD(n_components=200) self.Model = lsa.fit_transform(tfidfModel) self.Model = Normalizer(copy=False).fit_transform(self.Model)
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer tfidf_vect = TfidfVectorizer(use_idf=True, smooth_idf=True, sublinear_tf=False) from sklearn.cross_validation import train_test_split, cross_val_score df = pd.read_csv('/path/file.csv', header=0, sep=',', names=['SentenceId', 'Sentence', 'Sentiment']) reduced_data = tfidf_vect.fit_transform(df['Sentence'].values) y = df['Sentiment'].values from sklearn.decomposition.truncated_svd import TruncatedSVD svd = TruncatedSVD(n_components=5) reduced_data = svd.fit_transform(reduced_data) X_train, X_test, y_train, y_test = train_test_split(reduced_data, y, test_size=0.33, random_state=42) from sklearn.ensemble import RandomForestClassifier #se pasmo con 1000000 #probar con mas parametros classifier = RandomForestClassifier(n_estimators=100) classifier.fit(X_train, y_train) prediction = classifier.predict(X_test)
("ravel", Ravel()), ('tfid_vect', TfidfVectorizer(max_df= 0.743, min_df=0.036, ngram_range=(1,4),\ strip_accents='ascii', analyzer= "word", stop_words='english', norm = "l1", use_idf = True)) ]) # des_rescu_pipe = Pipeline([ # ('sel_num', DataFrameSelector(["Description", "RescuerID"], ravel = True)), # add rescuer to description # ('rm_nan', FnanToStr()), # ('tfid_vect', TfidfVectorizer(max_df= 0.743, min_df=0.036, ngram_range=(1,4),\ # strip_accents='ascii', analyzer= "word", stop_words='english', use_idf = True, norm = None)) # ]) des_pipe_svd = Pipeline([ ('des_pipe', des_pipe), ('SVD', TruncatedSVD(n_components=20) ) #ValueError: n_components must be < n_features; got 140 >= 124 ]) des_pipe_for_svd = replace_step( des_pipe, "tfid_vect", ("tfid_vect", TfidfVectorizer(max_df= 0.95, min_df=0.005, ngram_range=(1,4),\ strip_accents='ascii', analyzer= "word", stop_words='english', norm = "l1", use_idf = True)) ) des_pipe_svd_v2 = Pipeline([('des_pipe_for_svd', des_pipe_for_svd), ('SVD', TruncatedSVD(n_components=20))]) des_pipe_svd_v3 = replace_step(des_pipe_svd_v2, "SVD", ('SVD', TruncatedSVD(n_components=100)))
# strip_accents = 'unicode' : replace all accented unicode char ; use_idf = True : enable inverse-document-frequency reweighting ; # smooth_idf = True : prevents zero division for unseen words ; # max_features : If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus tfidf_vect = TfidfVectorizer(strip_accents='unicode', max_features=500, use_idf=True, smooth_idf=True, sublinear_tf=False) trainForVector = tfidf_vect.fit_transform(trainForVector) num_features = len(tfidf_vect.get_feature_names()) # n_components : Desired dimensionality of output data. Must be strictly less than the number of features. # n_iter : Number of iterations for randomized SVD solver. # random_state : If int, random_state is the seed used by the random number generator. #pca = TruncatedSVD(n_components = num_features-1, n_iter = 7, random_state = 42) pca = TruncatedSVD(n_components=300, n_iter=7, random_state=42) trainForVector = pca.fit_transform(trainForVector) # train i = 0 trainFeat = [] for pdf in trainF: v_all = list(pdf.getImgHistogram()) + list(trainForVector[i]) + list( pdf.getFeatVec()) trainFeat.append(v_all) i += 1 # test testFeat = [] for pdf in testF: v_all = list(pdf.getImgHistogram()) + list(trainForVector[i]) + list( pdf.getFeatVec())