def main(): A = load_npz("Atrain.npz") A_test = load_npz("Atest.npz") N, M = A.shape rbm = RBM(M, 50, 10) rbm.fit(A, A_test)
def main(): A = load_npz("Atrain.npz") A_test = load_npz("Atest.npz") mask = (A > 0) * 1.0 mask_test = (A_test > 0) * 1.0 N, M = A.shape rbm = RBM(M, 50, 10) rbm.fit(A, mask, A_test, mask_test)
def test_py23_compatibility(): # Try loading files saved on Python 2 and Python 3. They are not # the same, since files saved with Scipy versions < 1.0.0 may # contain unicode. a = load_npz(os.path.join(DATA_DIR, 'csc_py2.npz')) b = load_npz(os.path.join(DATA_DIR, 'csc_py3.npz')) c = csc_matrix([[0]]) assert_equal(a.toarray(), c.toarray()) assert_equal(b.toarray(), c.toarray())
def test_can_715(self): # this test is just to show the superiority of bicoloring vs. single coloring in # either direction. Bicoloring gives only 21 colors in this case vs. 105 for either # fwd or rev. matdir = os.path.join(os.path.dirname(openmdao.test_suite.__file__), 'matrices') # uses matrix can_715 from the sparse matrix collection website mat = load_npz(os.path.join(matdir, 'can_715.npz')).toarray() mat = np.asarray(mat, dtype=bool) coloring = get_simul_meta(None, 'auto', include_sparsity=False, setup=False, run_model=False, bool_jac=mat, stream=None) tot_size, tot_colors, fwd_solves, rev_solves, pct = _solves_info(coloring) self.assertEqual(tot_colors, 21) # verify that unidirectional colorings are much worse (105 vs 21 for bidirectional) coloring = get_simul_meta(None, 'fwd', include_sparsity=False, setup=False, run_model=False, bool_jac=mat, stream=None) tot_size, tot_colors, fwd_solves, rev_solves, pct = _solves_info(coloring) self.assertEqual(tot_colors, 105) coloring = get_simul_meta(None, 'rev', include_sparsity=False, setup=False, run_model=False, bool_jac=mat, stream=None) tot_size, tot_colors, fwd_solves, rev_solves, pct = _solves_info(coloring) self.assertEqual(tot_colors, 105)
def from_disk(self, file_path): file_name, ext = os.path.splitext(file_path) self.raw_data = load_npz(file_path) with open(file_name + ".voc", "rb") as vocab_file: self.vectorizer = pickle.load(vocab_file) self.identifiers = pd.read_pickle(file_name + ".pkl") self.load_features(self.vectorizer) self.load_data(self.raw_data)
def _save_and_load(matrix): fd, tmpfile = tempfile.mkstemp(suffix='.npz') os.close(fd) try: save_npz(tmpfile, matrix) loaded_matrix = load_npz(tmpfile) finally: os.remove(tmpfile) return loaded_matrix
def main(): args = get_args() V = np.load(args.v) U = np.load(args.u) item_map = np.array(load_json(args.item_map)) if args.item_map else None lookup_table = load_json(args.lookup_table) if args.lookup_table else None M = sum([load_npz(x).tocsr() for x in args.dataset]) while True: visualise(U, V, item_map = item_map, lookup_table = lookup_table, M = M, r = args.r, x_axis = args.x_axis, y_axis = args.y_axis, return_file = False) plt.show()
def load_embedding(fname, format="word2vec_bin", normalize=True, lower=False, clean_words=False, load_kwargs={}): """ Loads embeddings from file Parameters ---------- fname: string Path to file containing embedding format: string Format of the embedding. Possible values are: 'word2vec_bin', 'word2vec', 'glove', 'dict' normalize: bool, default: True If true will normalize all vector to unit length clean_words: bool, default: True If true will only keep alphanumeric characters and "_", "-" Warning: shouldn't be applied to embeddings with non-ascii characters load_kwargs: Additional parameters passed to load function. Mostly useful for 'glove' format where you should pass vocab_size and dim. """ assert format in ['word2vec_bin', 'word2vec', 'glove', 'dict', 'csr'], "Unrecognized format" if format == "word2vec_bin": w = Embedding.from_word2vec(fname, binary=True) elif format == "word2vec": w = Embedding.from_word2vec(fname, binary=False) elif format == "glove": w = Embedding.from_glove(fname, **load_kwargs) elif format == "dict": d = pickle.load(open(fname+'.npy', "rb"), encoding='latin1') w = Embedding.from_dict(d) elif format == "csr": d = sparse.load_npz(fname+'.npz') filepath = fname.replace('csr', 'keys')+'.npy' with open(filepath, 'rb') as handle: keys = pickle.load(handle) w = Embedding.from_dict(d, keys) if normalize: w.normalize_words(inplace=True) if lower or clean_words: w.standardize_words(lower=lower, clean_words=clean_words, inplace=True) return w
def test_from_csc162x162(): from siconos.numerics import SBM_from_csparse, SBM_get_value, NM_display from scipy.sparse import csr_matrix, linalg try: from scipy.sparse import load_npz except: return 0 M = load_npz(os.path.join(working_dir, 'data/csc162x162.npz')) #M = load_npz('data/csc162x162.npz') blocksize =9 r,SBM = SBM_from_csparse(blocksize,M) assert SBM_get_value(SBM,0,0) == M[0,0] assert SBM_get_value(SBM,161,161) == M[161,161]
def main(): args = get_args() user_map = load_json(args.user_map) if args.user_map else None user_idx = user_map.get(args.user_id, -1) if args.user_map else int_or_neg(args.user_id) item_map = np.array(load_json(args.item_map)) if args.item_map else None lookup_table = load_json(args.lookup_table) if args.lookup_table else None if user_idx != -1 or not args.fallback: U = np.load(args.u) V = np.load(args.v) M = load_npz(args.dataset).tocsr() if args.dataset and user_idx > 0 else None fallback = None else: U, V, M = None fallback = np.array(load_json(args.fallback)) print(predict(U, V, args.user_id, n_recs = args.n_recs, user_map = user_map, item_map = item_map,lookup_table = lookup_table, M = M, fallback = fallback))
def load_matrix(loc): if loc.startswith('gs://'): return load_npz(BytesIO(file_io.read_file_to_string(loc, binary_mode=True))).tocoo() return load_npz(loc).tocoo()
k += 1 if k % 10000 == 0: print("%s/%s" % (k, num_tokens)) start = max(0, i - context_size) end = min(len(line_as_idx), i + context_size) for c in line_as_idx[start:i]: wc_counts[w, c] += 1 for c in line_as_idx[i+1:end]: wc_counts[w, c] += 1 print("Finished counting") save_npz('pmi_counts_%s.npz' % V, csr_matrix(wc_counts)) else: wc_counts = load_npz('pmi_counts_%s.npz' % V) # context counts get raised ^ 0.75 c_counts = wc_counts.sum(axis=0).A.flatten() ** 0.75 c_probs = c_counts / c_counts.sum() c_probs = c_probs.reshape(1, V) # PMI(w, c) = #(w, c) / #(w) / p(c) # pmi = wc_counts / wc_counts.sum(axis=1) / c_probs # works only if numpy arrays pmi = wc_counts.multiply(1.0 / wc_counts.sum(axis=1) / c_probs).tocsr() # this operation changes it to a coo_matrix # which doesn't have functions we need, e.g log1p() # so convert it back to a csr print("type(pmi):", type(pmi))
def _save_and_load(matrix): with tempfile.NamedTemporaryFile(suffix='.npz') as file: file = file.name save_npz(file, matrix) loaded_matrix = load_npz(file) return loaded_matrix
cur_thetas[j]) + (1 - x[0, j]) * math.log(1 - cur_thetas[j]) likelihood = feature_likelihood + math.log(self.class_probs[cls]) return likelihood def predict_y(self, x): likelihoods = dict() for cls in self.thetas: likelihoods[cls] = self.get_class_likelihood(x, cls) max_likelihood = -math.inf best_cls = None for cls in likelihoods: if likelihoods[cls] > max_likelihood: best_cls = cls max_likelihood = likelihoods[cls] return best_cls # Testing model = SparseNaiveBayes() x = sparse.load_npz('../data/xtrainbin.npz') x_test = sparse.load_npz('../data/xtestbin.npz') print(x.shape) y = np.load('../data/y_train.npy') model.fit(x, y) print('Fitting Complete.') pred = model.predict(x_test) np.save('../results/mynb.csv', pred)
import json import numpy as np import pandas as pd from src import constants from scipy.sparse import load_npz from sklearn.preprocessing import Normalizer #directory path data_directory = constants.DATA_DIR clean_directory = constants.CLEAN_DIR imp_matrix_filename = constants.normalized_imp_matrix_filename #load presaved matrix and switch back to dense matrix img_imp_matrix_sparse = load_npz(f'{data_directory}/{imp_matrix_filename}') img_imp_matrix = img_imp_matrix_sparse.todense() #save filename tag_idx_filename = constants.tag_idx_filename img_idx_filename = constants.img_idx_filename #load index dictionaries with open(f'{data_directory}/{tag_idx_filename}', 'r') as tag_idx_file, \ open(f'{data_directory}/{img_idx_filename}', 'r') as img_idx_file: tag_idx_dict = json.load(tag_idx_file) img_idx_dict = json.load(img_idx_file) class T2I: "class function for tag-to-image recommendation" def __init__(self, art_id, ranked_tags, ranked_imp): self.idx = art_id #article id self.ranked_tags = ranked_tags #tags used in recommendation search
cleaned = pipeline(row['title'] + ' ' + row['abstract']) text.append(cleaned) print("Building vectors") hv = HashingVectorizer(n_features=2**10) hv.fit(text) X = hv.transform(text) print("Saving") save_npz(VECTORS_F, X) pickle.dump(hv, open(MODEL_F, 'wb+')) return X, hv # Only build everything once if necessary if os.path.exists(VECTORS_F) and os.path.exists(MODEL_F): X = load_npz(VECTORS_F) hv = pickle.load(open(MODEL_F, 'rb')) else: vectors, hv = build_vectors() # Return top 10 nearest neighbors to search term def query(s, n=25): v = hv.transform([pipeline(s)]) knn = cosine_similarity(X, v) # Return top 10 indexes return knn[:, 0].argsort()[-n:][::-1]
def get_urm_train_2(): global _urm_train_2 if _urm_train_2 is None: _urm_train_2 = load_npz(_URM_TRAIN_PATH_2) return _urm_train_2
def load_numpy(path, name): return load_npz(path+name).tocsr()
def build_input_pipeline(data_dir, batch_size, random_state, counts_transformation="nothing"): """Load data and build iterator for minibatches. Args: data_dir: The directory where the data is located. There must be four files inside the rep: `counts.npz`, `author_indices.npy`, `author_map.txt`, and `vocabulary.txt`. batch_size: The batch size to use for training. random_state: A NumPy `RandomState` object, used to shuffle the data. counts_transformation: A string indicating how to transform the counts. One of "nothing", "binary", "log", or "sqrt". """ counts = sparse.load_npz(os.path.join(data_dir, "counts.npz")) num_documents, num_words = counts.shape author_indices = np.load( os.path.join(data_dir, "author_indices.npy")).astype(np.int32) num_authors = np.max(author_indices + 1) author_map = np.loadtxt(os.path.join(data_dir, "author_map.txt"), dtype=str, delimiter="\n", encoding='latin-1') # Shuffle data. documents = random_state.permutation(num_documents) shuffled_author_indices = author_indices[documents] shuffled_counts = counts[documents] # Apply counts transformation. if counts_transformation == "nothing": count_values = shuffled_counts.data elif counts_transformation == "binary": count_values = np.int32(shuffled_counts.data > 0) elif counts_transformation == "log": count_values = np.round(np.log(1 + shuffled_counts.data)) elif counts_transformation == "sqrt": count_values = np.round(np.sqrt(shuffled_counts.data)) else: raise ValueError("Unrecognized counts transformation.") # Store counts as sparse tensor so it occupies less memory. shuffled_counts = tf.SparseTensor( indices=np.array(shuffled_counts.nonzero()).T, values=count_values, dense_shape=shuffled_counts.shape) dataset = tf.data.Dataset.from_tensor_slices( (documents, shuffled_counts, shuffled_author_indices)) batches = dataset.repeat().batch(batch_size).prefetch(batch_size) iterator = batches.make_one_shot_iterator() vocabulary = np.loadtxt(os.path.join(data_dir, "vocabulary.txt"), dtype=str, delimiter="\n", comments="<!-") total_counts_per_author = np.bincount( author_indices, weights=np.array(np.sum(counts, axis=1)).flatten()) counts_per_document_per_author = ( total_counts_per_author / np.bincount(author_indices)) # Author weights is how much lengthy each author's opinion over average is. author_weights = (counts_per_document_per_author / np.mean(np.sum(counts, axis=1))).astype(np.float32) return (iterator, author_weights, vocabulary, author_map, num_documents, num_words, num_authors)
from scipy.sparse import load_npz, save_npz from NDOCD.NDOCD import NDOCD import numpy as np from NDOCD.load_data import write_communities_to_file, get_communities_list2, get_amazon_graph from measures.mutual_information import normalized_mutual_information from measures.link_belong_modularity import cal_modularity, get_graph_info import time from measures.modularity import convert_communities_to_dict, get_modularity # graph = get_amazon_graph() # save_npz("data/amazon/graph.npz", graph) graph = load_npz("data/amazon/graph.npz") start = time.time() ndocd = NDOCD(graph, modification=True, modification_type="percent", modification_percent=0.2) ndocd.JS_threshold = 0.3 ndocd.MD_threshold = 0.3 coms = ndocd.find_all_communities() end = time.time() bigger_than = 6 file = "data/amazon/coms" write_communities_to_file( [com for com in coms if len(list(com.indices)) > bigger_than], file) nmi = normalized_mutual_information(file, "data/amazon/communities") coms2 = [ list(com.indices) for com in coms if len(list(com.indices)) > bigger_than
run_tokenization = False if (os.path.exists(train_tok_path.format("data") + ".npy") and os.path.exists(train_tok_path.format("target") + ".npy") and os.path.exists(val_tok_path.format("data") + ".npy") and os.path.exists(val_tok_path.format("target") + ".npy") and os.path.exists(test_tok_path.format("data") + ".npy") and os.path.exists(test_tok_path.format("target") + ".npy")) or ( os.path.exists(train_tok_path.format("data") + ".npz") and os.path.exists(train_tok_path.format("target") + ".npy") and os.path.exists(val_tok_path.format("data") + ".npz") and os.path.exists(val_tok_path.format("target") + ".npy") and os.path.exists(test_tok_path.format("data") + ".npz") and os.path.exists(test_tok_path.format("target") + ".npy")): if "bow-" in train_tok_path or "tfidf-" in train_tok_path: train_data = sparse.load_npz( train_tok_path.format("data") + ".npz") else: train_data = np.load(train_tok_path.format("data") + ".npy", allow_pickle=True) train_target = np.load(train_tok_path.format("target") + ".npy", allow_pickle=True) if "bow-" in val_tok_path or "tfidf-" in val_tok_path: val_data = sparse.load_npz(val_tok_path.format("data") + ".npz") else: val_data = np.load(val_tok_path.format("data") + ".npy", allow_pickle=True) val_target = np.load(val_tok_path.format("target") + ".npy", allow_pickle=True) if "bow-" in test_tok_path or "tfidf-" in test_tok_path: test_data = sparse.load_npz(test_tok_path.format("data") + ".npz") else:
def train_sampler(self, train_data): features = train_data[1] batch_size = 512 if features is not None: features = np.vstack([features, np.zeros((features.shape[1], ))]) node_size = len(features) node_dim = len(features[0]) # build model # input (features of vertex and its neighbor, label) x1_ph = tf.compat.v1.placeholder(shape=[batch_size, node_dim], dtype=tf.float32) x2_ph = tf.compat.v1.placeholder(shape=[batch_size, node_dim], dtype=tf.float32) y_ph = tf.compat.v1.placeholder(shape=[batch_size], dtype=tf.float32) with tf.compat.v1.variable_scope("MLsampler"): if self.nonlinear_sampler is True: print("Non-linear regression sampler used") l = tf.compat.v1.layers.dense( tf.concat([x1_ph, x2_ph], axis=1), 1, activation=None, trainable=True, kernel_initializer=tf.compat.v1.keras.initializers. VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"), name='dense') out = tf.nn.relu(tf.exp(l), name='relu') else: print("Linear regression sampler used") l = tf.compat.v1.layers.dense( x1_ph, node_dim, activation=None, trainable=True, kernel_initializer=tf.compat.v1.keras.initializers. VarianceScaling(scale=1.0, mode="fan_avg", distribution="uniform"), name='dense') l = tf.matmul(l, x2_ph, transpose_b=True, name='matmul') out = tf.nn.relu(l, name='relu') loss = tf.nn.l2_loss(out - y_ph, name='loss') / batch_size optimizer = tf.compat.v1.train.AdamOptimizer( learning_rate=self.learning_rate, name='Adam').minimize(loss) init = tf.compat.v1.global_variables_initializer() # configuration config = tf.compat.v1.ConfigProto( log_device_placement=self.log_device_placement) config.gpu_options.allow_growth = True config.allow_soft_placement = True # load data loss_node_path = self._loss_node_path("Uniform") loss_node = sparse.load_npz(loss_node_path + 'loss_node.npz') loss_node_count = sparse.load_npz(loss_node_path + 'loss_node_count.npz') idx_nz = sparse.find(loss_node_count) # due to out of memory, select randomly limited number of data node vertex = features[idx_nz[0]] neighbor = features[idx_nz[1]] count = idx_nz[2] y = np.divide(sparse.find(loss_node)[2], count) # partition train/validation data vertex_tr = vertex[:-batch_size] neighbor_tr = neighbor[:-batch_size] y_tr = y[:-batch_size] vertex_val = vertex[-batch_size:] neighbor_val = neighbor[-batch_size:] y_val = y[-batch_size:] iter_size = int(vertex_tr.shape[0] / batch_size) # initialize session sess = tf.compat.v1.Session(config=config) # summary tf.compat.v1.summary.scalar('loss', loss) merged_summary_op = tf.compat.v1.summary.merge_all() # summary_writer = tf.compat.v1.summary.FileWriter( # self._sampler_log_dir(), sess.graph) # save model saver = tf.compat.v1.train.Saver() model_path = self._sampler_model_path() if not os.path.exists(model_path): os.makedirs(model_path) # init variables sess.run(init) # train total_steps = 0 avg_time = 0.0 validation_losses = [] for epoch in range(self.epochs): # shuffle perm = np.random.permutation(vertex_tr.shape[0]) validation_loss_epoch = [] print("Epoch: %04d" % (epoch)) for iter in range(iter_size): # allocate batch vtr = vertex_tr[perm[iter * batch_size:(iter + 1) * batch_size]] ntr = neighbor_tr[perm[iter * batch_size:(iter + 1) * batch_size]] ytr = y_tr[perm[iter * batch_size:(iter + 1) * batch_size]] t = time.time() outs = sess.run([loss, optimizer, merged_summary_op], feed_dict={ x1_ph: vtr, x2_ph: ntr, y_ph: ytr }) train_loss = outs[0] # validation if iter % self.validate_iter == 0: outs = sess.run([loss, optimizer, merged_summary_op], feed_dict={ x1_ph: vertex_val, x2_ph: neighbor_val, y_ph: y_val }) val_loss = outs[0] validation_loss_epoch.append(val_loss) avg_time = (avg_time * total_steps + time.time() - t) / (total_steps + 1) # print if total_steps % self.print_every == 0: print("Iter:", "%04d" % iter, "train_loss=", "{:.5f}".format(train_loss), "val_loss=", "{:.5f}".format(val_loss)) total_steps += 1 if total_steps > self.max_total_steps: break validation_losses.append( sum(validation_loss_epoch) / len(validation_loss_epoch)) if validation_losses[-1] == min(validation_losses): print( "Minimum validation loss so far ({}) at epoch {}.".format( validation_losses[-1], epoch)) # save_model save_path = saver.save(sess, model_path + 'model.ckpt') sess.close() tf.compat.v1.reset_default_graph()
def loadRedditFromNPZ(dataset_dir): adj = sp.load_npz(dataset_dir+"reddit_adj.npz") data = np.load(dataset_dir+"reddit.npz") return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index']
populationScale = np.array([214,1287,(1046+194)/2,194,251]) populationScale = populationScale / populationScale.sum() * 5 else: #North Sea indices = ravel(np.array([53.0 , 52.5 , 53.5 , 53.0 , 53.0]) , np.array([3.0, 3.0, 3.0, 2.5, 3.5])) populationScale = np.array([1,1,1,1,1]) #rename target area(s) from createrawmatrix for brevity lon = np.array(M.lons[-1]) lat = np.array(M.lats[-1]) transitMatrix = [[] for x in range(M.nt)] for x in range(M.nt): transitMatrix[x] = sparse.load_npz(os.path.join(inputFolder, 'hybridtransitmatrix_'+M.dir+'_month_'+str(x)+'.npz')) for s in range(len(scaling)): if beaching: #load beaching data corresponding to set model and determine the chance of particle tracer ending on shore beachdata = np.genfromtxt(os.path.join('beachingCoastlines', beachingmodel + '_coast_dx_' + str((M.dd[-1] * 100).astype(int)) + '_region_'+region+'.csv'),delimiter=',',skip_header=1) beachdata = np.transpose(beachdata) beach = 1 - beachdata[3] * scaling[s] #chance of tracer ending on shore if beach.size != M.nc[-1] : print("Resolution of beaching grid and transition matrix don't match") sys.exit() for i in indices: if landpoints[i] != 0 : print("Point" + str(i) + "is a landpoint. Lon/lat = " + str(np.unravel_index(i,(M.nx[-1],M.ny[-1]),order='F')))
def get_urm_test_2(): global _urm_test_2 if _urm_test_2 is None: _urm_test_2 = load_npz(_URM_TEST_PATH_2) return _urm_test_2
continue if statement == 'Done': break t = feature_transformer(statement) prediction = clf.predict_proba(t) if prediction[0][1] > 0.9: #margin of confidence print 'Extreme bullying detected!' elif prediction[0][1] > 0.7: print 'Serious bullying detected!' elif prediction[0][1] > 0.6: print 'Some bullying detected. Please moderate your language.' prevState = t #Re-update SVM labels = list(pickle.load(open('master_labels.txt', 'rb'))) mat = load_npz('master_convo.npz') mat = vstack([mat, convStore]) mat = hstack([mat, csr_matrix( (mat.shape[0], num_new_feats))]) #adds columns for new features labels = labels + labelStore clf = svm.NuSVC(.05, probability=True) print 'Number of features added to the list: ' + str(num_new_feats) print 'Fitting new model...' clf.fit(mat, labels) print 'Done' #Redump SVM objects joblib.dump(clf, 'model.pkl') save_npz('master_convo.npz', mat) #for updating during simulation with open('master_labels.txt', 'wb') as f:
def get_urm_train_explicit(): global _urm_train_explicit if _urm_train_explicit is None: _urm_train_explicit = load_npz(_URM_TRAIN_EXPLICIT_PATH) return _urm_train_explicit
""" take a model, predictor matrix and paramater grid and return the optimal paramater set """ _gsearch = GridSearchCV(xgb_model, xgb_param_grid, scoring='roc_auc', n_jobs=4, iid=False, cv=3) _gsearch.fit(x_vals, y_vals) return _gsearch.best_params_ if __name__ == '__main__': #load in the processed data from train_and_test_to_matrix.py train_sparse = sparse.load_npz('sparse_train_punc.npz') test_sparse = sparse.load_npz('sparse_test_punc.npz') train = pd.read_csv('train.csv') test = pd.read_csv('test.csv') sub_file = pd.read_csv('sample_submission.csv') to_predict = list(train.columns[2:]) for col in to_predict: xgtrain_input = xgb.DMatrix(train_sparse, label=train[col].values) xgb_initial = xgb.XGBClassifier(learning_rate =0.1, n_estimators=1000,
def get_urm_test_explicit(): global _urm_test_explicit if _urm_test_explicit is None: _urm_test_explicit = load_npz(_URM_TEST_EXPLICIT_PATH) return _urm_test_explicit
from sklearn.utils import shuffle from scipy.sparse import save_npz, load_npz import keras.backend as K from keras.models import Model from keras.layers import Input, Dropout, Dense from keras.regularizers import l2 from keras.optimizers import SGD # config batch_size = 128 epochs = 20 reg = 0.0001 # reg = 0 A = load_npz("Atrain.npz") A_test = load_npz("Atest.npz") mask = (A > 0) * 1.0 mask_test = (A_test > 0) * 1.0 # make copies since we will shuffle A_copy = A.copy() mask_copy = mask.copy() A_test_copy = A_test.copy() mask_test_copy = mask_test.copy() N, M = A.shape print("N:", N, "M:", M) print("N // batch_size:", N // batch_size) # center the data
def get_urm_sequential_masked(): global _urm if _urm is None: _urm = load_npz(_URM_SEQUENTIAL_MASKED_PATH) return _urm
import torch from torch.utils.data import Dataset, DataLoader from torch import optim from torch.optim.lr_scheduler import ReduceLROnPlateau import numpy as np from scipy import sparse import sys import params from nn.mikolov_rnnlm import MikolovRNNLM data = sparse.load_npz('./../../2013MikolovWV/data/mikolov_bi_sg_data.npz') with open('./../../2013MikolovWV/data/vocabs.txt', 'r', encoding='utf-8') as f: vocabs = f.readlines() vocabs = [word.strip() for word in vocabs] params.rnnlm['v'] = len(vocabs) class PrepareData(Dataset): def __init__(self, X, y): self.X = X self.y = y def __len__(self): return self.X.shape[0] def __getitem__( self, idx): #default of torch: float32, default of np: float64 (double) return torch.as_tensor(self.X[idx].toarray()).float(), torch.as_tensor( self.y[idx].toarray()).view(1, 1).float()
def get_urm_train_sequential_masked(): global _urm_train_1 if _urm_train_1 is None: _urm_train_1 = load_npz(_URM_TRAIN_SEQUENTIAL_MASKED_PATH) return _urm_train_1
def redis_get_helper(key, npz = False): if npz: return load_npz(BytesIO(r.get(key))) return np.load(BytesIO(r.get(key)))
def get_urm_test_sequential_masked(): global _urm_test_1 if _urm_test_1 is None: _urm_test_1 = load_npz(_URM_TEST_SEQUENTIAL_MASKED_PATH) return _urm_test_1
def get_icm(): global _icm if _icm is None: _icm = load_npz(_ICM_PATH) return _icm
import scipy.sparse as sps import sys from utils.evaluator import Evaluator from utils.datareader import Datareader from utils.post_processing import eurm_to_recommendation_list from utils.ensembler import ensembler import numpy as np import os.path dr = Datareader(verbose=False, mode = "offline", only_load="False") cat = 3 a = sps.load_npz("../offline/offline-cbf_item_album-cat"+str(cat)+".npz") b = sps.load_npz("../offline/offline-cbf_item_artist-cat"+str(cat)+".npz") c = sps.load_npz("../offline/nlp_eurm_offline_bm25-cat"+str(cat)+".npz") d = sps.load_npz("../offline/offline-rp3beta-cat"+str(cat)+".npz") e = sps.load_npz("../offline/offline-cfuser-cat"+str(cat)+".npz") f = sps.load_npz("../offline/slim_bpr_completo_test1-cat"+str(cat)+".npz") g = sps.load_npz("../offline/eurm_cbfu_artists_offline-cat"+str(cat)+".npz") matrix = [a, b, c, d, e, f, g] a = float(sys.argv[1]) b = float(sys.argv[2]) c = float(sys.argv[3]) d = float(sys.argv[4]) e = float(sys.argv[5]) f = float(sys.argv[6]) g = float(sys.argv[7])
def labels(self): labels = sparse.load_npz(self.labels_npz) labels = labels.toarray().ravel() labels[labels != 1] = -1 return labels
import scipy.sparse as sps from scripts.scikit_ensemble.scikit_ensamble import Optimizer from utils.definitions import * from utils.datareader import Datareader cat = 8 matrix = list() from utils.definitions import load_obj name = load_obj("name") directory = ROOT_DIR + "/scripts/scikit_ensemble/offline/" matrix_dict = load_obj("matrix_dict", path="") m = list() for n in name[cat-1]: m.append(sps.load_npz(directory + matrix_dict[n])) matrix.append(m) dr = Datareader(verbose=False, mode = "offline", only_load="False") opt = Optimizer(matrices_array=matrix[0], matrices_names=name[cat-1], dr=dr, cat=cat, start=0, end=1) del matrix opt.run()
rxn_id_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/rxn_ids.pickle" pfp_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/pfp_mtx.npz" rfp_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/rfp_mtx.npz" rgt_1_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/rgt_1_mtx.npz" rgt_2_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/rgt_2_mtx.npz" slv_1_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/slv_1_mtx.npz" slv_2_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/slv_2_mtx.npz" cat_1_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/cat_1_mtx.npz" rgt_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/rgts.pickle" slv_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/slvs.pickle" cat_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/cats.pickle" temp_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/temps.pickle" yd_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/yds.pickle" pfp_csrmtx = sparse.load_npz(pfp_mtx_file) rfp_csrmtx = sparse.load_npz(rfp_mtx_file) # context_csrmtx = sparse.load_npz(context_mtx_file) rgt_1_mtx = sparse.load_npz(rgt_1_mtx_file) rgt_2_mtx = sparse.load_npz(rgt_2_mtx_file) slv_1_mtx = sparse.load_npz(slv_1_mtx_file) slv_2_mtx = sparse.load_npz(slv_2_mtx_file) cat_1_mtx = sparse.load_npz(cat_1_mtx_file) with open(temp_file,"r") as T_L_F: temp_list = pickle.load(T_L_F) with open(rxn_id_file,"r") as RID: rxn_id_list = pickle.load(RID)
def get_urm_train_1(): global _urm_train_1 if _urm_train_1 is None: _urm_train_1 = load_npz(_URM_TRAIN_PATH_1) return _urm_train_1
print ('{} maxidx'.format(maxidx)) elif mode == 'test_csr': if len(sys.argv) != 3: print('USAGE: python {} test_csr logid'.format(sys.argv[0])) sys.exit(1) id = sys.argv[2] w = 'meaner' filepath_keys = 'keys_{}_final.npy'.format(id) with open('./images/fp_{}/{}'.format(id, filepath_keys), 'rb') as f: keys = pickle.load(f) filepath_csr = 'csr_{}_final.npz'.format(id) csr = sparse.load_npz('./images/fp_{}/{}'.format(id, filepath_csr)) index = keys.index(w) word_csr1 = csr[index] elif mode == 'join_csr': if len(sys.argv) != 3: print('USAGE: python {} join_csr logid'.format(sys.argv[0])) sys.exit(1) id = sys.argv[2] """ filepath_keys = 'keys_{}_datasets.npy'.format(id) with open('./images/fp_{}/{}'.format(id, filepath_keys), 'rb') as f: keys = pickle.load(f)
def get_urm_test_1(): global _urm_test_1 if _urm_test_1 is None: _urm_test_1 = load_npz(_URM_TEST_PATH_1) return _urm_test_1
def load_proximity(ds, radius): logger.info("Loading proximity matrix...") fname = os.path.join(ds.a.data_path, "proximity_radius_%s_%s.npz" %(str(radius), ds.a.brain_mask)) A = load_npz(fname) return A.tolil()
def featureSelectionTraining(training=False): threshold = 0.01 np.set_printoptions(suppress=True) # ----------------- READ PREPROCESSING FILE ----------------------- VECT_SEL_FOLDER = "ml_core/vector_selection/training/" VECT_FOLDER = "ml_core/vector/training/" VECT_TEMPLATE = "ml_core/template/tfidf_sparse_template.npz" TEMPLATE_FOLDER = "ml_core/template/" FEATURE_CONFIG = "feature_template.json" LABEL_PATH = "ml_core/data/training/Preprocessed_Dataset_Training.csv" TWEET_DATA = pd.read_csv(LABEL_PATH, usecols=["label"]) tags = TWEET_DATA.label # ----------------------- LOAD SPARSE MATRIX ------------------------- FileName = "tfidf_sparse_training.npz" tfidf_mat = sparse.load_npz(VECT_FOLDER + FileName).toarray() json_feature = "tfidf_feature_training.json" features = readJson_config(VECT_FOLDER + "features/", json_feature, 'feature')[0] tfidf_mat_selection = None features_template = None tfidf_mat_template = None selected_idx = [] now = datetime.now() dt_string = now.strftime("%d%m%Y_%H%M%S") # ---------------------------- TRAINING ----------------------------- mi = mutual_info_classif(tfidf_mat, tags) norm_mi = mi / np.max(mi) column_idx = [i for i, mi_item in enumerate(norm_mi) if mi_item < 0.01] tfidf_mat_selection = np.delete(tfidf_mat, column_idx, 1) # template data selected_idx = [j for j in range(len(norm_mi)) if j not in column_idx] selected_features = [] for idx in selected_idx: selected_features.append(features[idx]) tfidf_mat_template = [0.0] * len(selected_features) features_template = selected_features #-------------------------------- SAVE ----------------------------------- # Save template tfidf_sparse_template = sparse.csr_matrix(tfidf_mat_template) sparse.save_npz(VECT_TEMPLATE, tfidf_sparse_template) feature_dict = {} feature_dict['feature'] = features_template writeJson_config(TEMPLATE_FOLDER, FEATURE_CONFIG, feature_dict, append=False) # save training data tfidf_sparse = sparse.csr_matrix(tfidf_mat_selection) sparse.save_npz(VECT_SEL_FOLDER + "tfidf_selection_sparse_training.npz", tfidf_sparse) writeJson_config(VECT_SEL_FOLDER + "features/", "tfidf_feature_training.json", features_template, append=False) return 'success'
def get_ucm_train(): global _ucm_train if _ucm_train is None: _ucm_train = load_npz(_UCM_TRAIN_PATH) return _ucm_train
from NDOCD.Community import Community from NDOCD.load_data import random_graph, get_actors_graph from scipy.sparse import save_npz, load_npz import numpy as np np.random.seed(123) graph = random_graph(size=150, edges=1200) ndocd = NDOCD(graph) # ndocd.neighbours_edges # ndocd.compute_neighbours_edges() # com.get_graph().toarray() # graph.toarray() # ndocd.graph.toarray() # graph = get_actors_graph() # save_npz("data/actors_graph.npz", graph) graph = load_npz("data/actors_graph.npz") # np.save("data/actors_neighbours.npy", ndocd.neighbours_edges) ndocd = NDOCD(graph, np.load("data/actors_neighbours.npy")) # graph.toarray() com = ndocd.initialize_new_community() com.get_graph()[com.get_vertex_indices()].toarray()[:, com.get_vertex_indices()] ndocd.JS_threshold = 0.18 ndocd.MD_threshold = 0.18 com = ndocd.initialize_new_community() com = ndocd.algorithm_step2(com) print(np.sum(com.vertices.toarray())) # com = ndocd.create_new_community()
def get_urm(): global _urm if _urm is None: _urm = load_npz(_URM_PATH) return _urm
def featurize_dataset(self, dataset: Dataset): logger.info( f"Loading dataset {dataset.key} and {self.split.key} split") data = dataset.load_x() for required_field in ['product', 'substrates']: if required_field not in data: raise NotImplementedError( f"Need to have field '{required_field} in the dataset") split = self.split.load(dataset.dir) feat_dir = self.dir(dataset.feat_dir) metadata = dataset.load_metadata() reaction_type_given = False if 'reaction_type_id' in metadata: rtypes = metadata['reaction_type_id'].values ntypes = len(np.unique(rtypes)) logger.info(f'Found {ntypes} unique reaction types in the dataset') reaction_type_given = True data['reaction_type'] = rtypes if not os.path.exists(feat_dir): os.makedirs(feat_dir) if 'max_n_nodes' in dataset.meta_info: max_n_nodes = dataset.meta_info['max_n_nodes'] else: max_n_nodes = 1024 logger.info("Max. number of nodes: {}".format(max_n_nodes)) # we do not featurize test set for training all_inds = np.argwhere(split['test'] == 0).flatten() # shuffle indices for featurization in multiple threads np.random.shuffle(all_inds) data_len = len(data) samples_len = data_len * self.max_n_steps chunk_size = int(len(all_inds) / self.n_jobs) chunk_ends = [chunk_size * i for i in range(self.n_jobs + 1)] chunk_ends[-1] = len(all_inds) chunk_inds = [ all_inds[chunk_ends[i]:chunk_ends[i + 1]] for i in range(len(chunk_ends) - 1) ] logger.info(f'Finding all possible values of atom and bond properties ' f'on {len(all_inds)} reactions using {self.n_jobs} chunks') parallel_args = [] for i, ch_inds in enumerate(chunk_inds): new_x = dict((k, x.values[ch_inds]) for k, x in data.items()) parallel_args.append((i, new_x, tqdm)) prop_dict = {'atom': {}, 'bond': {}} if self.n_jobs == 1: chunk_results = [find_properties_parallel(parallel_args[0])] else: pool = Pool(self.n_jobs) chunk_results = pool.imap(find_properties_parallel, parallel_args) for chunk_prop_dict in chunk_results: for type_key in prop_dict.keys(): for key, values in chunk_prop_dict[type_key].items(): if key not in prop_dict[type_key]: prop_dict[type_key][key] = set() prop_dict[type_key][key].update(values) # add some 'special' atom/bond feature values prop_dict['atom']['is_supernode'].update([0, 1]) prop_dict['atom']['is_edited'].update([0, 1]) prop_dict['atom']['is_reactant'].update([0, 1]) prop_dict['bond']['bond_type'].update(['supernode', 'self']) prop_dict['bond']['is_edited'].update([0, 1]) atom_feat_counts = ', '.join([ '{:s}: {:d}'.format(key, len(values)) for key, values in prop_dict['atom'].items() ]) logger.info(f'Found atom features: {atom_feat_counts}') bond_feat_counts = ', '.join([ '{:s}: {:d}'.format(key, len(values)) for key, values in prop_dict['bond'].items() ]) logger.info(f'Found bond features: {bond_feat_counts}') # make a dictionary for conversion of atom/bond features to OH numbers prop2oh = {'atom': {}, 'bond': {}} props = {'atom': {}, 'bond': {}} for type_key, prop_values in prop_dict.items(): for prop_key, values in prop_values.items(): sorted_vals = list( sorted(values, key=lambda x: x if isinstance(x, int) else 0)) props[type_key][prop_key] = sorted_vals oh = dict((k, i + 1) for i, k in enumerate(sorted_vals)) prop2oh[type_key][prop_key] = oh # save 'prop2oh' dictionary with open(get_prop2oh_vocab_path(feat_dir), 'w') as fp: json.dump( { 'atom': props['atom'], 'bond': props['bond'], 'atom_2oh': prop2oh['atom'], 'bond_2oh': prop2oh['bond'] }, fp, indent=2) atom_feature_keys = [ k for k in ORDERED_ATOM_OH_KEYS if k in prop2oh['atom'] ] bond_feature_keys = [ k for k in ORDERED_BOND_OH_KEYS if k in prop2oh['bond'] ] action_vocab = { 'prop2oh': prop2oh, 'atom_feature_keys': atom_feature_keys, 'bond_feature_keys': bond_feature_keys, 'atom_feat_ind': dict( (k, i) for i, k in enumerate(atom_feature_keys)), 'bond_feat_ind': dict( (k, i) for i, k in enumerate(bond_feature_keys)) } parallel_args = [] chunk_save_paths = [] for i, ch_inds in enumerate(chunk_inds): new_x = dict((k, x.values[ch_inds]) for k, x in data.items()) is_train = split['train'][ch_inds].values chunk_save_path = os.path.join(feat_dir, f'chunk_result_{i}') chunk_save_paths.append(chunk_save_path) parallel_args.append( (i, samples_len, ch_inds, new_x, max_n_nodes, tqdm, self.max_n_steps, is_train, reaction_type_given, self.forward, self.action_order, action_vocab, chunk_save_path)) logger.info( f'Featurizing {len(all_inds)} reactions with {self.n_jobs} threads' ) logger.info(f"Number of generated paths (train+valid): {data_len}") logger.info( f"Upper bound for number of generated samples: {samples_len} ({data_len} * {self.max_n_steps})" ) if self.n_jobs == 1: chunk_results = [featurize_parallel(parallel_args[0])] else: # leave one job for merging results pool = Pool(max(self.n_jobs - 1, 1)) chunk_results = pool.imap(featurize_parallel, parallel_args) logger.info(f"Merging featurized data from {self.n_jobs} chunks") nodes_mat = sparse.csr_matrix(([], ([], [])), shape=(samples_len, max_n_nodes)) adj_mat = sparse.csr_matrix(([], ([], [])), shape=(samples_len, max_n_nodes**2)) n_sample_data = 6 if reaction_type_given else 5 sample_data_mat = sparse.csr_matrix(([], ([], [])), shape=(samples_len, n_sample_data)) meta = [] # vocabulary of actions actions_vocab = [] action2ind = {} action_inds = [] action_tuples = [] sample_inds = [] for ch_inds, result_code, chunk_save_path in tqdm( zip(chunk_inds, chunk_results, chunk_save_paths), desc='merging reactions from chunks', total=self.n_jobs): sample_data_path = os.path.join(chunk_save_path, 'sample_data.npz') sample_data_mat += sparse.load_npz(sample_data_path) nodes_mat_path = os.path.join(chunk_save_path, 'nodes_mat.npz') nodes_mat += sparse.load_npz(nodes_mat_path) adj_mat_path = os.path.join(chunk_save_path, 'adj_mat.npz') adj_mat += sparse.load_npz(adj_mat_path) meta_save_path = os.path.join(chunk_save_path, 'metadata.csv') chunk_meta = pd.read_csv(meta_save_path) meta.append(chunk_meta) actions_save_path = os.path.join(chunk_save_path, 'actions.txt') chunk_action_tuples = [] for line in open(actions_save_path, 'r'): action = eval(line.strip()) chunk_action_tuples.append(action) for sample_ind, action in chunk_action_tuples: if action in action2ind: action_inds.append(action2ind[action]) else: action_ind = len(actions_vocab) action2ind[action] = action_ind actions_vocab.append(action) action_tuples.append(action) action_inds.append(action_ind) sample_inds.append(sample_ind) # remove temporary chunk files shutil.rmtree(chunk_save_path) logger.info( f"Merged chunk {len(meta)} (unparsed samples: {result_code}/{len(ch_inds)})" ) logger.info("Concatenating metadata") meta = pd.concat(meta) logger.info("Saving found actions") sample_data_mat[sample_inds, 0] = action_inds with open(get_actions_vocab_path(feat_dir), 'w') as fp: json.dump(action_tuples, fp) logger.info(f"Found {len(action_tuples)} reaction actions") n_samples = meta['n_samples'] logger.info( f"Number of steps: max: {np.max(n_samples)}, avg: {np.mean(n_samples)}" ) logger.info("Saving featurized data") meta.to_csv(get_metadata_path(feat_dir)) sparse.save_npz(get_sample_data_path(feat_dir), sample_data_mat) sparse.save_npz(get_nodes_path(feat_dir), nodes_mat) sparse.save_npz(get_adj_path(feat_dir), adj_mat) n_saved_reacs = len(np.unique(meta['reaction_ind'])) logger.info( f"Saved {n_saved_reacs}/{len(all_inds)} reactions ({n_saved_reacs / len(all_inds) * 100}%)" ) logger.info( f"Saved {len(meta)} paths (avg. {len(meta) / n_saved_reacs} paths per reaction)" ) logger.info("Saving featurization metadata") meta_info = { 'description': 'Graph representation of molecules with discrete node and edge features for MEGAN', 'features': ['atom', 'bond'], 'features_type': ['atom', 'bond'], 'max_n_nodes': max_n_nodes, 'format': 'sparse' } meta_path = self.meta_info_path(dataset.feat_dir) with open(meta_path, 'w') as fp: json.dump(meta_info, fp, indent=2)