def main():
    A = load_npz("Atrain.npz")
    A_test = load_npz("Atest.npz")

    N, M = A.shape
    rbm = RBM(M, 50, 10)
    rbm.fit(A, A_test)
def main():
    A = load_npz("Atrain.npz")
    A_test = load_npz("Atest.npz")
    mask = (A > 0) * 1.0
    mask_test = (A_test > 0) * 1.0

    N, M = A.shape
    rbm = RBM(M, 50, 10)
    rbm.fit(A, mask, A_test, mask_test)
Exemplo n.º 3
0
def test_py23_compatibility():
    # Try loading files saved on Python 2 and Python 3.  They are not
    # the same, since files saved with Scipy versions < 1.0.0 may
    # contain unicode.

    a = load_npz(os.path.join(DATA_DIR, 'csc_py2.npz'))
    b = load_npz(os.path.join(DATA_DIR, 'csc_py3.npz'))
    c = csc_matrix([[0]])

    assert_equal(a.toarray(), c.toarray())
    assert_equal(b.toarray(), c.toarray())
Exemplo n.º 4
0
    def test_can_715(self):
        # this test is just to show the superiority of bicoloring vs. single coloring in
        # either direction.  Bicoloring gives only 21 colors in this case vs. 105 for either
        # fwd or rev.
        matdir = os.path.join(os.path.dirname(openmdao.test_suite.__file__), 'matrices')

        # uses matrix can_715 from the sparse matrix collection website
        mat = load_npz(os.path.join(matdir, 'can_715.npz')).toarray()
        mat = np.asarray(mat, dtype=bool)
        coloring = get_simul_meta(None, 'auto', include_sparsity=False, setup=False,
                                  run_model=False, bool_jac=mat,
                                  stream=None)

        tot_size, tot_colors, fwd_solves, rev_solves, pct = _solves_info(coloring)

        self.assertEqual(tot_colors, 21)

        # verify that unidirectional colorings are much worse (105 vs 21 for bidirectional)
        coloring = get_simul_meta(None, 'fwd', include_sparsity=False, setup=False,
                                  run_model=False, bool_jac=mat,
                                  stream=None)

        tot_size, tot_colors, fwd_solves, rev_solves, pct = _solves_info(coloring)

        self.assertEqual(tot_colors, 105)

        coloring = get_simul_meta(None, 'rev', include_sparsity=False, setup=False,
                                  run_model=False, bool_jac=mat,
                                  stream=None)

        tot_size, tot_colors, fwd_solves, rev_solves, pct = _solves_info(coloring)

        self.assertEqual(tot_colors, 105)
Exemplo n.º 5
0
 def from_disk(self, file_path):
     file_name, ext = os.path.splitext(file_path)
     self.raw_data = load_npz(file_path)
     with open(file_name + ".voc", "rb") as vocab_file:
         self.vectorizer = pickle.load(vocab_file)
     self.identifiers = pd.read_pickle(file_name + ".pkl")
     self.load_features(self.vectorizer)
     self.load_data(self.raw_data)
Exemplo n.º 6
0
def _save_and_load(matrix):
    fd, tmpfile = tempfile.mkstemp(suffix='.npz')
    os.close(fd)
    try:
        save_npz(tmpfile, matrix)
        loaded_matrix = load_npz(tmpfile)
    finally:
        os.remove(tmpfile)
    return loaded_matrix
def main():
	args = get_args()

	V = np.load(args.v)
	U = np.load(args.u)
	item_map = np.array(load_json(args.item_map)) if args.item_map else None
	lookup_table = load_json(args.lookup_table) if args.lookup_table else None
	M = sum([load_npz(x).tocsr() for x in args.dataset])

	while True:
		visualise(U, V, item_map = item_map, lookup_table = lookup_table, M = M, r = args.r, x_axis = args.x_axis, y_axis = args.y_axis, return_file = False)
		plt.show()
Exemplo n.º 8
0
def load_embedding(fname, format="word2vec_bin", normalize=True,
                   lower=False, clean_words=False, load_kwargs={}):
    """
    Loads embeddings from file

    Parameters
    ----------
    fname: string
      Path to file containing embedding

    format: string
      Format of the embedding. Possible values are:
      'word2vec_bin', 'word2vec', 'glove', 'dict'

    normalize: bool, default: True
      If true will normalize all vector to unit length

    clean_words: bool, default: True
      If true will only keep alphanumeric characters and "_", "-"
      Warning: shouldn't be applied to embeddings with non-ascii characters

    load_kwargs:
      Additional parameters passed to load function. Mostly useful for 'glove' format where you
      should pass vocab_size and dim.
    """
    assert format in ['word2vec_bin', 'word2vec', 'glove', 'dict', 'csr'], "Unrecognized format"
    if format == "word2vec_bin":
        w = Embedding.from_word2vec(fname, binary=True)
    elif format == "word2vec":
        w = Embedding.from_word2vec(fname, binary=False)
    elif format == "glove":
        w = Embedding.from_glove(fname, **load_kwargs)
    elif format == "dict":
        d = pickle.load(open(fname+'.npy', "rb"), encoding='latin1')
        w = Embedding.from_dict(d)
    elif format == "csr":
        d = sparse.load_npz(fname+'.npz')
        filepath = fname.replace('csr', 'keys')+'.npy'
        with open(filepath, 'rb') as handle:
          keys = pickle.load(handle)
        w = Embedding.from_dict(d, keys)
    if normalize:
        w.normalize_words(inplace=True)
    if lower or clean_words:
        w.standardize_words(lower=lower, clean_words=clean_words, inplace=True)
    return w
Exemplo n.º 9
0
def test_from_csc162x162():

    from siconos.numerics import SBM_from_csparse, SBM_get_value, NM_display
    from scipy.sparse import csr_matrix, linalg
    try:
        from scipy.sparse import load_npz
    except:
        return 0

    
    M = load_npz(os.path.join(working_dir, 'data/csc162x162.npz'))
    #M = load_npz('data/csc162x162.npz')
    
    blocksize =9
    r,SBM = SBM_from_csparse(blocksize,M)
    assert SBM_get_value(SBM,0,0) == M[0,0]
    assert SBM_get_value(SBM,161,161) == M[161,161]
Exemplo n.º 10
0
def main():
    args = get_args()

    user_map = load_json(args.user_map) if args.user_map else None
    user_idx = user_map.get(args.user_id, -1) if args.user_map else int_or_neg(args.user_id)
    item_map = np.array(load_json(args.item_map)) if args.item_map else None
    lookup_table = load_json(args.lookup_table) if args.lookup_table else None

    if user_idx != -1 or not args.fallback:
        U = np.load(args.u)
        V = np.load(args.v)
        M = load_npz(args.dataset).tocsr() if args.dataset and user_idx > 0 else None
        fallback = None
    else:
        U, V, M = None
        fallback = np.array(load_json(args.fallback))

    print(predict(U, V, args.user_id, n_recs = args.n_recs, user_map = user_map, item_map = item_map,lookup_table = lookup_table, M = M, fallback = fallback))
Exemplo n.º 11
0
def load_matrix(loc):
	if loc.startswith('gs://'):
		return load_npz(BytesIO(file_io.read_file_to_string(loc, binary_mode=True))).tocoo()
	return load_npz(loc).tocoo()
Exemplo n.º 12
0
          k += 1
          if k % 10000 == 0:
            print("%s/%s" % (k, num_tokens))

          start = max(0, i - context_size)
          end   = min(len(line_as_idx), i + context_size)
          for c in line_as_idx[start:i]:
            wc_counts[w, c] += 1
          for c in line_as_idx[i+1:end]:
            wc_counts[w, c] += 1
  print("Finished counting")

  save_npz('pmi_counts_%s.npz' % V, csr_matrix(wc_counts))

else:
  wc_counts = load_npz('pmi_counts_%s.npz' % V)


# context counts get raised ^ 0.75
c_counts = wc_counts.sum(axis=0).A.flatten() ** 0.75
c_probs = c_counts / c_counts.sum()
c_probs = c_probs.reshape(1, V)


# PMI(w, c) = #(w, c) / #(w) / p(c)
# pmi = wc_counts / wc_counts.sum(axis=1) / c_probs # works only if numpy arrays
pmi = wc_counts.multiply(1.0 / wc_counts.sum(axis=1) / c_probs).tocsr()
# this operation changes it to a coo_matrix
# which doesn't have functions we need, e.g log1p()
# so convert it back to a csr
print("type(pmi):", type(pmi))
Exemplo n.º 13
0
def _save_and_load(matrix):        
    with tempfile.NamedTemporaryFile(suffix='.npz') as file:
        file = file.name
        save_npz(file, matrix)
        loaded_matrix = load_npz(file)
    return loaded_matrix
Exemplo n.º 14
0
                cur_thetas[j]) + (1 - x[0, j]) * math.log(1 - cur_thetas[j])
        likelihood = feature_likelihood + math.log(self.class_probs[cls])
        return likelihood

    def predict_y(self, x):
        likelihoods = dict()
        for cls in self.thetas:
            likelihoods[cls] = self.get_class_likelihood(x, cls)
        max_likelihood = -math.inf
        best_cls = None
        for cls in likelihoods:
            if likelihoods[cls] > max_likelihood:
                best_cls = cls
                max_likelihood = likelihoods[cls]
        return best_cls


# Testing
model = SparseNaiveBayes()
x = sparse.load_npz('../data/xtrainbin.npz')
x_test = sparse.load_npz('../data/xtestbin.npz')
print(x.shape)
y = np.load('../data/y_train.npy')

model.fit(x, y)

print('Fitting Complete.')

pred = model.predict(x_test)
np.save('../results/mynb.csv', pred)
Exemplo n.º 15
0
import json
import numpy as np
import pandas as pd
from src import constants
from scipy.sparse import load_npz
from sklearn.preprocessing import Normalizer

#directory path
data_directory = constants.DATA_DIR
clean_directory = constants.CLEAN_DIR

imp_matrix_filename = constants.normalized_imp_matrix_filename
#load presaved matrix and switch back to dense matrix
img_imp_matrix_sparse = load_npz(f'{data_directory}/{imp_matrix_filename}')
img_imp_matrix = img_imp_matrix_sparse.todense()
#save filename
tag_idx_filename = constants.tag_idx_filename
img_idx_filename = constants.img_idx_filename
#load index dictionaries
with open(f'{data_directory}/{tag_idx_filename}', 'r') as tag_idx_file, \
     open(f'{data_directory}/{img_idx_filename}', 'r') as img_idx_file:
    tag_idx_dict = json.load(tag_idx_file)
    img_idx_dict = json.load(img_idx_file)


class T2I:
    "class function for tag-to-image recommendation"

    def __init__(self, art_id, ranked_tags, ranked_imp):
        self.idx = art_id  #article id
        self.ranked_tags = ranked_tags  #tags used in recommendation search
Exemplo n.º 16
0
        cleaned = pipeline(row['title'] + ' ' + row['abstract'])
        text.append(cleaned)

    print("Building vectors")
    hv = HashingVectorizer(n_features=2**10)
    hv.fit(text)
    X = hv.transform(text)

    print("Saving")
    save_npz(VECTORS_F, X)
    pickle.dump(hv, open(MODEL_F, 'wb+'))

    return X, hv


# Only build everything once if necessary
if os.path.exists(VECTORS_F) and os.path.exists(MODEL_F):
    X = load_npz(VECTORS_F)
    hv = pickle.load(open(MODEL_F, 'rb'))
else:
    vectors, hv = build_vectors()


# Return top 10 nearest neighbors to search term
def query(s, n=25):
    v = hv.transform([pipeline(s)])
    knn = cosine_similarity(X, v)

    # Return top 10 indexes
    return knn[:, 0].argsort()[-n:][::-1]
Exemplo n.º 17
0
def get_urm_train_2():
    global _urm_train_2
    if _urm_train_2 is None:
        _urm_train_2 = load_npz(_URM_TRAIN_PATH_2)
    return _urm_train_2
Exemplo n.º 18
0
def load_numpy(path, name):
    return load_npz(path+name).tocsr()
Exemplo n.º 19
0
def build_input_pipeline(data_dir, 
                         batch_size, 
                         random_state, 
                         counts_transformation="nothing"):
  """Load data and build iterator for minibatches.
  
  Args:
    data_dir: The directory where the data is located. There must be four
      files inside the rep: `counts.npz`, `author_indices.npy`, 
      `author_map.txt`, and `vocabulary.txt`.
    batch_size: The batch size to use for training.
    random_state: A NumPy `RandomState` object, used to shuffle the data.
    counts_transformation: A string indicating how to transform the counts.
      One of "nothing", "binary", "log", or "sqrt".
  """
  counts = sparse.load_npz(os.path.join(data_dir, "counts.npz"))
  num_documents, num_words = counts.shape
  author_indices = np.load(
      os.path.join(data_dir, "author_indices.npy")).astype(np.int32) 
  num_authors = np.max(author_indices + 1)
  author_map = np.loadtxt(os.path.join(data_dir, "author_map.txt"),
                          dtype=str, 
                          delimiter="\n",
                          encoding='latin-1')
  # Shuffle data.
  documents = random_state.permutation(num_documents)
  shuffled_author_indices = author_indices[documents]
  shuffled_counts = counts[documents]
  
  # Apply counts transformation.
  if counts_transformation == "nothing":
    count_values = shuffled_counts.data
  elif counts_transformation == "binary":
    count_values = np.int32(shuffled_counts.data > 0)
  elif counts_transformation == "log":
    count_values = np.round(np.log(1 + shuffled_counts.data))
  elif counts_transformation == "sqrt":
    count_values = np.round(np.sqrt(shuffled_counts.data))
  else:
    raise ValueError("Unrecognized counts transformation.")
  
  # Store counts as sparse tensor so it occupies less memory.
  shuffled_counts = tf.SparseTensor(
      indices=np.array(shuffled_counts.nonzero()).T, 
      values=count_values,
      dense_shape=shuffled_counts.shape)
  dataset = tf.data.Dataset.from_tensor_slices(
      (documents, shuffled_counts, shuffled_author_indices))
  batches = dataset.repeat().batch(batch_size).prefetch(batch_size)
  iterator = batches.make_one_shot_iterator()
  vocabulary = np.loadtxt(os.path.join(data_dir, "vocabulary.txt"), 
                          dtype=str, 
                          delimiter="\n",
                          comments="<!-")

  total_counts_per_author = np.bincount(
      author_indices, 
      weights=np.array(np.sum(counts, axis=1)).flatten())
  counts_per_document_per_author = (
      total_counts_per_author / np.bincount(author_indices))
  # Author weights is how much lengthy each author's opinion over average is.
  author_weights = (counts_per_document_per_author / 
                    np.mean(np.sum(counts, axis=1))).astype(np.float32)
  return (iterator, author_weights, vocabulary, author_map,
          num_documents, num_words, num_authors)
Exemplo n.º 20
0
from scipy.sparse import load_npz, save_npz
from NDOCD.NDOCD import NDOCD
import numpy as np
from NDOCD.load_data import write_communities_to_file, get_communities_list2, get_amazon_graph
from measures.mutual_information import normalized_mutual_information
from measures.link_belong_modularity import cal_modularity, get_graph_info
import time
from measures.modularity import convert_communities_to_dict, get_modularity

# graph = get_amazon_graph()
# save_npz("data/amazon/graph.npz", graph)
graph = load_npz("data/amazon/graph.npz")
start = time.time()
ndocd = NDOCD(graph,
              modification=True,
              modification_type="percent",
              modification_percent=0.2)

ndocd.JS_threshold = 0.3
ndocd.MD_threshold = 0.3

coms = ndocd.find_all_communities()
end = time.time()

bigger_than = 6
file = "data/amazon/coms"
write_communities_to_file(
    [com for com in coms if len(list(com.indices)) > bigger_than], file)
nmi = normalized_mutual_information(file, "data/amazon/communities")
coms2 = [
    list(com.indices) for com in coms if len(list(com.indices)) > bigger_than
Exemplo n.º 21
0
    run_tokenization = False

    if (os.path.exists(train_tok_path.format("data") + ".npy")
            and os.path.exists(train_tok_path.format("target") + ".npy")
            and os.path.exists(val_tok_path.format("data") + ".npy")
            and os.path.exists(val_tok_path.format("target") + ".npy")
            and os.path.exists(test_tok_path.format("data") + ".npy")
            and os.path.exists(test_tok_path.format("target") + ".npy")) or (
                os.path.exists(train_tok_path.format("data") + ".npz")
                and os.path.exists(train_tok_path.format("target") + ".npy")
                and os.path.exists(val_tok_path.format("data") + ".npz")
                and os.path.exists(val_tok_path.format("target") + ".npy")
                and os.path.exists(test_tok_path.format("data") + ".npz")
                and os.path.exists(test_tok_path.format("target") + ".npy")):
        if "bow-" in train_tok_path or "tfidf-" in train_tok_path:
            train_data = sparse.load_npz(
                train_tok_path.format("data") + ".npz")
        else:
            train_data = np.load(train_tok_path.format("data") + ".npy",
                                 allow_pickle=True)
        train_target = np.load(train_tok_path.format("target") + ".npy",
                               allow_pickle=True)
        if "bow-" in val_tok_path or "tfidf-" in val_tok_path:
            val_data = sparse.load_npz(val_tok_path.format("data") + ".npz")
        else:
            val_data = np.load(val_tok_path.format("data") + ".npy",
                               allow_pickle=True)
        val_target = np.load(val_tok_path.format("target") + ".npy",
                             allow_pickle=True)
        if "bow-" in test_tok_path or "tfidf-" in test_tok_path:
            test_data = sparse.load_npz(test_tok_path.format("data") + ".npz")
        else:
Exemplo n.º 22
0
    def train_sampler(self, train_data):
        features = train_data[1]
        batch_size = 512

        if features is not None:
            features = np.vstack([features, np.zeros((features.shape[1], ))])

        node_size = len(features)
        node_dim = len(features[0])

        # build model
        # input (features of vertex and its neighbor, label)
        x1_ph = tf.compat.v1.placeholder(shape=[batch_size, node_dim],
                                         dtype=tf.float32)
        x2_ph = tf.compat.v1.placeholder(shape=[batch_size, node_dim],
                                         dtype=tf.float32)
        y_ph = tf.compat.v1.placeholder(shape=[batch_size], dtype=tf.float32)

        with tf.compat.v1.variable_scope("MLsampler"):
            if self.nonlinear_sampler is True:
                print("Non-linear regression sampler used")
                l = tf.compat.v1.layers.dense(
                    tf.concat([x1_ph, x2_ph], axis=1),
                    1,
                    activation=None,
                    trainable=True,
                    kernel_initializer=tf.compat.v1.keras.initializers.
                    VarianceScaling(scale=1.0,
                                    mode="fan_avg",
                                    distribution="uniform"),
                    name='dense')
                out = tf.nn.relu(tf.exp(l), name='relu')
            else:
                print("Linear regression sampler used")
                l = tf.compat.v1.layers.dense(
                    x1_ph,
                    node_dim,
                    activation=None,
                    trainable=True,
                    kernel_initializer=tf.compat.v1.keras.initializers.
                    VarianceScaling(scale=1.0,
                                    mode="fan_avg",
                                    distribution="uniform"),
                    name='dense')
                l = tf.matmul(l, x2_ph, transpose_b=True, name='matmul')
                out = tf.nn.relu(l, name='relu')

        loss = tf.nn.l2_loss(out - y_ph, name='loss') / batch_size
        optimizer = tf.compat.v1.train.AdamOptimizer(
            learning_rate=self.learning_rate, name='Adam').minimize(loss)
        init = tf.compat.v1.global_variables_initializer()

        # configuration
        config = tf.compat.v1.ConfigProto(
            log_device_placement=self.log_device_placement)
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True

        # load data
        loss_node_path = self._loss_node_path("Uniform")
        loss_node = sparse.load_npz(loss_node_path + 'loss_node.npz')
        loss_node_count = sparse.load_npz(loss_node_path +
                                          'loss_node_count.npz')

        idx_nz = sparse.find(loss_node_count)

        # due to out of memory, select randomly limited number of data node
        vertex = features[idx_nz[0]]
        neighbor = features[idx_nz[1]]
        count = idx_nz[2]
        y = np.divide(sparse.find(loss_node)[2], count)

        # partition train/validation data
        vertex_tr = vertex[:-batch_size]
        neighbor_tr = neighbor[:-batch_size]
        y_tr = y[:-batch_size]

        vertex_val = vertex[-batch_size:]
        neighbor_val = neighbor[-batch_size:]
        y_val = y[-batch_size:]

        iter_size = int(vertex_tr.shape[0] / batch_size)

        # initialize session
        sess = tf.compat.v1.Session(config=config)

        # summary
        tf.compat.v1.summary.scalar('loss', loss)
        merged_summary_op = tf.compat.v1.summary.merge_all()
        #        summary_writer = tf.compat.v1.summary.FileWriter(
        #                self._sampler_log_dir(), sess.graph)

        # save model
        saver = tf.compat.v1.train.Saver()
        model_path = self._sampler_model_path()
        if not os.path.exists(model_path):
            os.makedirs(model_path)

        # init variables
        sess.run(init)

        # train
        total_steps = 0
        avg_time = 0.0
        validation_losses = []

        for epoch in range(self.epochs):
            # shuffle
            perm = np.random.permutation(vertex_tr.shape[0])
            validation_loss_epoch = []

            print("Epoch: %04d" % (epoch))
            for iter in range(iter_size):
                # allocate batch
                vtr = vertex_tr[perm[iter * batch_size:(iter + 1) *
                                     batch_size]]
                ntr = neighbor_tr[perm[iter * batch_size:(iter + 1) *
                                       batch_size]]
                ytr = y_tr[perm[iter * batch_size:(iter + 1) * batch_size]]

                t = time.time()
                outs = sess.run([loss, optimizer, merged_summary_op],
                                feed_dict={
                                    x1_ph: vtr,
                                    x2_ph: ntr,
                                    y_ph: ytr
                                })
                train_loss = outs[0]

                # validation
                if iter % self.validate_iter == 0:
                    outs = sess.run([loss, optimizer, merged_summary_op],
                                    feed_dict={
                                        x1_ph: vertex_val,
                                        x2_ph: neighbor_val,
                                        y_ph: y_val
                                    })
                    val_loss = outs[0]
                    validation_loss_epoch.append(val_loss)

                avg_time = (avg_time * total_steps + time.time() -
                            t) / (total_steps + 1)

                # print
                if total_steps % self.print_every == 0:
                    print("Iter:", "%04d" % iter, "train_loss=",
                          "{:.5f}".format(train_loss), "val_loss=",
                          "{:.5f}".format(val_loss))
                total_steps += 1

                if total_steps > self.max_total_steps:
                    break

            validation_losses.append(
                sum(validation_loss_epoch) / len(validation_loss_epoch))
            if validation_losses[-1] == min(validation_losses):
                print(
                    "Minimum validation loss so far ({}) at epoch {}.".format(
                        validation_losses[-1], epoch))
                # save_model
                save_path = saver.save(sess, model_path + 'model.ckpt')

        sess.close()
        tf.compat.v1.reset_default_graph()
def loadRedditFromNPZ(dataset_dir):
    adj = sp.load_npz(dataset_dir+"reddit_adj.npz")
    data = np.load(dataset_dir+"reddit.npz")

    return adj, data['feats'], data['y_train'], data['y_val'], data['y_test'], data['train_index'], data['val_index'], data['test_index']
    populationScale = np.array([214,1287,(1046+194)/2,194,251])
    populationScale = populationScale / populationScale.sum() * 5

else:
#North Sea
    indices = ravel(np.array([53.0 , 52.5 , 53.5 , 53.0 , 53.0]) , np.array([3.0, 3.0, 3.0, 2.5, 3.5]))
    populationScale = np.array([1,1,1,1,1])


#rename target area(s) from createrawmatrix for brevity
lon = np.array(M.lons[-1])
lat = np.array(M.lats[-1])

transitMatrix = [[] for x in range(M.nt)]
for x in range(M.nt):
    transitMatrix[x] = sparse.load_npz(os.path.join(inputFolder, 'hybridtransitmatrix_'+M.dir+'_month_'+str(x)+'.npz'))


for s in range(len(scaling)):
    if beaching: #load beaching data corresponding to set model and determine the chance of particle tracer ending on shore
        beachdata = np.genfromtxt(os.path.join('beachingCoastlines', beachingmodel + '_coast_dx_' + str((M.dd[-1] * 100).astype(int)) + '_region_'+region+'.csv'),delimiter=',',skip_header=1)
        beachdata = np.transpose(beachdata)
        beach = 1 - beachdata[3] * scaling[s] #chance of tracer ending on shore

        if beach.size != M.nc[-1] :
            print("Resolution of beaching grid and transition matrix don't match")
            sys.exit()

    for i in indices:
        if landpoints[i] != 0 :
            print("Point" + str(i) + "is a landpoint. Lon/lat = " + str(np.unravel_index(i,(M.nx[-1],M.ny[-1]),order='F')))
Exemplo n.º 25
0
def get_urm_test_2():
    global _urm_test_2
    if _urm_test_2 is None:
        _urm_test_2 = load_npz(_URM_TEST_PATH_2)
    return _urm_test_2
Exemplo n.º 26
0
        continue
    if statement == 'Done':
        break
    t = feature_transformer(statement)
    prediction = clf.predict_proba(t)
    if prediction[0][1] > 0.9:  #margin of confidence
        print 'Extreme bullying detected!'
    elif prediction[0][1] > 0.7:
        print 'Serious bullying detected!'
    elif prediction[0][1] > 0.6:
        print 'Some bullying detected. Please moderate your language.'
    prevState = t

#Re-update SVM
labels = list(pickle.load(open('master_labels.txt', 'rb')))
mat = load_npz('master_convo.npz')
mat = vstack([mat, convStore])
mat = hstack([mat, csr_matrix(
    (mat.shape[0], num_new_feats))])  #adds columns for new features
labels = labels + labelStore
clf = svm.NuSVC(.05, probability=True)

print 'Number of features added to the list: ' + str(num_new_feats)
print 'Fitting new model...'
clf.fit(mat, labels)
print 'Done'

#Redump SVM objects
joblib.dump(clf, 'model.pkl')
save_npz('master_convo.npz', mat)  #for updating during simulation
with open('master_labels.txt', 'wb') as f:
Exemplo n.º 27
0
def get_urm_train_explicit():
    global _urm_train_explicit
    if _urm_train_explicit is None:
        _urm_train_explicit = load_npz(_URM_TRAIN_EXPLICIT_PATH)
    return _urm_train_explicit
Exemplo n.º 28
0
	""" take a model, predictor matrix and paramater grid and
		return the optimal paramater set """
	_gsearch = GridSearchCV(xgb_model,  xgb_param_grid, 
								scoring='roc_auc', 
								n_jobs=4, 
								iid=False, 
								cv=3)
	_gsearch.fit(x_vals, y_vals)

	return _gsearch.best_params_


if __name__ == '__main__':

	#load in the processed data from train_and_test_to_matrix.py
	train_sparse = sparse.load_npz('sparse_train_punc.npz')
	test_sparse = sparse.load_npz('sparse_test_punc.npz')

	train = pd.read_csv('train.csv')
	test = pd.read_csv('test.csv')
	sub_file = pd.read_csv('sample_submission.csv')


	to_predict = list(train.columns[2:])

	for col in to_predict:

		xgtrain_input = xgb.DMatrix(train_sparse, label=train[col].values)

		xgb_initial = xgb.XGBClassifier(learning_rate =0.1,
									n_estimators=1000,
Exemplo n.º 29
0
def get_urm_test_explicit():
    global _urm_test_explicit
    if _urm_test_explicit is None:
        _urm_test_explicit = load_npz(_URM_TEST_EXPLICIT_PATH)
    return _urm_test_explicit
Exemplo n.º 30
0
from sklearn.utils import shuffle
from scipy.sparse import save_npz, load_npz

import keras.backend as K
from keras.models import Model
from keras.layers import Input, Dropout, Dense
from keras.regularizers import l2
from keras.optimizers import SGD

# config
batch_size = 128
epochs = 20
reg = 0.0001
# reg = 0

A = load_npz("Atrain.npz")
A_test = load_npz("Atest.npz")
mask = (A > 0) * 1.0
mask_test = (A_test > 0) * 1.0

# make copies since we will shuffle
A_copy = A.copy()
mask_copy = mask.copy()
A_test_copy = A_test.copy()
mask_test_copy = mask_test.copy()

N, M = A.shape
print("N:", N, "M:", M)
print("N // batch_size:", N // batch_size)

# center the data
Exemplo n.º 31
0
def get_urm_sequential_masked():
    global _urm
    if _urm is None:
        _urm = load_npz(_URM_SEQUENTIAL_MASKED_PATH)
    return _urm
Exemplo n.º 32
0
import torch
from torch.utils.data import Dataset, DataLoader
from torch import optim
from torch.optim.lr_scheduler import ReduceLROnPlateau
import numpy as np
from scipy import sparse
import sys

import params
from nn.mikolov_rnnlm import MikolovRNNLM

data = sparse.load_npz('./../../2013MikolovWV/data/mikolov_bi_sg_data.npz')
with open('./../../2013MikolovWV/data/vocabs.txt', 'r', encoding='utf-8') as f:
    vocabs = f.readlines()
    vocabs = [word.strip() for word in vocabs]
params.rnnlm['v'] = len(vocabs)


class PrepareData(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(
            self,
            idx):  #default of torch: float32, default of np: float64 (double)
        return torch.as_tensor(self.X[idx].toarray()).float(), torch.as_tensor(
            self.y[idx].toarray()).view(1, 1).float()
Exemplo n.º 33
0
def get_urm_train_sequential_masked():
    global _urm_train_1
    if _urm_train_1 is None:
        _urm_train_1 = load_npz(_URM_TRAIN_SEQUENTIAL_MASKED_PATH)
    return _urm_train_1
def redis_get_helper(key, npz = False):
	if npz:
		return load_npz(BytesIO(r.get(key)))
	return np.load(BytesIO(r.get(key)))
Exemplo n.º 35
0
def get_urm_test_sequential_masked():
    global _urm_test_1
    if _urm_test_1 is None:
        _urm_test_1 = load_npz(_URM_TEST_SEQUENTIAL_MASKED_PATH)
    return _urm_test_1
Exemplo n.º 36
0
def get_icm():
    global _icm
    if _icm is None:
        _icm = load_npz(_ICM_PATH)
    return _icm
Exemplo n.º 37
0
import scipy.sparse as sps
import sys
from utils.evaluator import Evaluator
from utils.datareader import Datareader
from utils.post_processing import eurm_to_recommendation_list
from utils.ensembler import ensembler
import numpy as np

import os.path

dr = Datareader(verbose=False, mode = "offline", only_load="False")
cat = 3

a = sps.load_npz("../offline/offline-cbf_item_album-cat"+str(cat)+".npz")
b = sps.load_npz("../offline/offline-cbf_item_artist-cat"+str(cat)+".npz")
c = sps.load_npz("../offline/nlp_eurm_offline_bm25-cat"+str(cat)+".npz")
d = sps.load_npz("../offline/offline-rp3beta-cat"+str(cat)+".npz")
e = sps.load_npz("../offline/offline-cfuser-cat"+str(cat)+".npz")
f = sps.load_npz("../offline/slim_bpr_completo_test1-cat"+str(cat)+".npz")
g = sps.load_npz("../offline/eurm_cbfu_artists_offline-cat"+str(cat)+".npz")

matrix = [a, b, c, d, e, f, g]

a = float(sys.argv[1])
b = float(sys.argv[2])
c = float(sys.argv[3])
d = float(sys.argv[4])
e = float(sys.argv[5])
f = float(sys.argv[6])
g = float(sys.argv[7])
Exemplo n.º 38
0
    def labels(self):
        labels = sparse.load_npz(self.labels_npz)
        labels = labels.toarray().ravel()
        labels[labels != 1] = -1

        return labels
import scipy.sparse as sps
from scripts.scikit_ensemble.scikit_ensamble import Optimizer
from utils.definitions import *
from utils.datareader import Datareader

cat = 8

matrix = list()
from utils.definitions import load_obj
name = load_obj("name")
directory = ROOT_DIR + "/scripts/scikit_ensemble/offline/"
matrix_dict = load_obj("matrix_dict", path="")

m = list()
for n in name[cat-1]:
    m.append(sps.load_npz(directory + matrix_dict[n]))
matrix.append(m)

dr = Datareader(verbose=False, mode = "offline", only_load="False")

opt = Optimizer(matrices_array=matrix[0], matrices_names=name[cat-1],
                dr=dr, cat=cat, start=0, end=1)
del matrix

opt.run()
rxn_id_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/rxn_ids.pickle"
pfp_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/pfp_mtx.npz"
rfp_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/rfp_mtx.npz"
rgt_1_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/rgt_1_mtx.npz"
rgt_2_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/rgt_2_mtx.npz"
slv_1_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/slv_1_mtx.npz"
slv_2_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/slv_2_mtx.npz"
cat_1_mtx_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/cat_1_mtx.npz"
rgt_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/rgts.pickle"
slv_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/slvs.pickle"
cat_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/cats.pickle"
temp_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/temps.pickle"
yd_file = "/data/hanyu/preprocessed_data/separate/fullset_rearrange_test_chiral/yds.pickle"

pfp_csrmtx = sparse.load_npz(pfp_mtx_file)
rfp_csrmtx = sparse.load_npz(rfp_mtx_file)
# context_csrmtx = sparse.load_npz(context_mtx_file)
rgt_1_mtx = sparse.load_npz(rgt_1_mtx_file)
rgt_2_mtx = sparse.load_npz(rgt_2_mtx_file)
slv_1_mtx = sparse.load_npz(slv_1_mtx_file)
slv_2_mtx = sparse.load_npz(slv_2_mtx_file)
cat_1_mtx = sparse.load_npz(cat_1_mtx_file)

with open(temp_file,"r") as T_L_F:
	temp_list = pickle.load(T_L_F)

with open(rxn_id_file,"r") as RID:
	rxn_id_list = pickle.load(RID)	

Exemplo n.º 41
0
def get_urm_train_1():
    global _urm_train_1
    if _urm_train_1 is None:
        _urm_train_1 = load_npz(_URM_TRAIN_PATH_1)
    return _urm_train_1
Exemplo n.º 42
0
        print ('{} maxidx'.format(maxidx))
    
    elif mode == 'test_csr':

        if len(sys.argv) != 3:
            print('USAGE: python {} test_csr logid'.format(sys.argv[0]))
            sys.exit(1)
        id = sys.argv[2]

        w = 'meaner'

        filepath_keys = 'keys_{}_final.npy'.format(id)
        with open('./images/fp_{}/{}'.format(id, filepath_keys), 'rb') as f:
            keys = pickle.load(f)
        filepath_csr = 'csr_{}_final.npz'.format(id)
        csr = sparse.load_npz('./images/fp_{}/{}'.format(id, filepath_csr))

        index = keys.index(w)
        word_csr1 = csr[index]

    elif mode == 'join_csr':

        if len(sys.argv) != 3:
            print('USAGE: python {} join_csr logid'.format(sys.argv[0]))
            sys.exit(1)
        id = sys.argv[2]

        """
        filepath_keys = 'keys_{}_datasets.npy'.format(id)
        with open('./images/fp_{}/{}'.format(id, filepath_keys), 'rb') as f:
            keys = pickle.load(f)
Exemplo n.º 43
0
def get_urm_test_1():
    global _urm_test_1
    if _urm_test_1 is None:
        _urm_test_1 = load_npz(_URM_TEST_PATH_1)
    return _urm_test_1
Exemplo n.º 44
0
def load_proximity(ds, radius):
    
    logger.info("Loading proximity matrix...")
    fname = os.path.join(ds.a.data_path, "proximity_radius_%s_%s.npz" %(str(radius), ds.a.brain_mask))
    A = load_npz(fname)
    return A.tolil()
Exemplo n.º 45
0
def featureSelectionTraining(training=False):

    threshold = 0.01
    np.set_printoptions(suppress=True)

    # ----------------- READ PREPROCESSING FILE -----------------------
    VECT_SEL_FOLDER = "ml_core/vector_selection/training/"
    VECT_FOLDER = "ml_core/vector/training/"

    VECT_TEMPLATE = "ml_core/template/tfidf_sparse_template.npz"
    TEMPLATE_FOLDER = "ml_core/template/"
    FEATURE_CONFIG = "feature_template.json"

    LABEL_PATH = "ml_core/data/training/Preprocessed_Dataset_Training.csv"
    TWEET_DATA = pd.read_csv(LABEL_PATH, usecols=["label"])
    tags = TWEET_DATA.label

    # ----------------------- LOAD SPARSE MATRIX -------------------------
    FileName = "tfidf_sparse_training.npz"
    tfidf_mat = sparse.load_npz(VECT_FOLDER + FileName).toarray()

    json_feature = "tfidf_feature_training.json"
    features = readJson_config(VECT_FOLDER + "features/", json_feature,
                               'feature')[0]

    tfidf_mat_selection = None
    features_template = None
    tfidf_mat_template = None
    selected_idx = []

    now = datetime.now()
    dt_string = now.strftime("%d%m%Y_%H%M%S")

    # ---------------------------- TRAINING -----------------------------
    mi = mutual_info_classif(tfidf_mat, tags)
    norm_mi = mi / np.max(mi)

    column_idx = [i for i, mi_item in enumerate(norm_mi) if mi_item < 0.01]
    tfidf_mat_selection = np.delete(tfidf_mat, column_idx, 1)

    # template data
    selected_idx = [j for j in range(len(norm_mi)) if j not in column_idx]
    selected_features = []
    for idx in selected_idx:
        selected_features.append(features[idx])

    tfidf_mat_template = [0.0] * len(selected_features)
    features_template = selected_features

    #-------------------------------- SAVE -----------------------------------
    # Save template
    tfidf_sparse_template = sparse.csr_matrix(tfidf_mat_template)
    sparse.save_npz(VECT_TEMPLATE, tfidf_sparse_template)

    feature_dict = {}
    feature_dict['feature'] = features_template
    writeJson_config(TEMPLATE_FOLDER,
                     FEATURE_CONFIG,
                     feature_dict,
                     append=False)

    # save training data
    tfidf_sparse = sparse.csr_matrix(tfidf_mat_selection)
    sparse.save_npz(VECT_SEL_FOLDER + "tfidf_selection_sparse_training.npz",
                    tfidf_sparse)
    writeJson_config(VECT_SEL_FOLDER + "features/",
                     "tfidf_feature_training.json",
                     features_template,
                     append=False)

    return 'success'
Exemplo n.º 46
0
def get_ucm_train():
    global _ucm_train
    if _ucm_train is None:
        _ucm_train = load_npz(_UCM_TRAIN_PATH)
    return _ucm_train
Exemplo n.º 47
0
from NDOCD.Community import Community
from NDOCD.load_data import random_graph, get_actors_graph
from scipy.sparse import save_npz, load_npz
import numpy as np

np.random.seed(123)
graph = random_graph(size=150, edges=1200)
ndocd = NDOCD(graph)
# ndocd.neighbours_edges
# ndocd.compute_neighbours_edges()
# com.get_graph().toarray()
# graph.toarray()
# ndocd.graph.toarray()
# graph = get_actors_graph()
# save_npz("data/actors_graph.npz", graph)
graph = load_npz("data/actors_graph.npz")
# np.save("data/actors_neighbours.npy", ndocd.neighbours_edges)

ndocd = NDOCD(graph, np.load("data/actors_neighbours.npy"))

# graph.toarray()
com = ndocd.initialize_new_community()
com.get_graph()[com.get_vertex_indices()].toarray()[:, com.get_vertex_indices()]

ndocd.JS_threshold = 0.18
ndocd.MD_threshold = 0.18

com = ndocd.initialize_new_community()
com = ndocd.algorithm_step2(com)
print(np.sum(com.vertices.toarray()))
# com = ndocd.create_new_community()
Exemplo n.º 48
0
def get_urm():
    global _urm
    if _urm is None:
        _urm = load_npz(_URM_PATH)
    return _urm
Exemplo n.º 49
0
    def featurize_dataset(self, dataset: Dataset):
        logger.info(
            f"Loading dataset {dataset.key} and {self.split.key} split")
        data = dataset.load_x()
        for required_field in ['product', 'substrates']:
            if required_field not in data:
                raise NotImplementedError(
                    f"Need to have field '{required_field} in the dataset")

        split = self.split.load(dataset.dir)
        feat_dir = self.dir(dataset.feat_dir)

        metadata = dataset.load_metadata()
        reaction_type_given = False
        if 'reaction_type_id' in metadata:
            rtypes = metadata['reaction_type_id'].values
            ntypes = len(np.unique(rtypes))
            logger.info(f'Found {ntypes} unique reaction types in the dataset')
            reaction_type_given = True
            data['reaction_type'] = rtypes

        if not os.path.exists(feat_dir):
            os.makedirs(feat_dir)

        if 'max_n_nodes' in dataset.meta_info:
            max_n_nodes = dataset.meta_info['max_n_nodes']
        else:
            max_n_nodes = 1024
        logger.info("Max. number of nodes: {}".format(max_n_nodes))

        # we do not featurize test set for training
        all_inds = np.argwhere(split['test'] == 0).flatten()

        # shuffle indices for featurization in multiple threads
        np.random.shuffle(all_inds)

        data_len = len(data)
        samples_len = data_len * self.max_n_steps

        chunk_size = int(len(all_inds) / self.n_jobs)
        chunk_ends = [chunk_size * i for i in range(self.n_jobs + 1)]
        chunk_ends[-1] = len(all_inds)
        chunk_inds = [
            all_inds[chunk_ends[i]:chunk_ends[i + 1]]
            for i in range(len(chunk_ends) - 1)
        ]

        logger.info(f'Finding all possible values of atom and bond properties '
                    f'on {len(all_inds)} reactions using {self.n_jobs} chunks')
        parallel_args = []
        for i, ch_inds in enumerate(chunk_inds):
            new_x = dict((k, x.values[ch_inds]) for k, x in data.items())
            parallel_args.append((i, new_x, tqdm))

        prop_dict = {'atom': {}, 'bond': {}}
        if self.n_jobs == 1:
            chunk_results = [find_properties_parallel(parallel_args[0])]
        else:
            pool = Pool(self.n_jobs)
            chunk_results = pool.imap(find_properties_parallel, parallel_args)

        for chunk_prop_dict in chunk_results:
            for type_key in prop_dict.keys():
                for key, values in chunk_prop_dict[type_key].items():
                    if key not in prop_dict[type_key]:
                        prop_dict[type_key][key] = set()
                    prop_dict[type_key][key].update(values)

        # add some 'special' atom/bond feature values
        prop_dict['atom']['is_supernode'].update([0, 1])
        prop_dict['atom']['is_edited'].update([0, 1])
        prop_dict['atom']['is_reactant'].update([0, 1])
        prop_dict['bond']['bond_type'].update(['supernode', 'self'])
        prop_dict['bond']['is_edited'].update([0, 1])

        atom_feat_counts = ', '.join([
            '{:s}: {:d}'.format(key, len(values))
            for key, values in prop_dict['atom'].items()
        ])
        logger.info(f'Found atom features: {atom_feat_counts}')

        bond_feat_counts = ', '.join([
            '{:s}: {:d}'.format(key, len(values))
            for key, values in prop_dict['bond'].items()
        ])
        logger.info(f'Found bond features: {bond_feat_counts}')

        # make a dictionary for conversion of atom/bond features to OH numbers
        prop2oh = {'atom': {}, 'bond': {}}
        props = {'atom': {}, 'bond': {}}
        for type_key, prop_values in prop_dict.items():
            for prop_key, values in prop_values.items():
                sorted_vals = list(
                    sorted(values,
                           key=lambda x: x if isinstance(x, int) else 0))
                props[type_key][prop_key] = sorted_vals
                oh = dict((k, i + 1) for i, k in enumerate(sorted_vals))
                prop2oh[type_key][prop_key] = oh

        # save 'prop2oh' dictionary
        with open(get_prop2oh_vocab_path(feat_dir), 'w') as fp:
            json.dump(
                {
                    'atom': props['atom'],
                    'bond': props['bond'],
                    'atom_2oh': prop2oh['atom'],
                    'bond_2oh': prop2oh['bond']
                },
                fp,
                indent=2)

        atom_feature_keys = [
            k for k in ORDERED_ATOM_OH_KEYS if k in prop2oh['atom']
        ]
        bond_feature_keys = [
            k for k in ORDERED_BOND_OH_KEYS if k in prop2oh['bond']
        ]
        action_vocab = {
            'prop2oh': prop2oh,
            'atom_feature_keys': atom_feature_keys,
            'bond_feature_keys': bond_feature_keys,
            'atom_feat_ind': dict(
                (k, i) for i, k in enumerate(atom_feature_keys)),
            'bond_feat_ind': dict(
                (k, i) for i, k in enumerate(bond_feature_keys))
        }

        parallel_args = []
        chunk_save_paths = []
        for i, ch_inds in enumerate(chunk_inds):
            new_x = dict((k, x.values[ch_inds]) for k, x in data.items())
            is_train = split['train'][ch_inds].values
            chunk_save_path = os.path.join(feat_dir, f'chunk_result_{i}')
            chunk_save_paths.append(chunk_save_path)
            parallel_args.append(
                (i, samples_len, ch_inds, new_x, max_n_nodes, tqdm,
                 self.max_n_steps, is_train, reaction_type_given, self.forward,
                 self.action_order, action_vocab, chunk_save_path))

        logger.info(
            f'Featurizing {len(all_inds)} reactions with {self.n_jobs} threads'
        )
        logger.info(f"Number of generated paths (train+valid): {data_len}")
        logger.info(
            f"Upper bound for number of generated samples: {samples_len} ({data_len} * {self.max_n_steps})"
        )

        if self.n_jobs == 1:
            chunk_results = [featurize_parallel(parallel_args[0])]
        else:
            # leave one job for merging results
            pool = Pool(max(self.n_jobs - 1, 1))
            chunk_results = pool.imap(featurize_parallel, parallel_args)

        logger.info(f"Merging featurized data from {self.n_jobs} chunks")

        nodes_mat = sparse.csr_matrix(([], ([], [])),
                                      shape=(samples_len, max_n_nodes))
        adj_mat = sparse.csr_matrix(([], ([], [])),
                                    shape=(samples_len, max_n_nodes**2))

        n_sample_data = 6 if reaction_type_given else 5
        sample_data_mat = sparse.csr_matrix(([], ([], [])),
                                            shape=(samples_len, n_sample_data))
        meta = []

        # vocabulary of actions
        actions_vocab = []
        action2ind = {}
        action_inds = []
        action_tuples = []
        sample_inds = []

        for ch_inds, result_code, chunk_save_path in tqdm(
                zip(chunk_inds, chunk_results, chunk_save_paths),
                desc='merging reactions from chunks',
                total=self.n_jobs):
            sample_data_path = os.path.join(chunk_save_path, 'sample_data.npz')
            sample_data_mat += sparse.load_npz(sample_data_path)

            nodes_mat_path = os.path.join(chunk_save_path, 'nodes_mat.npz')
            nodes_mat += sparse.load_npz(nodes_mat_path)

            adj_mat_path = os.path.join(chunk_save_path, 'adj_mat.npz')
            adj_mat += sparse.load_npz(adj_mat_path)

            meta_save_path = os.path.join(chunk_save_path, 'metadata.csv')
            chunk_meta = pd.read_csv(meta_save_path)
            meta.append(chunk_meta)

            actions_save_path = os.path.join(chunk_save_path, 'actions.txt')
            chunk_action_tuples = []
            for line in open(actions_save_path, 'r'):
                action = eval(line.strip())
                chunk_action_tuples.append(action)

            for sample_ind, action in chunk_action_tuples:
                if action in action2ind:
                    action_inds.append(action2ind[action])
                else:
                    action_ind = len(actions_vocab)
                    action2ind[action] = action_ind
                    actions_vocab.append(action)
                    action_tuples.append(action)
                    action_inds.append(action_ind)
                sample_inds.append(sample_ind)

            # remove temporary chunk files
            shutil.rmtree(chunk_save_path)
            logger.info(
                f"Merged chunk {len(meta)} (unparsed samples: {result_code}/{len(ch_inds)})"
            )

        logger.info("Concatenating metadata")
        meta = pd.concat(meta)

        logger.info("Saving found actions")
        sample_data_mat[sample_inds, 0] = action_inds
        with open(get_actions_vocab_path(feat_dir), 'w') as fp:
            json.dump(action_tuples, fp)
        logger.info(f"Found {len(action_tuples)} reaction actions")

        n_samples = meta['n_samples']
        logger.info(
            f"Number of steps: max: {np.max(n_samples)}, avg: {np.mean(n_samples)}"
        )

        logger.info("Saving featurized data")
        meta.to_csv(get_metadata_path(feat_dir))
        sparse.save_npz(get_sample_data_path(feat_dir), sample_data_mat)
        sparse.save_npz(get_nodes_path(feat_dir), nodes_mat)
        sparse.save_npz(get_adj_path(feat_dir), adj_mat)

        n_saved_reacs = len(np.unique(meta['reaction_ind']))

        logger.info(
            f"Saved {n_saved_reacs}/{len(all_inds)} reactions ({n_saved_reacs / len(all_inds) * 100}%)"
        )
        logger.info(
            f"Saved {len(meta)} paths (avg. {len(meta) / n_saved_reacs} paths per reaction)"
        )

        logger.info("Saving featurization metadata")
        meta_info = {
            'description':
            'Graph representation of molecules with discrete node and edge features for MEGAN',
            'features': ['atom', 'bond'],
            'features_type': ['atom', 'bond'],
            'max_n_nodes': max_n_nodes,
            'format': 'sparse'
        }
        meta_path = self.meta_info_path(dataset.feat_dir)
        with open(meta_path, 'w') as fp:
            json.dump(meta_info, fp, indent=2)