Exemplo n.º 1
0
def test_perfect_separation_of_latents():
    latents = data()
    W, H, mapping = generate_w_h(latents, n_users=100)
    nmf = NMF(solver='mu', init='custom', n_components=3)
    nmf.components_ = H
    nmf.n_components_ = H.shape[0]
    X = nmf.inverse_transform(W)
Exemplo n.º 2
0
    def sci_nmf(self,
                components=2,
                procedure=None,
                separate=False,
                max_iter=1000,
                w_init=False,
                h_init=None):
        # Performs non-negative matrix factorization
        # procedure takes one of the initial methods of approximation listed in the parameters for nmf here:
        # https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.NMF.html

        # Sklearn only allows initalization of H in the equation X = W * H
        # to initialize W, set w_init to true and pass W^T into h_init to solve the equation X^T = H^T * W^T

        arr = np.array(self.array)

        W = None
        H = None
        if h_init is None:
            model = NMF(n_components=components,
                        init=procedure,
                        random_state=0,
                        max_iter=1000,
                        tol=1e-10)
            W = model.fit_transform(arr)
            H = model.components_
        else:
            model = NMF(n_components=components,
                        init=None,
                        max_iter=1000,
                        tol=1e-10)  #, regularization = 'components')
            model.n_components_ = components
            #W = model.fit_transform(X = arr, W = w_init, H = h_init)
            #H = model.components_

            model.components_ = h_init
            if w_init:
                H_transpose = model.transform(X=np.transpose(arr))
                W = np.transpose(model.components_)
                H = np.transpose(H_transpose)
            else:
                W = model.transform(X=arr)
                H = model.components_

        # Return either multiplied together as a new object OR as decomposed matrices
        if separate == False:
            tor = mat_opr(pd.DataFrame(np.dot(W, H)))
            tor.dataframe.columns = self.dataframe.columns
            tor.dataframe.index = self.dataframe.index
            return tor
        else:
            return W, H
Exemplo n.º 3
0
def generate_array_data_file():
    latents = data()
    # W = np.zeros(shape=(n_users + 1, n_components))
    # H = np.zeros(shape=(n_components, n_items))
    n_users = 100
    W, H, mapping = generate_w_h(latents,
                                 n_users=n_users,
                                 use_random=True,
                                 sigma=.5,
                                 mean=-1.5)
    nmf = NMF(solver='mu', init='custom', n_components=3)
    nmf.components_ = H
    nmf.n_components_ = H.shape[0]
    X = nmf.inverse_transform(W)
    # returns a shape of n_users+1. Wipe these ratings since they are not yet determined.
    X[n_users, :] = np.nan
    file_contents = dedent('''
namespace MonsterMatch.CollaborativeFiltering
{
    public static class MonsterMatchArrayData
    {
        // @formatter:off
        public const int UserCount = %d;
        public const int ItemCount = %d;
        public const int PlayerUserId = %d;
        public const int FactorCount = %d;
        public static readonly int[] ForProfiles = %s;
        public static readonly double[,] Data = %s;
        public static readonly double[,] Pu = %s;
        public static readonly double[,] Qi = %s;
        // @formatter:on
    }
}
    ''')
    file_contents = file_contents % (
        n_users + 1, len(latents), n_users, nmf.n_components_, 'new [] {' +
        ','.join([str(profile.index)
                  for profile in latents]) + '}', csharp_repr_ndarray(X),
        csharp_repr_ndarray(W), csharp_repr_ndarray(H.T))
    print(file_contents)
Exemplo n.º 4
0
def main(argvs):
    process_id = int(argvs[1])
    state_n = int(argvs[2])
    chromosome = argvs[3]
    start_index = int(argvs[4])
    end_index = int(argvs[5])
    resolution_size = int(argvs[6])
    annotation_result_dir = argvs[7]
    print(process_id, state_n, chromosome, start_index, end_index,
          resolution_size, annotation_result_dir)

    E = len(configs.chromHMM_assay)
    assay_list = configs.chromHMM_assay
    K = state_n
    window_size = 100
    # load nmf model
    nmf = NMF(n_components=K, init='random', random_state=0)
    with open(os.path.join(script_dir,
                           "../train/param_{}.json".format(K))) as param_f:
        param = json.load(param_f)
    nmf.components_ = np.array(param['components_'])
    nmf.n_components_ = param['n_components_']

    if not os.path.exists(annotation_result_dir):
        try:
            os.makedirs(annotation_result_dir)
        except (FileExistsError):
            pass

    annotation_result_file = os.path.join(
        annotation_result_dir, chromosome + "_" + str(process_id) + ".bed")
    with open(annotation_result_file, 'w') as annotation_f:
        # start annotation
        resol_fs = []
        for assay in assay_list:
            resol_fs.append(
                open(
                    os.path.join(
                        configs.blacklist_rm_data_path,
                        assay + "/resolution-" + str(resolution_size) + "bp/",
                        chromosome + ".bed"), "r"))
        # read in genome data
        done = False
        while not done:
            data_array = [[] for i in range(E)]
            start_annotation_index = None
            for i in range(E):
                resol_f = resol_fs[i]
                g = 0
                while g < window_size:
                    line = resol_f.readline()
                    if not line:
                        done = True
                        # raise(Exception("Reach end of file, wroing End index."))
                        break
                    inf = line.strip('\n').split()
                    index = int(inf[0])
                    signal = float(inf[1])
                    if index < start_index:
                        continue
                    if start_annotation_index is None:
                        start_annotation_index = index
                    signal = np.arcsinh(signal)
                    data_array[i].append(signal)
                    print(index)
                    print("append" + assay_list[i] + ":" + str(index))
                    g += 1
                    if index >= end_index:
                        done = True
                        break
            # currently skip the data when it is too small
            # a better implementation is to fetch the data from former position
            # test if data array lines up
            is_lined_up = True
            for myarray in data_array:
                if len(myarray) != len(data_array[0]):
                    is_lined_up = False
                    for array in data_array:
                        print(len(array))
                    break
            if g >= window_size / 2 and is_lined_up:
                x_m = np.asarray(data_array, dtype=np.int8)
                result = nmf.transform(x_m.T)  # shape (G,E)
                index = start_annotation_index
                print(result.shape)
                for i in range(g):
                    print("{} {} ".format(index * resolution_size,
                                          (index + 1) * resolution_size),
                          end="",
                          file=annotation_f)
                    for k in range(K):
                        print("{} ".format(result[i, k]),
                              end="",
                              file=annotation_f)
                    print(file=annotation_f)
                    index += 1

    return 0