예제 #1
0
def cadd_generate_batched_lmdb_from_many_csv(lmdb_batch_dir,
                                             csv_folder,
                                             variant_ids_file,
                                             batch_size,
                                             num_batches=-1):
    it = dir_batch_generator(csv_folder, batch_size)
    test_batch = next(it)
    variant_ids = load_pickle(variant_ids_file)
    nrows = len(variant_ids)

    row_example = {
        "batch_id": np.int32(0),
        "inputs": test_batch[0].values.astype(np.float16),
        "targets": test_batch[1].values.astype(np.float16),
        "metadata": {
            "row_num":
            np.array(test_batch[0].index, dtype=np.int32),
            "variant_id":
            np.array(variant_ids.loc[test_batch[0].index], dtype='<U20')
        }
    }

    ms = calculate_map_size(row_example, nrows)
    it = dir_batch_generator(csv_folder, batch_size)
    create_batched_lmdb_from_iterator(it,
                                      lmdb_batch_dir,
                                      variant_ids_file,
                                      num_batches=num_batches,
                                      map_size=ms)
예제 #2
0
    def __init__(self,
                 lmbd_dir,
                 batch_idx_file,
                 version="1.3",
                 hg_assembly="GRCh37"):

        self.version = version

        # indexed by location
        self.batch_idxs = load_pickle(batch_idx_file)

        self.lmdb_cadd_path = lmbd_dir
        self.lmdb_cadd = None
        self.txn = None
예제 #3
0
def reorder_sparse_matrix(input_npz, row_ids, output_npz):
    from scipy.sparse import load_npz, save_npz
    """Re-order the vcf file. Note: the output vcf 
    
    Args:
      input_npz: path to a .npz file
      row_ids List[int]: a list of integer numbers or a path 
        to a .txt file containing the shuffled rows
      output_npz: output .npz file path
    """
    if isinstance(row_ids, str):
        row_ids = load_pickle(row_ids)

    npz = load_npz(input_npz)
    npz = npz[row_ids]
    save_npz(output_npz, npz)
예제 #4
0
    def __init__(self,
                 lmbd_dir,
                 variant_id_file,
                 version="1.3",
                 hg_assembly="GRCh37"):

        self.version = version

        self.lmdb_cadd_path = lmbd_dir
        self.lmdb_cadd = None
        self.txn = None

        # indexed by location
        self.variant_ids_file = variant_id_file
        self.variant_ids = load_pickle(self.variant_ids_file)
        self.variant_ids = self.variant_ids.values
예제 #5
0
def sparse_cadd_dataset(sparse_matrix,
                        variant_ids_file,
                        targets_col=0,
                        split=0.3,
                        random_state=42,
                        output_npz=None,
                        output_ids=None,
                        separate_x_y=False):
    """Splits a sparse matrix into train and test set.
    Args:
      sparse_matrix: path-like or csr_matrix instance.
    """
    from sklearn.model_selection import ShuffleSplit
    import os

    if isinstance(sparse_matrix, str):
        sparse_matrix = load_npz(sparse_matrix)
    elif not isinstance(sparse_matrix, csr_matrix):
        raise ValueError(
            "Input must be either a path to a sparse matrix or an object of csr_matrix type."
        )

    keep_cols = list(range(sparse_matrix.shape[1]))
    keep_cols.remove(targets_col)
    assert targets_col not in keep_cols

    variant_ids = load_pickle(variant_ids_file)
    rs = ShuffleSplit(n_splits=1, test_size=split, random_state=random_state)
    train_index, valid_index = next(rs.split(variant_ids))

    train, valid = sparse_matrix[train_index], sparse_matrix[valid_index]
    train_ids, valid_ids = variant_ids[train_index], variant_ids[valid_index]

    if separate_x_y:
        train = train[:, keep_cols], train[:, targets_col]
        valid = valid[:, keep_cols], valid[:, targets_col]

    del sparse_matrix

    if output_npz is not None:
        save_npz(os.path.join(output_npz, "train.npz"), train)
        save_npz(os.path.join(output_npz, "valid.npz"), valid)
    if output_ids is not None:
        dump_to_pickle(os.path.join(output_ids, "train.pkl"), train_ids)
        dump_to_pickle(os.path.join(output_ids, "valid.pkl"), valid_ids)

    return (train, train_ids), (valid, valid_ids)
예제 #6
0
def create_batched_lmdb_from_iterator(it,
                                      lmdb_batch_dir,
                                      variant_ids_file,
                                      num_batches=-1,
                                      map_size=23399354270):
    start = time.time()

    index_mapping = OrderedDict()
    map_size = None
    txn = None
    batch_num = 0
    variant_ids = load_pickle(variant_ids_file)

    env = lmdb.Environment(lmdb_batch_dir,
                           map_size=map_size,
                           max_dbs=0,
                           lock=False)
    with env.begin(write=True, buffers=True) as txn:
        for batch in tqdm(it):
            b = {
                "batch_id": np.int32(batch_num),
                "inputs": batch[0].values.astype(np.float16),
                "targets": batch[1].values.astype(np.float16),
                "metadata": {
                    "row_num":
                    np.array(batch[0].index, dtype=np.int32),
                    "variant_id":
                    np.array(variant_ids.loc[batch[0].index], dtype='<U20')
                }
            }

            # Serialize and compress
            buff = pa.serialize(b).to_buffer()
            blzpacked = blosc.compress(buff, typesize=8, cname='blosclz')

            try:
                txn.put(str(batch_num).encode('ascii'), blzpacked)
            except lmdb.MapFullError as err:
                print(str(err) + ". Exiting the program.")

            batch_num += 1
            # if batch_num >= num_batches: break

    print("Finished putting " + str(batch_num) + " batches to lmdb.")
    end = time.time()
    print("Total elapsed time: {:.2f} minutes.".format((end - start) / 60))
예제 #7
0
    def __init__(self,
                 lmdb_dir,
                 variant_id_file,
                 version="1.3",
                 hg_assembly="GRCh37"):
        """Reads LMDB database and obtains all predictions available for each variant.
        """
        self.version = version

        self.lmdb_dir = lmdb_dir
        self.lmdb_kipoi = None
        self.txn = None
        self._column_names = None

        self.variant_ids_file = variant_id_file
        self.variant_ids = load_pickle(self.variant_ids_file)
        self.variant_ids = self.variant_ids.values
예제 #8
0
    def __init__(self,
                 sparse_npz,
                 variant_ids,
                 version="1.3",
                 hg_assembly="GRCh37"):
        if isinstance(sparse_npz, str) and isinstance(variant_ids, str):
            self.data = load_npz(sparse_npz)
            self.variant_ids = load_pickle(variant_ids)
        elif isinstance(sparse_npz, csr_matrix) and isinstance(
                variant_ids, pd.Series):
            self.data = sparse_npz
            self.variant_ids = variant_ids
        else:
            raise ValueError(
                "Inputs must be either a paths or objects of csr_matrix and pd.Series types."
            )

        self.variant_ids = self.variant_ids.values
예제 #9
0
    def __init__(self,
                 lmdb_dirs_list,
                 variant_ids_file,
                 version="1.3",
                 hg_assembly="GRCh37"):
        self.version = version

        self.lmdb_dirs_list = lmdb_dirs_list

        self.variant_ids_file = variant_ids_file
        self.variant_ids = load_pickle(self.variant_ids_file)
        self.variant_ids = self.variant_ids.values

        self._column_names = None

        self.datasets = [
            KipoiLmdbDataset(db, variant_ids_file, version)
            for db in lmdb_dirs_list
        ]
예제 #10
0
def reorder_vcf(input_vcf, row_ids, output_vcf, discard_metadata=False):
    """Re-order the vcf file. Note: the output vcf 
    
    Args:
      input_vcf: path to a vcf file
      row_ids List[int]: a list of integer numbers or a path 
        to a .txt file containing the shuffled rows
      output_vcf: output vcf file path
      discard_metadata: if True, the INFO field of the vcf is ignored
    """
    if isinstance(row_ids, str):
        row_ids = load_pickle(row_ids)

    colnames = ""
    with open(input_vcf, 'r') as f:
        for l in f.readlines():
            if "#CHROM" in l:
                colnames = l.replace("\n", "").split("\t")
                break

    vcf_df = pd.read_csv(input_vcf,
                         sep="\t",
                         header=None,
                         names=colnames,
                         comment="#")
    vcf_df = vcf_df.loc[row_ids]

    if discard_metadata:
        vcf_df.drop(columns=['ID', 'QUAL', 'FILTER', 'INFO'], inplace=True)
        header = "##fileformat=VCFv4.0\n"
        with open(output_vcf, 'w') as f:
            f.write(header)
        vcf_df.to_csv(output_vcf, sep="\t", mode='a', index=None)
    else:
        header_lines = ""
        with open(input_vcf, 'r') as f:
            for l in f.readlines():
                if l.startswith("#"): header_lines += l
        with open(output_vcf, 'w') as f:
            f.write(header_lines)
        vcf_df.to_csv(output_vcf, sep="\t", mode='a', header=None, index=None)
예제 #11
0
    def train(self, sample_weight=None, scaler_path=None, training_type=np.float32):
        """Train the model
        Args:
          batch_size:
          num_workers: how many workers to use in parallel
        """
        from sklearn.externals import joblib

        print("Started loading training dataset")
        
        X_train, y_train = self.train_dataset.load_all()

        if len(self.valid_dataset) == 0:
            raise ValueError("len(self.valid_dataset) == 0")

        if scaler_path:
            scaler = load_pickle(scaler_path)
            print("Started scaling X.")
            X_infl = X_train.astype(np.float32)
            X_infl = scaler.transform(X_infl)

            if training_type is not np.float32:    
                X_train = X_infl.astype(np.float16)
                if isinstance(X_train, csr_matrix):
                    X_train.data = np.minimum(X_train.data, 65500)
                else:
                    X_train = np.minimum(X_train, 65500)
                del X_infl
                print("The dataset was downscaled.")
            print("Finished scaling X.")
        
        print("Finished loading training dataset. Shape: ", X_train.shape, "True values:", y_train.sum()/y_train.shape[0])
        self.model.fit(X_train,
                       y_train,
                       sample_weight=sample_weight)
        
        print("Calculating training accuracy:")
        acc = self.model.score(X_train, y_train)
        print("Obtained training accuracy: ", acc)

        joblib.dump(self.model, self.ckp_file)
예제 #12
0
    def evaluate(self, metric, scaler_path=None, eval_type=np.float32, save=True):
        """Evaluate the model on the validation set
        Args:
          metrics: a list or a dictionary of metrics
          batch_size:
          num_workers:
        """
        print("Started loading validation dataset")
        
        X_valid, y_valid = self.valid_dataset.load_all()

        if scaler_path:
            scaler = load_pickle(scaler_path)
            print("Started scaling X.")
            X_infl = X_valid.astype(np.float32)
            X_infl = scaler.transform(X_infl)

            if eval_type is not np.float32:
                X_valid = X_infl.astype(np.float16)
                if isinstance(X_valid, csr_matrix):
                    X_valid.data = np.minimum(X_valid.data, 65500)
                else:
                    X_valid = np.minimum(X_valid, 65500)
                del X_infl
            print("Finished scaling X.")

        print("Finished loading validation dataset. Shape: ", X_valid.shape, "True values:", y_valid.sum()/y_valid.shape[0])
        
        y_pred = self.model.predict(X_valid)
        metric_res = metric(y_valid, y_pred)
        print("metric_res", metric_res, np.amax(X_valid))

        if save:
            write_json(metric_res, self.evaluation_path, indent=2)

        if self.cometml_experiment:
            self.cometml_experiment.log_multiple_metrics(flatten(metric_res), prefix="best/")

        return metric_res
예제 #13
0
    for f in TRAINING_DATA_FILES:
        inputfile = os.path.join(training_dir, f + ".tsv.gz")
        out = os.path.join(variant_ids_dir, f + ".pkl")
        if not os.path.isfile(out):  # skip if file exists
            generate_variant_ids(inputfile,
                                 out,
                                 variant_cols=variant_cols,
                                 dtype=dtype)

    print("Generating sparse matrices...")
    # Generate sparse matrices
    for f in TRAINING_DATA_FILES:
        # Get the base of the name
        inputfile = os.path.join(training_dir, f + ".csv.gz")
        # Num lines is necessary to set the total in tqdm, important feedback in a lengthy function
        num_lines = len(load_pickle(os.path.join(variant_ids_dir, f + ".pkl")))
        output = os.path.join(sparse_matrices_dir, f + ".npz")
        if not os.path.isfile(output):
            load_csv_chunks_tosparse(inputfile,
                                     10000,
                                     np.float32,
                                     num_lines=num_lines,
                                     output=output,
                                     header=None)

    # Merge variant ids
    output = os.path.join(variant_ids_dir, "all.pkl")
    if not os.path.isfile(output):
        print("Merging variant ids...")
        all_ids = None
        for f in tqdm(TRAINING_DATA_FILES):
예제 #14
0
def train_test_split_indexes(variant_id_file, test_size, random_state=1):
    variants = load_pickle(variant_id_file)
    train_vars, test_vars = train_test_split(variants,
                                             test_size=test_size,
                                             random_state=random_state)
    return train_vars, test_vars