예제 #1
0
 def test_double_iter(self):
     out = Major_reader(self.sample_major, self.pheno_file)
     reader = out.read('V1', 100)
     geno, pheno = next(reader)
     n_geno = geno.shape[0]
     n_pheno = len(pheno)
     self.assertEqual(n_geno, n_pheno)
예제 #2
0
    def __init__(self,
                 plink_file: str,
                 pheno_file,
                 pheno_name: str,
                 batch_size: int,
                 ldblock_file: str = None,
                 shuffle=True):
        """
        Primary data generator for keras

        :param plink_file: path of a plink file in sample major format
        :param pheno_file: path of the pheno file
        :param pheno_name: name of the phenotype
        :param batch_size: size of the mini batches
        :param ldblock_file: path of the ld block file (bed) (optional)
        :param shuffle: bool if the data should be shuffled
        """
        Major_reader.__init__(self, plink_file, pheno_file, ldblock_file)
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.pheno_name = pheno_name
        self.indexes = np.arange(0, self.n, dtype=int)
        self.on_epoch_end()
        self.dims = None
        if ldblock_file is not None:
            self.block_sequence = self._generate_ld_split_sequence()
            # check dims
            tx, ty = self.__getitem__(0)
            self.dims = [k.shape[1] for k in tx]
        else:
            self.block_sequence = None
            self.dims = self.p
예제 #3
0
 def test_pheno_reader(self):
     pheno = pd.read_table(self.pheno_file)
     batch_size = 100
     out = Major_reader(self.sample_major, self.pheno_file)
     reader = out._iter_pheno('V1', batch_size)
     batch = next(reader)
     compare = batch == pheno.V1.values[:batch_size]
     lg.debug(compare[0:10])
     self.assertTrue(batch_size, np.sum(compare))
예제 #4
0
 def test_one_iter_geno(self):
     out = Major_reader(self.sample_major, self.pheno_file)
     iterat = out._one_iter_geno()
     mat = np.load(self.sample_major_numpy)
     compare = list()
     maxiter = 20
     for i, geno in enumerate(iterat):
         compare.append(np.mean(mat[0][i] == geno.flatten()))
         if i >= maxiter:
             break
     self.assertEqual(np.sum(compare), maxiter + 1)
예제 #5
0
    def test_one_iter_pheno(self):
        out = Major_reader(self.sample_major, self.pheno_file)
        iterat = out._one_iter_pheno('V1')
        pheno = pd.read_table(self.pheno_file)
        maxiter = 20
        compare = list()
        for i, ph in enumerate(iterat):
            compare.append(np.mean(pheno.V1[i] == ph))
            if i >= maxiter:
                break

        self.assertEqual(np.sum(compare), maxiter + 1)
예제 #6
0
 def test_binary_genotype(self):
     bits = '00011011'
     expected_genotypes = [0, 1, 9, 2]
     a = bitarray(bits)
     input_bytes = a.tobytes()
     lg.debug('Used bytes: %s', input_bytes)
     out = Major_reader(self.sample_major, self.pheno_file)
     genotypes = out._bgeno(input_bytes)
     lg.debug('outputed genotypes: %s', genotypes)
     comparision = [genotypes[i] == expected_genotypes[i] for i in range(4)]
     lg.debug('Comparision result: %s', comparision)
     comparision = sum(comparision)
     self.assertEqual(comparision, 4)
예제 #7
0
 def test_geno_read(self):
     gold_data = np.load(self.sample_major_numpy)
     lg.debug('index gold: %s', gold_data[1][0:10])
     gold_data = gold_data[0]
     n_gold, p_gold = gold_data.shape
     lg.debug('Number of samples: %s Number of SNPs %s in gold', n_gold,
              p_gold)
     out = Major_reader(self.sample_major, self.pheno_file)
     reader = out._iter_geno(n_gold)
     genotype_matrix = next(reader)
     n, p = genotype_matrix.shape
     self.assertEqual(n_gold, n)
     self.assertEqual(p_gold, p)
     lg.debug('Gold: %s', gold_data[0, 0:10])
     lg.debug('Sample-Major: %s', genotype_matrix[0, 0:10])
     sub_i = genotype_matrix[0] == gold_data[0]
     self.assertEqual(np.sum(sub_i), p)
예제 #8
0
class Predict(object):
    def __init__(self,
                 train_path: str,
                 dev_path: str,
                 pheno: str,
                 batch_size: int,
                 pheno_dev: str = None):
        super(Predict, self).__init__()
        self._plink_train_path = train_path
        self._plink_dev_path = dev_path
        self.train = Major_reader(train_path, pheno)
        assert (self.train.n / batch_size).is_integer()
        if pheno_dev is None:
            self.dev = Major_reader(dev_path, pheno)
        else:
            self.dev = Major_reader(dev_path, pheno_dev)
        assert (self.dev.n / batch_size).is_integer()
        self.batch_size = batch_size
        self.results = None
        self.num_dev_iter = int(self.dev.n / batch_size)

        lg.info(
            'Using %s for training and %s for devop. Mini-batch size for both is set to %s',
            self.train.n, self.dev.n, batch_size)

    def fit(self,
            pheno: str,
            penal: str,
            lamb: float,
            l_rate: float,
            epochs: int = 201,
            logging_freq: int = 100,
            type: str = 'c'):
        assert pheno in self.train.pheno_names
        assert pheno in self.dev.pheno_names

        train_reader = self.train.read(pheno, self.batch_size)
        dev_reader = self.dev.read(pheno, self.batch_size)
        lg.debug('Finished setting up the iterators')
        model = pytorch_linear(train_reader, dev_reader, self.train.p,
                               self.train.n, self.num_dev_iter,
                               self.batch_size, type)
        lg.debug('Set up linear model')
        self.results = model.run(penal, lamb, epochs, l_rate, logging_freq)
        lg.debug('Model finished')
예제 #9
0
 def test_binary_genotype_overflow(self):
     expected_genotypes = [0, 1, 0, 2, 2, 2]
     a = bitarray('00011011' '11110000', endian='big')
     size = -(-len(expected_genotypes) // 4)
     over_flow = size * 4 - len(expected_genotypes)
     to_remove = [len(expected_genotypes) + k for k in range(over_flow)]
     input_bytes = a.tobytes()
     lg.debug('Used bytes: %s', input_bytes)
     out = Major_reader(self.sample_major, self.pheno_file)
     out._to_remove = to_remove
     lg.debug('Removing the following: %s', to_remove)
     genotypes = out._binary_genotype(input_bytes)
     self.assertEqual(len(genotypes), len(expected_genotypes))
     lg.debug('outputed genotypes: %s', genotypes)
     comparision = [genotypes[i] == expected_genotypes[i] for i in range(6)]
     lg.debug('Comparision result: %s', comparision)
     comparision = sum(comparision)
     self.assertEqual(comparision, 6)
예제 #10
0
    def test_one_iter(self):
        out = Major_reader(self.sample_major, self.pheno_file)
        iterat = out.one_iter('V1')
        mat = np.load(self.sample_major_numpy)
        pheno = pd.read_table(self.pheno_file)
        maxiter = 20
        compare = list()

        for i, value in enumerate(iterat):
            geno, ph = value
            compare.append(np.mean(pheno.V1[i] == ph))
            compare.append(np.mean(mat[0][i] == geno.flatten()))
            if i >= maxiter:
                break
        self.assertEqual(np.sum(compare), (maxiter + 1) * 2)

        # test with missingness
        r = np.random.choice(range(maxiter), 1)
        lg.debug('Replacing position %s with nan', r[0])
        pheno.V1.iloc[r] = np.nan
        path_to_missing_file = '.pheno_with_missing.csv'
        pheno.to_csv(path_to_missing_file, index=False, sep='\t')
        out = Major_reader(self.sample_major, path_to_missing_file)

        iterat = out.one_iter('V1')
        compare = list()
        for i, value in enumerate(iterat):
            geno, ph = value
            if np.isnan(ph):
                lg.debug('ph %s', ph)
            compare.append(np.mean(1 == np.mean(mat[0][i] == geno.flatten())))
            if i >= maxiter:
                break
        expected = r[0]
        self.assertEqual(expected, np.sum(compare))
예제 #11
0
    def test_continious_geno_read(self):
        fam = pd.read_table(self.pheno_file)
        batch_size = 100
        nn = fam.shape[0]
        out = Major_reader(self.sample_major, self.pheno_file)
        reader = out._iter_geno(100)
        first = next(reader)

        to_end = nn // batch_size - 1
        overlap = nn - nn // batch_size
        lg.debug('estimated overlap is %s', overlap)
        lg.debug('steps to end: %s', to_end)
        for i in range(to_end):
            lg.debug(i)
            batch = next(reader)
        batch = next(reader)
        lg.debug('shape of first is %s', first.shape)
        lg.debug('shape of last is %s', batch.shape)
        expected_overlap = batch_size * (nn // batch_size + 1) - nn
        compare = batch[4] == first[0]
        lg.debug(compare)
        self.assertEqual(out.p, np.sum(compare))
예제 #12
0
    def test_shuffle(self):
        out = Major_reader(self.sample_major, self.pheno_file)
        mat = np.load(self.sample_major_numpy)[0]
        n, p = mat.shape
        pheno = pd.read_table(self.pheno_file)
        pheno = pheno.V1.values
        maxiter = 20
        compare = list()
        ids = np.arange(0, n, dtype=int)
        np.random.shuffle(ids)
        geno_iter = out._one_iter_geno(ids)
        pheno_iter = out._one_iter_pheno('V1', ids)

        for i, g, p in zip(ids, geno_iter, pheno_iter):
            geno_comparision = np.equal(g, mat[i, :])
            pheno_comparision = np.equal(p, pheno[i])
            lg.debug('Index: %s: Geno: %s Pheno: %s', i,
                     geno_comparision.all(), pheno_comparision.all())
            if geno_comparision.all() and pheno_comparision.all():
                compare.append(True)
            else:
                compare.append(False)
        self.assertEqual(np.sum(compare), len(ids))
예제 #13
0
    def __init__(self,
                 train_path: str,
                 dev_path: str,
                 pheno: str,
                 batch_size: int,
                 pheno_dev: str = None):
        super(Predict, self).__init__()
        self._plink_train_path = train_path
        self._plink_dev_path = dev_path
        self.train = Major_reader(train_path, pheno)
        assert (self.train.n / batch_size).is_integer()
        if pheno_dev is None:
            self.dev = Major_reader(dev_path, pheno)
        else:
            self.dev = Major_reader(dev_path, pheno_dev)
        assert (self.dev.n / batch_size).is_integer()
        self.batch_size = batch_size
        self.results = None
        self.num_dev_iter = int(self.dev.n / batch_size)

        lg.info(
            'Using %s for training and %s for devop. Mini-batch size for both is set to %s',
            self.train.n, self.dev.n, batch_size)
예제 #14
0
 def test_check_magic_number(self):
     with self.assertRaises(ValueError):
         out = Major_reader(self.plink_file, self.pheno_file)
     out = Major_reader(self.sample_major, self.pheno_file)
     self.assertTrue(out._is_sample_major)