Пример #1
0
    def __init__(self, config_file=combivep_settings.COMBIVEP_CONFIGURATION_FILE):
        CombiVEPBase.__init__(self)

        self.referer = Referer()
        self.referer.config_file = config_file
        self.referer.load_config()
        self.dataset = DataSet()
Пример #2
0
    def __init__(self, cfg_file=cbv_const.CBV_CFG_FILE):
        CombiVEPBase.__init__(self)

        self.referer = Referer()
        self.referer.cfg_file = cfg_file
        self.referer.load_cfg()
        self.dataset = DataSet()
class TestReferer(SafePreProcTester):


    def __init__(self, test_name):
        SafePreProcTester.__init__(self, test_name)

    def setUp(self):
        self.test_class = 'referer'

    def init_referer_instance(self):
        self.__referer = Referer()

    def test_validate_snp(self):
        self.init_test('test_validate_snp')
        self.init_referer_instance()
        self.__referer.config_file = combivep_settings.COMBIVEP_CENTRAL_TEST_CONFIGURATION_FILE
        self.__referer.load_config()
        self.assertTrue(self.__referer.validate_snp('1'     , 887560  , 'A', 'C'), "Incorrect SNP validating")
        self.assertTrue(self.__referer.validate_snp('chr3'  , 25836088, 'C', 'A'), "Incorrect SNP validating")
        self.assertTrue(self.__referer.validate_snp('20'    , 17474690, 'T', 'G'), "Incorrect SNP validating")
        self.assertTrue(self.__referer.validate_snp('chrX'  , 56296488, 'G', 'C'), "Incorrect SNP validating")
        self.assertTrue(self.__referer.validate_snp('Y'     , 15581983, 'G', 'A'), "Incorrect SNP validating")
        self.assertFalse(self.__referer.validate_snp('chr16', 21086416, 'T', 'A'), "Incorrect SNP validating")

    def test_get_scores(self):
        self.init_test('test_get_scores')
        self.init_referer_instance()
        self.__referer.config_file = combivep_settings.COMBIVEP_CENTRAL_TEST_CONFIGURATION_FILE
        self.__referer.load_config()
        rec = self.__referer.get_scores('3', 108541778, 'T', 'C')
#        self.assertEqual(rec[combivep_settings.KEY_SNP_INFO][combivep_settings.KEY_LJB_CHROM], '3', "Incorrect LJB formatting")
#        self.assertEqual(rec[combivep_settings.KEY_SNP_INFO][combivep_settings.KEY_LJB_POS], '108541778', "Incorrect LJB formatting")
#        self.assertEqual(rec[combivep_settings.KEY_SNP_INFO][combivep_settings.KEY_LJB_REF], 'T', "Incorrect LJB formatting")
#        self.assertEqual(rec[combivep_settings.KEY_SNP_INFO][combivep_settings.KEY_LJB_ALT], 'C', "Incorrect LJB formatting")
        self.assertEqual(rec[combivep_settings.KEY_PHYLOP_SCORE], '0.102322', "Incorrect LJB formatting")
        self.assertEqual(rec[combivep_settings.KEY_SIFT_SCORE], '0.91', "Incorrect LJB formatting")
        self.assertEqual(rec[combivep_settings.KEY_PP2_SCORE], '0', "Incorrect LJB formatting")
        self.assertEqual(rec[combivep_settings.KEY_LRT_SCORE], '0.312516', "Incorrect LJB formatting")
        self.assertEqual(rec[combivep_settings.KEY_MT_SCORE], '0.000000', "Incorrect LJB formatting")
        self.assertEqual(rec[combivep_settings.KEY_GERP_SCORE], '-3.16', "Incorrect LJB formatting")

    def tearDown(self):
        self.remove_working_dir()
 def init_referer_instance(self):
     self.__referer = Referer()
Пример #5
0
class DataSetManager(CombiVEPBase):


    def __init__(self, config_file=combivep_settings.COMBIVEP_CONFIGURATION_FILE):
        CombiVEPBase.__init__(self)

        self.referer = Referer()
        self.referer.config_file = config_file
        self.referer.load_config()
        self.dataset = DataSet()

    def __clear_data(self):
        self.dataset.clear()

    def load_data(self, file_name, file_type=combivep_settings.FILE_TYPE_VCF):
        if file_type == combivep_settings.FILE_TYPE_VCF:
            return self.__load_vcf_data(file_name)
        if file_type == combivep_settings.FILE_TYPE_CBV:
            return self.__load_cbv_data(file_name)

    def __load_vcf_data(self, file_name):
        self.__clear_data()
        vcf_reader = VcfReader()
        vcf_reader.read(file_name)
        for rec in vcf_reader.fetch_hash_snps():
            snp_data = {combivep_settings.KEY_CHROM : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_VCF_CHROM],
                        combivep_settings.KEY_POS   : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_VCF_POS],
                        combivep_settings.KEY_REF   : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_VCF_REF],
                        combivep_settings.KEY_ALT   : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_VCF_ALT],
                        }
            prediction = {combivep_settings.KEY_TARGETS : None}
            self.dataset.append({combivep_settings.KEY_SNP_INFO_SECTION   : snp_data,
                                 combivep_settings.KEY_PREDICTION_SECTION : prediction})

    def __load_cbv_data(self, file_name):
        self.__clear_data()
        cbv_reader = CbvReader()
        cbv_reader.read(file_name)
        for rec in cbv_reader.fetch_hash_snps():
            snp_data = {combivep_settings.KEY_CHROM : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CBV_CHROM],
                        combivep_settings.KEY_POS   : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CBV_POS],
                        combivep_settings.KEY_REF   : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CBV_REF],
                        combivep_settings.KEY_ALT   : rec[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CBV_ALT],
                        }
            prediction = {combivep_settings.KEY_TARGETS : rec[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_CBV_TARGETS]}
            self.dataset.append({combivep_settings.KEY_SNP_INFO_SECTION : snp_data,
                                 combivep_settings.KEY_PREDICTION_SECTION : prediction})

    def validate_data(self):
        #to prevent misintepret due to different version between each data point by 
        #removing items from self.dataset if they are not exist in certain UCSC database
        self.dataset[:] = [item for item in self.dataset if self.referer.validate_snp(item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CHROM],
                                                                                      item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_POS],
                                                                                      item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_REF],
                                                                                      item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_ALT]
                                                                                      )]

    def calculate_scores(self):
        #get scores from LJB database
        for item in self.dataset:
            item[combivep_settings.KEY_SCORES_SECTION] = self.referer.get_scores(item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CHROM],
                                                                                 item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_POS],
                                                                                 item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_REF],
                                                                                 item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_ALT]
                                                                                 )
        #remove items from self.dataset if they don't have scores
        self.dataset[:] = [item for item in self.dataset if item[combivep_settings.KEY_SCORES_SECTION] is not None]

    def partition_data(self,
                       proportion_training_data   = combivep_settings.PROPORTION_TRAINING_DATA,
                       proportion_validation_data = combivep_settings.PROPORTION_VALIDATION_DATA,
                       ):
        total_proportion = proportion_training_data + proportion_validation_data
        self.training_data_size   = int(math.floor(len(self.dataset) * proportion_training_data / total_proportion))
        self.validation_data_size = len(self.dataset) - self.training_data_size

    def get_training_data(self):
        dataset = DataSet()
        for i in  xrange(0, self.training_data_size):
            dataset.append(self.dataset[i])
        return dataset

    def get_validation_data(self):
        dataset = DataSet()
        for i in xrange(self.training_data_size, len(self.dataset)):
            dataset.append(self.dataset[i])
        return dataset

    def set_shuffle_seed(self, shuffle_seed):
        self.dataset.set_shuffle_seed(shuffle_seed)

    def shuffle_data(self):
        self.dataset.shuffle()
Пример #6
0
class DataSetManager(CombiVEPBase):

    def __init__(self, cfg_file=cbv_const.CBV_CFG_FILE):
        CombiVEPBase.__init__(self)

        self.referer = Referer()
        self.referer.cfg_file = cfg_file
        self.referer.load_cfg()
        self.dataset = DataSet()

    def clear_data(self):
        self.dataset.clear()

    def load_data(self, file_name, file_type=cbv_const.FILE_TYPE_VCF):
        if file_type == cbv_const.FILE_TYPE_VCF:
            return self.__load_vcf_data(file_name)
        if file_type == cbv_const.FILE_TYPE_CBV:
            return self.__load_cbv_data(file_name)

    def __load_vcf_data(self, file_name):
        self.clear_data()
        vcf_reader = VcfReader()
        vcf_reader.read(file_name)
        for rec in vcf_reader.fetch_snps():
            snp_data = SnpDataRecord(rec)
            self.dataset.append({cbv_const.KW_SNP_DATA: snp_data})

    def __load_cbv_data(self, file_name):
        self.clear_data()
        cbv_reader = CbvReader()
        cbv_reader.read(file_name)
        for rec in cbv_reader.fetch_snps():
            snp_data = SnpDataRecord(rec)
            self.dataset.append({cbv_const.KW_SNP_DATA: snp_data})

    def validate_data(self):
        #to prevent misintepretion due to different version
        #between each data point by removing items from self.dataset
        #if they are not exist in certain UCSC database
        tmp_dataset = DataSet()
        for item in self.dataset:
            snp_data = item[cbv_const.KW_SNP_DATA]
            if self.referer.validate_snp(snp_data.chrom,
                                         snp_data.pos,
                                         snp_data.ref,
                                         snp_data.alt,
                                         ):
                tmp_dataset.append(item)
        del self.dataset[:]
        self.dataset = tmp_dataset

    def calculate_scores(self):
        #get scores from LJB database
        tmp_dataset = DataSet()
        for item in self.dataset:
            snp_data = item[cbv_const.KW_SNP_DATA]
            scores = self.referer.get_scores(snp_data.chrom,
                                             snp_data.pos,
                                             snp_data.ref,
                                             snp_data.alt,
                                             )
            item[cbv_const.KW_SCORES] = scores
        #remove items from self.dataset if they don't have scores
        self.dataset[:] = [item for item in self.dataset if item[cbv_const.KW_SCORES] is not None]

    def partition_data(self,
                       prop_training=cbv_const.PROPORTION_TRAINING_DATA,
                       prop_validation=cbv_const.PROPORTION_VALIDATION_DATA,
                       ):
        self.__prop_training   = prop_training
        self.__prop_validation = prop_validation

    @property
    def n_training_data(self):
        total_prop = float(self.__prop_training + self.__prop_validation)
        ratio_training = self.__prop_training / total_prop
        return int(math.floor(len(self.dataset) * ratio_training))

    @property
    def n_validation_data(self):
        return len(self.dataset) - self.n_training_data

    def get_training_data(self):
        dataset = DataSet()
        for i in xrange(0, self.n_training_data):
            dataset.append(self.dataset[i])
        return dataset

    def get_validation_data(self):
        dataset = DataSet()
        for i in xrange(self.n_training_data, len(self.dataset)):
            dataset.append(self.dataset[i])
        return dataset

    def set_shuffle_seed(self, shuffle_seed):
        self.dataset.set_shuffle_seed(shuffle_seed)

    def shuffle_data(self):
        self.dataset.shuffle()