def filter_cbv_data(cbv_file, config_file=combivep_settings.COMBIVEP_CONFIGURATION_FILE): (dir_name, file_name) = os.path.split(cbv_file) print print "> > > " + file_name dm = DataSetManager() dm.load_data(cbv_file, file_type=combivep_settings.FILE_TYPE_CBV) print "%-25s: %5d\n" % ("Original", len(dm.dataset)) dm.validate_data() f_clean = open(cbv_file + '.clean', 'w') for item in dm.dataset: f_clean.write("%s\t%s\t%s\t%s\t%s\n" % (item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CHROM], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_POS], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_REF], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_ALT], item[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_TARGETS], ) ) f_clean.close() print "%-25s: %5d" % ("Clean pathogenic", len([item for item in dm.dataset if item[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_TARGETS] == '1'])) print "%-25s: %5d" % ("Clean neutral", len([item for item in dm.dataset if item[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_TARGETS] == '0'])) print "%-25s: %5d\n" % ("Total", len(dm.dataset)) dm.calculate_scores() f_scores = open(cbv_file + '.scores', 'w') for item in dm.dataset: f_scores.write("%s\t%s\t%s\t%s\t%s\n" % (item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_CHROM], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_POS], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_REF], item[combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_ALT], item[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_TARGETS], ) ) f_scores.close() print "%-25s: %5d" % ("Scored pathogenic", len([item for item in dm.dataset if item[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_TARGETS] == '1'])) print "%-25s: %5d" % ("Scored neutral", len([item for item in dm.dataset if item[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_TARGETS] == '0'])) print "%-25s: %5d\n" % ("Total", len(dm.dataset)) dm.set_shuffle_seed(combivep_settings.DEMO_SEED) dm.shuffle_data() dm.partition_data() #partition data training_dataset = dm.get_training_data() validation_dataset = dm.get_validation_data() print "%-25s: %5d" % ("Training pathogenic", len([item for item in training_dataset if item[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_TARGETS] == '1'])) print "%-25s: %5d" % ("Training neutral", len([item for item in training_dataset if item[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_TARGETS] == '0'])) print "%-25s: %5d\n" % ("Total", len(training_dataset)) print "%-25s: %5d" % ("Validation pathogenic", len([item for item in validation_dataset if item[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_TARGETS] == '1'])) print "%-25s: %5d" % ("Validation neutral", len([item for item in validation_dataset if item[combivep_settings.KEY_PREDICTION_SECTION][combivep_settings.KEY_TARGETS] == '0'])) print "%-25s: %5d\n" % ("Total", len(validation_dataset))
def test_add_dataset(self): self.init_test('test_add_dataset') test_file = os.path.join(self.data_dir, 'test_add_dataset1.cbv') dm1 = DataSetManager(cfg_file=cbv_const.CBV_SAMPLE_CFG_FILE) dm1.load_data(test_file, file_type=cbv_const.FILE_TYPE_CBV) dm1.validate_data() dm1.calculate_scores() dm1.shuffle_data() dm1.partition_data() training_dataset1 = dm1.get_training_data() test_file = os.path.join(self.data_dir, 'test_add_dataset2.cbv') dm2 = DataSetManager(cfg_file=cbv_const.CBV_SAMPLE_CFG_FILE) dm2.load_data(test_file, file_type=cbv_const.FILE_TYPE_CBV) dm2.validate_data() dm2.calculate_scores() dm2.shuffle_data() dm2.partition_data() training_dataset2 = dm2.get_training_data() combine_dataset = training_dataset1 + training_dataset2 self.assertEqual(len(combine_dataset), 10, 'DataSetManager does not load VCF data correctly')
def train_combivep_using_cbv_data(training_data_file, params_out_file=cbv_const.USER_PARAMS_FILE, random_seed=cbv_const.DFLT_SEED, n_hidden_nodes=cbv_const.DFLT_HIDDEN_NODES, figure_dir=cbv_const.DFLT_FIGURE_DIR, iterations=cbv_const.DFLT_ITERATIONS, cfg_file=cbv_const.CBV_CFG_FILE, ): """ CBV (CombiVEP format) is a parsed format intended to be used by CombiVEP. CBV has 5 fields: - CHROM - POS - REF - ALT - EFFECT(1=deleterious, 0=neutral) All are tab separated. Required arguments - neutral_data_file : list of SNPs with no harmful effect, CBV format - pathognice_data_file : list of SNPs with deleterious effect, CBV format """ #pre-processing dataset print >> sys.stderr, 'pre-processing dataset, this may take a while (around 750 SNPs/mins). . .' dm = DataSetManager(cfg_file=cfg_file) dm.load_data(training_data_file, file_type=cbv_const.FILE_TYPE_CBV) dm.validate_data() dm.calculate_scores() dm.set_shuffle_seed(random_seed) dm.shuffle_data() dm.partition_data() #partition data training_data = dm.get_training_data() validation_data = dm.get_validation_data() #train !!! print >> sys.stderr, 'Training CombiVEP, please wait (around 500 SNPs/mins) . . .' trainer = Trainer(training_data, validation_data, random_seed, n_hidden_nodes, figure_dir) trainer.train(iterations) if not os.path.exists(cbv_const.USER_PARAMS_DIR): os.makedirs(cbv_const.USER_PARAMS_DIR) trainer.export_best_parameters(params_out_file)
def filter_cbv_data(cbv_file, cfg_file=cbv_const.CBV_CFG_FILE): (dir_name, file_name) = os.path.split(cbv_file) report_fmt = "{caption:<25}:{value:>6d}" clean_out_fmt = "{chrom}\t{pos}\t{ref}\t{alt}\t{target}\n" scores_out_fmt = "{chrom}\t{pos}\t{ref}\t{alt}\t{target}\t" scores_out_fmt += "{phylop_score}\t{sift_score}\t{pp2_score}\t" scores_out_fmt += "{lrt_score}\t{mt_score}\t{gerp_score}\n" print print "> > > " + file_name dm = DataSetManager() dm.load_data(cbv_file, file_type=cbv_const.FILE_TYPE_CBV) print report_fmt.format(caption="original", value=len(dm.dataset)) dm.validate_data() f_clean = open(cbv_file + '.clean', 'w') for item in dm.dataset: snp_data = item[cbv_const.KW_SNP_DATA] f_clean.write(clean_out_fmt.format(chrom=snp_data.chrom, pos=snp_data.pos, ref=snp_data.ref, alt=snp_data.alt, target=snp_data.target, )) f_clean.close() print report_fmt.format(caption="Clean pathogenic", value=len([item for item in dm.dataset if item[cbv_const.KW_SNP_DATA].target == '1'])) print report_fmt.format(caption="Clean neutral", value=len([item for item in dm.dataset if item[cbv_const.KW_SNP_DATA].target == '0'])) print report_fmt.format(caption="Total", value=len(dm.dataset)) dm.calculate_scores() f_scores = open(cbv_file + '.scores', 'w') for item in dm.dataset: snp_data = item[cbv_const.KW_SNP_DATA] scores = item[cbv_const.KW_SCORES] f_scores.write(scores_out_fmt.format(chrom=snp_data.chrom, pos=snp_data.pos, ref=snp_data.ref, alt=snp_data.alt, target=snp_data.target, phylop_score=scores.phylop_score, sift_score=scores.sift_score, pp2_score=scores.pp2_score, lrt_score=scores.lrt_score, mt_score=scores.mt_score, gerp_score=scores.gerp_score )) f_scores.close() print report_fmt.format(caption="Scored pathogenic", value=len([item for item in dm.dataset if item[cbv_const.KW_SNP_DATA].target == '1'])) print report_fmt.format(caption="Scored neutral", value=len([item for item in dm.dataset if item[cbv_const.KW_SNP_DATA].target == '0'])) print report_fmt.format(caption="Total", value=len(dm.dataset)) dm.set_shuffle_seed(cbv_const.DEMO_SEED) dm.shuffle_data() dm.partition_data() #partition data training_dataset = dm.get_training_data() validation_dataset = dm.get_validation_data() print report_fmt.format(caption="Training pathogenic", value=len([item for item in training_dataset if item[cbv_const.KW_SNP_DATA].target == '1'])) print report_fmt.format(caption="Training neutral", value=len([item for item in training_dataset if item[cbv_const.KW_SNP_DATA].target == '0'])) print report_fmt.format(caption="Total", value=len(training_dataset)) print report_fmt.format(caption="Validation pathogenic", value=len([item for item in validation_dataset if item[cbv_const.KW_SNP_DATA].target == '1'])) print report_fmt.format(caption="Validation neutral", value=len([item for item in validation_dataset if item[cbv_const.KW_SNP_DATA].target == '0'])) print report_fmt.format(caption="Total", value=len(validation_dataset))
class TestDataSetManager(SafePreProcTester): def __init__(self, test_name): SafePreProcTester.__init__(self, test_name) def setUp(self): self.test_class = 'dataset_manager' def init_dataset_instance(self): self.__dataset_manager = DataSetManager(config_file=combivep_settings.COMBIVEP_CENTRAL_TEST_CONFIGURATION_FILE) def test_vcf_load(self): self.init_test('test_vcf_load') self.init_dataset_instance() test_file = os.path.join(self.data_dir, 'test_vcf_load.vcf') self.__dataset_manager.load_data(test_file) self.assertEqual(len(self.__dataset_manager.dataset), 10, 'DataSetManager does not load VCF data correctly') def test_cbv_load(self): self.init_test('test_cbv_load') self.init_dataset_instance() test_file = os.path.join(self.data_dir, 'test_cbv_load.cbv') self.__dataset_manager.load_data(test_file, file_type=combivep_settings.FILE_TYPE_CBV) self.assertEqual(len(self.__dataset_manager.dataset), 11, 'DataSetManager does not load CBV data correctly') def test_validate_data(self): self.init_test('test_validate_data') self.init_dataset_instance() test_file = os.path.join(self.data_dir, 'test_vcf_load.vcf') self.__dataset_manager.load_data(test_file) self.__dataset_manager.validate_data() self.assertEqual(len(self.__dataset_manager.dataset), 7, 'DataSetManager does not clean data correctly') def test_calculate_scores(self): self.init_test('test_calculate_scores') self.init_dataset_instance() test_file = os.path.join(self.data_dir, 'test_calculate_scores.vcf') self.__dataset_manager.load_data(test_file) self.__dataset_manager.validate_data() self.__dataset_manager.calculate_scores() self.assertEqual(len(self.__dataset_manager.dataset), 3, 'DataSetManager does not calculate scores properly') def test_shuffle_data(self): self.init_test('test_shuffle_data') self.init_dataset_instance() test_file = os.path.join(self.data_dir, 'test_shuffle.vcf') self.__dataset_manager.load_data(test_file) self.__dataset_manager.validate_data() self.__dataset_manager.calculate_scores() self.assertEqual(self.__dataset_manager.dataset[5][combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_POS], '190999917', 'DataSetManager does not calculate scores properly') self.__dataset_manager.set_shuffle_seed(combivep_settings.DEMO_SEED) self.__dataset_manager.shuffle_data() self.assertNotEqual(self.__dataset_manager.dataset[5][combivep_settings.KEY_SNP_INFO_SECTION][combivep_settings.KEY_POS], '190999917', 'DataSetManager may not shuffle data correctly') def test_partition_data(self): self.init_test('test_partition_data') self.init_dataset_instance() test_file = os.path.join(self.data_dir, 'test_partition.cbv') self.__dataset_manager.load_data(test_file, file_type=combivep_settings.FILE_TYPE_CBV) self.__dataset_manager.validate_data() self.__dataset_manager.calculate_scores() self.__dataset_manager.set_shuffle_seed(combivep_settings.DEMO_SEED) self.__dataset_manager.shuffle_data() self.__dataset_manager.partition_data() self.assertEqual(len(self.__dataset_manager.get_training_data()), 11, 'DataSetManager does not correctly partition data') self.assertEqual(len(self.__dataset_manager.get_validation_data()), 3, 'DataSetManager does not correctly partition data') def test_vcf_dataset(self): self.init_test('test_dataset') self.init_dataset_instance() test_file = os.path.join(self.data_dir, 'test_shuffle.vcf') self.__dataset_manager.load_data(test_file) self.__dataset_manager.validate_data() self.__dataset_manager.calculate_scores() self.__dataset_manager.set_shuffle_seed(combivep_settings.DEMO_SEED) self.__dataset_manager.shuffle_data() self.__dataset_manager.partition_data() training_dataset = self.__dataset_manager.get_training_data() self.assertEqual(training_dataset.n_features, 6, msg='Dataset does not functional properly') self.assertEqual(training_dataset.n_data, 8, msg='Dataset does not functional properly') def test_cbv_dataset(self): self.init_test('test_dataset') self.init_dataset_instance() test_file = os.path.join(self.data_dir, 'test_cbv_dataset.cbv') self.__dataset_manager.load_data(test_file, file_type=combivep_settings.FILE_TYPE_CBV) self.__dataset_manager.validate_data self.__dataset_manager.calculate_scores() self.__dataset_manager.set_shuffle_seed(combivep_settings.DEMO_SEED) self.__dataset_manager.shuffle_data() self.__dataset_manager.partition_data() training_dataset = self.__dataset_manager.get_training_data() self.assertEqual(training_dataset.n_features, 6, msg='Dataset does not functional properly') self.assertEqual(training_dataset.n_data, 15, msg='Dataset does not functional properly') def test_add_dataset(self): self.init_test('test_add_dataset') test_file = os.path.join(self.data_dir, 'test_add_dataset1.cbv') self.dataset_manager1 = DataSetManager(config_file=combivep_settings.COMBIVEP_CENTRAL_TEST_CONFIGURATION_FILE) self.dataset_manager1.load_data(test_file, file_type=combivep_settings.FILE_TYPE_CBV) self.dataset_manager1.validate_data() self.dataset_manager1.calculate_scores() self.dataset_manager1.shuffle_data() self.dataset_manager1.partition_data() training_dataset1 = self.dataset_manager1.get_training_data() test_file = os.path.join(self.data_dir, 'test_add_dataset2.cbv') self.dataset_manager2 = DataSetManager(config_file=combivep_settings.COMBIVEP_CENTRAL_TEST_CONFIGURATION_FILE) self.dataset_manager2.load_data(test_file, file_type=combivep_settings.FILE_TYPE_CBV) self.dataset_manager2.validate_data() self.dataset_manager2.calculate_scores() self.dataset_manager2.shuffle_data() self.dataset_manager2.partition_data() training_dataset2 = self.dataset_manager2.get_training_data() combine_dataset = training_dataset1 + training_dataset2 self.assertEqual(len(combine_dataset), 10, 'DataSetManager does not load VCF data correctly') def tearDown(self): self.remove_working_dir()