def get_bx_ids(): # Get all bx_ids present in source files organized by source bx_ids = {} files = get_reports_files(ARGS.ready_input_dir) for file in files: file_path = abspath(ARGS.ready_input_dir + file) if file_path.endswith('.tsv'): source = "ENIGMA" bx_ids[source] = [] tsv_file = csv.DictReader(open(file_path, "r"), delimiter='\t') for report in tsv_file: ids = map(int, report['BX_ID'].split(',')) bx_ids[source] = bx_ids[source] + ids else: suffix = '.vcf' source = file[:(len(file)-len(suffix))] bx_ids[source] = [] vcf_reader = vcf.Reader(open(file_path, 'r'), strict_whitespace=True) try: for record in vcf_reader: ids = map(int, record.INFO['BX_ID']) bx_ids[source] = bx_ids[source] + ids except ValueError as e: print e return bx_ids
def test_aggregate_reports_maintains_proper_variant_effect_lovd_formatting(self): LOVD_reports_file = [INPUT_DIRECTORY + r for r in aggregate_reports.get_reports_files(INPUT_DIRECTORY) if r == 'LOVD.vcf'] reports = aggregate_reports.aggregate_reports(LOVD_reports_file, self.columns, self.genome_regions_symbol_dict) # Check that two of each source are present variant_effect_lovd_index = self.columns.index("Variant_effect_LOVD") for variant in reports: self.assertIn(variant[variant_effect_lovd_index][0], ['?/.', '+/+'])
def test_aggregate_reports_maintains_proper_variant_effect_lovd_formatting(self): LOVD_reports_file = [INPUT_DIRECTORY + r for r in aggregate_reports.get_reports_files(INPUT_DIRECTORY) if r == 'LOVD.vcf'] reports = aggregate_reports.aggregate_reports(LOVD_reports_file, self.columns) # Check that two of each source are present variant_effect_lovd_index = self.columns.index("Variant_effect_LOVD") for variant in reports: self.assertIn(variant[variant_effect_lovd_index][0], ['?/.', '+/+'])
def test_aggregate_reports(self): reports_files = [INPUT_DIRECTORY + r for r in aggregate_reports.get_reports_files(INPUT_DIRECTORY)] reports = aggregate_reports.aggregate_reports(reports_files, self.columns, self.genome_regions_symbol_dict) # Each test file contains two reports, check that all reports are present self.assertEqual(len(reports), (len(self.sources) * 2)) # Check that two of each source are present source_reports = {} source_column_index = self.columns.index("Source") for report in reports: source = report[source_column_index] if source not in source_reports: source_reports[source] = 1 else: source_reports[source] += 1 for source in self.sources: self.assertEqual(source_reports[source], 2)
def test_aggregate_reports(self): reports_files = [INPUT_DIRECTORY + r for r in aggregate_reports.get_reports_files(INPUT_DIRECTORY)] reports = aggregate_reports.aggregate_reports(reports_files, self.columns) # Each test file contains two reports, check that all reports are present self.assertEqual(len(reports), (len(self.sources) * 2)) # Check that two of each source are present source_reports = {} source_column_index = self.columns.index("Source") for report in reports: source = report[source_column_index] if source not in source_reports: source_reports[source] = 1 else: source_reports[source] += 1 for source in self.sources: self.assertEqual(source_reports[source], 2)
def test_get_reports_files(self): reports_files = aggregate_reports.get_reports_files(INPUT_DIRECTORY) self.assertEqual(len(reports_files), 9) self.assertNotIn("1000_Genomesready.vcf", reports_files)