def validate(input_file): """ Validates a file for annotation. Makes sure that the file meets valdation criteria Parameters ---------- input_file: str the path to a file to be validated. Returns ------- bool either ``True`` or ``False`` if the file passed validation Notes ----- Criteria: - file must exist - file must have >0 lines """ # check file existence if not tools.item_exists(item = input_file, item_type = 'file'): logger.warning('File does not exist and will not be annotated: {0}'.format(input_file)) return(False) # check number if lines num_lines = tools.num_lines(input_file) if not num_lines > 0: logger.warning('File has {0} lines and will not be annotated: {1}'.format(num_lines, input_file)) return(False) # return the boolean value from the filetype specific validations return(filetype_validation(input_file))
def test_snp_pileup2(self): """ """ self.maxDiff = None input_json = { "bam_file": { "path": os.path.join(DATA_SETS['Proj_08390_G']['BAM_DIR'], "Sample24.bam"), "class": "File" }, "ref_fasta": { "path": REF_FASTA, "class": "File" }, "regions_bed": { "path": CONPAIR_MARKERS_BED, "class": "File" }, "output_filename": "Sample24.pileup" } with TemporaryDirectory() as tmpdir: output_json, output_dir = run_cwl(testcase=self, tmpdir=tmpdir, input_json=input_json, cwl_file=cwl_file, CWL_ARGS=CWL_ARGS) expected_output = { 'output_file': { 'location': 'file://' + os.path.join(output_dir, "Sample24.pileup"), 'basename': "Sample24.pileup", 'class': 'File', 'checksum': 'sha1$a27039b65de9272e9a7b180e1b0e911c6484efbe', 'size': 2358580, 'path': os.path.join(output_dir, "Sample24.pileup") } } self.assertDictEqual(output_json, expected_output) pileup_path = os.path.join(output_dir, "Sample24.pileup") # shutil.copyfile(pileup_path, "Sample24.pileup") lines = num_lines(pileup_path) self.assertEqual(lines, 1024) fields = num_fields(pileup_path, delimiter=' ') self.assertEqual(fields, 8)
def test_partial_overlap(self): file1 = os.path.join(fixture_dir, 'variants_head200.tsv') ref_file = os.path.join(fixture_dir, 'variants_tail200.tsv') output_file = os.path.join(fixture_dir, 'foo_{0}.tsv'.format(t.timestamp())) t.write_tabular_overlap(file1=file1, ref_file=ref_file, output_file=output_file) num_lines = t.num_lines(input_file=output_file, skip=0) self.assertTrue( num_lines == 38, 'Number of lines output in partial overlap files does not match')
def test_workflow1(self): input_json = { "tumor_bam": { "path": os.path.join(DATA_SETS['Proj_08390_G']['BAM_DIR'], "Sample23.bam"), "class": "File" }, "ref_fasta": { "path": REF_FASTA, "class": "File" }, "conpair_markers_bed": { "path": CONPAIR_MARKERS_BED, "class": "File" }, "conpair_markers_txt": { "path": CONPAIR_MARKERS_TXT, "class": "File" }, "somalier_sites": { "path": SOMALIER_SITES, "class": "File" }, "conpair_normal_genotypes": [ { "path": os.path.join(DATA_SETS['Proj_08390_G']['LIKELIHOODS_DIR'], "Sample24.pickle"), "class": "File" } ], "somalier_normal_genotypes": [ { "path": os.path.join(DATA_SETS['Proj_08390_G']['GENOTYPES_DIR'], "Sample24.somalier"), "class": "File" } ] } with TemporaryDirectory() as tmpdir: output_json, output_dir = run_cwl( testcase = self, tmpdir = tmpdir, input_json = input_json, cwl_file = cwl_file) expected_output = { 'conpair_tsv': { 'location': 'file://' + os.path.join(output_dir, "Sample23.pickle.concordance.tsv"), 'basename': "Sample23.pickle.concordance.tsv", 'class': 'File', # 'checksum': 'sha1$e7f75336ea679447df581c4e2edc7339bae9a186', # dont use the checksum or size because the file contains the filepaths and those change each time # 'size': 279, 'path': os.path.join(output_dir, "Sample23.pickle.concordance.tsv") }, 'somalier_html': { 'location': 'file://' + os.path.join(output_dir, "somalier.html"), 'basename': "somalier.html", 'class': 'File', 'checksum': 'sha1$d78cc873ba6219ab57d5d76836eed8212cd96ce6', 'size': 22950, 'path': os.path.join(output_dir, "somalier.html") }, 'somalier_pairs_tsv': { 'location': 'file://' + os.path.join(output_dir, "somalier.pairs.tsv"), 'basename': "somalier.pairs.tsv", 'class': 'File', 'checksum': 'sha1$fdb140d0db51b0ede23208210ae59d03e496c5f6', 'size': 244, 'path': os.path.join(output_dir, "somalier.pairs.tsv") }, 'somalier_samples_tsv': { 'location': 'file://' + os.path.join(output_dir, "somalier.samples.tsv"), 'basename': "somalier.samples.tsv", 'class': 'File', 'checksum': 'sha1$c98bd07ac5a3a90c1386438171fa1113a06a14e8', 'size': 468, 'path': os.path.join(output_dir, "somalier.samples.tsv") }, } output_json['conpair_tsv'].pop('checksum') output_json['conpair_tsv'].pop('size') self.assertDictEqual(output_json, expected_output) # # Validate Conpair output # concordance_file = os.path.join(output_dir, "Sample23.pickle.concordance.tsv") with open(concordance_file) as f: reader = csv.DictReader(f, delimiter = '\t') rows = [ row for row in reader ] expected_rows = [ { 'concordance': '0.9885297184567258', 'num_markers_used': '959', 'num_total_markers': '1024', 'tumor': 'Sample23', 'normal': 'Sample24', # 'tumor_pileup': '/var/lib/cwl/stg35428a9e-1166-4a1f-87ad-70697da56267/Sample23.pickle', # 'normal_pileup': '/var/lib/cwl/stgc3f9e453-b2d4-4085-95df-07686d22f8ff/normals/Sample24.pickle' } ] # remove the file paths from the results because they always change rows[0].pop('tumor_pileup') rows[0].pop('normal_pileup') self.assertEqual(rows, expected_rows) # # Validate Somalier output # pairs_tsv = os.path.join(output_dir, "somalier.pairs.tsv") self.assertEqual(num_lines(pairs_tsv), 2) self.assertEqual(num_fields(pairs_tsv), 17) with open(pairs_tsv) as f: reader = csv.DictReader(f, delimiter = '\t') rows = [ row for row in reader ] expected_rows = [ {'#sample_a': 'Sample23', 'sample_b': 'Sample24', 'relatedness': '1.000', 'ibs0': '0', 'ibs2': '627', 'hom_concordance': '0.870', 'hets_a': '403', 'hets_b': '370', 'hets_ab': '694', 'shared_hets': '347', 'hom_alts_a': '207', 'hom_alts_b': '146', 'shared_hom_alts': '127', 'n': '627', 'x_ibs0': '0', 'x_ibs2': '17', 'expected_relatedness': '-1.0'} ] self.assertEqual(rows, expected_rows) samples_tsv = os.path.join(output_dir, "somalier.samples.tsv") self.assertEqual(num_lines(samples_tsv), 3) self.assertEqual(num_fields(samples_tsv), 25) with open(samples_tsv) as f: reader = csv.DictReader(f, delimiter = '\t') rows = [ row for row in reader ] expected_rows = [ {'#family_id': 'Sample23', 'sample_id': 'Sample23', 'paternal_id': '-9', 'maternal_id': '-9', 'sex': '-9', 'phenotype': '-9', 'original_pedigree_sex': '-9', 'gt_depth_mean': '31.5', 'gt_depth_sd': '21.3', 'depth_mean': '30.8', 'depth_sd': '21.5', 'ab_mean': '0.44', 'ab_std': '0.41', 'n_hom_ref': '258', 'n_het': '403', 'n_hom_alt': '207', 'n_unknown': '122', 'p_middling_ab': '0.048', 'X_depth_mean': '16.28', 'X_n': '25', 'X_hom_ref': '7', 'X_het': '1', 'X_hom_alt': '17', 'Y_depth_mean': '0.00', 'Y_n': '0'}, {'#family_id': 'Sample24', 'sample_id': 'Sample24', 'paternal_id': '-9', 'maternal_id': '-9', 'sex': '-9', 'phenotype': '-9', 'original_pedigree_sex': '-9', 'gt_depth_mean': '25.8', 'gt_depth_sd': '13.8', 'depth_mean': '24.8', 'depth_sd': '14.2', 'ab_mean': '0.43', 'ab_std': '0.44', 'n_hom_ref': '174', 'n_het': '370', 'n_hom_alt': '146', 'n_unknown': '300', 'p_middling_ab': '0.147', 'X_depth_mean': '15.44', 'X_n': '18', 'X_hom_ref': '7', 'X_het': '1', 'X_hom_alt': '10', 'Y_depth_mean': '0.00', 'Y_n': '0'} ] self.assertEqual(rows, expected_rows)
def test_skip(self): input_file = os.path.join(fixture_dir, 'variants_ref.tsv') self.assertTrue(t.num_lines(input_file=input_file, skip=1) == 366)
def test_workflow1(self): input_json = { "tumor_bam": { "path": os.path.join(DATA_SETS['Proj_08390_G']['BAM_DIR'], "Sample23.bam"), "class": "File" }, "ref_fasta": { "path": REF_FASTA, "class": "File" }, "sites": { "path": SOMALIER_SITES, "class": "File" }, "normal_genotypes": [{ "path": os.path.join(DATA_SETS['Proj_08390_G']['GENOTYPES_DIR'], "Sample24.somalier"), "class": "File" }] } with TemporaryDirectory() as tmpdir: output_json, output_dir = run_cwl(testcase=self, tmpdir=tmpdir, input_json=input_json, cwl_file=cwl_file) expected_output = { 'somalier_html': { 'location': 'file://' + os.path.join(output_dir, "somalier.html"), 'basename': "somalier.html", 'class': 'File', 'checksum': 'sha1$d78cc873ba6219ab57d5d76836eed8212cd96ce6', 'size': 22950, 'path': os.path.join(output_dir, "somalier.html") }, 'somalier_pairs_tsv': { 'location': 'file://' + os.path.join(output_dir, "somalier.pairs.tsv"), 'basename': "somalier.pairs.tsv", 'class': 'File', 'checksum': 'sha1$fdb140d0db51b0ede23208210ae59d03e496c5f6', 'size': 244, 'path': os.path.join(output_dir, "somalier.pairs.tsv") }, 'somalier_samples_tsv': { 'location': 'file://' + os.path.join(output_dir, "somalier.samples.tsv"), 'basename': "somalier.samples.tsv", 'class': 'File', 'checksum': 'sha1$c98bd07ac5a3a90c1386438171fa1113a06a14e8', 'size': 468, 'path': os.path.join(output_dir, "somalier.samples.tsv") }, } self.assertDictEqual(output_json, expected_output) pairs_tsv = os.path.join(output_dir, "somalier.pairs.tsv") self.assertEqual(num_lines(pairs_tsv), 2) self.assertEqual(num_fields(pairs_tsv), 17) with open(pairs_tsv) as f: reader = csv.DictReader(f, delimiter='\t') rows = [row for row in reader] expected_rows = [{ '#sample_a': 'Sample23', 'sample_b': 'Sample24', 'relatedness': '1.000', 'ibs0': '0', 'ibs2': '627', 'hom_concordance': '0.870', 'hets_a': '403', 'hets_b': '370', 'hets_ab': '694', 'shared_hets': '347', 'hom_alts_a': '207', 'hom_alts_b': '146', 'shared_hom_alts': '127', 'n': '627', 'x_ibs0': '0', 'x_ibs2': '17', 'expected_relatedness': '-1.0' }] self.assertEqual(rows, expected_rows) samples_tsv = os.path.join(output_dir, "somalier.samples.tsv") self.assertEqual(num_lines(samples_tsv), 3) self.assertEqual(num_fields(samples_tsv), 25) with open(samples_tsv) as f: reader = csv.DictReader(f, delimiter='\t') rows = [row for row in reader] expected_rows = [{ '#family_id': 'Sample23', 'sample_id': 'Sample23', 'paternal_id': '-9', 'maternal_id': '-9', 'sex': '-9', 'phenotype': '-9', 'original_pedigree_sex': '-9', 'gt_depth_mean': '31.5', 'gt_depth_sd': '21.3', 'depth_mean': '30.8', 'depth_sd': '21.5', 'ab_mean': '0.44', 'ab_std': '0.41', 'n_hom_ref': '258', 'n_het': '403', 'n_hom_alt': '207', 'n_unknown': '122', 'p_middling_ab': '0.048', 'X_depth_mean': '16.28', 'X_n': '25', 'X_hom_ref': '7', 'X_het': '1', 'X_hom_alt': '17', 'Y_depth_mean': '0.00', 'Y_n': '0' }, { '#family_id': 'Sample24', 'sample_id': 'Sample24', 'paternal_id': '-9', 'maternal_id': '-9', 'sex': '-9', 'phenotype': '-9', 'original_pedigree_sex': '-9', 'gt_depth_mean': '25.8', 'gt_depth_sd': '13.8', 'depth_mean': '24.8', 'depth_sd': '14.2', 'ab_mean': '0.43', 'ab_std': '0.44', 'n_hom_ref': '174', 'n_het': '370', 'n_hom_alt': '146', 'n_unknown': '300', 'p_middling_ab': '0.147', 'X_depth_mean': '15.44', 'X_n': '18', 'X_hom_ref': '7', 'X_het': '1', 'X_hom_alt': '10', 'Y_depth_mean': '0.00', 'Y_n': '0' }] self.assertEqual(rows, expected_rows)
def test_concordance1(self): self.maxDiff = None with TemporaryDirectory() as tmpdir: # copy over a Normal genotype file to the temporary dir normals_dir_path = os.path.join(tmpdir, "normals") os.mkdir(normals_dir_path) shutil.copyfile( os.path.join(DATA_SETS['Proj_08390_G']['GENOTYPES_DIR'], "Sample24.somalier"), os.path.join(normals_dir_path, "Sample24.somalier")) input_json = { "tumor_genotype_file": { "path": os.path.join(DATA_SETS['Proj_08390_G']['GENOTYPES_DIR'], "Sample23.somalier"), "class": "File" }, "cohort_genotype_dir": { "path": normals_dir_path, "class": "Directory" } } output_json, output_dir = run_cwl(testcase=self, tmpdir=tmpdir, input_json=input_json, CWL_ARGS=CWL_ARGS, cwl_file=cwl_file, print_stdout=False, print_command=False, check_returncode=True) expected_output = { 'html': { 'location': 'file://' + os.path.join(output_dir, "somalier.html"), 'basename': "somalier.html", 'class': 'File', 'checksum': 'sha1$d78cc873ba6219ab57d5d76836eed8212cd96ce6', 'size': 22950, 'path': os.path.join(output_dir, "somalier.html") }, 'pairs_tsv': { 'location': 'file://' + os.path.join(output_dir, "somalier.pairs.tsv"), 'basename': "somalier.pairs.tsv", 'class': 'File', 'checksum': 'sha1$fdb140d0db51b0ede23208210ae59d03e496c5f6', 'size': 244, 'path': os.path.join(output_dir, "somalier.pairs.tsv") }, 'samples_tsv': { 'location': 'file://' + os.path.join(output_dir, "somalier.samples.tsv"), 'basename': "somalier.samples.tsv", 'class': 'File', 'checksum': 'sha1$c98bd07ac5a3a90c1386438171fa1113a06a14e8', 'size': 468, 'path': os.path.join(output_dir, "somalier.samples.tsv") }, } self.assertDictEqual(output_json, expected_output) pairs_tsv = os.path.join(output_dir, "somalier.pairs.tsv") self.assertEqual(num_lines(pairs_tsv), 2) self.assertEqual(num_fields(pairs_tsv), 17) with open(pairs_tsv) as f: reader = csv.DictReader(f, delimiter='\t') rows = [row for row in reader] expected_rows = [{ '#sample_a': 'Sample23', 'sample_b': 'Sample24', 'relatedness': '1.000', 'ibs0': '0', 'ibs2': '627', 'hom_concordance': '0.870', 'hets_a': '403', 'hets_b': '370', 'hets_ab': '694', 'shared_hets': '347', 'hom_alts_a': '207', 'hom_alts_b': '146', 'shared_hom_alts': '127', 'n': '627', 'x_ibs0': '0', 'x_ibs2': '17', 'expected_relatedness': '-1.0' }] self.assertEqual(rows, expected_rows) samples_tsv = os.path.join(output_dir, "somalier.samples.tsv") self.assertEqual(num_lines(samples_tsv), 3) self.assertEqual(num_fields(samples_tsv), 25) with open(samples_tsv) as f: reader = csv.DictReader(f, delimiter='\t') rows = [row for row in reader] expected_rows = [{ '#family_id': 'Sample23', 'sample_id': 'Sample23', 'paternal_id': '-9', 'maternal_id': '-9', 'sex': '-9', 'phenotype': '-9', 'original_pedigree_sex': '-9', 'gt_depth_mean': '31.5', 'gt_depth_sd': '21.3', 'depth_mean': '30.8', 'depth_sd': '21.5', 'ab_mean': '0.44', 'ab_std': '0.41', 'n_hom_ref': '258', 'n_het': '403', 'n_hom_alt': '207', 'n_unknown': '122', 'p_middling_ab': '0.048', 'X_depth_mean': '16.28', 'X_n': '25', 'X_hom_ref': '7', 'X_het': '1', 'X_hom_alt': '17', 'Y_depth_mean': '0.00', 'Y_n': '0' }, { '#family_id': 'Sample24', 'sample_id': 'Sample24', 'paternal_id': '-9', 'maternal_id': '-9', 'sex': '-9', 'phenotype': '-9', 'original_pedigree_sex': '-9', 'gt_depth_mean': '25.8', 'gt_depth_sd': '13.8', 'depth_mean': '24.8', 'depth_sd': '14.2', 'ab_mean': '0.43', 'ab_std': '0.44', 'n_hom_ref': '174', 'n_het': '370', 'n_hom_alt': '146', 'n_unknown': '300', 'p_middling_ab': '0.147', 'X_depth_mean': '15.44', 'X_n': '18', 'X_hom_ref': '7', 'X_het': '1', 'X_hom_alt': '10', 'Y_depth_mean': '0.00', 'Y_n': '0' }] self.assertEqual(rows, expected_rows)