예제 #1
0
파일: annotate.py 프로젝트: jlevy44/util
def validate(input_file):
    """
    Validates a file for annotation. Makes sure that the file meets valdation criteria

    Parameters
    ----------
    input_file: str
        the path to a file to be validated.

    Returns
    -------
    bool
        either ``True`` or ``False`` if the file passed validation

    Notes
    -----
    Criteria:

    - file must exist

    - file must have >0 lines
    """
    # check file existence
    if not tools.item_exists(item = input_file, item_type = 'file'):
        logger.warning('File does not exist and will not be annotated: {0}'.format(input_file))
        return(False)

    # check number if lines
    num_lines = tools.num_lines(input_file)
    if not num_lines > 0:
        logger.warning('File has {0} lines and will not be annotated: {1}'.format(num_lines, input_file))
        return(False)

    # return the boolean value from the filetype specific validations
    return(filetype_validation(input_file))
    def test_snp_pileup2(self):
        """
        """
        self.maxDiff = None
        input_json = {
            "bam_file": {
                "path":
                os.path.join(DATA_SETS['Proj_08390_G']['BAM_DIR'],
                             "Sample24.bam"),
                "class":
                "File"
            },
            "ref_fasta": {
                "path": REF_FASTA,
                "class": "File"
            },
            "regions_bed": {
                "path": CONPAIR_MARKERS_BED,
                "class": "File"
            },
            "output_filename": "Sample24.pileup"
        }

        with TemporaryDirectory() as tmpdir:
            output_json, output_dir = run_cwl(testcase=self,
                                              tmpdir=tmpdir,
                                              input_json=input_json,
                                              cwl_file=cwl_file,
                                              CWL_ARGS=CWL_ARGS)

            expected_output = {
                'output_file': {
                    'location':
                    'file://' + os.path.join(output_dir, "Sample24.pileup"),
                    'basename':
                    "Sample24.pileup",
                    'class':
                    'File',
                    'checksum':
                    'sha1$a27039b65de9272e9a7b180e1b0e911c6484efbe',
                    'size':
                    2358580,
                    'path':
                    os.path.join(output_dir, "Sample24.pileup")
                }
            }

            self.assertDictEqual(output_json, expected_output)
            pileup_path = os.path.join(output_dir, "Sample24.pileup")
            # shutil.copyfile(pileup_path, "Sample24.pileup")

            lines = num_lines(pileup_path)
            self.assertEqual(lines, 1024)

            fields = num_fields(pileup_path, delimiter=' ')
            self.assertEqual(fields, 8)
예제 #3
0
 def test_partial_overlap(self):
     file1 = os.path.join(fixture_dir, 'variants_head200.tsv')
     ref_file = os.path.join(fixture_dir, 'variants_tail200.tsv')
     output_file = os.path.join(fixture_dir,
                                'foo_{0}.tsv'.format(t.timestamp()))
     t.write_tabular_overlap(file1=file1,
                             ref_file=ref_file,
                             output_file=output_file)
     num_lines = t.num_lines(input_file=output_file, skip=0)
     self.assertTrue(
         num_lines == 38,
         'Number of lines output in partial overlap files does not match')
예제 #4
0
    def test_workflow1(self):
        input_json = {
            "tumor_bam": {
                "path": os.path.join(DATA_SETS['Proj_08390_G']['BAM_DIR'], "Sample23.bam"),
                "class": "File"
            },
            "ref_fasta": {
                "path": REF_FASTA,
                "class": "File"
            },
            "conpair_markers_bed": {
                "path": CONPAIR_MARKERS_BED,
                "class": "File"
            },
            "conpair_markers_txt": {
                "path": CONPAIR_MARKERS_TXT,
                "class": "File"
            },
            "somalier_sites": {
                "path": SOMALIER_SITES,
                "class": "File"
            },
            "conpair_normal_genotypes": [
                {
                    "path": os.path.join(DATA_SETS['Proj_08390_G']['LIKELIHOODS_DIR'], "Sample24.pickle"),
                    "class": "File"
                }
            ],
            "somalier_normal_genotypes": [
                {
                    "path": os.path.join(DATA_SETS['Proj_08390_G']['GENOTYPES_DIR'], "Sample24.somalier"),
                    "class": "File"
                }
            ]
        }
        with TemporaryDirectory() as tmpdir:
            output_json, output_dir = run_cwl(
                testcase = self,
                tmpdir = tmpdir,
                input_json = input_json,
                cwl_file = cwl_file)

            expected_output = {
                'conpair_tsv': {
                    'location': 'file://' + os.path.join(output_dir, "Sample23.pickle.concordance.tsv"),
                    'basename': "Sample23.pickle.concordance.tsv",
                    'class': 'File',
                    # 'checksum': 'sha1$e7f75336ea679447df581c4e2edc7339bae9a186', # dont use the checksum or size because the file contains the filepaths and those change each time
                    # 'size': 279,
                    'path': os.path.join(output_dir, "Sample23.pickle.concordance.tsv")
                    },
                'somalier_html': {
                    'location': 'file://' + os.path.join(output_dir, "somalier.html"),
                    'basename': "somalier.html",
                    'class': 'File',
                    'checksum': 'sha1$d78cc873ba6219ab57d5d76836eed8212cd96ce6',
                    'size': 22950,
                    'path': os.path.join(output_dir, "somalier.html")
                    },
                'somalier_pairs_tsv': {
                    'location': 'file://' + os.path.join(output_dir, "somalier.pairs.tsv"),
                    'basename': "somalier.pairs.tsv",
                    'class': 'File',
                    'checksum': 'sha1$fdb140d0db51b0ede23208210ae59d03e496c5f6',
                    'size': 244,
                    'path': os.path.join(output_dir, "somalier.pairs.tsv")
                    },
                'somalier_samples_tsv': {
                    'location': 'file://' + os.path.join(output_dir, "somalier.samples.tsv"),
                    'basename': "somalier.samples.tsv",
                    'class': 'File',
                    'checksum': 'sha1$c98bd07ac5a3a90c1386438171fa1113a06a14e8',
                    'size': 468,
                    'path': os.path.join(output_dir, "somalier.samples.tsv")
                    },
                }
            output_json['conpair_tsv'].pop('checksum')
            output_json['conpair_tsv'].pop('size')

            self.assertDictEqual(output_json, expected_output)

            #
            # Validate Conpair output
            #
            concordance_file = os.path.join(output_dir, "Sample23.pickle.concordance.tsv")
            with open(concordance_file) as f:
                reader = csv.DictReader(f, delimiter = '\t')
                rows  = [ row for row in reader ]

            expected_rows = [
            {
                'concordance': '0.9885297184567258',
                'num_markers_used': '959',
                'num_total_markers': '1024',
                'tumor': 'Sample23',
                'normal': 'Sample24',
                # 'tumor_pileup': '/var/lib/cwl/stg35428a9e-1166-4a1f-87ad-70697da56267/Sample23.pickle',
                # 'normal_pileup': '/var/lib/cwl/stgc3f9e453-b2d4-4085-95df-07686d22f8ff/normals/Sample24.pickle'
                }
            ]

            # remove the file paths from the results because they always change
            rows[0].pop('tumor_pileup')
            rows[0].pop('normal_pileup')

            self.assertEqual(rows, expected_rows)

            #
            # Validate Somalier output
            #
            pairs_tsv = os.path.join(output_dir, "somalier.pairs.tsv")
            self.assertEqual(num_lines(pairs_tsv), 2)
            self.assertEqual(num_fields(pairs_tsv), 17)
            with open(pairs_tsv) as f:
                reader = csv.DictReader(f, delimiter = '\t')
                rows = [ row for row in reader ]
            expected_rows = [
            {'#sample_a': 'Sample23', 'sample_b': 'Sample24', 'relatedness': '1.000', 'ibs0': '0', 'ibs2': '627', 'hom_concordance': '0.870', 'hets_a': '403', 'hets_b': '370', 'hets_ab': '694', 'shared_hets': '347', 'hom_alts_a': '207', 'hom_alts_b': '146', 'shared_hom_alts': '127', 'n': '627', 'x_ibs0': '0', 'x_ibs2': '17', 'expected_relatedness': '-1.0'}
            ]
            self.assertEqual(rows, expected_rows)

            samples_tsv = os.path.join(output_dir, "somalier.samples.tsv")
            self.assertEqual(num_lines(samples_tsv), 3)
            self.assertEqual(num_fields(samples_tsv), 25)
            with open(samples_tsv) as f:
                reader = csv.DictReader(f, delimiter = '\t')
                rows = [ row for row in reader ]
            expected_rows = [
            {'#family_id': 'Sample23', 'sample_id': 'Sample23', 'paternal_id': '-9', 'maternal_id': '-9', 'sex': '-9', 'phenotype': '-9', 'original_pedigree_sex': '-9', 'gt_depth_mean': '31.5', 'gt_depth_sd': '21.3', 'depth_mean': '30.8', 'depth_sd': '21.5', 'ab_mean': '0.44', 'ab_std': '0.41', 'n_hom_ref': '258', 'n_het': '403', 'n_hom_alt': '207', 'n_unknown': '122', 'p_middling_ab': '0.048', 'X_depth_mean': '16.28', 'X_n': '25', 'X_hom_ref': '7', 'X_het': '1', 'X_hom_alt': '17', 'Y_depth_mean': '0.00', 'Y_n': '0'},
            {'#family_id': 'Sample24', 'sample_id': 'Sample24', 'paternal_id': '-9', 'maternal_id': '-9', 'sex': '-9', 'phenotype': '-9', 'original_pedigree_sex': '-9', 'gt_depth_mean': '25.8', 'gt_depth_sd': '13.8', 'depth_mean': '24.8', 'depth_sd': '14.2', 'ab_mean': '0.43', 'ab_std': '0.44', 'n_hom_ref': '174', 'n_het': '370', 'n_hom_alt': '146', 'n_unknown': '300', 'p_middling_ab': '0.147', 'X_depth_mean': '15.44', 'X_n': '18', 'X_hom_ref': '7', 'X_het': '1', 'X_hom_alt': '10', 'Y_depth_mean': '0.00', 'Y_n': '0'}
            ]
            self.assertEqual(rows, expected_rows)
예제 #5
0
 def test_skip(self):
     input_file = os.path.join(fixture_dir, 'variants_ref.tsv')
     self.assertTrue(t.num_lines(input_file=input_file, skip=1) == 366)
    def test_workflow1(self):
        input_json = {
            "tumor_bam": {
                "path":
                os.path.join(DATA_SETS['Proj_08390_G']['BAM_DIR'],
                             "Sample23.bam"),
                "class":
                "File"
            },
            "ref_fasta": {
                "path": REF_FASTA,
                "class": "File"
            },
            "sites": {
                "path": SOMALIER_SITES,
                "class": "File"
            },
            "normal_genotypes": [{
                "path":
                os.path.join(DATA_SETS['Proj_08390_G']['GENOTYPES_DIR'],
                             "Sample24.somalier"),
                "class":
                "File"
            }]
        }
        with TemporaryDirectory() as tmpdir:
            output_json, output_dir = run_cwl(testcase=self,
                                              tmpdir=tmpdir,
                                              input_json=input_json,
                                              cwl_file=cwl_file)

            expected_output = {
                'somalier_html': {
                    'location':
                    'file://' + os.path.join(output_dir, "somalier.html"),
                    'basename':
                    "somalier.html",
                    'class':
                    'File',
                    'checksum':
                    'sha1$d78cc873ba6219ab57d5d76836eed8212cd96ce6',
                    'size':
                    22950,
                    'path':
                    os.path.join(output_dir, "somalier.html")
                },
                'somalier_pairs_tsv': {
                    'location':
                    'file://' + os.path.join(output_dir, "somalier.pairs.tsv"),
                    'basename':
                    "somalier.pairs.tsv",
                    'class':
                    'File',
                    'checksum':
                    'sha1$fdb140d0db51b0ede23208210ae59d03e496c5f6',
                    'size':
                    244,
                    'path':
                    os.path.join(output_dir, "somalier.pairs.tsv")
                },
                'somalier_samples_tsv': {
                    'location':
                    'file://' +
                    os.path.join(output_dir, "somalier.samples.tsv"),
                    'basename':
                    "somalier.samples.tsv",
                    'class':
                    'File',
                    'checksum':
                    'sha1$c98bd07ac5a3a90c1386438171fa1113a06a14e8',
                    'size':
                    468,
                    'path':
                    os.path.join(output_dir, "somalier.samples.tsv")
                },
            }
            self.assertDictEqual(output_json, expected_output)

            pairs_tsv = os.path.join(output_dir, "somalier.pairs.tsv")
            self.assertEqual(num_lines(pairs_tsv), 2)
            self.assertEqual(num_fields(pairs_tsv), 17)
            with open(pairs_tsv) as f:
                reader = csv.DictReader(f, delimiter='\t')
                rows = [row for row in reader]
            expected_rows = [{
                '#sample_a': 'Sample23',
                'sample_b': 'Sample24',
                'relatedness': '1.000',
                'ibs0': '0',
                'ibs2': '627',
                'hom_concordance': '0.870',
                'hets_a': '403',
                'hets_b': '370',
                'hets_ab': '694',
                'shared_hets': '347',
                'hom_alts_a': '207',
                'hom_alts_b': '146',
                'shared_hom_alts': '127',
                'n': '627',
                'x_ibs0': '0',
                'x_ibs2': '17',
                'expected_relatedness': '-1.0'
            }]
            self.assertEqual(rows, expected_rows)

            samples_tsv = os.path.join(output_dir, "somalier.samples.tsv")
            self.assertEqual(num_lines(samples_tsv), 3)
            self.assertEqual(num_fields(samples_tsv), 25)
            with open(samples_tsv) as f:
                reader = csv.DictReader(f, delimiter='\t')
                rows = [row for row in reader]
            expected_rows = [{
                '#family_id': 'Sample23',
                'sample_id': 'Sample23',
                'paternal_id': '-9',
                'maternal_id': '-9',
                'sex': '-9',
                'phenotype': '-9',
                'original_pedigree_sex': '-9',
                'gt_depth_mean': '31.5',
                'gt_depth_sd': '21.3',
                'depth_mean': '30.8',
                'depth_sd': '21.5',
                'ab_mean': '0.44',
                'ab_std': '0.41',
                'n_hom_ref': '258',
                'n_het': '403',
                'n_hom_alt': '207',
                'n_unknown': '122',
                'p_middling_ab': '0.048',
                'X_depth_mean': '16.28',
                'X_n': '25',
                'X_hom_ref': '7',
                'X_het': '1',
                'X_hom_alt': '17',
                'Y_depth_mean': '0.00',
                'Y_n': '0'
            }, {
                '#family_id': 'Sample24',
                'sample_id': 'Sample24',
                'paternal_id': '-9',
                'maternal_id': '-9',
                'sex': '-9',
                'phenotype': '-9',
                'original_pedigree_sex': '-9',
                'gt_depth_mean': '25.8',
                'gt_depth_sd': '13.8',
                'depth_mean': '24.8',
                'depth_sd': '14.2',
                'ab_mean': '0.43',
                'ab_std': '0.44',
                'n_hom_ref': '174',
                'n_het': '370',
                'n_hom_alt': '146',
                'n_unknown': '300',
                'p_middling_ab': '0.147',
                'X_depth_mean': '15.44',
                'X_n': '18',
                'X_hom_ref': '7',
                'X_het': '1',
                'X_hom_alt': '10',
                'Y_depth_mean': '0.00',
                'Y_n': '0'
            }]
            self.assertEqual(rows, expected_rows)
    def test_concordance1(self):
        self.maxDiff = None
        with TemporaryDirectory() as tmpdir:
            # copy over a Normal genotype file to the temporary dir
            normals_dir_path = os.path.join(tmpdir, "normals")
            os.mkdir(normals_dir_path)
            shutil.copyfile(
                os.path.join(DATA_SETS['Proj_08390_G']['GENOTYPES_DIR'],
                             "Sample24.somalier"),
                os.path.join(normals_dir_path, "Sample24.somalier"))

            input_json = {
                "tumor_genotype_file": {
                    "path":
                    os.path.join(DATA_SETS['Proj_08390_G']['GENOTYPES_DIR'],
                                 "Sample23.somalier"),
                    "class":
                    "File"
                },
                "cohort_genotype_dir": {
                    "path": normals_dir_path,
                    "class": "Directory"
                }
            }

            output_json, output_dir = run_cwl(testcase=self,
                                              tmpdir=tmpdir,
                                              input_json=input_json,
                                              CWL_ARGS=CWL_ARGS,
                                              cwl_file=cwl_file,
                                              print_stdout=False,
                                              print_command=False,
                                              check_returncode=True)

            expected_output = {
                'html': {
                    'location':
                    'file://' + os.path.join(output_dir, "somalier.html"),
                    'basename':
                    "somalier.html",
                    'class':
                    'File',
                    'checksum':
                    'sha1$d78cc873ba6219ab57d5d76836eed8212cd96ce6',
                    'size':
                    22950,
                    'path':
                    os.path.join(output_dir, "somalier.html")
                },
                'pairs_tsv': {
                    'location':
                    'file://' + os.path.join(output_dir, "somalier.pairs.tsv"),
                    'basename':
                    "somalier.pairs.tsv",
                    'class':
                    'File',
                    'checksum':
                    'sha1$fdb140d0db51b0ede23208210ae59d03e496c5f6',
                    'size':
                    244,
                    'path':
                    os.path.join(output_dir, "somalier.pairs.tsv")
                },
                'samples_tsv': {
                    'location':
                    'file://' +
                    os.path.join(output_dir, "somalier.samples.tsv"),
                    'basename':
                    "somalier.samples.tsv",
                    'class':
                    'File',
                    'checksum':
                    'sha1$c98bd07ac5a3a90c1386438171fa1113a06a14e8',
                    'size':
                    468,
                    'path':
                    os.path.join(output_dir, "somalier.samples.tsv")
                },
            }

            self.assertDictEqual(output_json, expected_output)

            pairs_tsv = os.path.join(output_dir, "somalier.pairs.tsv")
            self.assertEqual(num_lines(pairs_tsv), 2)
            self.assertEqual(num_fields(pairs_tsv), 17)
            with open(pairs_tsv) as f:
                reader = csv.DictReader(f, delimiter='\t')
                rows = [row for row in reader]
            expected_rows = [{
                '#sample_a': 'Sample23',
                'sample_b': 'Sample24',
                'relatedness': '1.000',
                'ibs0': '0',
                'ibs2': '627',
                'hom_concordance': '0.870',
                'hets_a': '403',
                'hets_b': '370',
                'hets_ab': '694',
                'shared_hets': '347',
                'hom_alts_a': '207',
                'hom_alts_b': '146',
                'shared_hom_alts': '127',
                'n': '627',
                'x_ibs0': '0',
                'x_ibs2': '17',
                'expected_relatedness': '-1.0'
            }]
            self.assertEqual(rows, expected_rows)

            samples_tsv = os.path.join(output_dir, "somalier.samples.tsv")
            self.assertEqual(num_lines(samples_tsv), 3)
            self.assertEqual(num_fields(samples_tsv), 25)
            with open(samples_tsv) as f:
                reader = csv.DictReader(f, delimiter='\t')
                rows = [row for row in reader]
            expected_rows = [{
                '#family_id': 'Sample23',
                'sample_id': 'Sample23',
                'paternal_id': '-9',
                'maternal_id': '-9',
                'sex': '-9',
                'phenotype': '-9',
                'original_pedigree_sex': '-9',
                'gt_depth_mean': '31.5',
                'gt_depth_sd': '21.3',
                'depth_mean': '30.8',
                'depth_sd': '21.5',
                'ab_mean': '0.44',
                'ab_std': '0.41',
                'n_hom_ref': '258',
                'n_het': '403',
                'n_hom_alt': '207',
                'n_unknown': '122',
                'p_middling_ab': '0.048',
                'X_depth_mean': '16.28',
                'X_n': '25',
                'X_hom_ref': '7',
                'X_het': '1',
                'X_hom_alt': '17',
                'Y_depth_mean': '0.00',
                'Y_n': '0'
            }, {
                '#family_id': 'Sample24',
                'sample_id': 'Sample24',
                'paternal_id': '-9',
                'maternal_id': '-9',
                'sex': '-9',
                'phenotype': '-9',
                'original_pedigree_sex': '-9',
                'gt_depth_mean': '25.8',
                'gt_depth_sd': '13.8',
                'depth_mean': '24.8',
                'depth_sd': '14.2',
                'ab_mean': '0.43',
                'ab_std': '0.44',
                'n_hom_ref': '174',
                'n_het': '370',
                'n_hom_alt': '146',
                'n_unknown': '300',
                'p_middling_ab': '0.147',
                'X_depth_mean': '15.44',
                'X_n': '18',
                'X_hom_ref': '7',
                'X_het': '1',
                'X_hom_alt': '10',
                'Y_depth_mean': '0.00',
                'Y_n': '0'
            }]
            self.assertEqual(rows, expected_rows)