示例#1
0
    def test_find_test_locations(self):
        db_config_file = open(DB_CONFIG_FILE, 'r')
        db_config = Config(None, yaml.load(db_config_file.read(), Loader=yaml.FullLoader))
        db_config = RuntimeConfig(db_config)
        gene_descs = {'': [46843, 58454], \
                    'hgnc_symbol': ['CCR5', 'ALCAM'], \
                    'chromosome_name':  [3, 3], \
                    'start_position': [46411633, 105085753], \
                    'end_position': [46417697, 46417697]}
        gene_descs = pd.DataFrame(gene_descs, columns = ['', 'hgnc_symbol', 'chromosome_name', 'start_position', 'end_position'])
        target_list = []
        method_gene_descs = gene_descs

        genes = [Gene(gene) for gene in ['CCR5', 'ALCAM']]
        
        method_test_locations = gnali.find_test_locations(genes, method_gene_descs, db_config)

        prefix = "chr" if db_config.ref_genome_name == "GRCh38" else ""
        for index, gene in enumerate(genes):
            if gene.status is None:
                chrom = gene_descs.loc[gene_descs.index[index], 'chromosome_name']
                start = gene_descs.loc[gene_descs.index[index], 'start_position']
                end = gene_descs.loc[gene_descs.index[index], 'end_position']

                gene.set_location(location="{prefix}{}:{}-{}"
                                        .format(chrom, start, end,
                                                prefix=prefix))
    
        assert method_test_locations == genes
示例#2
0
    def test_get_variants_happy(self, monkeypatch):
        target_genes = [Gene('CCR5', location="3:46411633-46417697")]
        
        expected_variants = []
        with open(EXPECTED_PLOF_VARIANTS, 'r') as test_file:
            for line in test_file:
                expected_variants.append(line)

        db_config_file = open(DB_CONFIG_FILE, 'r')
        db_config = Config('gnomadv2.1.1', yaml.load(db_config_file.read(), Loader=yaml.FullLoader))
        db_config = RuntimeConfig(db_config)

        def mock_get_db_tbi(data_file, data_path, max_time):
            if 'exomes' in data_file.path:
                return TEST_EXOMES_TBI
            else:
                return TEST_GENOMES_TBI
        monkeypatch.setattr(gnali, "get_db_tbi", mock_get_db_tbi)

        temp_dir = tempfile.TemporaryDirectory()
        header = gnali.get_variants(target_genes, db_config, 
                                    [Filter("homozygous-controls","controls_nhomalt>0")],
                                    temp_dir.name, None, False)
        method_variants = [var.record_str + "\n" for var in sum([gene.variants for gene in target_genes], [])]
        assert expected_variants == method_variants
示例#3
0
    def test_extract_lof_annotations(self):
        

        config_file = open(DB_CONFIG_FILE, 'r')
        config = Config('gnomadv2.1.1', yaml.load(config_file.read(), Loader=yaml.FullLoader))
        config = RuntimeConfig(config)
        data_file = next((file for file in config.files if file.name == 'exomes'), None)
        tbx = pysam.TabixFile(data_file.path)
        header = [line for line in tbx.header if "ID=vep" in line][0]

        genes = [Gene("CCR5")]
        test_variants = []
        with open(EXPECTED_PLOF_VARIANTS, 'r') as test_file:
            for row in test_file:
                row = Variant("CCR5", str(row), "vep", "LoF", header)
                test_variants.append(row)
        genes[0].add_variants(test_variants)
        num_trans = sum([var.num_transcripts() for var in genes[0].variants])
        print(num_trans)
        
        method_results, results_as_vcf = gnali.extract_lof_annotations(genes, config, False)

        variants = genes[0].variants
        variant_records = [variant.record_str for variant in variants]
        results_as_vcf = variant_records
        variant_tuple = []

        for variant in variants:
            if not variant.multiple_transcripts():
                variant_tuple.append(variant.as_tuple_vep(config.lof.get('id')))
            else:
                for trans in variant.transcripts:
                    variant.as_tuple_vep(config.lof.get('id'))[-1].split(",")
                    variant_tuple.extend([variant.as_tuple_basic() +
                                        (str(trans),)])
        results = np.asarray(variant_tuple, dtype=str)
        results = pd.DataFrame(data=results)

        results.columns = ["Chromosome", "Position_Start", "RSID",
                       "Reference_Allele", "Alternate_Allele",
                       "Score", "Quality", "VEP"]

        results_codes = pd.DataFrame(results['VEP'].str.split('|', 5).tolist(),
                                    columns=["LoF_Variant", "LoF_Annotation",
                                            "Confidence", "HGNC_Symbol",
                                            "Ensembl Code", "Rest"])
        results_codes['HGVSc'] = results_codes.Rest.str.split("|", 6).str[5]

        results_codes.drop('Rest', axis=1, inplace=True)
        results_codes.drop('Confidence', axis=1, inplace=True)
        results.drop('VEP', axis=1, inplace=True)
        results = pd.concat([results, results_codes], axis=1)

        expected_results = results.drop_duplicates(keep='first', inplace=False)

        assert expected_results.equals(method_results)
示例#4
0
 def test_get_db_tbi_happy(self, monkeypatch):
     def mock_tbi_needed(url, dest_path):
         return True
     monkeypatch.setattr(gnali, "tbi_needed", mock_tbi_needed)
     def mock_download_file(url, dest_path, max_time):
         dest_path = tempfile.TemporaryFile().name
     monkeypatch.setattr(gnali, "download_file", mock_download_file)
     with tempfile.TemporaryDirectory() as temp:
         db_config_file = open(DB_CONFIG_FILE, 'r')
         db_config = Config(None, yaml.load(db_config_file.read(), Loader=yaml.FullLoader))
         db_config = RuntimeConfig(db_config)
         assert gnali.get_db_tbi(db_config.files[0], temp, MAX_TIME)
示例#5
0
    def test_vep_annotate(self):
        deps_exist = True
        deps_version = Dependencies.versions['GRCh37']
        deps_version_file = Dependencies.files['GRCh37']
        if not os.path.exists(deps_version_file):
            deps_exist = False
            with open(deps_version_file, 'w') as fh:
                fh.write(deps_version)

        input_headers = []
        input_recs = []
        with open(EXOMES_CCR5_NO_LOF, 'r') as stream:
            lines = stream.readlines()
            input_headers = [line for line in lines if line[0] == '#']
            input_recs = [line for line in lines if line[0] != '#']
        db_config = None

        with open(DB_CONFIG_FILE, 'r') as config_stream:
            db_config = Config(
                'gnomadv2.1.1nolof',
                yaml.load(config_stream.read(), Loader=yaml.FullLoader))

        db_config = RuntimeConfig(db_config)
        data_file = next(
            (file for file in db_config.files if file.name == 'ccr5'), None)

        tbx = pysam.VariantFile(data_file.path)
        header = str(tbx.header)

        method_headers, method_recs = VEP.annotate_vep_loftee(
            input_headers, input_recs, db_config)

        lof_id = db_config.lof['id']
        lof_annot = db_config.lof['annot']
        annot_header = [
            line for line in method_headers if "ID={}".format(lof_id) in line
        ][0]
        method_recs = [
            Variant("CCR5", rec, lof_id, lof_annot, str(annot_header))
            for rec in method_recs
        ]
        method_recs = [rec.record_str for rec in method_recs]

        if not deps_exist:
            os.remove(deps_version_file)

        expected_headers = []
        expected_recs = []
        with open(EXOMES_CCR5, 'r') as stream:
            lines = stream.readlines()
            expected_headers = [line for line in lines if line[0] == '#']
            expected_recs = [line for line in lines if line[0] != '#']
        assert method_recs == expected_recs
示例#6
0
 def test_get_db_tbi_lock_timeout_exception(self, monkeypatch):
     with tempfile.TemporaryDirectory() as temp:
         temp = tempfile.TemporaryDirectory()
         tbi_path = "{}/{}".format(temp.name, TEST_DB_TBI_NAME)
         shutil.copyfile(TEST_DB_TBI, tbi_path)
         def mock_lock_acquire(*args, **kwargs):
             raise TimeoutError
         monkeypatch.setattr(filelock.FileLock, "acquire", mock_lock_acquire)
         def mock_download_file(url, dest_path, max_time):
             dest_path = tempfile.TemporaryFile().name
         monkeypatch.setattr(gnali, "download_file", mock_download_file)
         db_config_file = open(DB_CONFIG_FILE, 'r')
         db_config = Config(None, yaml.load(db_config_file.read(), Loader=yaml.FullLoader))
         db_config = RuntimeConfig(db_config)
         assert gnali.get_db_tbi(db_config.files[0], tbi_path, MAX_TIME)  
示例#7
0
    def test_get_test_gene_descs(self, monkeypatch):
        genes_list = ['CCR5', 'ALCAM']
        def mock_get_human_genes(db_config):
            human_genes = pd.read_csv(ENSEMBL_HUMAN_GENES) 
            human_genes.drop(human_genes.columns[0], axis=1, inplace=True)
            return human_genes
        monkeypatch.setattr(gnali, "get_human_genes", mock_get_human_genes)

        db_config_file = open(DB_CONFIG_FILE, 'r')
        db_config = Config(None, yaml.load(db_config_file.read(), Loader=yaml.FullLoader))
        db_config = RuntimeConfig(db_config)

        human_genes = gnali.get_human_genes(db_config)
        human_genes.columns = ['hgnc_symbol', 'chromosome_name', 'start_position', 'end_position']

        genes_data = [Gene(gene) for gene in genes_list]
        target_gene_names = [gene.name for gene in genes_data]

        gene_descriptions = human_genes
        gene_descriptions.columns = ['hgnc_symbol', 'chromosome_name',
                                 'start_position', 'end_position']
        gene_descriptions = gene_descriptions[~gene_descriptions['chromosome_name']
                                            .str.contains('PATCH')]
        gene_descriptions = gene_descriptions[(gene_descriptions['hgnc_symbol']
                                            .isin(target_gene_names))]
        expected_gene_descs = gene_descriptions.reset_index(drop=True)

        unavailable_genes = [gene for gene in target_gene_names if gene not in
                            list(gene_descriptions['hgnc_symbol'])]

        method_genes, method_gene_descs = gnali.get_test_gene_descriptions(genes_data, db_config, None, False)

        for gene in genes_data:
            if gene.name in unavailable_genes:
                gene.set_status("Unknown gene")
                continue
        expected_genes = genes_data

        assert method_genes == expected_genes
        assert expected_gene_descs.equals(method_gene_descs)
示例#8
0
    def test_get_variants_tabix_error(self, monkeypatch, capfd):
        target_list = [Gene('GENE1', location="Y:2000000000-2000000001")]

        db_config_file = open(DB_CONFIG_FILE, 'r')
        db_config = Config('gnomadv2.1.1', yaml.load(db_config_file.read(), Loader=yaml.FullLoader))
        db_config = RuntimeConfig(db_config)

        def mock_get_db_tbi(data_file, data_path, max_time):
            if 'exomes' in data_file.path:
                return TEST_EXOMES_TBI
            else:
                return TEST_GENOMES_TBI
        monkeypatch.setattr(gnali, "get_db_tbi", mock_get_db_tbi)

        temp_dir = tempfile.TemporaryDirectory()
        output_dir = "{}/output".format(temp_dir.name)
        pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True)

        logger = Logger(output_dir)
        gnali.get_variants(target_list, db_config, 
                        [Filter("homozygous-controls","controls_nhomalt>0")],
                        output_dir, logger, True)
        method_log_file = "{}/gnali_errors.log".format(output_dir)
        assert filecmp.cmp(method_log_file, TEST_LOG_FILE)
示例#9
0
文件: gnali.py 项目: phac-nml/gnali
def main():
    id = uuid.uuid4()
    arg_parser = init_parser(id)
    if len(sys.argv) == 1:
        arg_parser.print_help()
        arg_parser.exit()
    args = arg_parser.parse_args()
    results_dir = args.output_dir

    if args.config_template_grch37:
        create_template('grch37')
        return
    elif args.config_template_grch38:
        create_template('grch38')
        return

    try:
        db_config = None
        if args.config is not None:
            db_config = get_db_config(args.config, args.database)
        else:
            db_config = get_db_config(DB_CONFIG_FILE, args.database)
        if args.pop_freqs:
            db_config.validate_pop_freqs_present()

        genes = open_test_file(args.input_file)
        genes_data = [Gene(gene) for gene in genes]

        db_config = RuntimeConfig(db_config)
        # check that VEP dependencies are present if necessary
        if not db_config.has_lof_annots:
            verify_files_present(db_config.ref_genome_name,
                                 db_config.cache_path)

        logger = Logger(results_dir)
        Path(results_dir).mkdir(parents=True, exist_ok=args.force)
        genes, gene_descs = get_test_gene_descriptions(genes_data, db_config,
                                                       logger, args.verbose)
        genes = find_test_locations(genes, gene_descs, db_config)

        validate_filters(db_config, args.predefined_filters,
                         args.additional_filters)

        filters = transform_filters(db_config, args.predefined_filters,
                                    args.additional_filters)

        header = get_variants(genes, db_config, filters, results_dir, logger,
                              args.verbose)

        results, results_as_vcf = \
            extract_lof_annotations(genes, db_config, args.pop_freqs)

        write_results_all(results, genes, header, results_as_vcf, results_dir,
                          args.vcf)

        print("Finished. Output in {}".format(results_dir))
    except FileExistsError:
        print("Output directory already exists. Use a different name or "
              "--force to overwrite")
        raise
    except NoVariantsAvailableError:
        # Delete results directory if it's empty.
        # If there is a log file, leave it
        write_results_basic(genes, results_dir)
        print("No variants passed filtering")
        print("Finished. Output in {}".format(results_dir))
        return
    except Exception:
        raise