def test_find_test_locations(self): db_config_file = open(DB_CONFIG_FILE, 'r') db_config = Config(None, yaml.load(db_config_file.read(), Loader=yaml.FullLoader)) db_config = RuntimeConfig(db_config) gene_descs = {'': [46843, 58454], \ 'hgnc_symbol': ['CCR5', 'ALCAM'], \ 'chromosome_name': [3, 3], \ 'start_position': [46411633, 105085753], \ 'end_position': [46417697, 46417697]} gene_descs = pd.DataFrame(gene_descs, columns = ['', 'hgnc_symbol', 'chromosome_name', 'start_position', 'end_position']) target_list = [] method_gene_descs = gene_descs genes = [Gene(gene) for gene in ['CCR5', 'ALCAM']] method_test_locations = gnali.find_test_locations(genes, method_gene_descs, db_config) prefix = "chr" if db_config.ref_genome_name == "GRCh38" else "" for index, gene in enumerate(genes): if gene.status is None: chrom = gene_descs.loc[gene_descs.index[index], 'chromosome_name'] start = gene_descs.loc[gene_descs.index[index], 'start_position'] end = gene_descs.loc[gene_descs.index[index], 'end_position'] gene.set_location(location="{prefix}{}:{}-{}" .format(chrom, start, end, prefix=prefix)) assert method_test_locations == genes
def test_get_variants_happy(self, monkeypatch): target_genes = [Gene('CCR5', location="3:46411633-46417697")] expected_variants = [] with open(EXPECTED_PLOF_VARIANTS, 'r') as test_file: for line in test_file: expected_variants.append(line) db_config_file = open(DB_CONFIG_FILE, 'r') db_config = Config('gnomadv2.1.1', yaml.load(db_config_file.read(), Loader=yaml.FullLoader)) db_config = RuntimeConfig(db_config) def mock_get_db_tbi(data_file, data_path, max_time): if 'exomes' in data_file.path: return TEST_EXOMES_TBI else: return TEST_GENOMES_TBI monkeypatch.setattr(gnali, "get_db_tbi", mock_get_db_tbi) temp_dir = tempfile.TemporaryDirectory() header = gnali.get_variants(target_genes, db_config, [Filter("homozygous-controls","controls_nhomalt>0")], temp_dir.name, None, False) method_variants = [var.record_str + "\n" for var in sum([gene.variants for gene in target_genes], [])] assert expected_variants == method_variants
def test_extract_lof_annotations(self): config_file = open(DB_CONFIG_FILE, 'r') config = Config('gnomadv2.1.1', yaml.load(config_file.read(), Loader=yaml.FullLoader)) config = RuntimeConfig(config) data_file = next((file for file in config.files if file.name == 'exomes'), None) tbx = pysam.TabixFile(data_file.path) header = [line for line in tbx.header if "ID=vep" in line][0] genes = [Gene("CCR5")] test_variants = [] with open(EXPECTED_PLOF_VARIANTS, 'r') as test_file: for row in test_file: row = Variant("CCR5", str(row), "vep", "LoF", header) test_variants.append(row) genes[0].add_variants(test_variants) num_trans = sum([var.num_transcripts() for var in genes[0].variants]) print(num_trans) method_results, results_as_vcf = gnali.extract_lof_annotations(genes, config, False) variants = genes[0].variants variant_records = [variant.record_str for variant in variants] results_as_vcf = variant_records variant_tuple = [] for variant in variants: if not variant.multiple_transcripts(): variant_tuple.append(variant.as_tuple_vep(config.lof.get('id'))) else: for trans in variant.transcripts: variant.as_tuple_vep(config.lof.get('id'))[-1].split(",") variant_tuple.extend([variant.as_tuple_basic() + (str(trans),)]) results = np.asarray(variant_tuple, dtype=str) results = pd.DataFrame(data=results) results.columns = ["Chromosome", "Position_Start", "RSID", "Reference_Allele", "Alternate_Allele", "Score", "Quality", "VEP"] results_codes = pd.DataFrame(results['VEP'].str.split('|', 5).tolist(), columns=["LoF_Variant", "LoF_Annotation", "Confidence", "HGNC_Symbol", "Ensembl Code", "Rest"]) results_codes['HGVSc'] = results_codes.Rest.str.split("|", 6).str[5] results_codes.drop('Rest', axis=1, inplace=True) results_codes.drop('Confidence', axis=1, inplace=True) results.drop('VEP', axis=1, inplace=True) results = pd.concat([results, results_codes], axis=1) expected_results = results.drop_duplicates(keep='first', inplace=False) assert expected_results.equals(method_results)
def test_get_db_tbi_happy(self, monkeypatch): def mock_tbi_needed(url, dest_path): return True monkeypatch.setattr(gnali, "tbi_needed", mock_tbi_needed) def mock_download_file(url, dest_path, max_time): dest_path = tempfile.TemporaryFile().name monkeypatch.setattr(gnali, "download_file", mock_download_file) with tempfile.TemporaryDirectory() as temp: db_config_file = open(DB_CONFIG_FILE, 'r') db_config = Config(None, yaml.load(db_config_file.read(), Loader=yaml.FullLoader)) db_config = RuntimeConfig(db_config) assert gnali.get_db_tbi(db_config.files[0], temp, MAX_TIME)
def test_vep_annotate(self): deps_exist = True deps_version = Dependencies.versions['GRCh37'] deps_version_file = Dependencies.files['GRCh37'] if not os.path.exists(deps_version_file): deps_exist = False with open(deps_version_file, 'w') as fh: fh.write(deps_version) input_headers = [] input_recs = [] with open(EXOMES_CCR5_NO_LOF, 'r') as stream: lines = stream.readlines() input_headers = [line for line in lines if line[0] == '#'] input_recs = [line for line in lines if line[0] != '#'] db_config = None with open(DB_CONFIG_FILE, 'r') as config_stream: db_config = Config( 'gnomadv2.1.1nolof', yaml.load(config_stream.read(), Loader=yaml.FullLoader)) db_config = RuntimeConfig(db_config) data_file = next( (file for file in db_config.files if file.name == 'ccr5'), None) tbx = pysam.VariantFile(data_file.path) header = str(tbx.header) method_headers, method_recs = VEP.annotate_vep_loftee( input_headers, input_recs, db_config) lof_id = db_config.lof['id'] lof_annot = db_config.lof['annot'] annot_header = [ line for line in method_headers if "ID={}".format(lof_id) in line ][0] method_recs = [ Variant("CCR5", rec, lof_id, lof_annot, str(annot_header)) for rec in method_recs ] method_recs = [rec.record_str for rec in method_recs] if not deps_exist: os.remove(deps_version_file) expected_headers = [] expected_recs = [] with open(EXOMES_CCR5, 'r') as stream: lines = stream.readlines() expected_headers = [line for line in lines if line[0] == '#'] expected_recs = [line for line in lines if line[0] != '#'] assert method_recs == expected_recs
def test_get_db_tbi_lock_timeout_exception(self, monkeypatch): with tempfile.TemporaryDirectory() as temp: temp = tempfile.TemporaryDirectory() tbi_path = "{}/{}".format(temp.name, TEST_DB_TBI_NAME) shutil.copyfile(TEST_DB_TBI, tbi_path) def mock_lock_acquire(*args, **kwargs): raise TimeoutError monkeypatch.setattr(filelock.FileLock, "acquire", mock_lock_acquire) def mock_download_file(url, dest_path, max_time): dest_path = tempfile.TemporaryFile().name monkeypatch.setattr(gnali, "download_file", mock_download_file) db_config_file = open(DB_CONFIG_FILE, 'r') db_config = Config(None, yaml.load(db_config_file.read(), Loader=yaml.FullLoader)) db_config = RuntimeConfig(db_config) assert gnali.get_db_tbi(db_config.files[0], tbi_path, MAX_TIME)
def test_get_test_gene_descs(self, monkeypatch): genes_list = ['CCR5', 'ALCAM'] def mock_get_human_genes(db_config): human_genes = pd.read_csv(ENSEMBL_HUMAN_GENES) human_genes.drop(human_genes.columns[0], axis=1, inplace=True) return human_genes monkeypatch.setattr(gnali, "get_human_genes", mock_get_human_genes) db_config_file = open(DB_CONFIG_FILE, 'r') db_config = Config(None, yaml.load(db_config_file.read(), Loader=yaml.FullLoader)) db_config = RuntimeConfig(db_config) human_genes = gnali.get_human_genes(db_config) human_genes.columns = ['hgnc_symbol', 'chromosome_name', 'start_position', 'end_position'] genes_data = [Gene(gene) for gene in genes_list] target_gene_names = [gene.name for gene in genes_data] gene_descriptions = human_genes gene_descriptions.columns = ['hgnc_symbol', 'chromosome_name', 'start_position', 'end_position'] gene_descriptions = gene_descriptions[~gene_descriptions['chromosome_name'] .str.contains('PATCH')] gene_descriptions = gene_descriptions[(gene_descriptions['hgnc_symbol'] .isin(target_gene_names))] expected_gene_descs = gene_descriptions.reset_index(drop=True) unavailable_genes = [gene for gene in target_gene_names if gene not in list(gene_descriptions['hgnc_symbol'])] method_genes, method_gene_descs = gnali.get_test_gene_descriptions(genes_data, db_config, None, False) for gene in genes_data: if gene.name in unavailable_genes: gene.set_status("Unknown gene") continue expected_genes = genes_data assert method_genes == expected_genes assert expected_gene_descs.equals(method_gene_descs)
def test_get_variants_tabix_error(self, monkeypatch, capfd): target_list = [Gene('GENE1', location="Y:2000000000-2000000001")] db_config_file = open(DB_CONFIG_FILE, 'r') db_config = Config('gnomadv2.1.1', yaml.load(db_config_file.read(), Loader=yaml.FullLoader)) db_config = RuntimeConfig(db_config) def mock_get_db_tbi(data_file, data_path, max_time): if 'exomes' in data_file.path: return TEST_EXOMES_TBI else: return TEST_GENOMES_TBI monkeypatch.setattr(gnali, "get_db_tbi", mock_get_db_tbi) temp_dir = tempfile.TemporaryDirectory() output_dir = "{}/output".format(temp_dir.name) pathlib.Path(output_dir).mkdir(parents=True, exist_ok=True) logger = Logger(output_dir) gnali.get_variants(target_list, db_config, [Filter("homozygous-controls","controls_nhomalt>0")], output_dir, logger, True) method_log_file = "{}/gnali_errors.log".format(output_dir) assert filecmp.cmp(method_log_file, TEST_LOG_FILE)
def main(): id = uuid.uuid4() arg_parser = init_parser(id) if len(sys.argv) == 1: arg_parser.print_help() arg_parser.exit() args = arg_parser.parse_args() results_dir = args.output_dir if args.config_template_grch37: create_template('grch37') return elif args.config_template_grch38: create_template('grch38') return try: db_config = None if args.config is not None: db_config = get_db_config(args.config, args.database) else: db_config = get_db_config(DB_CONFIG_FILE, args.database) if args.pop_freqs: db_config.validate_pop_freqs_present() genes = open_test_file(args.input_file) genes_data = [Gene(gene) for gene in genes] db_config = RuntimeConfig(db_config) # check that VEP dependencies are present if necessary if not db_config.has_lof_annots: verify_files_present(db_config.ref_genome_name, db_config.cache_path) logger = Logger(results_dir) Path(results_dir).mkdir(parents=True, exist_ok=args.force) genes, gene_descs = get_test_gene_descriptions(genes_data, db_config, logger, args.verbose) genes = find_test_locations(genes, gene_descs, db_config) validate_filters(db_config, args.predefined_filters, args.additional_filters) filters = transform_filters(db_config, args.predefined_filters, args.additional_filters) header = get_variants(genes, db_config, filters, results_dir, logger, args.verbose) results, results_as_vcf = \ extract_lof_annotations(genes, db_config, args.pop_freqs) write_results_all(results, genes, header, results_as_vcf, results_dir, args.vcf) print("Finished. Output in {}".format(results_dir)) except FileExistsError: print("Output directory already exists. Use a different name or " "--force to overwrite") raise except NoVariantsAvailableError: # Delete results directory if it's empty. # If there is a log file, leave it write_results_basic(genes, results_dir) print("No variants passed filtering") print("Finished. Output in {}".format(results_dir)) return except Exception: raise