def test_gwas_unpickle(): g1 = [ Gwas("1", 101, "A", "T", 1, 0, 5e-8, 1000, 0.4, "rs1234", None, None, None), Gwas("1", 105, "A", "T", 1, 0, 5e-8, 1000, 0.4, "rs1234", None, None, None), Gwas("1", 102, "A", "T", 1, 0, 5e-8, 1000, 0.4, "rs1234", None, None, None) ] g2 = [] idx = [] results = tempfile.TemporaryFile() for result in g1: heappush(idx, (result.pos, results.tell())) pickle.dump(result, results) while idx: pos = heappop(idx) results.seek(pos[1]) result = pickle.load(results) g2.append(result) assert g2[0].pos == 101 assert g2[1].pos == 102 assert g2[2].pos == 105 results.close()
def test_reverse_sign(): g = Gwas('test', 1, 'A', 'T', 1, None, None, None, None, None, None, None, None) g.reverse_sign() assert g.chrom == "test" assert g.pos == 1 assert g.ref == "T" assert g.alt == "A" assert g.b == -1
def test_are_alleles_iupac(): g = Gwas('test', 1, 'A', 'T', None, None, None, None, None, None, None, None, None) g.check_alleles_are_vaild() with pytest.raises(AssertionError): g = Gwas('test', 1, 'A', 'wdeT', None, None, None, None, None, None, None, None, None) g.check_alleles_are_vaild()
def test_check_reference_allele(): with pysam.FastaFile(os.path.join(os.path.dirname(__file__), "test.fasta")) as fasta: g = Gwas('test', 1, 'A', 'T', 1, None, None, None, None, None, None, None, None) g.check_reference_allele(fasta) with pytest.raises(AssertionError): g = Gwas('test', 1, 'T', 'A', 1, None, None, None, None, None, None, None, None) g.check_reference_allele(fasta)
def test_update_dbsnp(): with pysam.VariantFile( os.path.join(os.path.dirname(__file__), "dbsnp.vcf.gz")) as dbsnp: g = Gwas('test', 1, 'A', 'T', 1, None, None, None, None, None, None, None, None) assert g.dbsnpid is None g.update_dbsnp(dbsnp) assert g.dbsnpid == "rs1234" g = Gwas('test', 2, 'A', 'T', 1, None, None, None, None, None, None, None, None) assert g.dbsnpid is None g.update_dbsnp(dbsnp) assert g.dbsnpid is None
def main(): version = "1.3.0" parser = argparse.ArgumentParser(description='Map GWAS summary statistics to VCF/BCF') parser.add_argument('-v', '--version', action='version', version='%(prog)s {}'.format(version)) parser.add_argument('--out', dest='out', required=False, help='Path to output VCF/BCF. If not present then must be specified as \'out\' in json file') parser.add_argument('--data', dest='data', required=False, help='Path to GWAS summary stats. If not present then must be specified as \'data\' in json file') parser.add_argument('--ref', dest='ref', required=True, help='Path to reference FASTA') parser.add_argument('--dbsnp', dest='dbsnp', required=False, help='Path to reference dbSNP VCF') parser.add_argument('--json', dest='json', required=True, help='Path to parameters JSON') parser.add_argument('--id', dest='id', required=False, help='Study identifier. If not present then must be specified as \'id\' in json file') parser.add_argument('--cohort_controls', type=int, dest='cohort_controls', required=False, default=None, help='Total study number of controls (if case/control) or total sample size if continuous. Overwrites value if present in json file.') parser.add_argument('--cohort_cases', type=int, dest='cohort_cases', required=False, default=None, help='Total study number of cases. Overwrites value if present in json file.') parser.add_argument('--csi', dest='csi', action='store_true', default=False, required=False, help='Default is to index tbi but use this flag to index csi') parser.add_argument("--log", dest="log", required=False, default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help="Set the logging level") parser.add_argument('--alias', dest='alias', required=False, help='Optional chromosome alias file') args = parser.parse_args() # set logging level if args.log: logging.basicConfig(level=getattr(logging, args.log), format='%(asctime)s %(levelname)s %(message)s') logging.info("Gwas2VCF {}".format(version)) logging.info("Arguments: {}".format(vars(args))) logging.info("Reading JSON parameters") try: schema = Param(strict=True) with open(args.json) as f: j = schema.load(json.load(f)).data logging.info("Parameters: {}".format(j)) except json.decoder.JSONDecodeError as e: logging.error("Could not read json parameter file: {}".format(e)) sys.exit() except marshmallow.exceptions.ValidationError as e: logging.error("Could not validate json parameter file: {}".format(e)) sys.exit() logging.info("Checking input arguments") if args.data is None: if 'data' in j.keys(): vars(args)['data'] = j['data'] else: logging.error("'data' filename not provided in arguments or json file") sys.exit() if args.out is None: if 'out' in j.keys(): vars(args)['out'] = j['out'] else: logging.error("out filename not provided in arguments or json file") sys.exit() if args.id is None: if 'id' in j.keys(): vars(args)['id'] = j['id'] else: logging.error("id not provided in arguments or json file") sys.exit() if args.cohort_cases is None and 'cohort_cases' in j.keys(): vars(args)['cohort_cases'] = j['cohort_cases'] if args.cohort_controls is None and 'cohort_controls' in j.keys(): vars(args)['cohort_controls'] = j['cohort_controls'] # check values are valid if args.cohort_cases is not None: if args.cohort_cases < 1: logging.error("Total study number of cases must be a positive number") sys.exit() if args.cohort_controls is not None: if args.cohort_controls < 1: logging.error("Total study number of controls must be a positive number") sys.exit() if not os.path.isfile(args.data): logging.error("{} file does not exist".format(args.data)) sys.exit() if not os.path.isfile(args.ref): logging.error("{} file does not exist".format(args.ref)) sys.exit() if not os.path.exists(os.path.dirname(args.out)): logging.error("{} output directory does not exist".format(args.out)) sys.exit() if args.dbsnp is not None: dbsnp = pysam.VariantFile(args.dbsnp) else: dbsnp = None if args.alias is not None: alias = {} with open(args.alias) as f: for line in f: (key, val) = line.strip().split("\t") alias[key] = val else: alias = None # read in data # harmonise, left align and trim on-the-fly and write to pickle format # keep file index for each record and chromosome position to write out karyotypically sorted records later with pysam.FastaFile(args.ref) as fasta: gwas, idx, sample_metadata = Gwas.read_from_file( args.data, fasta, j['chr_col'], j['pos_col'], j['ea_col'], j['oa_col'], j['beta_col'], j['se_col'], j['pval_col'], j['delimiter'], j['header'], ncase_field=j.get('ncase_col'), rsid_field=j.get('snp_col'), ea_af_field=j.get('eaf_col'), nea_af_field=j.get('oaf_col'), imp_z_field=j.get('imp_z_col'), imp_info_field=j.get('imp_info_col'), ncontrol_field=j.get('ncontrol_col'), alias=alias, dbsnp=dbsnp ) if dbsnp is not None: dbsnp.close() # metadata file_metadata = { 'Gwas2VCF_command': ' '.join(sys.argv[1:]) + "; " + version, 'file_date': datetime.now().isoformat() } if args.cohort_controls is not None: sample_metadata['TotalControls'] = args.cohort_controls if args.cohort_cases is not None: sample_metadata['TotalCases'] = args.cohort_cases if 'ncase_col' in j or args.cohort_cases is not None: sample_metadata['StudyType'] = 'CaseControl' else: sample_metadata['StudyType'] = 'Continuous' # write to VCF # loop over sorted chromosome position and get record using random access Vcf.write_to_file(gwas, idx, args.out, fasta, j['build'], args.id, sample_metadata, file_metadata, args.csi) # close temp file to release disk space gwas.close()
def test_normalise(): with pysam.FastaFile(os.path.join(os.path.dirname(__file__), "test.fasta")) as fasta: # SNV g = Gwas('test', 1, 'A', 'T', None, None, None, None, None, None, None, None, None) g.check_reference_allele(fasta) g.normalise(fasta, padding=5) assert g.chrom == "test" assert g.pos == 1 assert g.ref == "A" assert g.alt == "T" # left pad SNV g = Gwas('test', 10, 'ACACA', 'ACACT', None, None, None, None, None, None, None, None, None) g.check_reference_allele(fasta) g.normalise(fasta, padding=5) assert g.chrom == "test" assert g.pos == 14 assert g.ref == "A" assert g.alt == "T" # right pad SNV g = Gwas('test', 10, 'ACACA', 'TCACA', None, None, None, None, None, None, None, None, None) g.check_reference_allele(fasta) g.normalise(fasta, padding=5) assert g.chrom == "test" assert g.pos == 10 assert g.ref == "A" assert g.alt == "T" # left pad ins g = Gwas('test', 10, 'ACA', 'ACAGT', None, None, None, None, None, None, None, None, None) g.check_reference_allele(fasta) g.normalise(fasta, padding=5) assert g.chrom == "test" assert g.pos == 12 assert g.ref == "A" assert g.alt == "AGT" # left-align pad del g = Gwas('test', 10, 'ACACA', 'ACA', None, None, None, None, None, None, None, None, None) g.check_reference_allele(fasta) g.normalise(fasta, padding=5) assert g.chrom == "test" assert g.pos == 9 assert g.ref == "TAC" assert g.alt == "T"