def test_init_hail_context_twice(self): hl.init(idempotent=True) # Should be no error hl.stop() hl.init(idempotent=True) hl.experimental.define_function(lambda x: x + 2, hl.tint32) # ensure functions are cleaned up without error hl.stop() hl.init(idempotent=True) # Should be no error hl.init(hl.spark_context(), idempotent=True) # Should be no error
def test_init_hail_context_twice(self): hl.init(hl.spark_context(), idempotent=True) # Should be no error
def create_meta(related_data: GnomADRelatedData, fake_fam_prop: float, old_version: str, overwrite: bool) -> None: """ Creates and writes a dataframe with metadata to evaluate gnomAD trios from the raw ped file. In order to compare the raw ped, metadata is also generated for: 1) A number of fake families are generated 2) The previous iteration of the ped file (old_version) :param GnomADRelatedData related_data: Input data :param float fake_fam_prop: Number of fake trios to generate as a proportion of the number of real families in the data :param str old_version: Version of previous iteration to load :param bool overwrite: Whether to overwrite previous data :return: Nothing :rtype: None """ raw_ped = hl.Pedigree.read(raw_fam_path(related_data.data_type), delimiter="\\t") n_fake_trios = int(fake_fam_prop * len(raw_ped.complete_trios())) logger.info( f"Generating fake pedigree with {n_fake_trios} trios for {related_data.data_type}" ) fake_fams = create_fake_pedigree(n_fake_trios, list(related_data.meta_pd.s), raw_ped) fake_fams.write(fake_fam_path(related_data.data_type)) logger.info(f"Running mendel_errors on {related_data.data_type}") # Run mendel errors on families made of random samples to establish expectation in non-trios: pedigrees = [('new', raw_ped), ('old', hl.Pedigree.read(fam_path(related_data.data_type, version=old_version), delimiter="\\t")), ('fake', hl.Pedigree.read(fake_fam_path(related_data.data_type), delimiter="\\t"))] ped_pd = merge_pedigree_pandas([(name, ped_to_pandas(ped)) for name, ped in pedigrees], related_data.sample_to_dups, True) # Run mendel_errors all_ped = pandas_to_ped(ped_pd) gnomad = get_gnomad_data(related_data.data_type) fam_samples = hl.literal({ s for trio in all_ped.trios for s in [trio.s, trio.mat_id, trio.pat_id] }) gnomad = gnomad.filter_cols(fam_samples.contains(gnomad.s)) all_errors, per_fam, per_sample, _ = hl.mendel_errors( gnomad['GT'], all_ped) all_errors.write(sample_qc_mendel_ht_path(related_data.data_type, "all_errors"), overwrite=overwrite) per_fam.write(sample_qc_mendel_ht_path(related_data.data_type, "per_fam"), overwrite=overwrite) per_sample.write(sample_qc_mendel_ht_path(related_data.data_type, "per_sample"), overwrite=overwrite) # Merge all metadata ped_pd = add_pedigree_meta(ped_pd=ped_pd, meta_pd=related_data.meta_pd, kin_ht=related_data.kin_ht, mendel_per_sample_ht=per_sample) # Write merged pedigrees as HT sql_context = SQLContext(hl.spark_context()) hl.Table.from_spark(sql_context.createDataFrame(ped_pd)).write( merged_pedigrees_ht_path(related_data.data_type), overwrite=overwrite)
def test_init_hail_context_twice(self): hl.init(idempotent=True) # Should be no error hl.stop() hl.init(idempotent=True) # Should be no error hl.init(hl.spark_context(), idempotent=True) # Should be no error
print( "Error! One of --genes, --variants, or --variant_list must be given!" ) exit() if (args.pheno_col is not None) and (args.female_col is None): print("Error! if giving --pheno_col, --female_col must also be given") args.output_stem = os.path.join(args.output_dir, args.output_name) ########################## # Import python scripts # ########################## scripts = ["variant_annotation.py", "find_putative_causal_variants.py"] for script in scripts: hl.spark_context().addPyFile(os.path.join(args.scripts_dir, script)) import variant_annotation as va from find_putative_causal_variants import count_case_control_carriers ######################## # Load in matrix table # ######################## fullmt = hl.read_matrix_table(args.mt) #################################### # Check if variant annotation done # #################################### try: fullmt.gene.describe() except Exception as e:
def main(sqlContext, configuration, chrom, nchroms, step): call(["ls", "-l"]) if (chrom == "" or step == ""): usage() sys.exit(2) destination = configuration["destination"] + "/" + configuration["version"] sourceFileName = utils.buildFileName(configuration["source_path"], chrom) fileName = "variants" + chrom + ".ht" fileNameCnv = "variants.ht" number_partitions = configuration["number_of_partitions"] current_dir = utils.buildFileName(configuration["origin_path"], chrom) print("sourcefilename is " + sourceFileName) # Pipeline steps if ("createIndex" in step): if ("createIndexCNV" in step): print("step to create index CNV") index.create_index_cnv( configuration["elasticsearch"]["host"], configuration["elasticsearch"]["port"], configuration["elasticsearch"]["index_cnv_name"], configuration["version"], configuration["elasticsearch"]["num_shards"], configuration["elasticsearch"]["num_replicas"], configuration["elasticsearch"]["user"], configuration["elasticsearch"]["pwd"]) else: print("step to create index") index.create_index_snv( configuration["elasticsearch"]["host"], configuration["elasticsearch"]["port"], configuration["elasticsearch"]["index_name"], configuration["version"], configuration["elasticsearch"]["num_shards"], configuration["elasticsearch"]["num_replicas"], configuration["elasticsearch"]["user"], configuration["elasticsearch"]["pwd"]) if ("loadGermline" in step): print("step loadGermline") annotations.importGermline(hl, current_dir, sourceFileName, destination + "/loaded/" + fileName, number_partitions) current_dir = destination + "/loaded/" + "variants" + chrom + ".ht" if ("loadSomatic" in step): print("step loadSomatics") print("Somatics list path: " + utils.buildFileName(configuration["somatic_paths"], chrom)) # Read somatic vcf file sc = hl.spark_context() somatic_paths = sc.textFile( utils.buildFileName(configuration["somatic_paths"], chrom)).collect() # Import and merge somatic files annotations.importSomatic(hl, current_dir, somatic_paths, destination + "/loadedSomatic/" + fileName, number_partitions) current_dir = destination + "/loadedSomatic/" + fileName if ("loadCNV" in step): print("step loadCNV") annotations.loadCNV(hl, configuration["source_path_cnv"], destination + "/loadedCNV/" + fileNameCnv, number_partitions) if ("loaddbNSFP" in step): print("step loaddbNSFP") annotations.importDbNSFPTable( hl, utils.buildFileName(configuration["dbNSFP_Raw"], chrom), utils.buildFileName(configuration["dnNSFP_path"], chrom), number_partitions) if ("loadcadd" in step): print("step loadCADD") annotations.importDBVcf( hl, utils.buildFileName(configuration["cadd_Raw"], chrom), utils.buildFileName(configuration["cadd_path"], chrom), number_partitions) if ("loadclinvar" in step): print("step loadclinvar") annotations.importDBVcf( hl, utils.buildFileName(configuration["clinvar_Raw"], ""), utils.buildFileName(configuration["clinvar_path"], ""), number_partitions) if ("loadExomesGnomad" in step): print("step load exomes gnomad") annotations.importDBVcf( hl, utils.buildFileName(configuration["exomesGnomad_Raw"], chrom), utils.buildFileName(configuration["exomesGnomad_path"], chrom), number_partitions) if ("loadExAC" in step): print("step load ExAC") annotations.importDBVcf( hl, utils.buildFileName(configuration["ExAC_Raw"], chrom), utils.buildFileName(configuration["ExAC_path"], chrom), number_partitions) if ("loadCGI" in step): print("step load CGI") annotations.importCGITable( hl, utils.buildFileName(configuration["CGI_Raw"], ""), utils.buildFileName(configuration["CGI_path"], ""), number_partitions) if ("annotateCGI" in step): print("step annotate CGI") variants = hl.read_table(current_dir) annotations.annotateCGI( hl, variants, utils.buildFileName(configuration["CGI_path"], chrom), destination + "/annotatedCGI/" + fileName) current_dir = destination + "/annotatedCGI/" + fileName if ("annotateVEP" in step): print("step annotate VEP") print("source file is " + current_dir) variants = hl.read_table(current_dir) annotations.annotateVEP(hl, variants, destination + "/annotatedVEP/" + fileName, configuration["vep"], number_partitions) if ("annotatedbNSFP" in step): print("step annotate dbNSFP") variants = hl.read_table(destination + "/annotatedVEP/" + fileName) annotations.annotateDbNSFP( hl, variants, utils.buildFileName(configuration["dnNSFP_path"], chrom), destination + "/annotatedVEPdbnSFP/" + fileName) if ("annotatecadd" in step): print("step annotate dbcadd") variants = hl.read_table(destination + "/annotatedVEPdbnSFP/" + fileName) annotations.annotateCADD( hl, variants, utils.buildFileName(configuration["cadd_path"], chrom), destination + "/annotatedVEPdbnSFPCadd/" + fileName) if ("annotateclinvar" in step): print("step annotate clinvar") variants = hl.read_table(destination + "/annotatedVEPdbnSFPCadd/" + fileName) annotations.annotateClinvar( hl, variants, utils.buildFileName(configuration["clinvar_path"], ""), destination + "/annotatedVEPdbnSFPCaddClinvar/" + fileName) if ("annotateExomesGnomad" in step): print("step annotate exomes gnomad") variants = hl.read_table(destination + "/annotatedVEPdbnSFPCaddClinvar/" + fileName) annotations.annotateGnomADEx( hl, variants, utils.buildFileName(configuration["exomesGnomad_path"], chrom), destination + "/annotatedVEPdbnSFPCaddClinvarExGnomad/" + fileName) if ("annotateExAC" in step): print("step annotate ExAC") variants = hl.read_table(destination + "/annotatedVEPdbnSFPCaddClinvarExGnomad/" + fileName) annotations.annotateExAC( hl, variants, utils.buildFileName(configuration["ExAC_path"], chrom), destination + "/annotatedVEPdbnSFPCaddClinvarExGnomadExAC/" + fileName) # Transforming step. It sets all fields to the corresponding ElasticSearch format if ("transform" in step): print("step transform") annotated = hl.read_table( destination + "/annotatedVEPdbnSFPCaddClinvarExGnomadExAC/" + fileName) transform.transform(annotated, destination, chrom) # Uploading step. It uploads all annotated variants to ElasticSearch if ("toElastic" in step): print("step to elastic") es_conf = { "es.net.http.auth.user": configuration["elasticsearch"]["user"], "es.net.http.auth.pass": configuration["elasticsearch"]["pwd"], "es.nodes": configuration["elasticsearch"]["host"], "es.port": configuration["elasticsearch"]["port"] } #print(es_conf) index_name = configuration["elasticsearch"]["index_name"] if ("toElasticCNV" in step): print("step toElasticCNV") variants = hl.read_table(destination + "/loadedCNV/" + fileNameCnv).to_spark() variants = variants.withColumn("chrom", variants["chrom"].cast(IntegerType())) \ .withColumn("start", variants["start"].cast(IntegerType())) \ .withColumn("end", variants["end"].cast(IntegerType())) \ .withColumn("cnt", variants["cnt"].cast(IntegerType())) \ .withColumn("bf", variants["bf"].cast(FloatType())) \ .withColumn("omim_number", variants["omim_number"].cast(IntegerType())) \ .withColumn("tool",lit("ExomeDepth")) index_name = configuration["elasticsearch"]["index_cnv_name"] variants.printSchema() else: # Getting annotated variants and adding the chromosome column variants = sqlContext.read.load(destination+"/variants/chrom="+chrom)\ .withColumn("chrom",lit(chrom)) variants.printSchema() variants.write.format("org.elasticsearch.spark.sql").options( **es_conf).save(index_name + "/" + configuration["version"], mode='append') # Counting step to check whether the number of variants in Spark corresponds to tht number of variants that # have been uploaded to ElasticSearch if ("count" in step): if (nchroms == ""): usage() sys.exit(2) count = 0 for chrom in range(1, int(nchroms) + 1): variants = sqlContext.read.load(destination + "/variants/chrom=" + str(chrom)) count += variants.count() print("\nTotal number of variants: " + str(count) + "\n")
# Counting step to check whether the number of variants in Spark corresponds to tht number of variants that # have been uploaded to ElasticSearch if ("count" in step): if (nchroms == ""): usage() sys.exit(2) count = 0 for chrom in range(1, int(nchroms) + 1): variants = sqlContext.read.load(destination + "/variants/chrom=" + str(chrom)) count += variants.count() print("\nTotal number of variants: " + str(count) + "\n") if __name__ == "__main__": # Command line options parsing chrom, path, nchroms, step, cores = optionParser(sys.argv[1:]) main_conf = config.readConfig(path) spark_conf = SparkConf().setAppName(APP_NAME).set('spark.executor.cores', cores) spark = SparkSession.builder.config(conf=spark_conf).getOrCreate() spark.sparkContext._jsc.hadoopConfiguration().setInt( "dfs.block.size", main_conf["dfs_block_size"]) spark.sparkContext._jsc.hadoopConfiguration().setInt( "parquet.block.size", main_conf["dfs_block_size"]) hl.init(spark.sparkContext) sqlContext = SQLContext(hl.spark_context()) # Execute Main functionality main(sqlContext, main_conf, chrom, nchroms, step)
parser.add_argument("--chr_prefix", action='store_true', help="Chromosomes are of form 'chr1', NOT '1' etc.") parser.add_argument("--force_bgz", action='store_true', help="Force blog gzip import? Default true.") parser.add_argument("--call_fields", default="PGT", help="Name of genotype call field in VCF, default PGT.") parser.add_argument("--test", action='store_true', help="Filters data to just chr 22 for testing purposes.") args = parser.parse_args() ################## # Import scripts # ################## hl.init() scripts = ["helper_scripts.py"] for script in scripts: hl.spark_context().addPyFile(args.scripts_dir + script) import helper_scripts as h ##################################### # Configure logging, define outputs # ##################################### logstem = 'import_vep_annotate-' datestr, timestr, log_file = h.configure_logging(logstem=logstem) log_dir = os.path.join(args.log_dir, logstem + datestr) # Configure logger root = logging.getLogger() log_formatter = '%(asctime)s - %(levelname)s - %(message)s' logging.basicConfig(filename=log_file, format=log_formatter, level=logging.INFO)