def check_and_write_data(self, current_lo, bed_file): if self.gene == None: return self.trans[0].check_and_write_data(current_lo, bed_file) # just write the gene if we're a lone gene and it's passed elif len(self.trans) == 0 and current_lo > self.gene['hi']: output_row = default_bed_line[:] output_row[bed_col['chr']] = self.gene['chr'] output_row[bed_col['chr']] = self.gene['lo'] output_row[bed_col['chr']] = self.gene['hi'] output_row[bed_col['chr']] = self.gene['name'] output_row[bed_col['chr']] = self.gene['strand'] if "thick_start" in self.gene: if self.gene['thick_start'] != dxpy.NULL: output_row[bed_col['thick_start']] = str( self.gene['thick_start']) if "thick_end" in self.gene: if self.gene['thick_end'] != dxpy.NULL: output_row[bed_col['thick_end']] = str( self.gene['thick_end']) if "score" in self.gene: if self.gene['score'] != dxpy.NULL: output_row[bed_col['score']] = str(self.gene['score']) return True elif current_lo > self.gene['hi']: for t in self.trans: if t.check_and_write_data(current_lo, bed_file) != True: raise dxpy.AppError( "found end of gene but not end of transcript: " + str(self.gene)) return True else: return False
def import_BED(**args): if len(args) == 0: cmd_line_args = parser.parse_args(sys.argv[1:]) args['filename'] = cmd_line_args.filename args['reference'] = cmd_line_args.reference args['file_id'] = cmd_line_args.file_id args['additional_type'] = cmd_line_args.additional_type args['property_key'] = cmd_line_args.property_key args['property_value'] = cmd_line_args.property_value args['tag'] = cmd_line_args.tag bed_filename = args['filename'] reference = args['reference'] file_id = args['file_id'] additional_types = args['additional_type'] property_keys = args['property_key'] property_values = args['property_value'] tags = args['tag'] job_outputs = [] # uncompresses file if necessary. Returns new filename bed_filename_uncomp = unpack( bed_filename ) current_file = 1 for import_filename in split_on_track(bed_filename_uncomp): try: bed_basename = os.path.basename(bed_filename) except: bed_basename = bed_filename if current_file == 1: name = bed_basename else: name = bed_basename+"_"+str(current_file) current_file += 1 bed_type = detect_type(import_filename)["type"] delimiter = detect_type(import_filename)["delimiter"] print("Bed type is : " + bed_type, file=sys.stderr) if bed_type == "genes": print("Importing as Genes Type", file=sys.stderr) job_outputs.append(import_genes(import_filename, name, reference, file_id, additional_types, property_keys, property_values, tags, delimiter)) elif bed_type == "spans" or bed_type == "bedDetail": print("Importing as Spans Type", file=sys.stderr) if bed_type == "bedDetail": print("input file is in 'bedDetails' format...", file=sys.stderr) bedDetail=True else: bedDetail=False job_outputs.append(import_spans(import_filename, name, reference, file_id, additional_types, property_keys, property_values, tags, bedDetail, delimiter)) else: raise dxpy.AppError("Unable to determine type of BED file") subprocess.check_call(" ".join(["rm", import_filename]), shell=True) if(bed_filename != bed_filename_uncomp): subprocess.check_call(" ".join(["rm", bed_filename_uncomp]), shell=True) print(json.dumps(job_outputs)) return job_outputs
def RunPindel(kwargs, pindel_command, output_path): folder = output_path.split("/")[0] print "Making folder for output: " + str(folder) os.mkdir(folder) print "Running pindel with: " print '\t' + str(pindel_command) start_time = time.time() try: p = subprocess.check_output(pindel_command, stderr=subprocess.STDOUT, shell=True) print p tot_time = time.time() - start_time hours = int(tot_time / 3600) mins = int(float(tot_time % 3600) / 60) secs = tot_time % 60 print "Pindel ran in: {hrs}h {mins}m {secs}s".format(hrs=hours, mins=mins, secs=secs) except subprocess.CalledProcessError, e: print "\n" + str(e.output) raise dxpy.AppError( "Pindel failed to run. Please check job logs for pindel output. If error is a segmentation fault " + "raised as pindel begins to run, check that reference FASTA file is the same reference used to produce the mappings" )
def ExportVCF(kwargs, output_path, ref_fn): ref_name_version = dxpy.describe(kwargs["reference_fasta"])["name"] ref_name_version = ref_name_version.rstrip(".fa") vcf_out_fn = kwargs["output_prefix"] + '.pindel.vcf' command_args = ["pindel2vcf"] command_args.append("-r {input}".format(input=ref_fn)) command_args.append("-P {input}".format(input=output_path)) command_args.append("-v {input}".format(input=vcf_out_fn)) if kwargs["vcf_gatk_compatible"]: command_args.append("-G") if "export_vcf_advanced_options" in kwargs: command_args.append(kwargs["export_vcf_advanced_options"]) else: ref_date = str(datetime.date.today()) command_args.append("-R {input}".format(input=ref_name_version)) command_args.append("-d ''") try: vcf_command = " ".join(command_args) print "Executing: " + vcf_command print subprocess.check_output(vcf_command, stderr=subprocess.STDOUT, shell=True) except subprocess.CalledProcessError, e: print e print e.output raise dxpy.AppError( "APP ERROR: App was not able to convert pindel to vcf. Please check pindel2vcf inputs" )
def group_files_by_read(fastq_files): """ Function : Groups a list of FASTQ files by the values of their Read property that indicates the read number. Returns a dict mapping each observed value of the property (or 'none' if a file does not have a value for the property) to a list of the files with that value. Within each group, the files are sorted by their value of the Chunk property (to ensure that left and right reads of a given chunk are handled together. Args : fastq_files - a list of dxpy.DXFile objects representing FASTQ files. Returns : dict. """ print("Grouping Fastq files by read number") read_dict = {} for fastq_file in fastq_files: props = fastq_file.get_properties() read_num = props["read"] if read_num not in ["1", "2", "none"]: raise dxpy.AppError("%s has invalid Read property: %s" % (fastq_file.get_id(), read_num)) if read_num not in read_dict: read_dict[read_num] = [] read_dict[read_num].append(fastq_file) #for read_num in read_dict: # read_dict[read_num] = sorted(read_dict[read_num], key=chunk_property) return read_dict
def main(**job_inputs): # If we weren't provided a mmi index for the reference, generate it. if 'genome_mmi' not in job_inputs: mmi_input = {'genome_fastagz': job_inputs['genome_fastagz']} minimap_index_job = dxpy.new_dxjob(mmi_input, 'run_minimap_index') job_inputs['genome_mmi'] = minimap_index_job.get_output_ref( 'genome_mmi') output = {'genome_mmi': job_inputs['genome_mmi']} # check if we're dealing with pacbio or ONT reads and what the filetype is datatype = job_inputs['datatype'] one_reads_file = dxpy.DXFile(job_inputs['reads'][0]).describe()['name'] try: file_ext = re.search("(fastq|fasta|fa|fq){1}(.gz)?$", one_reads_file, flags=re.I).group(1).lower() except AttributeError: raise dxpy.AppError("Invalid filetype extension supplied.") # for fasta and fastq inputs, run jobs using native minimap2 jobs = run_minimap2_subjobs(job_inputs) output['bam_files'] = [j.get_output_ref('mapped_reads') for j in jobs] output['bai_files'] = [ j.get_output_ref('mapped_reads_index') for j in jobs ] return output
def main(tumor_bams=None, normal_bams=None, cn_reference=None, baits=None, fasta=None, annotation=None, method='hybrid', is_male_normal=True, drop_low_coverage=False, antitarget_avg_size=150000, target_avg_size=267, do_parallel=True): if not tumor_bams and not normal_bams: raise dxpy.AppError("Must provide tumor_bams or normal_bams (or both)") if cn_reference and any((normal_bams, baits, fasta, annotation)): raise dxpy.AppError("Reference profile (cn_reference) cannot be used " "alongside normal_bams, baits, fasta, " "or annotation") if tumor_bams and not any((baits, cn_reference)): raise dxpy.AppError("Need cn_reference or baits to process tumor_bams") print("Downloading file inputs to the local file system") cn_reference = download_link(cn_reference) baits = download_link(baits) fasta = download_link(fasta) annotation = download_link(annotation) if tumor_bams is not None: tumor_bams = map(download_link, tumor_bams) if normal_bams is not None: normal_bams = map(download_link, normal_bams) # If these input files are gzipped, decompress them fasta = maybe_gunzip(fasta, "ref", "fa") annotation = maybe_gunzip(annotation, "annot", "txt") out_fnames = run_cnvkit(tumor_bams, normal_bams, cn_reference, baits, fasta, annotation, method, is_male_normal, drop_low_coverage, antitarget_avg_size, target_avg_size, do_parallel) print("Uploading local file outputs to the DNAnexus platform") output = {} for filekey in ("cn_reference", "seg", "metrics", "genders", "scatter_pdf", "diagram_pdf"): if filekey in out_fnames: output[filekey] = dxpy.dxlink( dxpy.upload_local_file(out_fnames[filekey])) for listkey in ("copy_ratios", "copy_segments", "gainloss", "breaks"): if listkey in out_fnames: output[listkey] = [dxpy.dxlink(dxpy.upload_local_file(fname)) for fname in out_fnames[listkey]] return output
def ValidateBamConfig(bam_config_fn, bam_name_array): print "\nValidating bam config file" with open(bam_config_fn) as config_fh: for line in config_fh: name = line.split()[0] if name not in bam_name_array: raise dxpy.AppError( "Bam config file contains filenames which do not match input bam files" ) print "\tBam config file is valid" return True
def main(**job_inputs): # If we weren't provided a mmi index for the reference, generate it. if 'genome_mmi' not in job_inputs: mmi_input = {'genome_fastagz': job_inputs['genome_fastagz']} minimap_index_job = dxpy.new_dxjob(mmi_input, 'run_minimap_index') job_inputs['genome_mmi'] = minimap_index_job.get_output_ref( 'genome_mmi') output = {'genome_mmi': job_inputs['genome_mmi']} # check if we're dealing with pacbio or ONT reads and what the filetype is datatype = job_inputs['datatype'] one_reads_file = dxpy.DXFile(job_inputs['reads'][0]).describe()['name'] try: file_ext = re.search("(bam|fastq|fasta|fa|fq){1}(.gz)?$", one_reads_file, flags=re.I).group(1).lower() except AttributeError: raise dxpy.AppError("Unknown filetype extension supplied.") if file_ext == 'bam': # input bam files must be pacbio raw reads if datatype == 'ONT': raise dxpy.AppError("Invalid file input for provided datatype.") # for bam input, run jobs using pbmm2 jobs = run_pbmm2_subjobs(job_inputs) else: # for fasta and fastq inputs, run jobs using native minimap2 if job_inputs['pbbamify']: print( 'WARNING: The "Run pbbamify" option is only valid for BAM input' ) jobs = run_minimap2_subjobs(job_inputs) output['bam_files'] = [j.get_output_ref('mapped_reads') for j in jobs] output['bai_files'] = [ j.get_output_ref('mapped_reads_index') for j in jobs ] return output
def check_reads(reads_tables): # validate that tables contain data that can be used together (all paired or all unpaired, etc) if len(reads_tables) == 0: raise dxpy.AppError("Please enter at least one Reads table as input") single = 0 paired = 0 for table in reads_tables: if 'sequence2' in dxpy.DXGTable(table).get_col_names(): paired = paired + 1 else: single = single + 1 if single > 0 and paired > 0: raise dxpy.AppError( "Found both single and paired-end reads. Please only input one type." ) return
def main(**kwargs): if len(kwargs) == 0: kwargs = vars(arg_parser.parse_args(sys.argv[1:])) try: spans = dxpy.DXGTable(kwargs['Spans']) except: raise dxpy.AppError("Failed to open Spans object for export") spans_types = spans.describe()['types'] if 'Genes' in spans_types: export_genes(spans, kwargs['output']) else: export_generic_bed(spans, kwargs['output'])
def validate_per_tumor(values, n_expected, title, criterion=None): """Ensure a per-tumor input array matches the number of tumor BAMs, etc. Also allow a value of None to skip checks & downstream processing, or a single value to apply to every tumor sample. Returns: list of values the same length as `n_expected`. """ if values is None: out_vals = [None] * n_expected else: if criterion is not None and not all(map(criterion, values)): raise dxpy.AppError( """Tumor {} must all be between 0 and 1; got: {}""".format( title, values)) if len(values) == n_expected: out_vals = values elif len(values) == 1 and n_expected > 1: out_vals = [values] * n_expected else: raise dxpy.AppError("""Number of tumor {} specified ({}) does not match the number of tumor BAM files given ({})""".format( title, len(values), n_expected)) return out_vals
def find_delimiter(bed_file): with open(bed_file, "rU") as bf: line = bf.readline() if line.startswith("track"): line = bf.readline() tab_split = line.split("\t") if len(tab_split) >= 3: print("Bed file is tab delimited", file=sys.stderr) return "\t" else: space_split = line.split() if len(space_split) < 3: raise dxpy.AppError("File is not a valid bed file (neither space delimited nor tab delimited)") print("Bed file is space delimited", file=sys.stderr) return " "
def convert_qual(qualString, qual_encode): convQualString = '' if qual_encode == 'phred64': #convert to phred33 do this by subtracting the difference in ASCII offsets #should be scaling values here? Lose some top end values by doing this for i in range(len(qualString)): convQualString += chr(ord(qualString[i]) - 31) elif qual_encode == 'qual_file': convQualString = ''.join( chr(int(i) + 33) for i in qualString.strip(' ').split(' ')) elif qual_encode == 'phred33': convQualString = qualString else: raise dxpy.AppError( "Unknown quality encoding. Supported encodings are Phred33 and Phred64." ) return convQualString
def main(**kwargs): mappings_ids = kwargs["mappings_files"] mappings_names = sorted([dxpy.describe(id)["name"] for id in mappings_ids]) if "num_threads_per_instance" not in kwargs: kwargs["num_threads_per_instance"] = multiprocessing.cpu_count() if "num_instances" not in kwargs: kwargs["num_instances"] = 1 # Set output prefix here if "output_prefix" not in kwargs: kwargs["output_prefix"] = mappings_names[0].rstrip('.bam').rstrip( '.txt') # Set output suffixes (for consistency through app) kwargs["variant_suffixes"] = { "deletions": 'D', "short_inserts": 'SI', "tandem_duplications": 'TD', "large_inserts": 'LI', "inversions": 'INV', "breakpoints": 'BP', #"breakdancer_outputs": 'BD', "close_mapped_reads": 'CloseEndMapped' } """ if kwargs["export_vcf"]: print "\nTESTING pindel2vcf command line inputs on dummy inputs" ExportVCF(kwargs, output_path="/usr/test_vcf/dummy", ref_fn="/usr/test_vcf/dummy.fa") """ # Check if input files have .bam extension if mappings_names[0][-4:] == ".bam": for name in mappings_names: if name[-4:] != ".bam": raise dxpy.AppError( "Input mappings files are not all bam files with .bam extensions" ) app_outputs = RunWithBamInput(kwargs=kwargs) else: app_outputs = RunWithPindelInput(kwargs=kwargs, sam2pindel=False) return app_outputs
def main(sam_file, probability): # The following line(s) initialize your data object inputs on the platform # into dxpy.DXDataObject instances that you can start using immediately. sam_file = dxpy.DXFile(sam_file) # The following line(s) download your file inputs to the local file system # using variable names for the filenames. dxpy.download_dxfile(sam_file.get_id(), "sam_file") if probability < 0 or probability > 1: raise dxpy.AppError( "Probability parameter determines % of mappings included in output. Must be between 0 an 1." ) subprocess.check_call(" ".join([ "java", "-Xmx2g", "-jar", "/usr/local/bin/DownsampleSam.jar", "INPUT=sam_file", "OUTPUT=downsampled_sam", "PROBABILITY=" + str(probability) ]), shell=True) # The following line(s) use the Python bindings to upload your file outputs # after you have created them on the local file system. It assumes that you # have used the output field name for the filename for each output, but you # can change that behavior to suit your needs. downsampled_sam = dxpy.upload_local_file("downsampled_sam") downsampled_sam.rename(sam_file.describe()['name'] + "_downsampled") # The following line fills in some basic dummy output and assumes # that you have created variables to represent your output with # the same name as your output fields. output = {} output["downsampled_sam"] = dxpy.dxlink(downsampled_sam) return output
def iterate_reads(fastqa1_filename, fastqa2_filename, qual1_filename, qual2_filename, is_fasta, is_colorspace, qual_encoding): fastqa1_iter = unpack_and_open(fastqa1_filename).__iter__() fastqa2_iter, qual1_iter, qual2_iter = None, None, None if fastqa2_filename != None: fastqa2_iter = unpack_and_open(fastqa2_filename).__iter__() if qual1_filename != None: qual1_iter = unpack_and_open(qual1_filename).__iter__() if qual2_filename != None: qual2_iter = unpack_and_open(qual2_filename).__iter__() read_iter = get_read(fastqa1_iter, qual1_iter, is_fasta, is_colorspace, qual_encoding).__iter__() if fastqa1_filename != None: read_iter2 = get_read(fastqa2_iter, qual2_iter, is_fasta, is_colorspace, qual_encoding).__iter__() try: while True: temp = read_iter.next() name1 = temp[0] seq1 = temp[1] qual1 = temp[2] #name1, seq1, qual1 = read_iter.next() name2, seq2, qual2 = None, None, None if fastqa2_filename != None: name2, seq2, qual2 = read_iter2.next() yield name1, seq1, qual1, name2, seq2, qual2 except StopIteration: # check to make sure all files we're reading from are all finished at the same time for file_iter in fastqa1_iter, fastqa2_iter, qual1_iter, qual2_iter: if file_iter != None: try: line = file_iter.next().rstrip("\n") raise dxpy.AppError( "Number of reads in each file must be equal") except StopIteration: pass
def RunWithPindelInput(kwargs, sam2pindel=False): pindel_config_fn = "pindel_config.txt" mappings_names = DownloadFilesFromArray(input_ids=kwargs["mappings_files"]) if sam2pindel: print "\nInput was not produced by BWA/MOSAIK, running sam2pindel script on input BAM files" if "sequence_platform" not in kwargs: raise dxpy.AppError( "If BAM files were not produced by BWA, must ALSO specify which sequence platform was used to produce the mappings" ) pindel_config_fn = RunSam2Pindel( bam_names=mappings_names, insert_size=kwargs["insert_size"], seq_platform=kwargs["sequence_platform"], num_threads=kwargs["num_threads_per_instance"], config_fn=pindel_config_fn) else: print "\nInput is pindel input. Making pindel configuration file" pindel_config_fn = WriteConfigFile(mappings_names=mappings_names, fn=pindel_config_fn, is_pindel=True) chrom = kwargs["chromosome"] if "chromosome" in kwargs else "ALL" command, output_path = BuildPindelCommand(kwargs=kwargs, chrom=chrom, input_fn=pindel_config_fn, is_pindel_input_type=True) output_path = RunPindel(kwargs=kwargs, pindel_command=command, output_path=output_path) app_outputs = UploadPindelOutputs(kwargs=kwargs, output_path=output_path) if kwargs["export_vcf"]: app_outputs["vcf"] = ExportVCF(kwargs=kwargs, output_path=output_path, ref_fn="reference_fasta") return app_outputs
def map_contaminant(Contig, Reads): # get ID of our mapper try: bwa = dxpy.DXApp( dxpy.find_apps(name="bwa_mem_fastq_read_mapper").next()['id']) except StopIteration: raise dxpy.AppError( "Unable to find app 'bwa_mem_fastq_read_mapper'. Please install it to enable contaminant mapping" ) # TODO: find optimal chunk size so we don't launch too many bwa jobs map_job = bwa.run({ "reads": Reads, "reference": Contig, "discard_unmapped_rows": True, "chunk_size": 10000000 }) total_reads = 0 for r in Reads: desc = dxpy.DXGTable(r).describe() current_reads = desc['length'] if 'sequence2' in desc['columns']: current_reads *= 2 total_reads += current_reads # launch a job to wait for the mapping and will calculate what % has mapped calc_job = dxpy.new_dxjob( { "num_reads": total_reads, "mappings": { "job": map_job.get_id(), "field": "mappings" } }, "calc_contam") return calc_job.get_id()
def importGTF(**args): if len(args) == 0: command_line_args = parser.parse_args(sys.argv[1:]) fileName = command_line_args.fileName reference = command_line_args.reference outputName = command_line_args.outputName tag = command_line_args.tag property_key = command_line_args.property_key property_value = command_line_args.property_value additional_type = command_line_args.additional_type file_id = command_line_args.file_id else: fileName = args['fileName'] reference = args['reference'] outputName = '' if args.get('outputName') != None: outputName = args['outputName'] tag = [] if args.get('tag'): tag = args['tag'] if args.get('property_key') != None: property_key = args['property_key'] if args.get('property_value') != None: property_value = args['property_value'] if args.get('additional_type') != None: additional_type = args['additional_type'] if args.get('file_id') != None: file_id = args['file_id'] inputFileName = unpack(fileName) capturedTypes = { "5UTR": "5' UTR", "3UTR": "3' UTR", "CDS": "CDS", "inter": "intergenic", "inter_CNS": "intergenic_conserved", "intron_CNS": "intron_conserved", "exon": "exon", "transcript": "transcript", "gene": "gene", "stop_codon": "stop_codon", "start_codon": "start_codon" } #Rows of this type will not be written to the gtable as their information is fully encompassed by the rest of the data ##Isolate the attribute tags from the file and check integrity spansTable, additionalColumns = constructTable(inputFileName) spansTable.add_tags(tag) types = ["Genes", "gri"] for x in additional_type: types.append(x) spansTable.add_types(types) details = {'original_contigset': dxpy.dxlink(reference)} if len(property_key) != len(property_value): raise dxpy.AppError( "Expected each provided property to have a corresponding value") for i in range(len(property_key)): details[property_key[i]] = property_value[i] for x in additional_type: types.append(x) if file_id != None: details['original_file'] = dxpy.dxlink(file_id) spansTable.set_details(details) if outputName == '': spansTable.rename(fileName) else: spansTable.rename(outputName) #This passes through the file calculates the gene and transcript models genes = {} transcripts = {} spanId = 0 frames = {} stopCodons = {} inputFile = open(inputFileName, 'r') for line in inputFile: if line[0] != "#": values = parseLine(line, capturedTypes) if values["type"] == "CDS": if frames.get(values["transcriptId"]) == None: frames[values["transcriptId"]] = {} frames[values["transcriptId"]][values["lo"]] = values["frame"] for [element, hashId, elementType ] in [[genes, values["geneId"], "geneName"], [transcripts, values["transcriptId"], "transcriptName"]]: if element.get(hashId) == None: element[hashId] = { values["chromosome"]: { "lo": values["lo"], "hi": values["hi"], "codingLo": -1, "codingHi": -1, "strand": values["strand"], "score": values["score"], "geneId": values["geneId"], "coding": False, "spanId": spanId, "name": values[elementType], "originalGeneId": values["attributes"]["gene_id"], "originalTranscriptId": values["attributes"]["transcript_id"] } } spanId += 1 elif element[hashId].get(values["chromosome"]) == None: element[hashId][values["chromosome"]] = { "lo": values["lo"], "hi": values["hi"], "codingLo": -1, "codingHi": -1, "strand": values["strand"], "score": values["score"], "geneId": values["geneId"], "coding": False, "spanId": spanId, "name": values[elementType], "originalGeneId": values["attributes"]["gene_id"], "originalTranscriptId": values["attributes"]["transcript_id"] } spanId += 1 else: if values["lo"] < element[hashId][ values["chromosome"]]["lo"]: element[hashId][ values["chromosome"]]["lo"] = values["lo"] if values["hi"] > element[hashId][ values["chromosome"]]["hi"]: element[hashId][ values["chromosome"]]["hi"] = values["hi"] if values["type"] == "stop_codon": if stopCodons.get(values["transcriptId"]) == None: stopCodons[values["transcriptId"]] = [[ values["lo"], values["hi"] ]] else: stopCodons[values["transcriptId"]].append( [values["lo"], values["hi"]]) if values["type"] == "CDS" or values[ "type"] == "start_codon" or values["type"] == "stop_codon": if values["hi"] > transcripts[values["transcriptId"]][ values["chromosome"]]["codingHi"]: transcripts[values["transcriptId"]][ values["chromosome"]]["codingHi"] = values["hi"] if values["lo"] < transcripts[values["transcriptId"]][ values["chromosome"]]["codingLo"] or transcripts[ values["transcriptId"]][ values["chromosome"]]["codingLo"] == -1: transcripts[values["transcriptId"]][ values["chromosome"]]["codingLo"] = values["lo"] genes[values["geneId"]][values["chromosome"]]["coding"] = True transcripts[values["transcriptId"]][ values["chromosome"]]["coding"] = True for gId, chrList in genes.iteritems(): for k, v in chrList.iteritems(): entry = [ k, v["lo"], v["hi"], v["name"], v["spanId"], "gene", v["strand"], v["score"], v["coding"], -1, -1, '', '', v["originalGeneId"], '' ] for x in additionalColumns: if x != "gene_id" and x != "transcript_id": entry.append('') spansTable.add_rows([entry]) for gId, chrList in transcripts.iteritems(): for k, v in chrList.iteritems(): entry = [ k, v["lo"], v["hi"], v["name"], v["spanId"], "transcript", v["strand"], v["score"], genes[v["geneId"]][k]["coding"], genes[v["geneId"]][k]["spanId"], -1, '', '', v["originalGeneId"], v["originalTranscriptId"] ] for x in additionalColumns: if x != "gene_id" and x != "transcript_id": entry.append('') spansTable.add_rows([entry]) exons = {} inputFile = open(inputFileName, 'r') for line in inputFile: if line[0] != "#": values = parseLine(line, capturedTypes) if exons.get(values["transcriptId"]) != None: if exons[values["transcriptId"]].get( values["chromosome"]) == None: exons[values["transcriptId"]][values["chromosome"]] = [] else: exons[values["transcriptId"]] = {values["chromosome"]: []} if capturedTypes.get(values["type"]) != None: #If type is 5'UTR, 3'UTR, intergenic, or conserved intron, type is always noncoding if values["type"] == "5UTR" or values[ "type"] == "3UTR" or values[ "type"] == "inter" or values[ "type"] == "inter_CNS" or values[ "type"] == "intron_CNS": writeEntry( spansTable, spanId, exons[values["transcriptId"]], additionalColumns, values["chromosome"], values["lo"], values["hi"], values["attributes"], [ values["chromosome"], values["lo"], values["hi"], values["name"], spanId, capturedTypes[values["type"]], values["strand"], values["score"], False, transcripts[values["transcriptId"]]["spanId"], values["frame"], '', values["source"] ]) if "exon_number" in values["attributes"]: values["transcriptName"] += "." + values["attributes"][ "exon_number"] #If type is CDS, always of type coding if values["type"] == "CDS": if stopCodons.get(values["transcriptId"]) != None: for x in stopCodons[values["transcriptId"]]: if values["hi"] == x[0]: values["hi"] = x[1] break if [values["lo"], values["hi"]] not in exons[ values["transcriptId"]][values["chromosome"]]: spanId = writeEntry( spansTable, spanId, exons[values["transcriptId"]], additionalColumns, values["chromosome"], values["lo"], values["hi"], values["attributes"], [ values["chromosome"], values["lo"], values["hi"], values["transcriptName"], spanId, capturedTypes[values["type"]], values["strand"], values["score"], True, transcripts[values["transcriptId"]][ values["chromosome"]]["spanId"], values["frame"], '', values["source"] ]) #If type is exon do calculation as to whether coding or non-coding if values["type"] == "stop_codon": values["type"] = "exon" values["frame"] = 3 - (values["hi"] - values["lo"]) #if values["strand"] == "-": # values["lo"] = transcripts[values["transcriptId"]][values["chromosome"]]["lo"] #else: # values["hi"] = transcripts[values["transcriptId"]][values["chromosome"]]["hi"] if values["type"] == "exon": if (transcripts[values["transcriptId"]][ values["chromosome"]]["codingLo"] != -1 and transcripts[values["transcriptId"]][ values["chromosome"]]["codingHi"] != -1): if frames.get(values["transcriptId"]) != None: if frames[values["transcriptId"]].get( values["lo"]) != None: values["frame"] = frames[ values["transcriptId"]][values["lo"]] for x in splitExons( transcripts[values["transcriptId"]], values["chromosome"], values["lo"], values["hi"], values["strand"]): spanId = writeEntry( spansTable, spanId, exons[values["transcriptId"]], additionalColumns, values["chromosome"], x[1], x[2], values["attributes"], [ values["chromosome"], x[1], x[2], values["transcriptName"], spanId, x[0], values["strand"], values["score"], x[3], transcripts[values["transcriptId"]][ values["chromosome"]]["spanId"], values["frame"], '', values["source"] ]) else: spanId = writeEntry( spansTable, spanId, exons[values["transcriptId"]], additionalColumns, values["chromosome"], values["lo"], values["hi"], values["attributes"], [ values["chromosome"], values["lo"], values["hi"], values["transcriptName"], spanId, capturedTypes[values["type"]], values["strand"], values["score"], False, transcripts[values["transcriptId"]][ values["chromosome"]]["spanId"], values["frame"], '', values["source"] ]) spansTable.flush() spansTable.close() outputFile = open("result.txt", 'w') outputFile.write(spansTable.get_id()) outputFile.close() print(spansTable.get_id()) return spansTable.get_id()
def constructTable(inputFileName): inputFile = open(inputFileName, 'r') attributes = {"gene_id": True, "transcript_id": True} for line in inputFile: if line[0] != "#": tabSplit = line.split("\t") if len(tabSplit) == 1: tabSplit = line.split(" ") if len(tabSplit) < 9: raise dxpy.AppError( "One row did not have 9 entries, it had 1 instead. Offending line: " + line) tabSplit[8] = " ".join(tabSplit[8:]) tabSplit = tabSplit[:9] if len(tabSplit) != 9: raise dxpy.AppError("One row did not have 9 entries, it had " + str(len(tabSplit)) + " instead. Offending line: " + line) else: entrySplit = tabSplit[8].split(";") geneIdPresent = False transcriptIdPresent = False result = [] for x in entrySplit: keyValue = x.strip().split(" ") key = keyValue[0] if key == "gene_id": geneIdPresent = True elif key == "transcript_id": transcriptIdPresent = True attributes[key] = True if not geneIdPresent: raise dxpy.AppError( "One row did not have a gene_id Offending line: " + line) if not transcriptIdPresent: raise dxpy.AppError( "One row did not have a gene_id Offending line: " + line) #Construct table schema = [{ "name": "chr", "type": "string" }, { "name": "lo", "type": "uint32" }, { "name": "hi", "type": "uint32" }, { "name": "name", "type": "string" }, { "name": "span_id", "type": "int32" }, { "name": "type", "type": "string" }, { "name": "strand", "type": "string" }, { "name": "score", "type": "float" }, { "name": "is_coding", "type": "boolean" }, { "name": "parent_id", "type": "int32" }, { "name": "frame", "type": "int16" }, { "name": "description", "type": "string" }, { "name": "source", "type": "string" }, { "name": "gene_id", "type": "string" }, { "name": "transcript_id", "type": "string" }] additionalColumns = ['gene_id', 'transcript_id'] for k, v in attributes.iteritems(): if k != '' and k != 'gene_id' and k != 'transcript_id' and len( k) < 100: schema.append({"name": k, "type": "string"}) additionalColumns.append(k) indices = [ dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", 'gri'), dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("name", True, False), dxpy.DXGTable.lexicographic_index_column("chr"), dxpy.DXGTable.lexicographic_index_column("lo"), dxpy.DXGTable.lexicographic_index_column("hi"), dxpy.DXGTable.lexicographic_index_column("type") ], "search") ] spansTable = dxpy.new_dxgtable(columns=schema, indices=indices) return spansTable, additionalColumns
def parseLine(line, capturedTypes): tabSplit = line.split("\t") if len(tabSplit) == 1: tabSplit = line.split(" ") if len(tabSplit) < 9: raise dxpy.AppError( "One row did not have 9 entries, it had 1 instead. Offending line: " + line) tabSplit[8] = " ".join(tabSplit[8:]) tabSplit = tabSplit[:9] chromosome = tabSplit[0] source = tabSplit[1] typ = tabSplit[2] if capturedTypes.get(typ) == None: message = 'Permitted types: ' + " ,".join(capturedTypes.keys()) raise dxpy.AppError( "One row had a type which is not in the list of permitted types. " + message + "\nOffending line: " + line + "\nOffending type: " + typ) try: score = float(tabSplit[5]) except ValueError: if tabSplit[5] == "." or tabSplit[5] == '': score = dxpy.NULL else: raise dxpy.AppError( "The score for one line could not be translated into a number and was not \".\"" + "\nOffending line: " + line + "\nOffending value: " + tabSplit[5]) if tabSplit[6] != "+" and tabSplit[6] != "-" and tabSplit[6] != ".": raise dxpy.AppError( "The strand indicated for an element was not \"+\", \"-\", or \".\"" + "\nOffending line: " + line + "\nOffending value: " + tabSplit[6]) else: strand = tabSplit[6] try: lo = int(tabSplit[3]) - 1 except ValueError: raise dxpy.AppError( "One of the start values was could not be translated to an integer. " + "\nOffending line: " + line + "\nOffending value: " + tabSplit[3]) try: hi = int(tabSplit[4]) except ValueError: raise dxpy.AppError( "One of the start values was could not be translated to an integer. " + "\nOffending line: " + line + "\nOffending value: " + tabSplit[4]) try: frame = int(tabSplit[7]) if frame > 2 or frame < 0: raise dxpy.AppError( "The frame indicated for an element was not \".\", \"0\", \"1\", or \"2\"" + "\nOffending line: " + line + "\nOffending value: " + tabSplit[7]) except ValueError: if tabSplit[7] == ".": frame = -1 else: raise dxpy.AppError( "The frame indicated for an element was not \".\", \"0\", \"1\", or \"2\"" + "\nOffending line: " + line + "\nOffending value: " + tabSplit[7]) lineAttributes = {} ##Extract the attributes from the file entrySplit = tabSplit[8].split(";") result = [] for x in entrySplit: keyValue = x.strip().split(" ") key = keyValue[0] if key != '': if len(key) < 100: lineAttributes[key.strip('"')] = keyValue[1].strip('"') geneId = lineAttributes["gene_id"] transcriptId = lineAttributes["transcript_id"] geneName = geneId if "gene_name" in lineAttributes: geneName = lineAttributes["gene_name"] transcriptName = transcriptId if "transcript_name" in lineAttributes: transcriptName = lineAttributes["transcript_name"] values = { "chromosome": chromosome, "lo": lo, "hi": hi, "geneName": geneName, "transcriptName": transcriptName, "source": source, "type": typ, "strand": strand, "score": score, "frame": frame, "geneId": geneId, "transcriptId": transcriptId, "attributes": lineAttributes } return values
def unpack(input): m = magic.Magic() # determine compression format try: file_type = m.from_file(input) except Exception as e: raise dxpy.AppError("Error while identifying compression format: " + str(e)) # if we find a tar file throw a program error telling the user to unpack it if file_type == 'application/x-tar': raise dxpy.AppError("App does not support tar files. Please unpack.") # since we haven't returned, the file is compressed. Determine what program to use to uncompress uncomp_util = None if file_type == 'XZ compressed data': uncomp_util = 'xzcat' elif file_type[:21] == 'bzip2 compressed data': uncomp_util = 'bzcat' elif file_type[:20] == 'gzip compressed data': uncomp_util = 'zcat' elif file_type == 'POSIX tar archive (GNU)' or 'tar' in file_type: raise dxpy.AppError("Found a tar archive. Please untar your sequences before importing") else: # just return input filename since it's already uncompressed return input if uncomp_util != None: # bzcat does not support -t. Use non streaming decompressors for testing input test_util = None if uncomp_util == 'xzcat': test_util = 'xz' elif uncomp_util == 'bzcat': test_util = 'bzip2' elif uncomp_util == 'zcat': test_util = 'gzip' try: subprocess.check_call(" ".join([test_util, "-t", input]), shell=True) except subprocess.CalledProcessError: raise dxpy.AppError("File failed integrity check by "+uncomp_util+". Compressed file is corrupted.") # with that in hand, unzip file. If we find a tar archive then exit with error. try: with subprocess.Popen([uncomp_util, input], stdout=subprocess.PIPE).stdout as pipe: line = pipe.next() uncomp_type = m.from_buffer(line) except Exception as e: raise dxpy.AppError("Error detecting file format after decompression: " + str(e)) if uncomp_type == 'POSIX tar archive (GNU)' or 'tar' in uncomp_type: raise dxpy.AppError("Found a tar archive after decompression. Please untar your files before importing") elif 'ASCII text' not in uncomp_type: raise dxpy.AppError("After decompression found file type other than plain text") try: out_name = id_generator() subprocess.check_call(" ".join([uncomp_util, "--stdout", input, ">", out_name]), shell=True) return out_name except subprocess.CalledProcessError as e: raise dxpy.AppError("Unable to open compressed input for reading: " + str(e))
def importGFF(**args): if len(args) == 0: args = parser.parse_args(sys.argv[1:]) fileName = args.fileName reference = args.reference outputName = args.outputName file_id = args.file_id property_key = args.property_key property_value = args.property_value tag = args.tag additional_type = args.additional_type else: fileName = args['fileName'] reference = args['reference'] outputName = '' if args.get('outputName') != None: outputName = args['outputName'] tag = [] if args.get('tag'): tag = args['tag'] if args.get('property_key') != None: property_key = args['property_key'] if args.get('property_value') != None: property_value = args['property_value'] if args.get('additional_type') != None: additional_type = args['additional_type'] if args.get('file_id') != None: file_id = args['file_id'] inputFileName = unpack(fileName) #Rows of this type will not be written to the gtable as their information is fully encompassed by the rest of the data discardedTypes = {"start_codon": True, "stop_codon": True} ##Isolate the attribute tags from the file and check integrity spansTable, additionalColumns = constructTable(inputFileName) details = {'original_contigset': dxpy.dxlink(reference)} if file_id != None: details['original_file'] = dxpy.dxlink(file_id) if len(property_key) != len(property_value): raise dxpy.AppError( "Expected each provided property to have a corresponding value.") for i in range(len(property_key)): details[property_key[i]] = property_value[i] spansTable.set_details(details) spansTable.add_tags(tag) if outputName == '': spansTable.rename(fileName) else: spansTable.rename(outputName) hasGenes = False #This pass through the file calculates the gene and transcript models genes = {} transcripts = {} spanId = 0 sequenceOntology = {} for x in [ "five_prime_UTR", "5' UTR", "five prime UTR", "five_prime_untranslated_region", "five_prime_coding_exon_noncoding_region", "five_prime_exon_noncoding_region", "five prime coding exon noncoding region" ]: sequenceOntology[x] = "5' UTR" for x in [ "three_prime_UTR", "3' UTR", "three prime UTR", "three_prime_untranslated_region", "three_prime_coding_exon_noncoding_region", "three_prime_exon_noncoding_region", "three prime coding exon noncoding region" ]: sequenceOntology[x] = "3' UTR" for x in [ "mRNA", "rRNA", "tRNA", "snRNA", "snoRNA", "miRNA", "ncRNA", "transcript", "mature_transcript", "rRNA_large_subunit_primary_transcript", "35S rRNA primary transcript", "rRNA large subunit primary transcript", "rRNA_primary_transcript", "enzymatic_RNA", "nc_primary_transcript", "scRNA", "protein_coding_primary_transcript", "antisense_RNA", "antisense_primary_transcript", "primary_transcript", "ribosomal_subunit_rRNA", "small subunit rRNA", "SSU RNA", "SSU rRNA", "large_subunit_rRNA", "LSU RNA", "LSU rRNA" ]: sequenceOntology[x] = "transcript" for x in [ "exon", "interior_coding_exon", "interior coding exon", "coding_exon", "coding exon", "five_prime_coding_exon_region", "five prime exon coding region", "three_prime_coding_exon_region", "three prime coding exon region", "five_prime_coding_exon", "three_prime_coding_exon", "non_coding_exon", "non coding exon" ]: sequenceOntology[x] = "exon" isCoding = {} for x in [ "CDS", "interior_coding_exon", "interior coding exon", "coding_exon", "five_prime_coding_exon_region", "five prime exon coding region", "three_prime_coding_exon_region", "three prime coding exon region", "five_prime_coding_exon", "three_prime_coding_exon" ]: isCoding[x] = True codingRegions = {} spans = {} inputFile = open(inputFileName, 'r') for line in inputFile: if line[0] != "#": values = parseLine(line.split("#")[0]) if values["attributes"].get("Parent") != None: for parent in values["attributes"]["Parent"].split(","): if codingRegions.get(parent) == None: codingRegions[parent] = { values["chromosome"]: { "codingLo": -1, "codingHi": -1 } } if isCoding.get(values["type"]) != None: if values["lo"] < codingRegions[parent][values[ "chromosome"]]["codingLo"] or codingRegions[ parent][values["chromosome"]][ "codingLo"] == -1: codingRegions[parent][values["chromosome"]][ "codingLo"] = values["lo"] if values["hi"] > codingRegions[parent][values[ "chromosome"]]["codingHi"] or codingRegions[ parent][values["chromosome"]][ "codingLo"] == -1: codingRegions[parent][values["chromosome"]][ "codingHi"] = values["hi"] if values["attributes"].get("ID") != None: spans[values["attributes"]["ID"]] = spanId spanId += 1 inputFile = open(inputFileName, 'r') overflowSpans = spanId spanId = 0 for line in inputFile: if line[0] != "#": values = parseLine(line) entryIsCoding = False if isCoding.get(values["type"]) != None: entryIsCoding = True if values["attributes"].get("Name") != None: name = values["attributes"]["Name"] elif values["attributes"].get("name") != None: name = values["attributes"]["name"] elif values["attributes"].get("NAME") != None: name = values["attributes"]["NAME"] elif values["attributes"].get("ID") != None: name = values["attributes"]["ID"] else: name = '' if sequenceOntology.get(values["type"]) != None: values["type"] = sequenceOntology[values["type"]] hasGenes = True description = '' if values["attributes"].get("description") != None: description = values["attributes"]["description"] if values["attributes"].get("Description") != None: description = values["attributes"]["description"] parent = -1 if values["type"] not in discardedTypes: if values["attributes"].get("Parent") != None: parentSplit = values["attributes"]["Parent"].split(",") else: parentSplit = ["-1"] for parent in parentSplit: currentSpan = spanId parentId = -1 if spans.get(parent) != None: parentId = spans[parent] if parentSplit.index(parent) > 0: currentSpan = overflowSpans overflowSpans += 1 for x in ["ID", "Parent"]: if not entryIsCoding and values["attributes"].get( x) != None: if codingRegions.get( values["attributes"][x]) != None: if codingRegions[values["attributes"][x]].get( "chromosome") != None: if values["lo"] >= codingRegions[ values["attributes"] [x]]["chromosome"]["codingLo"] and values[ "lo"] <= codingRegions[ values["attributes"] [x]]["chromosome"]["codingHi"] and codingRegions[ values["attributes"] [x]]["chromosome"][ "codingHi"] > -1 and codingRegions[ values["attributes"] [x]]["chromosome"][ "codingHi"] > -1: entryIsCoding = True if values["hi"] >= codingRegions[ values["attributes"] [x]]["chromosome"]["codingLo"] and values[ "hi"] <= codingRegions[ values["attributes"] [x]]["chromosome"]["codingHi"] and codingRegions[ values["attributes"] [x]]["chromosome"][ "codingHi"] > -1 and codingRegions[ values["attributes"] [x]]["chromosome"][ "codingHi"] > -1: entryIsCoding = True entry = [ values["chromosome"], values["lo"], values["hi"], name, currentSpan, values["type"], values["strand"], values["score"], entryIsCoding, parentId, values["frame"], description, values["source"] ] for x in additionalColumns: if values["attributes"].get(x) != None: entry.append(values["attributes"][x]) else: entry.append('') spansTable.add_rows([entry]) spanId += 1 if hasGenes: types = ["Genes", "gri"] else: types = ["Spans", "gri"] for x in additional_type: types.append(x) spansTable.add_types(types) spansTable.flush() spansTable.close() print(spansTable.get_id()) job_outputs = dxpy.dxlink(spansTable.get_id()) return job_outputs
def import_spans(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags, isBedDetail, delimiter="\t"): num_cols = find_num_columns(bed_file, delimiter) # if this is a bedDetail file we should treat the last two columns separately if isBedDetail: num_cols -= 2 possible_columns = [("chr", "string"), ("lo", "int32"), ("hi", "int32"), ("name", "string"), ("score", "float"), ("strand", "string"), ("thick_start", "int32"), ("thick_end", "int32"), ("item_rgb", "string")] bedDetail_columns = [("bedDetail_ID", "string"), ("bedDetail_desc", "string")] possible_default_row = ["", 0, 0, "", 0, ".", 0, 0, ""] columns = possible_columns[:num_cols] if isBedDetail: columns.extend(bedDetail_columns) if num_cols > len(columns): for i in range(len(columns), num_cols): columns.append(("BED_column_"+str(i+1), "string")) possible_default_row.append("") default_row = possible_default_row[:num_cols] if isBedDetail: default_row.extend(["",""]) column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns] indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri')] for c in columns: if "name" in c: indices.append(dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("name", True, False), dxpy.DXGTable.lexicographic_index_column("chr"), dxpy.DXGTable.lexicographic_index_column("lo"), dxpy.DXGTable.lexicographic_index_column("hi")], "search")) break with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=indices, mode='w') as span: details = {"original_contigset": dxpy.dxlink(ref_id)} if file_id != None: details["original_file"] = dxpy.dxlink(file_id) if len(property_keys) != len(property_values): raise dxpy.AppError("Expected each provided property to have a corresponding value.") for i in range(len(property_keys)): details[property_keys[i]] = property_values[i] span.set_details(details) span.add_types(["Spans", "gri"]) span.rename(table_name) for line in bed: row = list(default_row) if line.startswith("track"): details = span.get_details() details['track'] = line span.set_details(details) continue line = line.rstrip("\n") line = line.split(delimiter) if isBedDetail: # only the first 4 columns are guaranteed to be defined by UCSC validate_line(line[:4]) # save last two fields separately bedDetailFields = line[-2:] line = line[:-2] else: validate_line(line[:num_cols]) # check to see if this is a weird line if len(line) == 0: break if len(line) < 3: raise dxpy.AppError("Line: "+"\t".join(line)+" in BED file contains less than the minimum 3 columns. Invalid BED file.") try: row[0] = line[0] row[1] = int(line[1]) row[2] = int(line[2]) row[3] = line[3] # dashes are sometimes used when field is invalid if line[4] == "-" or line[4] == ".": line[4] = 0 row[4] = float(line[4]) row[5] = line[5] # dashes are sometimes used when field is invalid if line[6] == "-" or line[6] == ".": line[6] = 0 row[6] = int(line[6]) # dashes are sometimes used when field is invalid if line[7] == "-" or line[7] == ".": line[7] = 0 row[7] = int(line[7]) row[8] = line[8] # an index error would come from having fewer columns in a row, which we should handle ok except IndexError: pass # value error when fields are messed up and string gets converted to int, etc. Throw these out. except ValueError: continue if isBedDetail: # add these in at the end if we have a bedDetail file row[num_cols] = bedDetailFields[0] row[num_cols+1] = bedDetailFields[1] span.add_row(row) span.flush() return dxpy.dxlink(span.get_id())
def process(project_id, output_folder, fastq_file, genome_fasta_file, genome_index_file, mapper, mark_duplicates, fastq_file2=None, sample_name=None, properties=None): """Download a single FASTQ file, map it, and output a coordinate-sorted BAM file.""" logger = [] bams_subfolder = output_folder + '/bams' if mapper not in SUPPORTED_MAPPERS: raise dxpy.AppError("Unsupported mapper: " + mapper) if mapper == "bwa_mem": if fastq_file2 == None: run_bwa_mem_single(fastq_file, genome_fasta_file, genome_index_file, mark_duplicates, logger) else: run_bwa_mem_paired(fastq_file, fastq_file2, genome_fasta_file, genome_index_file, mark_duplicates, logger) elif mapper == "bwa" or mapper == "bwa_aln": if fastq_file2 == None: run_bwa_backtrack_single(fastq_file, genome_fasta_file, genome_index_file, mark_duplicates, logger) else: run_bwa_backtrack_paired(fastq_file, fastq_file2, genome_fasta_file, genome_index_file, mark_duplicates, logger) else: raise dxpy.AppError("Unsupported mapper: " + mapper) run_samtools_calmd(logger) ''' From bwa_mem_fastq_read_mapper bash source: bwa mem -t `nproc` "$genome_file" $input $opts | samtools view -u -S - | samtools sort -m 256M -@ `nproc` - output samtools index output.bam ''' index_cmd = 'samtools index sample.bam' run_cmd(index_cmd, logger) bam_file = dxpy.upload_local_file(filename="sample.bam", name=sample_name + ".bam", properties=properties, project=project_id, folder=bams_subfolder, parents=True) bai_file = dxpy.upload_local_file(filename="sample.bam.bai", name=sample_name + ".bai", properties=properties, project=project_id, folder=bams_subfolder, parents=True) return { "bam": dxpy.dxlink(bam_file), "bai": dxpy.dxlink(bai_file), "tools_used": logger }
def import_genes(bed_file, table_name, ref_id, file_id, additional_types, property_keys, property_values, tags, delimiter="\t"): # implement BED importing from this format: # http://genome.ucsc.edu/FAQ/FAQformat.html#format1 columns = [("chr", "string"), ("lo", "int32"), ("hi", "int32"), ("name", "string"), ("span_id", "int32"), ("type", "string"), ("strand", "string"), ("is_coding", "boolean"), ("parent_id", "int32"), ("frame", "int16"), ("description", "string")] column_descs = [dxpy.DXGTable.make_column_desc(name, type) for name, type in columns] indices = [dxpy.DXGTable.genomic_range_index("chr","lo","hi", 'gri'), dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("name", True, False), dxpy.DXGTable.lexicographic_index_column("chr"), dxpy.DXGTable.lexicographic_index_column("lo"), dxpy.DXGTable.lexicographic_index_column("hi"), dxpy.DXGTable.lexicographic_index_column("type")], "search")] default_row = ["", 0, 0, "", -1, "", ".", False, -1, -1, ""] with open(bed_file, 'rU') as bed, dxpy.new_dxgtable(column_descs, indices=indices, mode='w') as span: span_table_id = span.get_id() details = {"original_contigset": dxpy.dxlink(ref_id)} if file_id != None: details["original_file"] = dxpy.dxlink(file_id) if len(property_keys) != len(property_values): raise dxpy.AppError("Expected each provided property to have a corresponding value.") for i in range(len(property_keys)): details[property_keys[i]] = property_values[i] span.set_details(details) span.add_types(["gri", "Genes"]) span.rename(table_name) current_span_id = 0 # where the parsing magic happens for line in bed: if line.startswith("track"): details = span.get_details() details['track'] = line span.set_details(details) continue line = line.rstrip("\n") row = list(default_row) line = line.split(delimiter) validate_line(line) if len(line) < 12: raise dxpy.AppError("Line: "+"\t".join(line)+" in gene model-like BED file contains less than 12 columns. Invalid BED file.") # add parent gene track row = generate_gene_row(line, 0, 0, "transcript", default_row, -1, current_span_id) if row != None: span.add_row(row) current_parent_id = current_span_id current_span_id += 1 # add all children blockCount = int(line[9]) line[10] = line[10].rstrip(",").split(",") blockSizes = [int(line[10][n]) for n in range(blockCount)] line[11] = line[11].rstrip(",").split(",") blockStarts = [int(line[11][n]) for n in range(blockCount)] gene_lo = int(line[1]) gene_hi = int(line[2]) # set thick* to be within the gene if outside thickStart = min(max(int(line[6]), gene_lo), gene_hi) thickEnd = max(min(int(line[7]), gene_hi), gene_lo) for i in range(blockCount): # look to thickStart and thickEnd to get information about the type of this region # if thick* are the same or cover the whole transcript then we ignore them # else, we partition the exons into CDS and UTR based on their boundaries if thickStart == thickEnd or (thickStart == gene_lo and thickEnd == gene_hi): span.add_row(generate_gene_row(line, blockSizes[i], blockStarts[i], "exon", default_row, current_parent_id, current_span_id)) current_span_id += 1 else: exon_lo = int(line[1])+blockStarts[i] exon_hi = int(exon_lo+blockSizes[i]) # we're all UTR if we enter either of these if (exon_hi <= thickStart and line[5] == '+') or (exon_lo >= thickEnd and line[5] == '-'): span.add_row(generate_gene_row(line, blockSizes[i], blockStarts[i], "5' UTR", default_row, current_parent_id, current_span_id)) current_span_id += 1 elif (exon_hi <= thickStart and line[5] == '-') or (exon_lo >= thickEnd and line[5] == '+'): span.add_row(generate_gene_row(line, blockSizes[i], blockStarts[i], "3' UTR", default_row, current_parent_id, current_span_id)) current_span_id += 1 # if this is true then we overlap CDS partially or completely elif (exon_lo < thickEnd and exon_hi > thickStart): # entirely contained if exon_lo >= thickStart and exon_hi <= thickEnd: span.add_row(generate_gene_row(line, blockSizes[i], blockStarts[i], "CDS", default_row, current_parent_id, current_span_id)) current_span_id += 1 else: # left portion is UTR if exon_lo < thickStart: if line[5] == '+': UTR_type = "5' UTR" else: UTR_type = "3' UTR" UTR_size = (min(blockSizes[i], thickStart - exon_lo)) span.add_row(generate_gene_row(line, UTR_size, blockStarts[i], UTR_type, default_row, current_parent_id, current_span_id)) current_span_id += 1 # CDS portion CDS_size = blockSizes[i] - (max(exon_lo, thickStart) - exon_lo) CDS_size -= (exon_hi - min(exon_hi, thickEnd)) CDS_start = (max(exon_lo, thickStart) - exon_lo) + blockStarts[i] span.add_row(generate_gene_row(line, CDS_size, CDS_start, "CDS", default_row, current_parent_id, current_span_id)) current_span_id += 1 # right portion is UTR if exon_hi > thickEnd: if line[5] == '+': UTR_type = "3' UTR" else: UTR_type = "5' UTR" UTR_size = (min(blockSizes[i], exon_hi - thickEnd)) UTR_start = blockStarts[i] + thickEnd - exon_lo span.add_row(generate_gene_row(line, UTR_size, UTR_start, UTR_type, default_row, current_parent_id, current_span_id)) current_span_id += 1 return dxpy.dxlink(span.get_id())
def parseLine(line): line = line.strip().split("#")[0] tabSplit = line.split("\t") if len(tabSplit) == 1: tabSplit = line.split(" ") if len(tabSplit) < 8: raise dxpy.AppError( "One row did not have 8 or 9 entries, it had 1 instead. Offending line: " + line) tabSplit[8] = " ".join(tabSplit[8:]) tabSplit = tabSplit[:9] chromosome = tabSplit[0] source = tabSplit[1] typ = tabSplit[2] try: lo = int(tabSplit[3]) - 1 except ValueError: raise dxpy.AppError( "One of the start values was could not be translated to an integer. " + "\nOffending line: " + line + "\nOffending value: " + tabSplit[3]) try: hi = int(tabSplit[4]) except ValueError: raise dxpy.AppError( "One of the start values was could not be translated to an integer. " + "\nOffending line: " + line + "\nOffending value: " + tabSplit[4]) try: score = float(tabSplit[5]) except ValueError: if tabSplit[5] == "." or tabSplit[5] == '': score = dxpy.NULL else: raise dxpy.AppError( "The score for one line could not be translated into a number and was not \".\"" + "\nOffending line: " + line + "\nOffending value: " + tabSplit[5]) tabSplit[6] = tabSplit[6].replace("?", ".") if tabSplit[6] != "+" and tabSplit[6] != "-" and tabSplit[6] != ".": raise dxpy.AppError( "The strand indicated for an element was not \"+\", \"-\", \"?\", or \".\"" + "\nOffending line: " + line + "\nOffending value: " + tabSplit[6]) else: strand = tabSplit[6] try: frame = int(tabSplit[7]) if frame > 2 or frame < 0: raise dxpy.AppError( "The frame indicated for an element was not \".\", \"0\", \"1\", or \"2\"" + "\nOffending line: " + line + "\nOffending value: " + tabSplit[7]) except ValueError: if tabSplit[7] == ".": frame = -1 else: raise dxpy.AppError( "The frame indicated for an element was not \".\", \"0\", \"1\", or \"2\"" + "\nOffending line: " + line + "\nOffending value: " + tabSplit[7]) lineAttributes = {} ##Extract the attributes from the file if len(tabSplit) >= 9: reg = re.findall("([^=]*)=([^;]*);", tabSplit[8].strip() + ";") for x in reg: if len(x[0]) < 100: lineAttributes[x[0]] = x[1].strip().strip("\"") else: lineAttributes = {} values = { "chromosome": chromosome, "lo": lo, "hi": hi, "source": source, "type": typ, "strand": strand, "score": score, "frame": frame, "attributes": lineAttributes } return values
def validate_line(line): line_str = "\t".join(line) entries = list(line) if len(entries) > 1: try: if int(entries[1]) < 0: raise dxpy.AppError("The start position for one entry was unexpectedly negative. \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[1])) except ValueError: raise dxpy.AppError("One of the start values could not be translated to an integer. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[1])) if len(entries) > 2: try: if int(entries[2]) < 0: raise dxpy.AppError("The end position for one entry was unexpectedly negative. \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[2])) except ValueError: raise dxpy.AppError("One of the end values could not be translated to an integer. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[2])) if len(entries) > 4: try: if entries[4] != "." and entries[4] != "-": float(entries[4]) except ValueError: raise dxpy.AppError("One of the score values for one entry could not be translated to a number. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[4])) if len(entries) > 5: if entries[5] != "+" and entries[5] != "-" and entries[5] != ".": raise dxpy.AppError("The strand indicated for an element was not \"+\", \"-\", or \".\"" + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[5])) if len(entries) > 6: try: if entries[6] != "." and entries[6] != "-": if int(entries[6]) < 0: raise dxpy.AppError("The thickStart position for one entry was unexpectedly negative. \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[6])) except ValueError: raise dxpy.AppError("One of the thickStart values could not be translated to an integer. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[6])) if len(entries) > 7: try: if entries[7] != "." and entries[7] != "-": if int(entries[7]) < 0: raise dxpy.AppError("The thickEnd position for one entry was unexpectedly negative. \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[7])) except ValueError: raise dxpy.AppError("One of the thickEnd values could not be translated to an integer. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[7])) if len(entries) > 9: try: if int(entries[9]) < 0: raise dxpy.AppError("The number of exons (blockCount) for one entry was unexpectedly negative. \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[9])) except ValueError: raise dxpy.AppError("One of the thickEnd values could not be translated to an integer. " + "\nOffending line_str: " + line_str + "\nOffending value: " + str(entries[9])) if len(entries) > 10: try: entries[10] = entries[10].rstrip(",").split(",") blockStarts = [int(entries[10][n]) for n in range(int(entries[9]))] except: raise dxpy.AppError("Could not parse the blockSizes entry as a comma-separated list of integers \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[10])) if len(entries) > 11: try: entries[11] = entries[11].rstrip(",").split(",") blockStarts = [int(entries[11][n]) for n in range(int(entries[9]))] except: raise dxpy.AppError("Could not parse the blockStarts entry as a comma-separated list of integers \nOffending line_str: " + line_str + "\nOffending value: " + str(entries[11]))
def constructTable(inputFileName): inputFile = open(inputFileName, 'r') attributes = {} for line in inputFile: if line[0] != "#": line = line.strip().split("#")[0] tabSplit = line.split("\t") if len(tabSplit) == 1: tabSplit = line.split(" ") if len(tabSplit) < 9: raise dxpy.AppError( "One row did not have 8 or 9 entries, it had 1 instead. Offending line: " + line) tabSplit[8] = " ".join(tabSplit[8:]) tabSplit = tabSplit[:9] if len(tabSplit) != 8 and len(tabSplit) != 9: raise dxpy.AppError( "One row did not have 8 or 9 entries, it had " + str(len(tabSplit)) + " instead. Offending line: " + line) elif len(tabSplit) == 9: reg = re.findall("([^=]*)=([^;]*);", tabSplit[8].strip() + ";") for x in reg: attributes[x[0]] = True reservedColumns = [ "", "chr", "lo", "hi", "name", "span_id", "type", "score", "is_coding", "parent_id", "frame", "description", "source" ] #Construct table schema = [{ "name": "chr", "type": "string" }, { "name": "lo", "type": "uint32" }, { "name": "hi", "type": "uint32" }, { "name": "name", "type": "string" }, { "name": "span_id", "type": "int32" }, { "name": "type", "type": "string" }, { "name": "strand", "type": "string" }, { "name": "score", "type": "float" }, { "name": "is_coding", "type": "boolean" }, { "name": "parent_id", "type": "int32" }, { "name": "frame", "type": "int16" }, { "name": "description", "type": "string" }, { "name": "source", "type": "string" }] additionalColumns = [] for k, v in attributes.iteritems(): if k not in reservedColumns and len(k) < 100: schema.append({"name": k, "type": "string"}) additionalColumns.append(k) indices = [ dxpy.DXGTable.genomic_range_index("chr", "lo", "hi", 'gri'), dxpy.DXGTable.lexicographic_index([ dxpy.DXGTable.lexicographic_index_column("name", True, False), dxpy.DXGTable.lexicographic_index_column("chr"), dxpy.DXGTable.lexicographic_index_column("lo"), dxpy.DXGTable.lexicographic_index_column("hi"), dxpy.DXGTable.lexicographic_index_column("type") ], "search") ] spansTable = dxpy.new_dxgtable(columns=schema, indices=indices) return spansTable, additionalColumns