def process(input_gtable_id, start_row, end_row, output_gtable_id): DX_APP_WIZARD_||_INPUT = dxpy.DXGTable(input_gtable_id) # Using the context manager here is useful so that the flush() # method is called once the context manager exits, and any rows # added will be flushed to the platform. The mode is set to "a" # for "append". with dxpy.open_dxgtable(output_gtable_id, mode="a") as DX_APP_WIZARD_||_OUTPUT: # The following loop iterates over each row from start_row to # end_row (not including end_row). You can find documentation on # other useful GTable methods (such as iterating over a genomic # range query with iterate_query_rows) in the dxpy library here: # http://autodoc.dnanexus.com/bindings/python/current/dxpy_dxgtable.html for row in DX_APP_WIZARD_||_INPUT.iterate_rows(start_row, end_row): # Fill in code here to perform whatever computation is # necessary to process the row and compute the new row. # # *row* is an array where the first element is the row ID, # and the rest of the elements appear in the same order as # the GTable's column specification. You can retrieve the # column specifications or names by using # DX_APP_WIZARD_||_INPUT.get_columns() or DX_APP_WIZARD_||_INPUT.get_col_names(). new_row = [] # The following line queues up the array new_row as a row # of data that should be added to the output GTable. # Queued rows will be flushed to the platform periodically. DX_APP_WIZARD_||_OUTPUT.add_row(new_row)
def main(**kwargs): if len(kwargs) == 0: opts = parser.parse_args(sys.argv[1:]) else: opts = parser.parse_args(kwargs) if opts.mappings_id == None: parser.print_help() sys.exit(1) mappingsTable = dxpy.DXGTable(opts.mappings_id) if opts.file_name != None: fh = open(opts.file_name, "w") else: fh = sys.stdout if 'quality' in mappingsTable.get_col_names(): outputFastq = True else: outputFastq = False for row in mappingsTable.iterate_rows(want_dict=True): if outputFastq: writeFastq(row, fh) else: writeFasta(row, fh)
def process(gtable_id, start_row, end_row): DX_APP_WIZARD_||_INPUT = dxpy.DXGTable(gtable_id) # The following loop iterates over each row from start_row to # end_row (not including end_row). You can find documentation on # other useful GTable methods (such as iterating over a genomic # range query with iterate_query_rows) in the dxpy library here: # http://autodoc.dnanexus.com/bindings/python/current/dxpy_dxgtable.html for row in DX_APP_WIZARD_||_INPUT.iterate_rows(start_row, end_row): # Fill in code here to perform whatever computation is # necessary to process the row. # # *row* is an array where the first element is the row ID, and # the rest of the elements appear in the same order as the # GTable's column specification. You can retrieve the column # specifications or names by using # DX_APP_WIZARD_||_INPUT.get_columns() or DX_APP_WIZARD_||_INPUT.get_col_names(). pass # If your subproblem is to compute some value over the rows it was # given, you can return it here: return { "output": "placeholder value" }
def postprocess(**job_inputs): print "Postprocess:", job_inputs job_outputs = {} time_report = {k: v for k, v in job_inputs.iteritems() if re.match("chunk\d+debug", k)} t = dxpy.DXGTable(job_inputs["table_id"]) d = t.get_details() d['time_report'] = time_report t.set_details(d) t.close() job_outputs['mappings'] = dxpy.dxlink(t) return job_outputs
def main(**kwargs): if len(kwargs) == 0: kwargs = vars(arg_parser.parse_args(sys.argv[1:])) try: spans = dxpy.DXGTable(kwargs['Spans']) except: raise dxpy.AppError("Failed to open Spans object for export") spans_types = spans.describe()['types'] if 'Genes' in spans_types: export_genes(spans, kwargs['output']) else: export_generic_bed(spans, kwargs['output'])
def dump_fastqa(reads_ID, output_base): if 'sequence2' in dxpy.DXGTable(reads_ID).get_col_names(): paired = True else: paired = False if paired: run_shell(" ".join([ "dx-reads-to-fastq", reads_ID, "--output " + output_base + "_1", "--output2 " + output_base + "_2" ])) else: run_shell(" ".join( ["dx-reads-to-fastq", reads_ID, "--output " + output_base + "_1"])) if paired: return output_base + "_1", output_base + "_2" else: return output_base + "_1", None
def check_reads(reads_tables): # validate that tables contain data that can be used together (all paired or all unpaired, etc) if len(reads_tables) == 0: raise dxpy.AppError("Please enter at least one Reads table as input") single = 0 paired = 0 for table in reads_tables: if 'sequence2' in dxpy.DXGTable(table).get_col_names(): paired = paired + 1 else: single = single + 1 if single > 0 and paired > 0: raise dxpy.AppError( "Found both single and paired-end reads. Please only input one type." ) return
def map_contaminant(Contig, Reads): # get ID of our mapper try: bwa = dxpy.DXApp( dxpy.find_apps(name="bwa_mem_fastq_read_mapper").next()['id']) except StopIteration: raise dxpy.AppError( "Unable to find app 'bwa_mem_fastq_read_mapper'. Please install it to enable contaminant mapping" ) # TODO: find optimal chunk size so we don't launch too many bwa jobs map_job = bwa.run({ "reads": Reads, "reference": Contig, "discard_unmapped_rows": True, "chunk_size": 10000000 }) total_reads = 0 for r in Reads: desc = dxpy.DXGTable(r).describe() current_reads = desc['length'] if 'sequence2' in desc['columns']: current_reads *= 2 total_reads += current_reads # launch a job to wait for the mapping and will calculate what % has mapped calc_job = dxpy.new_dxjob( { "num_reads": total_reads, "mappings": { "job": map_job.get_id(), "field": "mappings" } }, "calc_contam") return calc_job.get_id()
def postprocess(output_gtable_id): DX_APP_WIZARD_||_OUTPUT = dxpy.DXGTable(output_gtable_id) DX_APP_WIZARD_||_OUTPUT.close()
def main(**kwargs): if len(kwargs) == 0: opts = parser.parse_args(sys.argv[1:]) else: opts = parser.parse_args(kwargs) if opts.genes_id == None: parser.print_help() sys.exit(1) if opts.file_name != None: outputFile = open(opts.file_name, 'w') else: outputFile = None if opts.genes_id == None: parser.print_help() sys.exit(1) tableId = opts.genes_id table = dxpy.DXGTable(tableId) genesTypes = { "exon": True, "CDS": True, "5' UTR": True, "3' UTR": True, "transcript": True, "gene": True } translatedTypes = { "transcript": "mRNA", "5' UTR": "five_prime_UTR", "3' UTR": "three_prime_UTR" } columns = table.get_col_names() idColumn = None parentColumn = None if "ID" in columns: idColumn = "ID" elif "Id" in columns: idColumn = "Id" elif "id" in columns: idColumn = "id" else: idColumn = "span_id" if "Parent" in columns: parentColumn = "Parent" elif "PARENT" in columns: parentColumn = "PARENT" elif "parent" in columns: parentColumn = "parent" else: parentColumn = "parent_id" for row in table.iterate_rows(want_dict=True): typ = row["type"] if opts.only_genes_types == False or genesTypes.get(typ) != None: if translatedTypes.get(typ) != None: typ = translatedTypes[typ] reservedColumns = [ "chr", "lo", "hi", "span_id", "type", "strand", "score", "is_coding", "parent_id", "frame", "source", "__id__", "ID", "Id", "id", "Parent", "PARENT", "parent" ] attributes = "" rowId = str(row[idColumn]) parentId = str(row[parentColumn]) attributes += "ID=\"" + rowId + "\";" if not (parentColumn == "parent_id" and parentId == "-1"): attributes += "Parent=\"" + parentId + "\";" for k, v in row.iteritems(): if k not in reservedColumns and v != '': attributes += k + "=" + '"' + str(v) + '";' chromosome = row["chr"] lo = str(row["lo"] + 1) hi = str(row["hi"]) strand = row["strand"] if strand == '': strand = '.' if row["frame"] == -1: frame = '.' else: frame = str(row["frame"]) source = '.' # 2**31 and 2**31-1 are legacy null values that will be removed when possible if row.get("score") == None: score = "." if row["score"] == dxpy.NULL or row["score"] == 2**31 - 1 or row[ "score"] == float(2**31): score = "." else: score = str(row["score"]) if row.get("source") != None: if row["source"] != '': source = row["source"] result = "\t".join([ chromosome, source, typ, lo, hi, score, strand, frame, attributes.rstrip(";") ]) + "\n" if outputFile != None: outputFile.write(result) else: sys.stdout.write(result)
def map(**job_inputs): print "Map:", job_inputs job_outputs = {} times = [('start', time.time())] reads_inputs = job_inputs['reads'] reads_ids = [r['$dnanexus_link'] for r in reads_inputs] reads_descriptions = {r: dxpy.DXGTable(r).describe() for r in reads_ids} reads_columns = {r: [col['name'] for col in desc['columns']] for r, desc in reads_descriptions.items()} reads_are_paired = any(['sequence2' in columns for columns in reads_columns.values()]) times.append(('preamble', time.time())) dxpy.download_dxfile(dxpy.get_details(job_inputs["indexed_reference"])['index_archive'], "reference.tar.xz") times.append(('download reference', time.time())) # TODO: Async everything below # subprocess.check_call("pixz -d reference.tar.xz && tar -xf reference.tar", shell=True) subprocess.check_call("tar -xJf reference.tar.xz", shell=True) if job_inputs["algorithm"] == "bwasw": bwa_algorithm = "bwasw" else: # algorithm = aln or auto. TODO: check what auto should do bwa_algorithm = "aln" aln_opts, sampe_opts, sw_opts, samse_opts = parse_bwa_cmd_opts(job_inputs) # Set the number of threads BWA parameter to the apparent number of CPUs. aln_opts += " -t " + str(cpu_count()) sw_opts += " -t " + str(cpu_count()) row_offsets = job_inputs['row_offsets'] # starting row for each reads table if you added them all up start_row = job_inputs['start_row'] # the position in this chunk relative to the row_offsets 'total' num_rows = job_inputs['num_rows'] # size of chunk to do this time subjobs = [] for i in range(len(reads_ids)): reads_length = reads_descriptions[reads_ids[i]]["length"] read_group = i # see if the reads table is part of this chunk # if start is inside this reads table, add it # doing this in the form: (A_start < B_end) and (A_end > B_start) # A is the reads tables # B is the current chunk # A_start = row_offsets[i] # A_end = row_offsets[i] + reads_length # B_start = start_row # B_end = start_row + num_rows if row_offsets[i] < (start_row+num_rows) and (row_offsets[i]+reads_length) > start_row: rel_start = max(start_row - row_offsets[i], 0) rel_end = min(reads_length, start_row - row_offsets[i] + num_rows) # Using half-open intervals: [start, end) subjobs.append({'reads_id': reads_ids[i], 'start_row': rel_start, 'end_row': rel_end, 'read_group':read_group}) times.append(('parse parameters', time.time())) print 'SUBJOBS:', subjobs for subchunk_id in range(len(subjobs)): subjob = subjobs[subchunk_id] reads_id = subjob['reads_id'] # TODO: FlowReads trimming support if 'quality' in reads_columns[reads_id]: if reads_are_paired: reads_file1 = "input"+str(subchunk_id)+"_1.fastq" reads_file2 = "input"+str(subchunk_id)+"_2.fastq" write_reads_to_fastq(reads_id, reads_file1, seq_col='sequence', qual_col='quality', start_row=subjob['start_row'], end_row=subjob['end_row']) write_reads_to_fastq(reads_id, reads_file2, seq_col='sequence2', qual_col='quality2', start_row=subjob['start_row'], end_row=subjob['end_row']) times.append(('fetch reads (subchunk %d)' % subchunk_id, time.time())) run_alignment(bwa_algorithm, reads_file1, reads_file2, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts) times.append(('run alignment (subchunk %d)' % subchunk_id, time.time())) else: reads_file1 = "input"+str(subchunk_id)+".fastq" write_reads_to_fastq(reads_id, reads_file1, start_row=subjob['start_row'], end_row=subjob['end_row']) run_alignment(bwa_algorithm, reads_file1, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts) else: # No qualities, use plain fasta if reads_are_paired: reads_file1 = "input"+str(subchunk_id)+"_1.fasta" reads_file2 = "input"+str(subchunk_id)+"_2.fasta" write_reads_to_fasta(reads_id, reads_file1, seq_col='sequence', start_row=subjob['start_row'], end_row=subjob['end_row']) write_reads_to_fasta(reads_id, reads_file2, seq_col='sequence2', start_row=subjob['start_row'], end_row=subjob['end_row']) run_alignment(bwa_algorithm, reads_file1, reads_file2, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts) else: reads_file1 = "input"+str(subchunk_id)+".fasta" write_reads_to_fasta(reads_id, reads_file1, start_row=subjob['start_row'], end_row=subjob['end_row']) run_alignment(bwa_algorithm, reads_file1, aln_opts=aln_opts, sampe_opts=sampe_opts, sw_opts=sw_opts, samse_opts=samse_opts) times.append(('run alignment (subchunk %d)' % subchunk_id, time.time())) cmd = "dx_storeSamAsMappingsTable_bwa" cmd += " --alignments '%s.sam'" % reads_file1 cmd += " --table_id '%s'" % job_inputs["table_id"] cmd += " --reads_id '%s'" % reads_id cmd += " --start_row %d" % subjob['start_row'] cmd += " --read_group %d" % subjob['read_group'] if job_inputs.get('discard_unmapped_rows'): cmd += " --discard_unmapped_rows" run_shell(cmd) times.append(('run table upload (subchunk %d)' % subchunk_id, time.time())) job_outputs["ok"] = True timing_report = {} for i in range(len(times)-1): timing_report[times[i+1][0]] = times[i+1][1] - times[i][1] job_outputs["debug"] = {'times': timing_report} return job_outputs
def main(**kwargs): if len(kwargs) == 0: opts = parser.parse_args(sys.argv[1:]) else: opts = parser.parse_args(kwargs) if opts.mappings_id == None: parser.print_help() sys.exit(1) mappingsTable = dxpy.DXGTable(opts.mappings_id) idAsName = opts.id_as_name idPrepend = opts.id_prepend writeRowId = opts.write_row_id paired = "chr2" in mappingsTable.get_col_names() regions = [] if opts.region_file != "": regions = re.findall("-L ([^:]*):(\d+)-(\d+)", open(opts.region_file, 'r').read()) name = mappingsTable.describe()['name'] if opts.reference != None: originalContig = opts.reference else: try: originalContig = mappingsTable.get_details( )['original_contigset']['$dnanexus_link'] except: raise dxpy.AppError( "The original reference genome must be attached to mappings table" ) try: contigDetails = dxpy.DXRecord(originalContig).get_details()['contigs'] except: raise dxpy.AppError("Unable to access reference with ID " + originalContig) contigNames = contigDetails['names'] contigSizes = contigDetails['sizes'] if opts.file_name != None: outputFile = open(opts.file_name, 'w') else: outputFile = None header = "" for i in range(len(contigNames)): header += "@SQ\tSN:" + str(contigNames[i]) + "\tLN:" + str( contigSizes[i]) + "\n" assignReadGroup = opts.assign_read_group if assignReadGroup != "": header += "@RG\tID:" + assignReadGroup + "\tSM:Sample_0" else: for i in range(len(mappingsTable.get_details()['read_groups'])): header += "@RG\tID:" + str(i) + "\tSM:Sample_" + str(i) if opts.read_group_platform != '': header += "\tPL:" + opts.read_group_platform header += "\n" if outputFile != None: outputFile.write(header) else: sys.stdout.write(header) col = {} names = mappingsTable.get_col_names() for i in range(len(names)): col[names[i]] = i + 1 column_descs = mappingsTable.describe()['columns'] sam_cols = [] sam_col_names = [] sam_col_types = {} for c in column_descs: if c['name'].startswith( "sam_field_") or c['name'] == "sam_optional_fields": sam_cols.append(c) sam_col_names.append(c['name']) sam_col_types[c['name']] = c['type'] defaultCol = { "sequence": "", "name": "", "quality": "", "status": "UNMAPPED", "chr": "", "lo": 0, "hi": 0, "negative_strand": False, "error_probability": 0, "qc_fail": False, "duplicate": False, "cigar": "", "mate_id": -1, "status2": "", "chr2": "", "lo2": 0, "hi2": 0, "negative_strand2": False, "proper_pair": False, "read_group": 0 } #unmappedFile = open("unmapped.txt", 'w') if len(regions) == 0: if opts.start_row > mappingsTable.describe()['length']: raise dxpy.AppError( "Starting row is larger than number of rows in table") elif opts.end_row < opts.start_row: raise dxpy.AppError("Ending row is before Start") if opts.end_row > 0: generator = mappingsTable.iterate_rows(start=opts.start_row, end=opts.end_row, want_dict=True) else: generator = mappingsTable.iterate_rows(start=opts.start_row, want_dict=True) # write each row unless we're throwing out unmapped for row in generator: if row["status"] != "UNMAPPED" or opts.discard_unmapped == False: if not paired: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.no_interchromosomal and row["chr"] == row["chr2"]: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.only_interchromosomal and opts.no_interchromosomal == False and ( row["chr"] != row["chr2"] or (row["chr"] == "" and row["chr2"] == "")): writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.no_interchromosomal == False and opts.only_interchromosomal == False: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) else: for x in regions: # generate the query for this region query = mappingsTable.genomic_range_query( x[0], int(x[1]) + opts.region_index_offset, int(x[2]) + opts.region_index_offset, index='gri') for row in mappingsTable.get_rows(query=query, limit=1)['data']: startRow = row[0] for row in mappingsTable.iterate_rows(start=startRow, want_dict=True): if row["chr"] != x[0] or row["lo"] > int( x[2]) + opts.region_index_offset: break if row["status"] != "UNMAPPED" or opts.discard_unmapped == False: if not paired: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.no_interchromosomal and row["chr"] == row[ "chr2"]: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.only_interchromosomal and opts.no_interchromosomal == False and ( row["chr"] != row["chr2"] or (row["chr"] == "" and row["chr2"] == "")): writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) elif opts.no_interchromosomal == False and opts.only_interchromosomal == False: writeRow(row, col, defaultCol, outputFile, idAsName, idPrepend, writeRowId, assignReadGroup, column_descs, sam_cols, sam_col_names, sam_col_types) if outputFile != None: outputFile.close()
def main(**kwargs): if len(kwargs) == 0: opts = parser.parse_args(sys.argv[1:]) else: opts = parser.parse_args(kwargs) if opts.genes_id == None: parser.print_help() sys.exit(1) if opts.file_name != None: outputFile = open(opts.file_name, 'w') else: outputFile = None if opts.genes_id == None: parser.print_help() sys.exit(1) tableId = opts.genes_id table = dxpy.DXGTable(tableId) transcripts = {} genes = {} acceptedTypes = { "CDS": "CDS", "start_codon": "start_codon", "stop_codon": "stop_codon", "5' UTR": "5UTR", "3' UTR": "3UTR", "intergenic": "inter", "intergenic_conserved": "inter_CNS", "exon": "exon" } biotypePresent = False if "gene_biotype" in table.get_col_names(): biotypePresent = True for row in table.iterate_rows(want_dict=True): if row["type"] == "gene": if genes.get(row["span_id"]) == None: genes[row["span_id"]] = str(row["span_id"]) if row.get("gene_id") != None: if row["gene_id"] != "": genes[row["span_id"]] = row["gene_id"] if row.get("name") != None and genes[row["span_id"]] == str( row["span_id"]): if row["name"] != '': genes[row["span_id"]] = row["name"] else: raise dxpy.AppError( "Error: span_id was not unique, in violation of the type spec for Genes. As a result, some gene_id data may be overwritten" ) if row["type"] == "transcript": if transcripts.get(row["span_id"]) == None: transcriptInfo = {"name": str(row["span_id"])} if row.get("gene_id") != None: if row["transcript_id"] != '': transcriptInfo["name"] = row["transcript_id"] if row.get("name") != None and transcriptInfo["name"] == str( row["span_id"]): if row["name"] != '': transcriptInfo["name"] = row["name"] transcriptInfo['parent'] = row["parent_id"] transcriptInfo['gene'] = '' transcripts[row["span_id"]] = transcriptInfo else: raise dxpy.AppError( "Error: span_id was not unique, in violation of the type spec for Genes. As a result, some transcript_id data may be overwritten" ) for k, v in transcripts.iteritems(): if genes.get(v["parent"]) != None: transcripts[k]["gene"] = genes[v["parent"]] warnedGeneId = False warnedTranscriptId = False for row in table.iterate_rows(want_dict=True): if acceptedTypes.get(row["type"]) != None: reservedColumns = [ "chr", "lo", "hi", "span_id", "type", "strand", "score", "is_coding", "parent_id", "frame", "source", "gene_id", "transcript_id", "__id__" ] attributes = "" transcriptId = '' geneId = '' try: transcriptId = transcripts[row["parent_id"]]["name"] except: if not warnedTranscriptId: print "Warning, at least one position had a transcriptId that could not be determined. Future warnings of this type will not be printed" print "Offending position - Chr: " + row[ "chr"] + " lo: " + str(row["lo"]) + " hi: " warnedTranscriptId = True try: geneId = transcripts[row["parent_id"]]["gene"] except: if not warnedGeneId: print "Warning, at least one position had a geneId that could not be determined. Future warnings of this type will not be printed" print "Offending position - Chr: " + row[ "chr"] + " lo: " + str(row["lo"]) + " hi: " warnedGeneId = True attributes += "gene_id " + '"' + geneId + '"' + ";" attributes += " transcript_id " + '"' + transcriptId + '"' + ";" for k, v in row.iteritems(): if k not in reservedColumns and v != '': attributes += " " + k + " " + '"' + str(v) + '";' if opts.add_gene_biotype and not biotypePresent: if row["is_coding"]: entry = "protein_coding" else: entry = "non_protein_coding" attributes += " gene_biotype " + '"' + entry + '"' + '";' chromosome = row["chr"] lo = str(row["lo"] + 1) hi = str(row["hi"]) typ = acceptedTypes[row["type"]] strand = row["strand"] if strand == '': strand = '.' if row["frame"] == -1: frame = '.' else: frame = str(row["frame"]) #Null values 2**31 and 2**31-1 are legacy values and will be removed when possible if row.get("score") == None: score = "." elif row["score"] == dxpy.NULL or row["score"] == 2**31 - 1 or row[ "score"] == float(2**31): score = "." else: score = str(row["score"]) if row.get("source") != None: if row["source"] != '': source = row["source"] if opts.add_gene_biotype and not biotypePresent: if row["is_coding"]: source = "protein_coding" else: source = "non_protein_coding" else: source = "." result = "\t".join([ chromosome, source, typ, lo, hi, score, strand, frame, attributes.rstrip(";") ]) + "\n" if outputFile != None: outputFile.write(result) else: sys.stdout.write(result)
def main(**kwargs): if len(kwargs) == 0: kwargs = vars(arg_parser.parse_args(sys.argv[1:])) if "end_row" not in kwargs: kwargs["end_row"] = None if kwargs["end_row"] is not None and kwargs["end_row"] <= kwargs[ "start_row"]: arg_parser.error("End row %d must be greater than start row %d" % (kwargs["end_row"], kwargs["start_row"])) try: table = dxpy.DXGTable(kwargs['reads_table']) except: raise dxpy.AppError("Failed to open table for export") existCols = table.get_col_names() ### sort out columns to download col = [] col2 = [] # if there's a second sequence, it's paired if "sequence2" in existCols: isPaired = True else: isPaired = False if "name" in existCols and kwargs['discard_names'] != True: hasName = True col.append("name") if isPaired == True: col2.append("name2") else: hasName = False col.append("sequence") if isPaired == True: col2.append("sequence2") if "quality" in existCols: hasQual = True col.append("quality") if isPaired == True: col2.append("quality2") else: hasQual = False # if we don't have quals we must output FASTA instead kwargs['output_FASTA'] = True if kwargs['output'] is None: raise dxpy.AppError("output parameter is required") with open(kwargs['output'], 'wb') as out_fh: exportToFile(columns=col, table=table, output_file=out_fh, hasName=hasName, hasQual=hasQual, FASTA=kwargs['output_FASTA'], start_row=kwargs['start_row'], end_row=kwargs['end_row']) if isPaired == True: if kwargs['output2'] is None: raise dxpy.AppError( "output2 parameter is required for paired reads") with open(kwargs['output2'], 'wb') as out_fh2: exportToFile(columns=col2, table=table, output_file=out_fh2, hasName=hasName, hasQual=hasQual, FASTA=kwargs['output_FASTA'], start_row=kwargs['start_row'], end_row=kwargs['end_row'])
def main(**job_inputs): job_outputs = {} reads_inputs = job_inputs['reads'] reads_ids = [r['$dnanexus_link'] for r in reads_inputs] reads_descriptions = {r: dxpy.DXGTable(r).describe() for r in reads_ids} reads_columns = {r: [col['name'] for col in desc['columns']] for r, desc in reads_descriptions.items()} print reads_inputs print reads_ids print reads_descriptions print reads_columns all_reads_have_FlowReads_tag = all(['FlowReads' in desc['types'] for desc in reads_descriptions.values()]) all_reads_have_LetterReads_tag = all(['LetterReads' in desc['types'] for desc in reads_descriptions.values()]) reads_have_names = any(['name' in columns for columns in reads_columns.values()]) reads_are_paired = any(['sequence2' in columns for columns in reads_columns.values()]) reads_have_qualities = any(['quality' in columns for columns in reads_columns.values()]) if reads_have_qualities: assert(all(['quality' in columns for columns in reads_columns.values()])) if reads_are_paired: all_paired = all(['sequence2' in columns for columns in reads_columns.values()]) if not all_paired: raise dxpy.AppError("Reads to be mapped must be either all paired or all unpaired. App input contains both paired and unpaired reads.") if job_inputs["algorithm"] == "bwasw": assert(not reads_are_paired) # bwasw does not support paired inputs assert(all_reads_have_FlowReads_tag or all_reads_have_LetterReads_tag) reference_record_types = dxpy.describe(job_inputs['reference'])['types'] if "BwaLetterContigSetV3" in reference_record_types: input_ref_is_indexed = True elif "ContigSet" in reference_record_types: input_ref_is_indexed = False else: raise dxpy.ProgramError("Unrecognized object passed as reference. It must be a ContigSet record or a BwaLetterContigSetV3 file") if input_ref_is_indexed: job_outputs['indexed_reference'] = job_inputs['reference'] else: found_cached_idx = False for result in dxpy.find_data_objects(classname='record', typename='BwaLetterContigSetV3', link=job_inputs['reference']['$dnanexus_link']): job_outputs['indexed_reference'] = dxpy.dxlink(result['id']) found_cached_idx = True break if not found_cached_idx: job_outputs['indexed_reference'] = dxpy.dxlink(make_indexed_reference(job_inputs)) table_columns = [("sequence", "string")] if reads_have_names: table_columns.append(("name", "string")) if reads_have_qualities: table_columns.append(("quality", "string")) table_columns.extend([("status", "string"), ("chr", "string"), ("lo", "int32"), ("hi", "int32"), ("negative_strand", "boolean"), ("error_probability", "uint8"), ("qc_fail", "boolean"), ("duplicate", "boolean"), ("cigar", "string"), ("template_id", "int64"), ("read_group", "int32")]) # optional sam fields: RG BC XC XT NM CM XN SM AM XM X0 X1 XG MD XA if reads_are_paired: table_columns.extend([("mate_id", "int32"), # TODO: int8 ("status2", "string"), ("chr2", "string"), ("lo2", "int32"), ("hi2", "int32"), ("negative_strand2", "boolean"), ("proper_pair", "boolean")]) if all_reads_have_FlowReads_tag: table_columns.extend([("flowgram", "string"), ("flow_indices", "string"), ("clip_qual_left", "int32"), ("clip_qual_right", "int32"), ("clip_adapter_left", "int32"), ("clip_adapter_right", "int32")]) table_columns.extend([("sam_field_BC", "string"), ("sam_field_XC", "int32"), ("sam_field_XT", "string"), ("sam_field_NM", "int32"), ("sam_field_CM", "int32"), ("sam_field_XN", "int32"), ("sam_field_SM", "int32"), ("sam_field_AM", "int32"), ("sam_field_XM", "int32"), ("sam_field_X0", "int32"), ("sam_field_X1", "int32"), ("sam_field_XG", "int32"), ("sam_field_MD", "string"), ("sam_field_XA", "string"), ("sam_optional_fields", "string")]) column_descriptors = [dxpy.DXGTable.make_column_desc(name, type) for name, type in table_columns] gri_index = dxpy.DXGTable.genomic_range_index("chr", "lo", "hi") t = dxpy.new_dxgtable(column_descriptors, indices=[gri_index]) if input_ref_is_indexed: original_contigset = dxpy.get_details(job_inputs['reference'])['original_contigset'] else: original_contigset = job_inputs['reference'] t.set_details({'original_contigset': original_contigset}) t.add_types(["LetterMappings", "Mappings", "gri"]) # name table if 'output_name' in job_inputs: t.rename(job_inputs['output_name']) else: first_reads_name = dxpy.DXGTable( job_inputs['reads'][0] ).describe()['name'] contig_set_name = dxpy.describe(job_inputs['reference'])['name'] # if we're working on an indexed_reference we're not guaranteed to have access to original_contigset if input_ref_is_indexed: contig_set_name = contig_set_name.split(' (index')[0] t.rename(first_reads_name + " mapped to " + contig_set_name) # declare how many paired or single reads are in each reads table read_group_lengths = [] for i in range(len(reads_ids)): current_length = reads_descriptions[reads_ids[i]]["length"] if 'sequence2' in dxpy.DXGTable(reads_ids[i]).get_col_names(): num_pairs = current_length num_singles = 0 else: num_pairs = 0 num_singles = current_length read_group_lengths.append( {"num_singles":num_singles, "num_pairs":num_pairs} ) details = t.get_details() details['read_groups'] = read_group_lengths t.set_details(details) row_offsets = []; row_cursor = 0 for i in range(len(reads_ids)): row_offsets.append(row_cursor) row_cursor += reads_descriptions[reads_ids[i]]["length"] chunk_size = job_inputs["chunk_size"] map_job_inputs = job_inputs.copy() map_job_inputs["row_offsets"] = row_offsets map_job_inputs["num_rows"] = chunk_size map_job_inputs["table_id"] = t.get_id() map_job_inputs["indexed_reference"] = job_outputs['indexed_reference'] postprocess_job_inputs = job_inputs.copy() postprocess_job_inputs["table_id"] = t.get_id() for start_row in xrange(0, row_cursor, chunk_size): map_job_inputs["start_row"] = start_row map_job = dxpy.new_dxjob(map_job_inputs, "map") print "Launched map job with", map_job_inputs postprocess_job_inputs["chunk%dresult" % start_row] = {'job': map_job.get_id(), 'field': 'ok'} postprocess_job_inputs["chunk%ddebug" % start_row] = {'job': map_job.get_id(), 'field': 'debug'} postprocess_job = dxpy.new_dxjob(postprocess_job_inputs, "postprocess") job_outputs['mappings'] = {'job': postprocess_job.get_id(), 'field': 'mappings'} print "MAIN OUTPUT:", job_outputs return job_outputs
def generate_report(geneBody, inner_dist, junc_ann, read_dist, read_dup, mappings, contam, names): report_details = {} # Gene Body Dist loc_in_gene = [n for n in range(100)] report_details['Gene Body Coverage'] = { "Normalized Location in Gene": loc_in_gene, "% of Reads Covering": geneBody } ######################### # Inner Distance if inner_dist != None: dxpy.download_dxfile(inner_dist, "inner_dist.txt") inner_bucket = [] inner_num_reads = [] inner_total_reads = 0 # if a bucket has less than 0.1% of reads in it then don't include it cutoff = 0.001 with open("inner_dist.txt", "r") as fh: line = fh.readline().rstrip("\n") while line != "": inner_total_reads += int(line.split()[2]) line = fh.readline().rstrip("\n") bucket_cutoff = cutoff * inner_total_reads print "Applying cutoff of: " + str( cutoff) + " for inner distance calculation" with open("inner_dist.txt", "r") as fh: line = fh.readline().rstrip("\n") while line != "": start, end, num_reads = [int(x) for x in line.split()] if num_reads > bucket_cutoff: # store center position of this bucket inner_bucket.append(int(end - ((end - start) / 2))) inner_num_reads.append(num_reads) line = fh.readline().rstrip("\n") # find total to normalize inner_total_reads = sum(inner_num_reads) print "Total reads for inner distance calculation: " + str( inner_total_reads) inner_median = None running_total = 0 inner_length_sum = 0 for i in range(len(inner_bucket)): # multiply read length by number of observations for the mean inner_length_sum += inner_bucket[i] * inner_num_reads[i] # calculate median running_total += inner_num_reads[i] if running_total >= inner_total_reads / 2 and inner_median == None: inner_median = inner_bucket[i] inner_mean = inner_length_sum / inner_total_reads print "inner distance metrics: " + " ".join( [str(inner_length_sum), str(inner_total_reads)]) # calc standard deviation std_sum = 0 for i in range(len(inner_bucket)): std_sum += ((inner_bucket[i] - inner_mean)**2) * inner_num_reads[i] std_sum /= inner_total_reads inner_std = int(math.sqrt(std_sum) + 0.5) report_details['Paired Read Inner Distance'] = { "Inner Distance (bp)": inner_bucket, "Count": inner_num_reads, "Mean": inner_mean, "Median": inner_median, "Standard Deviation": inner_std } ############################ # Junction Annotation dxpy.download_dxfile(junc_ann, "junc_ann.r") # initialize splicing values in case there was no splicing sj_k = 0 sj_pn = 0 sj_cn = 0 se_k = 0 se_pn = 0 se_cn = 0 if os.path.getsize("junc_ann.r") == 0: print "No splicing events found so setting all junction stats to 0" else: with open("junc_ann.r", "r") as fh: line = fh.readline() while line != "": line = line.rstrip("\n") if line.startswith("events"): # parse out the % and assign them se_pn, se_cn, se_k = [ float(n) / 100 for n in line[9:-1].split(",") ] if line.startswith("junction"): sj_pn, sj_cn, sj_k = [ float(n) / 100 for n in line[11:-1].split(",") ] line = fh.readline() report_details['Junction Annotation'] = { "Splicing Junctions": { "known": sj_k, "partial novel": sj_pn, "complete novel": sj_cn }, "Splicing Events": { "known": se_k, "partial novel": se_pn, "complete novel": se_cn } } ############################ # read duplication dxpy.download_dxfile(read_dup, "read_dup.txt") pos_copy = [] pos_num_reads = [] pos_total_reads = 0 seq_copy = [] seq_num_reads = [] seq_total_reads = 0 with open("read_dup.txt", "r") as fh: # pull of first header line = fh.readline() line = fh.readline() # read until we hit the stats for sequence based duplication while not line.startswith("Occurrence"): c, r = [int(n) for n in line.split()] pos_copy.append(c) pos_num_reads.append(float(r)) pos_total_reads += r line = fh.readline() #get next line to start with the data line = fh.readline() while line != "": c, r = [int(n) for n in line.split()] seq_copy.append(c) seq_num_reads.append(float(r)) seq_total_reads += r line = fh.readline() pos_total_reads = float(pos_total_reads) seq_total_reads = float(seq_total_reads) for i in range(len(pos_num_reads)): pos_num_reads[i] /= pos_total_reads for i in range(len(seq_num_reads)): seq_num_reads[i] /= seq_total_reads report_details['Read Duplication'] = { "Position Based": { "Read Occurrences": pos_copy, "% Reads": pos_num_reads }, "Sequence Based": { "Read Occurrences": seq_copy, "% Reads": seq_num_reads } } ############################ # read distribution report if read_dist != None: dxpy.download_dxfile(read_dist, "read_dist.txt") report_details['Read Distribution'] = {} with open("read_dist.txt", "r") as rd_file: report_details['Read Distribution']['Total Reads'] = int( rd_file.readline().split()[-1]) report_details['Read Distribution']['Total Tags'] = int( rd_file.readline().split()[-1]) report_details['Read Distribution']['Total Assigned Tags'] = int( rd_file.readline().split()[-1]) # pull out line of "="s rd_file.readline() # pull header line rd_file.readline() line = rd_file.readline() while not line.startswith("="): fields = line.split() report_details['Read Distribution'][fields[0]] = [ int(fields[1]), int(fields[2]), float(fields[3]) ] line = rd_file.readline() ############################# # add report of contaminations if calculated if contam != None: contam_report = [] for i in range(len(contam)): contam_report.append({ "Contaminant Name": names[i], "% Reads Mapping": contam[i] }) report_details['Contamination'] = contam_report ############################# # add link to mappings report_details['original_mappings'] = mappings report_name = dxpy.DXGTable(mappings).describe()['name'] + " RSeQC report" # create report report = dxpy.new_dxrecord(name=report_name, details=report_details, types=["Report", "RSeQC"]) report.close() return {"Report": dxpy.dxlink(report.get_id())}
def calc_contam(num_reads, mappings): percent_mapped = float( dxpy.DXGTable(mappings).describe()['length']) / float(num_reads) return {"percent_mapped": percent_mapped}
def main(**job_inputs): output = {} reportInput = {} run_shell("dx-spans-to-bed --output genes.bed " + job_inputs["gene_model"]["$dnanexus_link"]) bed_id = dxpy.upload_local_file("genes.bed").get_id() mappings_id = job_inputs["mappings"]["$dnanexus_link"] # get contaminant mapping started if we're doing it: if "contaminants" in job_inputs: if not "original_reads" in job_inputs: raise dxpy.AppError( "Original Reads must be input to calculate contamination levels. Please also supply the reads object that corresponds to these RNA-Seq mappings" ) name_input = [] contam_input = [] #spawn mappings job for each ContigSet for contaminant in job_inputs['contaminants']: calc_job = map_contaminant(Reads=job_inputs['original_reads'], Contig=contaminant) name_input.append(dxpy.DXRecord(contaminant).describe()['name']) contam_input.append({"job": calc_job, "field": "percent_mapped"}) reportInput['contam'] = contam_input reportInput['names'] = name_input else: reportInput['contam'] = None reportInput['names'] = None # output mappings as SAM for analysis modules run_shell(" ".join([ "dx-mappings-to-sam", "--discard_unmapped", "--output mappings.sam", mappings_id ])) run_shell(" ".join( ["samtools", "view", "-S", "-b", "mappings.sam", ">", "mappings.bam"])) bam_id = dxpy.upload_local_file("mappings.bam", wait_on_close=True).get_id() job1 = dxpy.new_dxjob({ 'BED_file': bed_id, "BAM_file": dxpy.dxlink(bam_id) }, "geneBody_coverage") # if paired then do inner distance calculation if "chr2" in dxpy.DXGTable(mappings_id).get_col_names(): job2 = dxpy.new_dxjob( { 'BED_file': bed_id, "BAM_file": dxpy.dxlink(bam_id) }, "inner_distance") else: job2 = None job3 = dxpy.new_dxjob({ 'BED_file': bed_id, "BAM_file": dxpy.dxlink(bam_id) }, "junction_annotation") job4 = dxpy.new_dxjob({"BAM_file": dxpy.dxlink(bam_id)}, "read_duplication") # implement this one when we can request a large RAM instance - requires 19GB for human genome job5 = dxpy.new_dxjob({ 'BED_file': bed_id, "BAM_file": dxpy.dxlink(bam_id) }, "read_distribution") # {"systemRequirements": {"instanceType":"dx_m2.2xlarge"}} ) reportInput['geneBody'] = {"job": job1.get_id(), "field": "results"} if job2 != None: reportInput['inner_dist'] = {"job": job2.get_id(), "field": "results"} else: reportInput['inner_dist'] = None reportInput['junc_ann'] = {"job": job3.get_id(), "field": "results"} reportInput['read_dup'] = {"job": job4.get_id(), "field": "results"} reportInput['read_dist'] = {"job": job5.get_id(), "field": "results"} reportInput['mappings'] = job_inputs["mappings"] reportJob = dxpy.new_dxjob(reportInput, "generate_report") output['report'] = {"job": reportJob.get_id(), "field": "Report"} return output