def generate_read_from_sam(input_lines, keep_seq=True): """Convert a set of lines from a SAM file into an RNASeqRead object""" generated_read = RNASeqRead(ID=None, Nmap=0, primary={}, secondary={}, seq1=None, seq2=None) for line in input_lines: l = line.rstrip().split('\t') ID = l[0] if not generated_read.ID: generated_read.ID = ID assert ID == generated_read.ID, 'ERROR: nonmatching IDs in input_lines:\n{}\t{}'.format( generated_read.ID, ID) attributes = dict([(':'.join(i.split(':')[0:2]), i.split(':')[-1]) for i in l[11:]]) # Nmap = int(attributes['NH:i']) # if generated_read.Nmap == 0: # generated_read.Nmap = Nmap # assert Nmap == generated_read.Nmap, 'ERROR: inconsistent Nmap score for {}'.format(ID) SAMflags = bin(int(l[1]))[2:] SAMflags = '0' * (12 - len(SAMflags)) + SAMflags # Interpret the binary SAM flags is_paired = bool(int(SAMflags[-1])) pair_is_mapped = bool(int(SAMflags[-2])) read_reverse = bool(int(SAMflags[-5])) first_in_pair = bool(int(SAMflags[-7])) secondary = bool(int(SAMflags[-9])) supplementary = bool(int(SAMflags[-12])) # read_unmapped = bool(int(SAMflags[-3])) # mate_unmapped = bool(int(SAMflags[-4])) # mate_reverse = bool(int(SAMflags[-6])) # second_in_pair = bool(int(SAMflags[-8])) # quality_fail = bool(int(SAMflags[-10])) # pcr_duplicate = bool(int(SAMflags[-11])) ### chrom = l[2] pos = int(l[3]) pairpos = int(l[7]) mapscore = int(l[4]) cigar = l[5] seq = l[9] generated_read.paired = is_paired if keep_seq: # store the + stranded read of the respective mate pair if read_reverse: rcseq = fu.rc(seq) if first_in_pair or not is_paired: # store reverse complement in seq1 if not generated_read.seq1: generated_read.seq1 = rcseq assert rcseq == generated_read.seq1, 'ERROR: nonmatching sequence in input_lines:\n{}\t{}'.format( generated_read.seq1, rcseq) else: # store reverse complement in seq2 if not generated_read.seq2: generated_read.seq2 = rcseq assert rcseq == generated_read.seq2, 'ERROR: nonmatching sequence in input_lines:\n{}\t{}'.format( generated_read.seq2, rcseq) else: if first_in_pair or not is_paired: # store in seq1 if not generated_read.seq1: generated_read.seq1 = seq assert seq == generated_read.seq1, 'ERROR: nonmatching sequence in input_lines:\n{}\t{}'.format( generated_read.seq1, seq) else: # store in seq2 if not generated_read.seq2: generated_read.seq2 = seq assert seq == generated_read.seq2, 'ERROR: nonmatching sequence in input_lines:\n{}\t{}'.format( generated_read.seq2, seq) if read_reverse: if first_in_pair or not is_paired: strand = '-' else: strand = '+' else: if first_in_pair or not is_paired: strand = '+' else: strand = '-' if first_in_pair or not is_paired: if secondary or supplementary: if pos in generated_read.secondary: generated_read.secondary[pos][2].append((pos, cigar, 1)) else: generated_read.secondary[pos] = [ chrom, strand, [(pos, cigar, 1)] ] else: if pos in generated_read.primary: generated_read.primary[pos][2].append((pos, cigar, 1)) else: generated_read.primary[pos] = [ chrom, strand, [(pos, cigar, 1)] ] else: if not pair_is_mapped: continue if secondary or supplementary: if pairpos in generated_read.secondary: generated_read.secondary[pairpos][2].append( (pos, cigar, 2)) else: generated_read.secondary[pairpos] = [ chrom, strand, [(pos, cigar, 2)] ] else: if pairpos in generated_read.primary: generated_read.primary[pairpos][2].append((pos, cigar, 2)) else: generated_read.primary[pairpos] = [ chrom, strand, [(pos, cigar, 2)] ] return generated_read
s = '+' for t in group: positions = set( flatten([ list(range(a - 1, b)) for a, b in zip(transcripts[t].get_exon_start(), transcripts[t].get_exon_end()) ])) if args.FEATURE != 'transcript': positions = sorted(list(positions)) strand = transcripts[t].strand sequence = ''.join([genome[chrom][i] for i in positions]) if strand == '-': aa, ss, f = fu.longest_orf(fu.rc(sequence)) if len(aa) < MIN_ORF: continue ORFstart, ORFstop = ss if ORFstart == 0: ORFstart = 1 if ORFstop == len(positions): ORFstop = len(positions) - 1 if args.FEATURE == '5UTR': positions = positions[-ORFstart:] elif args.FEATURE == 'CDS': positions = positions[-ORFstop:-ORFstart] elif args.FEATURE == '3UTR': positions = positions[:-ORFstop] else:
transcript_coverage = np.empty(0, dtype='float32') if args.write_fasta: transcript_fasta = '' if strand not in ['+', '-']: print('# WARNING: {} is unstranded'.format(ID)) continue for a, b in zip(exon_starts, exon_ends): exon = coverage[strand][chrom][a - 1:b] transcript_coverage = np.append(transcript_coverage, exon) if args.write_fasta: transcript_fasta += genome[chrom][a - 1:b] if strand == '-': # Flip the strand for minus-stranded data transcript_coverage = transcript_coverage[::-1] if args.write_fasta: transcript_fasta = fu.rc(transcript_fasta) for line in vector_to_bedgraph_lines(transcript_coverage, ID): output_file.write(line + '\n') if args.write_fasta: fasta_outfile.write('>{}\n'.format(ID)) fasta_outfile.write('{}\n'.format(transcript_fasta)) output_file.close() if args.write_fasta: fasta_outfile.close()
feature, len(bed_features[feature]['positions']))) values = [ sum([ coverage[graph][bed_features[feature]['chrom']].get(i, 0) for i in bed_features[feature]['positions'] ]) for graph in ingraphs ] if args.g_content: nucleotides = [ genome[bed_features[feature]['chrom']][i] for i in bed_features[feature]['positions'] ] if bed_features[feature]['strand'] == '-': nucleotides = [i for i in fu.rc(''.join(nucleotides))] G = round( float(sum([i == 'G' for i in nucleotides])) / len(nucleotides), 3) g_file.write('{}\t{}\n'.format(feature, G)) if args.BED_OUT: outfile.write('\t'.join([ str(i) for i in [ bed_features[feature]['chrom'], min(bed_features[feature]['positions']), max(bed_features[feature]['positions']), feature, '.', bed_features[feature]['strand'], ] + values ]) + '\n')
# args.reference_GFF # )) ################################# ### EVALUATE THE GIVEN SAMPLE ### ################################# for ID in ref_IDs: input_transcript = ref_transcripts[ID] chrom = input_transcript.chrom strand = input_transcript.strand # Gather information about exons exon_starts = input_transcript.get_exon_start() exon_ends = input_transcript.get_exon_end() if len(exon_starts) != len(exon_ends): print('# WARNING: inconsistent exon structure with {}'.format(ID)) continue # Make a list of all nucleotide positions in the input transcript positions = flatten( [list(range(a - 1, b)) for a, b in zip(exon_starts, exon_ends)]) length = len(positions) positions = [i for i in positions if i < chromosomes[chrom] and i > 0] # Get transcript's nucleotide sequence from the FASTA file sequence = ''.join([genome[chrom][i] for i in positions]) if strand == '-': sequence = fu.rc(sequence) print('>{}\n{}'.format(ID, sequence))