예제 #1
0
def generate_read_from_sam(input_lines, keep_seq=True):
    """Convert a set of lines from a SAM file into an RNASeqRead object"""
    generated_read = RNASeqRead(ID=None,
                                Nmap=0,
                                primary={},
                                secondary={},
                                seq1=None,
                                seq2=None)
    for line in input_lines:
        l = line.rstrip().split('\t')
        ID = l[0]
        if not generated_read.ID:
            generated_read.ID = ID
        assert ID == generated_read.ID, 'ERROR: nonmatching IDs in input_lines:\n{}\t{}'.format(
            generated_read.ID, ID)

        attributes = dict([(':'.join(i.split(':')[0:2]), i.split(':')[-1])
                           for i in l[11:]])
        # Nmap = int(attributes['NH:i'])
        # if generated_read.Nmap == 0:
        # generated_read.Nmap = Nmap
        # assert Nmap == generated_read.Nmap, 'ERROR: inconsistent Nmap score for {}'.format(ID)

        SAMflags = bin(int(l[1]))[2:]
        SAMflags = '0' * (12 - len(SAMflags)) + SAMflags
        # Interpret the binary SAM flags
        is_paired = bool(int(SAMflags[-1]))
        pair_is_mapped = bool(int(SAMflags[-2]))
        read_reverse = bool(int(SAMflags[-5]))
        first_in_pair = bool(int(SAMflags[-7]))
        secondary = bool(int(SAMflags[-9]))
        supplementary = bool(int(SAMflags[-12]))
        # read_unmapped  = bool(int(SAMflags[-3]))
        # mate_unmapped  = bool(int(SAMflags[-4]))
        # mate_reverse   = bool(int(SAMflags[-6]))
        # second_in_pair = bool(int(SAMflags[-8]))
        # quality_fail   = bool(int(SAMflags[-10]))
        # pcr_duplicate  = bool(int(SAMflags[-11]))
        ###
        chrom = l[2]
        pos = int(l[3])
        pairpos = int(l[7])
        mapscore = int(l[4])
        cigar = l[5]
        seq = l[9]

        generated_read.paired = is_paired

        if keep_seq:
            # store the + stranded read of the respective mate pair
            if read_reverse:
                rcseq = fu.rc(seq)
                if first_in_pair or not is_paired:
                    # store reverse complement in seq1
                    if not generated_read.seq1:
                        generated_read.seq1 = rcseq
                    assert rcseq == generated_read.seq1, 'ERROR: nonmatching sequence in input_lines:\n{}\t{}'.format(
                        generated_read.seq1, rcseq)
                else:
                    # store reverse complement in seq2
                    if not generated_read.seq2:
                        generated_read.seq2 = rcseq
                    assert rcseq == generated_read.seq2, 'ERROR: nonmatching sequence in input_lines:\n{}\t{}'.format(
                        generated_read.seq2, rcseq)
            else:
                if first_in_pair or not is_paired:
                    # store in seq1
                    if not generated_read.seq1:
                        generated_read.seq1 = seq
                    assert seq == generated_read.seq1, 'ERROR: nonmatching sequence in input_lines:\n{}\t{}'.format(
                        generated_read.seq1, seq)
                else:
                    # store in seq2
                    if not generated_read.seq2:
                        generated_read.seq2 = seq
                    assert seq == generated_read.seq2, 'ERROR: nonmatching sequence in input_lines:\n{}\t{}'.format(
                        generated_read.seq2, seq)

        if read_reverse:
            if first_in_pair or not is_paired:
                strand = '-'
            else:
                strand = '+'
        else:
            if first_in_pair or not is_paired:
                strand = '+'
            else:
                strand = '-'

        if first_in_pair or not is_paired:
            if secondary or supplementary:
                if pos in generated_read.secondary:
                    generated_read.secondary[pos][2].append((pos, cigar, 1))
                else:
                    generated_read.secondary[pos] = [
                        chrom, strand, [(pos, cigar, 1)]
                    ]
            else:
                if pos in generated_read.primary:
                    generated_read.primary[pos][2].append((pos, cigar, 1))
                else:
                    generated_read.primary[pos] = [
                        chrom, strand, [(pos, cigar, 1)]
                    ]
        else:
            if not pair_is_mapped:
                continue
            if secondary or supplementary:
                if pairpos in generated_read.secondary:
                    generated_read.secondary[pairpos][2].append(
                        (pos, cigar, 2))
                else:
                    generated_read.secondary[pairpos] = [
                        chrom, strand, [(pos, cigar, 2)]
                    ]
            else:
                if pairpos in generated_read.primary:
                    generated_read.primary[pairpos][2].append((pos, cigar, 2))
                else:
                    generated_read.primary[pairpos] = [
                        chrom, strand, [(pos, cigar, 2)]
                    ]

    return generated_read
예제 #2
0
                s = '+'

        for t in group:
            positions = set(
                flatten([
                    list(range(a - 1, b))
                    for a, b in zip(transcripts[t].get_exon_start(),
                                    transcripts[t].get_exon_end())
                ]))

            if args.FEATURE != 'transcript':
                positions = sorted(list(positions))
                strand = transcripts[t].strand
                sequence = ''.join([genome[chrom][i] for i in positions])
                if strand == '-':
                    aa, ss, f = fu.longest_orf(fu.rc(sequence))
                    if len(aa) < MIN_ORF:
                        continue
                    ORFstart, ORFstop = ss
                    if ORFstart == 0:
                        ORFstart = 1
                    if ORFstop == len(positions):
                        ORFstop = len(positions) - 1

                    if args.FEATURE == '5UTR':
                        positions = positions[-ORFstart:]
                    elif args.FEATURE == 'CDS':
                        positions = positions[-ORFstop:-ORFstart]
                    elif args.FEATURE == '3UTR':
                        positions = positions[:-ORFstop]
                else:
예제 #3
0
    transcript_coverage = np.empty(0, dtype='float32')
    if args.write_fasta:
        transcript_fasta = ''

    if strand not in ['+', '-']:
        print('# WARNING: {} is unstranded'.format(ID))
        continue

    for a, b in zip(exon_starts, exon_ends):
        exon = coverage[strand][chrom][a - 1:b]
        transcript_coverage = np.append(transcript_coverage, exon)
        if args.write_fasta:
            transcript_fasta += genome[chrom][a - 1:b]

    if strand == '-':  # Flip the strand for minus-stranded data
        transcript_coverage = transcript_coverage[::-1]
        if args.write_fasta:
            transcript_fasta = fu.rc(transcript_fasta)

    for line in vector_to_bedgraph_lines(transcript_coverage, ID):
        output_file.write(line + '\n')

    if args.write_fasta:
        fasta_outfile.write('>{}\n'.format(ID))
        fasta_outfile.write('{}\n'.format(transcript_fasta))

output_file.close()
if args.write_fasta:
    fasta_outfile.close()
        feature, len(bed_features[feature]['positions'])))

    values = [
        sum([
            coverage[graph][bed_features[feature]['chrom']].get(i, 0)
            for i in bed_features[feature]['positions']
        ]) for graph in ingraphs
    ]

    if args.g_content:
        nucleotides = [
            genome[bed_features[feature]['chrom']][i]
            for i in bed_features[feature]['positions']
        ]
        if bed_features[feature]['strand'] == '-':
            nucleotides = [i for i in fu.rc(''.join(nucleotides))]

        G = round(
            float(sum([i == 'G' for i in nucleotides])) / len(nucleotides), 3)
        g_file.write('{}\t{}\n'.format(feature, G))
    if args.BED_OUT:
        outfile.write('\t'.join([
            str(i) for i in [
                bed_features[feature]['chrom'],
                min(bed_features[feature]['positions']),
                max(bed_features[feature]['positions']),
                feature,
                '.',
                bed_features[feature]['strand'],
            ] + values
        ]) + '\n')
예제 #5
0
# args.reference_GFF
# ))

#################################
### EVALUATE THE GIVEN SAMPLE ###
#################################

for ID in ref_IDs:
    input_transcript = ref_transcripts[ID]
    chrom = input_transcript.chrom
    strand = input_transcript.strand
    # Gather information about exons
    exon_starts = input_transcript.get_exon_start()
    exon_ends = input_transcript.get_exon_end()
    if len(exon_starts) != len(exon_ends):
        print('# WARNING: inconsistent exon structure with {}'.format(ID))
        continue

    # Make a list of all nucleotide positions in the input transcript
    positions = flatten(
        [list(range(a - 1, b)) for a, b in zip(exon_starts, exon_ends)])
    length = len(positions)
    positions = [i for i in positions if i < chromosomes[chrom] and i > 0]

    # Get transcript's nucleotide sequence from the FASTA file
    sequence = ''.join([genome[chrom][i] for i in positions])
    if strand == '-':
        sequence = fu.rc(sequence)

    print('>{}\n{}'.format(ID, sequence))