Пример #1
0
def replace_genome(filename, tp, outfile1, outfile2, BWA):
    f, out1, out2 = shell.IsGzipFile(filename), open(outfile1, 'wt'), open(
        outfile2, 'wt')
    for line in f:
        if line[0] == '>':
            out1.write(line)
            out2.write(line)
        else:
            out1.write(line.upper().replace(tp[0], tp[1]))
            out2.write(line.upper().replace(BT[tp[0]], BT[tp[1]]))
    for fd in [f, out1, out2]:
        fd.close()
    # index and rm
    p1 = subprocess.Popen([BWA, 'index', outfile1],
                          stderr=subprocess.PIPE,
                          stdout=subprocess.PIPE)
    p2 = subprocess.Popen([BWA, 'index', outfile2],
                          stderr=subprocess.PIPE,
                          stdout=subprocess.PIPE)
    p1.wait()
    p2.wait()
    if p1.returncode != 0 or p2.returncode != 0:
        shell.eprint('[' + PROGRAM + '] Error: bwa index failed')
        sys.exit(1)
    os.system('rm -f ' + outfile1)
    os.system('rm -f ' + outfile2)

    return 0
Пример #2
0
def bwaaln(tp, genomefile, fq1, fq2, phred, outdir, config):
    if phred == 33:
        args = []
    elif phred == 64:
        args = ['-I']
    else:
        shell.eprint('[' + PROGRAM + '] Error: phred error')
        sys.exit(1)
    for k, v in config['bwa']['aln'][tp].items():
        if v != '':
            args += [k, v]
    p1 = subprocess.Popen(
        [BWA, 'aln'] + args +
        [genomefile, fq1, '-f', outdir + os.path.basename(fq1) + '_1.sai'],
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL)
    if fq2 != '':
        p2 = subprocess.Popen(
            [BWA, 'aln'] + args +
            [genomefile, fq2, '-f', outdir + os.path.basename(fq2) + '_2.sai'],
            stdout=subprocess.DEVNULL,
            stderr=subprocess.DEVNULL)
        p2.wait()
    p1.wait()
    if p1.returncode != 0 and p2.returncode != 0:
        shell.eprint('[' + PROGRAM + '] Error: bwa aln error')
        sys.exit(1)
    args = []
    for k, v in config['bwa']['sampe'][tp].items():
        args += [k, v]
    if fq2 != '':
        p = subprocess.Popen([BWA, 'sampe'] + args + [
            genomefile, outdir + os.path.basename(fq1) + '_1.sai',
            outdir + os.path.basename(fq2) + '_2.sai', fq1, fq2
        ],
                             stdout=subprocess.PIPE,
                             stderr=subprocess.DEVNULL)
    else:
        p = subprocess.Popen([
            BWA, 'samse', genomefile,
            outdir + os.path.basename(fq1) + '_1.sai', fq1
        ],
                             stdout=subprocess.PIPE,
                             stderr=subprocess.DEVNULL)
    infile = pysam.AlignmentFile(p.stdout, 'r')
    oufile = pysam.AlignmentFile(outdir + tp + '_aln.bam',
                                 'wb',
                                 template=infile)
    for r in infile:
        oufile.write(r)
    p.wait()
    oufile.close()
    infile.close()
    os.system('rm -f ' + outdir + os.path.basename(fq1) + '_1.sai')
    if fq2:
        os.system('rm -f ' + outdir + os.path.basename(fq2) + '_2.sai')

    return outdir + tp + '_aln.bam'
Пример #3
0
def spawnSourceFiles(entries, isTarget=False, systemHeaders=False):
    if not entries:
        return

    for path in entries:
        if os.path.isfile(path):
            spawnSourceFile(path, isTarget, systemHeaders)
        elif os.path.isdir(path):
            spawnSourceDirectory(path, isTarget, systemHeaders)
        else:
            eprint("Error: couldn't process path '%s'." % path)
Пример #4
0
 def _transfer_cigar(cigartuples, start, query, qual):
     '''
        Judge the extend end of scaffold by read if out, and return  
     '''
     #shell.eprint(cigartuples)
     d = {}  # record handle of each cigar
     offset = 0  # the offset on read
     i = start  # i for Delete position in genome
     #shell.eprint(query)
     for operation, num in cigartuples:
         #shell.eprint(offset)
         #shell.eprint(query[offset: offset+num])
         if operation == 0:  ## M
             try:
                 d['M'].append(
                     (start, start + num, query[offset:offset + num],
                      qual[offset:offset + num]))
             except KeyError:
                 d['M'] = [(start, start + num, query[offset:offset + num],
                            qual[offset:offset + num])]
             #shell.eprint(d)
             start += num - 1
             i += num
             offset += num
         elif operation == 1:  ## I
             try:
                 d['I'].append((start, query[offset:offset + num],
                                qual[offset:offset + num]))
             except KeyError:
                 d['I'] = [(start, query[offset:offset + num],
                            qual[offset:offset + num])]
             start += 1
             offset += num
         elif operation == 2:  ## D
             try:
                 d['D'].append((i, num))
             except KeyError:
                 d['D'] = [(i, num)]
             start += num + 1
             i += num
         elif operation == 3:  ## N
             start += num + 1
             i += num
             offset += num
         elif operation == 4:  ## S
             offset += num
         else:
             shell.eprint(
                 '[' + PROGRAM +
                 '] Error: the cigar string may have unsupported char, should only have [MISDN]'
             )
     return start, d
Пример #5
0
def bamsortindex(bamfile):
    prefix = '.'.join(bamfile.split('.')[:-1])
    try:
        pysam.sort('-o', prefix + '.sorted.bam', bamfile)
    except:
        shell.eprint('[' + PROGRAM + '] Error: ' + bamfile + ' sort error')
        sys.exit(1)
    try:
        pysam.index(prefix + '.sorted.bam')
    except:
        shell.eprint('[' + PROGRAM + '] Error: ' + bamfile + ' index error')
    os.system('rm -f ' + bamfile)
    return prefix + '.sorted.bam'
Пример #6
0
 def _trim(cigartuples, t, start, seq, qual):
     '''
         Trim the reads' 5' and 3' end.
     '''
     newcigarlist, flag, tr = [], 1, 0  ## tr
     for operation, num in cigartuples:
         if flag:
             if operation in (0, 1, 4):  ## 0 for M, 1 for I, 4 for S
                 if num + tr > t[0]:
                     #shell.eprint(num, tr, t[0])
                     newcigarlist.append((operation, num + tr - t[0]))
                     if operation == 0 and len(t) == 2:
                         start += t[0] - tr
                     flag = 0
                 else:
                     tr += num
                     if operation == 0 and len(t) == 2:
                         start += num
                     #shell.eprint(newcigarlist, 'ok')
             elif operation in (2, 3):  ## 2 for D, 3 for N
                 if len(t) == 2:
                     start += num
             else:
                 shell.eprint(
                     '[' + PROGRAM +
                     '] Error: the cigar string may have unsupported char, should only have [MISDN]'
                 )
         else:
             newcigarlist.append((operation, num))
         #shell.eprint(newcigarlist, t)
     if len(t) == 2:
         return _trim(newcigarlist[::-1], [t[1]], start, seq[t[0]:-t[1]],
                      qual[t[0]:-t[1]])
     elif len(t) == 1:
         ##
         if newcigarlist[0][0] == 1:
             newcigarlist[0] = (4, newcigarlist[0][1])
         elif newcigarlist[0][0] in (2, 3):
             newcigarlist = newcigarlist[1:]
         ##
         if newcigarlist[-1][0] == 1:
             newcigarlist[-1] = (4, newcigarlist[-1][1])
         elif newcigarlist[-1][0] in (2, 3):
             newcigarlist = newcigarlist[:-1]
         return newcigarlist[::-1], start, seq, qual
def getDotPicture(graph, engine):
    if not executableExists(engine):
        eprint("No 'dot' executable!")

    dotFileName = None
    pngFileName = None

    with tempfile.NamedTemporaryFile(delete=False) as dotFile:
        dotFileName = os.path.abspath(dotFile.name)
        dotFile.write(graph)

    with tempfile.NamedTemporaryFile(delete=False) as pngFile:
        pngFileName = os.path.abspath(pngFile.name)

    runCommand("%s -Tpng -o %s %s" % (engine, pngFileName, dotFileName))

    with open(pngFileName, "r") as graphFile:
        return graphFile.read()
Пример #8
0
def get_config():
    try:
        optlist, args = getopt.getopt(sys.argv[1:], 'hp:t:',
                                      ['help', 'trim=', 'prefix='])
    except getopt.GetoptError as e:
        shell.eprint('[' + PROGRAM + '] Error: ' + str(e))
        sys.exit(2)

    if optlist == [] and args == []:
        print_help()
        sys.exit(0)

    config = {'sam2base': {}}
    for opt, value in optlist:
        if opt in ('-h', '--help'):
            print_help()
            sys.exit(0)
        elif opt in ('-t', '--trim'):
            try:
                config['sam2base']['trim'] = (int(value.split(',')[0]),
                                              int(value.split(',')[1]))
            except ValueError:
                shell.eprint('[' + PROGRAM +
                             '] Error: --trim parameter should be integer')
                sys.exit(1)
        elif opt in ('p', '--prefix'):
            if value.endswith('.'):
                shell.eprint(
                    '[' + PROGRAM +
                    '] Error: --prefix parameter should not end with \'.\'')
                sys.exit(1)
            else:
                config['sam2base']['output'] = value
        else:
            assert False, 'unhandled option'
    try:
        genomefile, bamfile = args
    except ValueError as e:
        shell.eprint('[' + PROGRAM +
                     '] Error: only two input files should be provided')
        sys.exit(1)

    return config, genomefile, bamfile
Пример #9
0
def soapnuke(tp, fq1, fq2, phred, config):
    if phred == 33:
        args = ['-Q', '2']
    elif phred == 64:
        args = ['-Q', '1']
    else:
        shell.eprint('[' + PROGRAM + '] Error: phred error')
        sys.exit(1)
    if fq2 == '':
        args += [
            '-1', fq1, '-o', 'soapnuke/' + tp, '-C',
            os.path.basename(fq1) + '.clean.fq.gz'
        ]
        fq1 = 'soapnuke/' + tp + '/' + os.path.basename(fq1) + '.clean.fq.gz'
    else:
        args += [
            '-1', fq1, '-2', fq2, '-o', 'soapnuke/' + tp, '-C',
            os.path.basename(fq1) + '_1.clean.fq.gz', '-D',
            os.path.basename(fq2) + '_2.clean.fq.gz'
        ]
        fq1, fq2 = 'soapnuke/' + tp + '/' + os.path.basename(
            fq1) + '_1.clean.fq.gz', 'soapnuke/' + tp + '/' + os.path.basename(
                fq2) + '_2.clean.fq.gz'
    for k, v in config['soapnuke']['filter'][tp].items():
        if v != '':
            args += [k, v]
    p = subprocess.Popen([SOAPNUKE, 'filter'] + args + ['-G', '-5', '1'],
                         stdout=open('log', 'w'),
                         stderr=subprocess.STDOUT)
    p.wait()
    if p.returncode != 0:
        shell.eprint('[' + PROGRAM + '] Error: soapnuke run error')
        shell.eprint(''.join(open('log', 'r').read()))
        sys.exit(1)

    return fq1, fq2, 33
Пример #10
0
def sam2base(genomefile, sortbamfile, trim=(0, 0), output='', suffix='.sb.gz'):
    '''
        Bam to sb file. sb -> singlebase, for the future, sbz for the sb specific compressed file,
        sbi for the sb index file.
    '''
    def _trim(cigartuples, t, start, seq, qual):
        '''
            Trim the reads' 5' and 3' end.
        '''
        newcigarlist, flag, tr = [], 1, 0  ## tr
        for operation, num in cigartuples:
            if flag:
                if operation in (0, 1, 4):  ## 0 for M, 1 for I, 4 for S
                    if num + tr > t[0]:
                        #shell.eprint(num, tr, t[0])
                        newcigarlist.append((operation, num + tr - t[0]))
                        if operation == 0 and len(t) == 2:
                            start += t[0] - tr
                        flag = 0
                    else:
                        tr += num
                        if operation == 0 and len(t) == 2:
                            start += num
                        #shell.eprint(newcigarlist, 'ok')
                elif operation in (2, 3):  ## 2 for D, 3 for N
                    if len(t) == 2:
                        start += num
                else:
                    shell.eprint(
                        '[' + PROGRAM +
                        '] Error: the cigar string may have unsupported char, should only have [MISDN]'
                    )
            else:
                newcigarlist.append((operation, num))
            #shell.eprint(newcigarlist, t)
        if len(t) == 2:
            return _trim(newcigarlist[::-1], [t[1]], start, seq[t[0]:-t[1]],
                         qual[t[0]:-t[1]])
        elif len(t) == 1:
            ##
            if newcigarlist[0][0] == 1:
                newcigarlist[0] = (4, newcigarlist[0][1])
            elif newcigarlist[0][0] in (2, 3):
                newcigarlist = newcigarlist[1:]
            ##
            if newcigarlist[-1][0] == 1:
                newcigarlist[-1] = (4, newcigarlist[-1][1])
            elif newcigarlist[-1][0] in (2, 3):
                newcigarlist = newcigarlist[:-1]
            return newcigarlist[::-1], start, seq, qual

    def _transfer_cigar(cigartuples, start, query, qual):
        '''
           Judge the extend end of scaffold by read if out, and return  
        '''
        #shell.eprint(cigartuples)
        d = {}  # record handle of each cigar
        offset = 0  # the offset on read
        i = start  # i for Delete position in genome
        #shell.eprint(query)
        for operation, num in cigartuples:
            #shell.eprint(offset)
            #shell.eprint(query[offset: offset+num])
            if operation == 0:  ## M
                try:
                    d['M'].append(
                        (start, start + num, query[offset:offset + num],
                         qual[offset:offset + num]))
                except KeyError:
                    d['M'] = [(start, start + num, query[offset:offset + num],
                               qual[offset:offset + num])]
                #shell.eprint(d)
                start += num - 1
                i += num
                offset += num
            elif operation == 1:  ## I
                try:
                    d['I'].append((start, query[offset:offset + num],
                                   qual[offset:offset + num]))
                except KeyError:
                    d['I'] = [(start, query[offset:offset + num],
                               qual[offset:offset + num])]
                start += 1
                offset += num
            elif operation == 2:  ## D
                try:
                    d['D'].append((i, num))
                except KeyError:
                    d['D'] = [(i, num)]
                start += num + 1
                i += num
            elif operation == 3:  ## N
                start += num + 1
                i += num
                offset += num
            elif operation == 4:  ## S
                offset += num
            else:
                shell.eprint(
                    '[' + PROGRAM +
                    '] Error: the cigar string may have unsupported char, should only have [MISDN]'
                )
        return start, d

    def _update(seq, dw, **kw):
        if 'M' in kw:
            for line in kw['M']:
                #shell.eprint(line)
                for j, i in enumerate(range(line[0], line[1])):
                    if i not in dw:
                        dw[i] = {'M': [line[2][j], line[3][j]]}
                    else:
                        try:
                            dw[i]['M'][0] += line[2][j]
                            dw[i]['M'][1] += line[3][j]
                        except KeyError:
                            #shell.eprint(j)
                            dw[i].update({'M': [line[2][j], line[3][j]]})
        if 'I' in kw:
            for line in kw['I']:
                if line[0] not in dw:
                    dw[line[0]] = {'I': [line[1], line[2]]}
                else:
                    try:
                        dw[line[0]]['I'][0] += ',' + line[1]
                        dw[line[0]]['I'][1] += ',' + line[2]
                    except KeyError:
                        dw[line[0]].update({'I': [line[1], line[2]]})
        if 'D' in kw:
            for line in kw['D']:
                st = ''
                for i in range(line[1]):
                    st += seq[line[0] + i - 1]
                if line[0] not in dw:
                    dw[line[0]] = {'D': [st, '0' * line[1]]}
                else:
                    try:
                        dw[line[0]]['D'][0] += ',' + st
                        dw[line[0]]['D'][1] += ',' + '0' * line[1]
                    except KeyError:
                        dw[line[0]].update({'D': [st, '0' * line[1]]})
        return dw

    def _write2gzipfile(dw, fw):
        wr = []
        for key in sorted(dw.keys()):
            if 'M' in dw[key]:
                if 'I' in dw[key]:
                    if 'D' in dw[key]:
                        wr.append(sca + '\t' + str(key) + '\t' + seq[key - 1] +
                                  '\t' + '[M]:' + dw[key]['M'][0] + ';[I]:' +
                                  dw[key]['I'][0] + ';[D]:' + dw[key]['D'][0] +
                                  '\t' + '[M]:' + dw[key]['M'][1] + ';[I]:' +
                                  dw[key]['I'][1] + ';[D]:' + dw[key]['D'][1] +
                                  '\t' + str(len(dw[key]['M'][0])) + '\n')
                    else:
                        wr.append(sca + '\t' + str(key) + '\t' + seq[key - 1] +
                                  '\t' + '[M]:' + dw[key]['M'][0] + ';[I]:' +
                                  dw[key]['I'][0] + '\t' + '[M]:' +
                                  dw[key]['M'][1] + ';[I]:' + dw[key]['I'][1] +
                                  '\t' + str(len(dw[key]['M'][0])) + '\n')
                elif 'D' in dw[key]:
                    wr.append(sca + '\t' + str(key) + '\t' + seq[key - 1] +
                              '\t' + '[M]:' + dw[key]['M'][0] + ';[D]:' +
                              dw[key]['D'][0] + '\t' + '[M]:' +
                              dw[key]['M'][1] + ';[D]:' + dw[key]['D'][1] +
                              '\t' + str(len(dw[key]['M'][0])) + '\n')
                else:
                    wr.append(sca + '\t' + str(key) + '\t' + seq[key - 1] +
                              '\t' + '[M]:' + dw[key]['M'][0] + '\t' + '[M]:' +
                              dw[key]['M'][1] + '\t' +
                              str(len(dw[key]['M'][0])) + '\n')
        fw.write(''.join(wr))
        return {}

    if not sortbamfile.endswith('.bam'):
        shell.eprint('[' + PROGRAM +
                     '] Error: input should be sorted and indexed bam')
        sys.exit(1)
    if trim[0] < 0 or trim[1] < 0:
        shell.eprint(
            'Are you serious ? --trim should not be negative integer.')
        sys.exit(255)
    if output == '':
        output = sortbamfile
    # Get a chromosome or scaffold from genome then do with bam
    ## If scaffold has no mapping reads, could it raise error?
    fw = gzip.open(output + suffix, 'wt')
    with pysam.AlignmentFile(sortbamfile, 'rb') as f:
        for sca, seq, seqlen in shell.Fa2Geno(genomefile):
            dw, newend = {}, 1
            try:
                for read in f.fetch(contig=sca):
                    if 1 < newend < read.reference_start + 1:
                        dw = _write2gzipfile(dw, fw)
                    if len(read.query_sequence) != len(read.qual):
                        continue
                    #shell.eprint(read.query_sequence)
                    if trim == (0, 0):
                        newstart, d = _transfer_cigar(read.cigartuples,
                                                      read.reference_start + 1,
                                                      read.query_sequence,
                                                      read.qual)
                    else:
                        if read.flag & 16:
                            trim = trim[::-1]
                        newstart, d = _transfer_cigar(
                            *_trim(list(
                                read.cigartuples), trim, read.reference_start +
                                   1, read.query_sequence, read.qual))
                    #
                    if seqlen < newstart:
                        continue
                    if newstart > newend:
                        newend = newstart
                    dw = _update(seq, dw, **d)
                _write2gzipfile(dw, fw)
            except ValueError as e:
                shell.eprint('[' + PROGRAM +
                             '] Error: input should be sorted and indexed bam')
                os.system('rm -f ' + output + suffix)
                sys.exit(1)
    fw.close()

    return output + suffix
Пример #11
0
def merge_RES(Config,
              Genomefile,
              Phred_DNA=33,
              Phred_RNA=33,
              Qual_cutoff=30,
              HomoPrior=0.99,
              Rate=2,
              Method='Bayesian',
              Ploidy=2,
              Intron=None,
              DNAdepth=10,
              RNAdepth=3,
              Bayesian_Posterior_Probability=0.95,
              FDR_DNA_Heterozygosis=0.05,
              Non_Ref_BaseCount=0,
              Paralogous_D=1,
              Intronic=6,
              Homopolymer=1,
              out_path='./'):
    '''
        merge the result of scanner and check
    '''

    BT = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}
    BASE = ('A', 'C', 'G', 'T')

    def read(f, s, table, several_tp):
        for line in f:
            if line[0] == '#':
                continue
            fd = line.strip().split()
            key1, key2, key3, key4, key5 = (fd[0], fd[1], fd[2]), (s, 'DNA'), (
                s, 'RNA'), (s, 'SNPvalue'), (s, 'Editvalue')
            try:
                table[key1].update({key2: fd[5]})
            except KeyError:
                table[key1] = {key2: fd[5]}
            table[key1][key3] = fd[9]
            if Method == 'Bayesian':
                table[key1][key4] = (fd[6], fd[7])
            else:
                table[key1][key4] = fd[6]
            table[key1][key5] = fd[12]
            table[key1]['gbase'] = fd[3].upper()
            try:
                if table[key1]['type'] != fd[10]:
                    several_tp.add(key1)
            except KeyError:
                table[key1]['type'] = fd[10]
        return 0

    def deleteintron(sca, l, table):
        for i in l:
            try:
                del table[(sca, str(i), '+')]
            except KeyError:
                pass
            try:
                del table[(sca, str(i), '-')]
            except KeyError:
                pass
            try:
                del table[(sca, str(i), '.')]
            except KeyError:
                pass
        return 0

    def caculus(f, table, errorRate, st):
        for line in f:
            fd = line.strip().split()
            ref = fd[2]
            for strand in st:
                key1 = (fd[0], fd[1], strand)
                if key1 in table:
                    key2 = (s, 'RNA')
                    if key2 not in table[key1]:
                        editV = (s, 'Editvalue')
                        if fd[5] == 0:
                            table[key1][key2] = '0,0,0,0'
                            table[key1][editV] = '1'
                        else:
                            seq, qual = shell._DealBasequality(
                                fd[3][4:4 + int(fd[-1])].upper(),
                                fd[4][4:4 + int(fd[-1])], Phred_RNA)
                            l = [seq.count(i) for b in BASE]
                            table[key1][key2] = '{},{},{},{}'.format(*l)
                            tol, editdep = 0, 0
                            for i in range(4):
                                tol += l[i]
                                if BASE[i] != ref and l[i] > editdep:
                                    editdep = l[i]
                            table[key1][editV] = shell._SNVvalue_Binomial(
                                (editdep, tol), errorRate, RNA=True)
        return 0

    if Method not in ('Bayesian', 'Binomial', 'Frequency'):
        print_help()
        sys.exit(1)
    elif Method == 'Bayesian' and Genomefile == None:
        print_help()
        sys.exit(1)
    HetePrior = 1 - HomoPrior
    ## read config file
    sample, order = {}, []
    with open(Config, 'r') as f:
        for line in f:
            fd = line.strip().split()
            order.append(fd[0])
            if os.path.isfile(fd[1] + '.stat') == True:
                sample[fd[0]] = {3: fd[1] + '.stat'}
            else:
                shell.eprint(
                    '[bigtable] Error: DNA single base file do not have .stat file in the same directory'
                )
                sys.exit(1)
            sample[fd[0]][0] = fd[1]
            if len(fd) < 4:
                shell.eprint('[bigtable] Error: config file format error')
                sys.exit(1)
            sample[fd[0]][1.1] = fd[2]
            sample[fd[0]][1.2] = fd[3]
            if len(fd) == 6:
                sample[fd[0]][2.1] = fd[4]
                sample[fd[0]][2.2] = fd[5]
    ## read table
    several_tp, table, peak_dep = set(), {}, {}
    for s, v in sample.items():
        f = shell.IsGzipFile(v[1.2])
        f.readline()
        read(f, s, table, several_tp)
        f.close()
        with open(v[3], 'r') as f:
            fd = f.readlines()[1].strip().split()
            peak_dep[s] = float(
                fd[4]) if float(fd[4]) > float(fd[3]) else float(fd[3])
        try:
            f = shell.IsGzipFile(v[2.2])
            f.readline()
            read(f, s, table, several_tp)
            f.close()
        except:
            pass
    ## delete sites have different editing type in different sample
    for key in several_tp:
        del table[key]
    ## filter sites locating near junctions
    if Intron != None:
        f = shell.IsGzipFile(Intron)
        for line in f:
            fd = line.strip().split()
            if int(fd[3]) < int(fd[4]):
                beg, end = int(fd[3]), int(fd[4])
            else:
                beg, end = int(fd[4]), int(fd[3])
            deleteintron(fd[1], range(beg, beg + Intronic), table)
            deleteintron(fd[1], range(end - Intronic + 1, end + 1), table)
        f.close()
    ## filter homopolymer
    if Homopolymer:
        seq, leng, st = {}, {}, ''
        m = [re.compile(i * 5) for i in BASE]
        f = shell.IsGzipFile(Genomefile)
        for line in f:
            if line[0] == '>':
                if st:
                    seq[key] = st
                    leng[key] = len(st)
                st = ''
                key = line.strip().split()[0][1:]
            else:
                st += line.strip().upper()
        if st:
            seq[key] = st
            leng[key] = len(st)
        delkey = set()
        for key in table:
            sca, pos, strand = key
            beg = int(pos) - 4 if int(pos) - 4 > 1 else 1
            end = int(pos) + 4 if int(pos) + 4 < leng[sca] else leng[sca]
            nt = seq[sca][beg - 1:end]
            for m1 in m:
                if m1.match(nt):
                    delkey.add(key)
                    break
        for key in delkey:
            del table[key]
        del seq
        del leng
        del delkey

    if Method == 'Bayesian':
        basecontent = shell._BasePercent(Genomefile)
        weight = 0.5
        weight_other = (1 - weight) / 2
        ## adjust substitution rate for illumina
        FixedError = {
            'A': {
                'C': weight,
                'T': weight_other,
                'G': weight_other
            },
            'C': {
                'A': weight,
                'T': weight_other,
                'G': weight_other
            },
            'G': {
                'T': weight,
                'A': weight_other,
                'C': weight_other
            },
            'T': {
                'G': weight,
                'A': weight_other,
                'C': weight_other
            }
        }
        FixedKey = []
        for key in FixedError.keys():
            for ke in FixedError[key].keys():
                if (key, ke) not in FixedKey and (ke, key) not in FixedKey:
                    FixedKey.append((key, ke))
    errorRate = 10**(-1 * Qual_cutoff / 10)
    for s, v in sample.items():
        with gzip.open(v[0], 'rt') as f:
            for line in f:
                fd = line.strip().split()
                ref = fd[2].upper()
                for strand in ('+', '-', '.'):
                    key1 = (fd[0], fd[1], strand)
                    if key1 in table:
                        key2 = (s, 'DNA')
                        if key2 not in table[key1]:
                            snpV = (s, 'SNPvalue')
                            if fd[5] == 0:
                                table[key1][snpV] = 'NA'
                                table[key1][key2] = '0,0,0,0'
                            else:
                                seq, qual = shell._DealBasequality(
                                    fd[3][4:4 + int(fd[-1])].upper(),
                                    fd[4][4:4 + int(fd[-1])], Phred_DNA)
                                table[key1][key2] = ','.join(
                                    [str(seq.count(i)) for b in BASE])
                                if seq == '':
                                    table[key1][snpV] = 'NA'
                                    continue
                                if Method == 'Bayesian':
                                    result = shell._SNPvalue_Bayesian(
                                        seq, qual, Ploidy, FixedKey,
                                        FixedError, basecontent)
                                    table[key1][snpV] = (result[0], result[1])
                                elif Method == 'Binomial':
                                    table[key1][
                                        snpV] = shell._SNVvalue_Binomial(
                                            ','.join([
                                                str(seq.count(i)) for b in BASE
                                            ]), ref)
                                elif Method == 'Frequency':
                                    table[key1][
                                        snpV] = shell._SNPvalue_Frequency(
                                            seq, ref)[1]
                                else:
                                    shell.eprint(
                                        '[bigtable] Error: --method not recognize'
                                    )
                                    sys.exit(1)
        with gzip.open(v[1.1], 'rt') as f:
            caculus(f, table, errorRate, ['+', '.'])
        if 2.1 not in v.keys():
            continue
        with gzip.open(v[2.1], 'rt') as f:
            caculus(f, table, errorRate, ['-'])
    ## Complement DNA and RNA information for all sites in the table
    for k, v in table.items():
        for spl in order:
            keyDNA, keysnpV, keyRNA, keyeditV = (spl, 'DNA'), (
                spl, 'SNPvalue'), (spl, 'RNA'), (spl, 'Editvalue')
            if keyDNA not in v:
                table[k][keyDNA] = '0,0,0,0'
                table[k][keysnpV] = 'NA'
            if keyRNA not in v:
                table[k][keyRNA] = '0,0,0,0'
                table[k][keyeditV] = '1'
    ## Remove sites with high DNA depth and multiple RNA editing types
    delkey = set()
    for key1 in table:
        for s in order:
            fd = [int(i) for i in table[key1][(s, 'DNA')].split(',')]
            if Paralogous_D:
                dep = sum(fd)
                if dep > 2 * peak_dep[s]:
                    delkey.add(key1)
                    break
            fd = [int(i) for i in table[key1][(s, 'RNA')].split(',')]
            rna_count = {}
            for i in range(4):
                if BASE[i] != table[key1]['gbase']:
                    rna_count[BASE[i]] = fd[i]
            key3 = sorted(rna_count.keys(),
                          key=lambda x: rna_count[x],
                          reverse=True)
            if len(key3) != 3:
                shell.eprint('[bigtable] Error: rna depth error')
                sys.exit(1)
            if rna_count[key3[0]] > 0 and (rna_count[key3[1]] /
                                           rna_count[key3[0]]) > 0.01:
                delkey.add(key1)
                break
    for key in delkey:
        del table[key]

    delkey = set()
    for key1 in table:
        kplus, kminus = (k[0], k[1], '+'), (k[0], k[1], '-')
        if kplus in table and kminus in table:
            plusdep, miusdep = 0, 0
            for s in order:
                plusdep += sum(
                    [int(i) for i in table[kplus][(s, 'RNA')].split(',')])
                miusdep += sum(
                    [int(i) for i in table[kminus][(s, 'RNA')].split(',')])
            if plusdep > miusdep:
                delkey.add(kminus)
            elif plusdep < miusdep:
                delkey.add(kplus)
    for key in delkey:
        del table[key]
    del delkey

    fdr, binomial = {}, {}
    for key1 in table:
        for s in order:
            try:
                fdr[s].append([key1, table[key1][(s, 'Editvalue')]])
            except KeyError:
                fdr[s] = [[key1, table[key1][(s, 'Editvalue')]]]
            if Method == 'Binomial':
                try:
                    binomial[s].append([key1, table[key1][(s, 'SNPvalue')]])
                except KeyError:
                    binomial[s] = [[key1, table[key1][(s, 'SNPvalue')]]]
    for s, v in fdr.items():
        fd = sorted(v, key=lambda x: float(x[1]), reverse=True)
        fd_fdr = shell._Fdr([float(i[1]) for i in fd])
        for i in range(len(fd_fdr)):
            table[fd[i][0]][(s, 'Editvalue')] = fd_fdr[i]
    least_dep = 1
    if Method == 'Binomial':
        for s, v in binomial.items():
            fd = sorted(v, key=lambda x: float(x[1]), reverse=True)
            fd_fdr = shell._Fdr([float(i[1]) for i in fd])
            for i in range(len(fd_fdr)):
                table[fd[i][0]][(s, 'SNPvalue')] = shell._FormatP(fd_fdr[i])
        p = sorted(peak_dep.keys(), key=lambda x: peak_dep[x])
        ratio = 1 / Ploidy
        while least_dep < p[0]:
            if shell._SNVvalue_Binomial(
                (0, least_dep), ratio, RNA=True) < 0.05:
                break
            least_dep += 1

    fw = open(out_path + '/RES.txt', 'w')
    title = '#1.Chromosome\t2.Coordinate\t3.Strand\t4.Gbase\t5.EditType'
    for idx, s in enumerate(order):
        title += '\t' + str(
            6 + idx * 2) + '.' + s + '.DNA_baseCount[A,C,G,T]\t' + str(
                7 + idx * 2) + '.' + s + '.RNA_basecount[A,C,G,T];P_value'
    title += '\n'
    fw.write(title)
    for key1 in sorted(table.keys(), key=lambda x: (x[0], int(x[1]))):
        info = []
        flag = 1
        for s in order:
            rnainfo = table[key1][(s, 'RNA')].split(',')
            rna_dep = sum([int(i) for i in rnainfo])
            dna_dep = sum([int(i) for i in table[key1][(s, 'DNA')].split(',')])
            if table[key1][(s, 'SNPvalue')] != 'NA':
                if Method == 'Bayesian':
                    if len(
                            set(table[key1][(s, 'SNPvalue')][0]) -
                            set(table[key1]['gbase'])) != 0 or float(
                                table[key1][(s, 'SNPvalue')]
                                [1]) < Bayesian_Posterior_Probability:
                        flag = 0
                elif Method == 'Binomial':
                    if dna_dep < least_dep:
                        n = 0
                        for i in table[key1][(s, 'RNA')].split(','):
                            if int(i) == 0: n += 1
                        if n < 3: flag = 0
                    else:
                        if table[key1][(s,
                                        'SNPvalue')] < FDR_DNA_Heterozygosis:
                            flag = 0
                elif Method == 'Frequency':
                    if table[key1][(s, 'SNPvalue')] < Non_Ref_BaseCount:
                        flag = 0
            if dna_dep >= DNAdepth and rna_dep >= RNAdepth and float(
                    table[key1][(s, 'Editvalue')]) < 0.05:
                alt_base = table[key1]['type'].split('->')[1]
                if key1[2] == '-':
                    alt_base = BT[alt_base]
                base_dep = {}
                for i in range(4):
                    base_dep[BASE[i]] = int(rnainfo[i])
                if base_dep[alt_base] > 0:
                    info.append(table[key1][(s, 'DNA')] + '\t' +
                                table[key1][(s, 'RNA')] + ';' +
                                str(table[key1][(s, 'Editvalue')]) + '*')
                else:
                    info.append(table[key1][(s, 'DNA')] + '\t' +
                                table[key1][(s, 'RNA')] + ';' +
                                str(table[key1][(s, 'Editvalue')]))
            else:
                info.append(table[key1][(s, 'DNA')] + '\t' +
                            table[key1][(s, 'RNA')] + ';' +
                            str(table[key1][(s, 'Editvalue')]))
        if flag == 1:
            fw.write(('{}\t' * 5).format(
                *(list(key1) + [table[key1]['gbase'], table[key1]['type']])) +
                     '\t'.join(info) + '\n')

    return 0
Пример #12
0
def get_config():
    try:
        optlist, args = getopt.getopt(sys.argv[1:], 'ho:',
                                      ['help', 'outdir=', 'bwa='])
    except getopt.GetoptError as e:
        shell.eprint('[' + PROGRAM + '] Error: ' + str(e))
        sys.exit(2)

    if optlist == [] and args == []:
        print_help()
        sys.exit(0)

    global BWA, OUTDIR
    for opt, value in optlist:
        if opt in ('-h', '--help'):
            print_help()
            sys.exit(1)
        elif opt in ('-o', '--outdir'):
            OUTDIR = os.path.abspath(value) + '/'
        elif opt == '--bwa':
            if os.path.exists(value):
                BWA = os.path.abspath(value)
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: bwa path does not exist')
                sys.exit(1)
    try:
        genomefile = args[0]
        if os.path.isfile(genomefile):
            genomefile = os.path.abspath(genomefile)
        else:
            shell.eprint('[' + PROGRAM + '] Error: genome file does not exist')
            sys.exit(1)
    except ValueError as e:
        shell.eprint('[' + PROGRAM + '] Error: ' + str(e))
        sys.exit(1)
    if BWA == '':
        r = subprocess.getstatusoutput('which bwa')
        if r[0] == 1:
            shell.eprint('[' + PROGRAM + '] Warning: lack bwa program')
            sys.exit(1)
        else:
            BWA = r[1]
    if os.path.exists(OUTDIR) == False:
        try:
            os.makedirs(OUTDIR)
        except:
            shell.eprint('[' + PROGRAM +
                         '] Error: outdir could not be created, please check')
            sys.exit(1)
    if OUTDIR == '':
        OUTDIR = os.getcwd() + '/regeo'
        try:
            os.makedirs(OUTDIR)
        except FileExistsError:
            pass
        except:
            shell.eprint('[' + PROGRAM +
                         '] Error: outdir could not be created, please check')
            sys.exit(1)
    os.chdir(OUTDIR)

    return genomefile
Пример #13
0
def main():
    global DBAM, RBAM, DFQ1, DFQ2, RFQ1, RFQ2, PDFQ1, PRFQ1
    config, genomefile = get_config()
    logging.info('Program Start')

    if FLAG & 2:
        logging.info('do soapnuke')
        if DFQ1 != '':
            DFQ1, DFQ2, PDFQ1 = soapnuke('DNA', DFQ1, DFQ2, PDFQ1, config)
        if RFQ1 != '':
            RFQ1, RFQ2, PRFQ1 = soapnuke('RNA', RFQ1, RFQ2, PRFQ1, config)
        logging.info('soapnuke has done')

    if FLAG & 4:
        logging.info('do pilon')
        logging.info('check genome index')
        for suffix in ['.amb', '.ann', '.bwt', '.pac', '.sa']:
            if os.path.isfile(genomefile + suffix) == False:
                logging.info('genome index not found')
                p = subprocess.Popen([BWA, 'index', genomefile],
                                     stdout=open('log', 'w'),
                                     stderr=subprocess.STDOUT)
                p.wait()
                if p.returncode != 0:
                    shell.eprint('[' + PROGRAM +
                                 '] Error: bwa index run error')
                    shell.eprint(''.join(open('log', 'r').read()))
                    sys.exit(1)
                logging.info('genome index done')
                break
        try:
            os.makedirs('pilon/')
        except FileExistsError:
            pass
        except:
            shell.eprint('[' + PROGRAM + '] Error: mkdir pilon/ error')
            sys.exit(1)
        args = []
        for k, v in config['bwa']['mem'].items():
            args += (k, v)
        args += [genomefile, DFQ1] if DFQ2 == '' else [genomefile, DFQ1, DFQ2]
        p = subprocess.Popen([BWA, 'mem'] + args,
                             stdout=subprocess.PIPE,
                             stderr=open('log', 'w'))
        infile = pysam.AlignmentFile(p.stdout, 'r')
        oufile = pysam.AlignmentFile('pilon/mem_just.bam',
                                     'wb',
                                     template=infile)
        for r in infile:
            oufile.write(r)
        infile.close()
        oufile.close()
        p.wait()
        if p.returncode != 0:
            shell.eprint('[' + PROGRAM + '] Error: bwa mem run error')
            shell.eprint(''.join(open('log', 'r').read()))
            sys.exit(1)
        DBAM_mem = bamsortindex('pilon/mem_just.bam')
        try:
            os.makedirs('pilon/sub/')
        except FileExistsError:
            pass
        except:
            shell.eprint('[' + PROGRAM + '] Error: mkdir pilon/sub/ error')
            sys.exit(1)
        shell.splitFa(genomefile, 'pilon/sub/')
        f = pysam.AlignmentFile(DBAM_mem, 'rb')
        for fn in [
                'pilon/sub/' + bed for bed in os.listdir('pilon/sub/')
                if bed.endswith('.bed')
        ]:
            fw = pysam.AlignmentFile('.'.join(fn.split('.')[:-1]) + '.bam',
                                     'wb',
                                     template=f)
            with open(fn, 'r') as fg:
                for line in fg:
                    for r in f.fetch(contig=line.strip().split()[0]):
                        if r.flag & 256 or r.flag & 2048:
                            continue
                        fw.write(r)
            fw.close()
            try:
                pysam.index('.'.join(fn.split('.')[:-1]) + '.bam')
            except:
                shell.eprint('[' + PROGRAM + '] Error: ' +
                             '.'.join(fn.split('.')[:-1]) + '.bam' +
                             ' index error')
        f.close()

        for fn in [
                'pilon/sub/' + fa for fa in os.listdir('pilon/sub/')
                if fa.endswith('.fa')
        ]:
            maxn = int(
                subprocess.getstatusoutput(
                    'awk \'BEGIN{x};{x+=$3};END{print x}\' ' +
                    '.'.join(fn.split('.')[:-1]) + '.bed')[1])
            maxmem = '-Xmx8g' if maxn <= 100000000 else '-Xmx16g'
            p = subprocess.Popen([
                JAVA, maxmem, '-jar', PILON, '--fix', 'snps', '--genome', fn,
                '--bam', '.'.join(fn.split('.')[:-1]) + '.bam', '--output',
                fn + '_fix_snps'
            ],
                                 stdout=open('log', 'w'),
                                 stderr=subprocess.STDOUT)
            p.wait()
            if p.returncode != 0:
                shell.eprint('[' + PROGRAM + '] Error: pilon run error')
                shell.eprint(''.join(open('log', 'r').read()))
                sys.exit(1)
        os.system('cat pilon/sub/*_fix_snps.fasta > ' + genomefile +
                  '.fix_snps.fa')
        os.system('sed -i s/_pilon// ' + genomefile + '.fix_snps.fa')
        os.system('rm -rf pilon/sub/*')
        genomefile = genomefile + '.fix_snps.fa'
        logging.info('pilon has done')
    # check and do bwa aln
    logging.info('do bwa aln')
    logging.info('check fixed genome index')
    for suffix in ['.amb', '.ann', '.bwt', '.pac', '.sa']:
        if os.path.isfile(genomefile + suffix) == False:
            logging.info('fixed genome not found')
            p = subprocess.Popen([BWA, 'index', genomefile],
                                 stdout=open('log', 'w'),
                                 stderr=subprocess.STDOUT)
            p.wait()
            if p.returncode != 0:
                shell.eprint(
                    '[' + PROGRAM +
                    '] Error: bwa index for fixed snps genome run error')
                shell.eprint(''.join(open('log', 'r').read()))
                sys.exit(1)
            logging.info('fixed genome index done')
            break
    try:
        os.makedirs('aln/')
    except FileExistsError:
        pass
    except:
        shell.eprint('[' + PROGRAM + '] Error: make aln directory error')
        sys.exit(1)
    if DFQ1 != '' and DBAM == '':
        DBAM = bwaaln('DNA', genomefile, DFQ1, DFQ2, PDFQ1, 'aln/', config)
    if RFQ1 != '' and RBAM == '':
        RBAM = bwaaln('RNA', genomefile, RFQ1, RFQ2, PRFQ1, 'aln/', config)
    logging.info('bwa aln has done')
    logging.info('do get best bam')
    if DBAM:
        bestuniqbam(DBAM, DNA=True, **config['bestuniqbam']['DNA'])
        DBAM = bamsortindex('.'.join(os.path.basename(DBAM).split('.')[:-1]) +
                            '.best.bam')
        logging.info('The output file of DNA alignment is ' + DBAM)
    if RBAM:
        bestuniqbam(RBAM, RNA=True, **config['bestuniqbam']['RNA'])
        if 'SS' in config['bestuniqbam']['RNA']:
            bamsortindex('.'.join(os.path.basename(RBAM).split('.')[:-1]) +
                         '.negative.bam')
            bamsortindex('.'.join(os.path.basename(RBAM).split('.')[:-1]) +
                         '.positive.bam')
            logging.info(
                'The output files of strand-specific RNA alignment are ' +
                '.'.join(os.path.basename(RBAM).split('.')[:-1]) +
                '.negative.bam and ' +
                '.'.join(os.path.basename(RBAM).split('.')[:-1]) +
                '.positive.bam')
        else:
            bamsortindex('.'.join(os.path.basename(RBAM).split('.')[:-1]) +
                         '.best.bam')
            logging.info('The output file of RNA alignment is ' +
                         '.'.join(os.path.basename(RBAM).split('.')[:-1]) +
                         '.best.bam')
    os.system('rm -f log')
    logging.info('All things have been done! Have a good day!')

    return 0
Пример #14
0
def get_config():
    shortopts = 'ho:'
    longopts = [
        'help', 'outdir=', 'DNA-fq1=', 'DNA-fq2=', 'RNA-fq1=', 'RNA-fq2=',
        'DNA-bam=', 'RNA-bam=', 'bwa=', 'soapnuke=', 'java=', 'pilon=',
        'config=', 'DNA-mapQ=', 'RNA-mapQ=', 'rmdup=', 'uniq=', 'ss=',
        'DNA-I=', 'RNA-I='
    ]
    try:
        optlist, args = getopt.getopt(sys.argv[1:], shortopts, longopts)
    except getopt.GetoptError as e:
        shell.eprint('[' + PROGRAM + '] Error: ' + str(e))
        sys.exit(2)

    if optlist == [] and args == []:
        print_help()
        sys.exit(0)

    config = {'bestuniqbam': {'DNA': {}, 'RNA': {}}}
    tobool = {'T': True, 'F': False}
    global DBAM, DFQ1, DFQ2, RBAM, RFQ1, RFQ2
    global PDFQ1, PDFQ2, PRFQ1, PRFQ2
    global BWA, SOAPNUKE, JAVA, PILON
    global FLAG, OUTDIR
    for opt, value in optlist:
        if opt in ('-h', '--help'):
            print_help()
            sys.exit(0)
        elif opt in ('-o', '--outdir'):
            OUTDIR = os.path.abspath(value) + '/'
        elif opt == '--DNA-fq1':
            if os.path.exists(value):
                DFQ1 = os.path.abspath(value)
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: dna fq1 file does not exist')
                sys.exit(1)
        elif opt == '--DNA-fq2':
            if os.path.exists(value):
                DFQ2 = os.path.abspath(value)
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: dna fq2 file does not exist')
                sys.exit(1)
        elif opt == '--RNA-fq1':
            if os.path.exists(value):
                RFQ1 = os.path.abspath(value)
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: rna fq1 file does not exist')
                sys.exit(1)
        elif opt == '--RNA-fq2':
            if os.path.exists(value):
                RFQ2 = os.path.abspath(value)
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: rna fq2 file does not exist')
                sys.exit(1)
        elif opt == '--DNA-bam':
            if os.path.exists(value):
                DBAM = os.path.abspath(value)
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: dna bam file does not exist')
                sys.exit(1)
        elif opt == '--RNA-bam':
            if os.path.exists(value):
                RBAM = os.path.abspath(value)
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: rna bam file does not exist')
                sys.exit(1)
        elif opt == '--bwa':
            if os.path.exists(value):
                BWA = os.path.abspath(value)
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: bwa path does not exist')
                sys.exit(1)
        elif opt == '--soapnuke':
            if os.path.exists(value):
                SOAPNUKE = os.path.abspath(value)
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: soapnuke path does not exist')
                sys.exit(1)
        elif opt == '--java':
            if os.path.exists(value):
                JAVA = os.path.abspath(value)
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: java path does not exist')
                sys.exit(1)
        elif opt == '--pilon':
            if os.path.exists(value):
                PILON = os.path.abspath(value)
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: pilon path does not exist')
                sys.exit(1)
        elif opt == '--config':
            if os.path.exists(value):
                with open(value, 'r') as f:
                    config.update(json.load(f))
            else:
                shell.eprint('[' + PROGRAM +
                             '] Error: config file does not exist')
                sys.exit(1)
            if len(
                    set(['-C', '-D', '-o', '-1', '-2'])
                    & config['soapnuke']['filter'].keys()) > 0:
                shell.eprint(
                    '[' + PROGRAM +
                    '] Error: soapnuke parameters -C -D -o -1 -2 do not needed to provided'
                )
                sys.exit(1)
        elif opt == '--DNA-mapQ':
            try:
                config['bestuniqbam']['DNA']['mapQ'] = int(value)
            except ValueError:
                shell.eprint('[' + PROGRAM +
                             '] Error: --DNA-mapQ should be integer')
                sys.exit(1)
        elif opt == '--RNA-mapQ':
            try:
                config['bestuniqbam']['RNA']['mapQ'] = int(value)
            except ValueError:
                shell.eprint('[' + PROGRAM +
                             '] Error: --RNA-mapQ should be integer')
                sys.exit(1)
        elif opt == '--rmdup':
            try:
                config['bestuniqbam']['RNA']['RmDup'] = tobool[value]
            except KeyError:
                shell.eprint('[' + PROGRAM +
                             '] Error: --rmdup should be T or F')
                sys.exit(1)
        elif opt == '--uniq':
            try:
                config['bestuniqbam']['RNA']['Uniq'] = tobool[value]
            except KeyError:
                shell.eprint('[' + PROGRAM +
                             '] Error: --uniq should be T or F')
                sys.exit(1)
        elif opt == '--ss':
            try:
                config['bestuniqbam']['RNA']['SS'] = tobool[value]
            except KeyError:
                shell.eprint('[' + PROGRAM + '] Error: --ss should be T or F')
                sys.exit(1)
        else:
            assert False, 'unhandled option'

    if BWA == '':
        shell.eprint('[' + PROGRAM + '] Error: bwa is necessary')
        sys.exit(1)

    if PILON != '' and JAVA == '':
        shell.eprint('[' + PROGRAM + '] Error: pilon process needs java')
        sys.exit(1)

    if os.path.exists(OUTDIR) == False:
        try:
            os.makedirs(OUTDIR)
        except FileExistsError:
            shell.eprint(
                '[' + PROGRAM +
                '] Warning: outdir have existed, may have some conflict')
        except:
            shell.eprint('[' + PROGRAM +
                         '] Error: outdir could not be created, please check')
            sys.exit(1)

    try:
        genomefile = args[0]
        genomefile = os.path.abspath(genomefile)
        try:
            os.symlink(genomefile, OUTDIR + os.path.basename(genomefile))
        except FileExistsError:
            pass
        except:
            shell.eprint('[' + PROGRAM +
                         '] Error: could not create soft link of genomefile')
            sys.exit(1)
        genomefile = os.path.basename(genomefile)
    except ValueError:
        shell.eprint('[' + PROGRAM + '] Error: ' + str(e))
        sys.exit(1)

    if (DFQ1 != '' and DBAM != '') or (DFQ1 == '' and DBAM == ''):
        shell.eprint(
            '[' + PROGRAM +
            '] Error: DNA fastq file or bamfile should be provided only one')
        sys.exit(1)

    if (RFQ1 != '' and RBAM != '') or (RFQ1 == '' and RBAM == ''):
        shell.eprint(
            '[' + PROGRAM +
            '] Error: RNA fastq file or bamfile should be provided only one')
        sys.exit(1)

    if (DBAM != '' and RBAM != '') and (SOAPNUKE != '' or PILON != ''):
        shell.eprint(
            '[' + PROGRAM +
            '] Error: there are some logical error. Offering bam file means there is no need to do soapnuke or pilon'
        )
        sys.exit(1)

    if DFQ1 != '':
        PDFQ1 = shell.checkFqQuality(DFQ1)
    if RFQ1 != '':
        PRFQ1 = shell.checkFqQuality(RFQ1)

    if SOAPNUKE != '':
        FLAG += 2

    if PILON != '':
        FLAG += 4

    os.chdir(OUTDIR)

    logging.basicConfig(level=logging.INFO,
                        filename=OUTDIR + PROGRAM + '.log',
                        filemode='w',
                        format='%(asctime)s : %(message)s',
                        datefmt='%m/%d/%Y %I:%M:%S %p')

    return config, genomefile
Пример #15
0
def bestuniqbam(bamfile,
                mapQ=20,
                DNA=None,
                RNA=None,
                SS=False,
                RmDup=True,
                Uniq=True,
                outdir='./'):
    '''
        Filter bam to best uniq bam.
        ##eg: bestuniqbam(bamfile, RNA-mapQ=20, RNA=True, SS=True, RmDup=True, Uniq=True, outdir='./scanner')
    '''
    def _flag_split(g):
        '''
            According to sam format, 115 = 64 + 32 + 16 + 2 + 1.
        '''
        return set([g & 2**i for i in range(0, 12)])

    # check bamfile
    if bamfile.split('.')[-1] == 'bam':
        file_type_tag = 'rb'
    elif bamfile.split('.')[-1] == 'sam':
        file_type_tag = 'r'
    else:
        shell.eprint(
            '[' + PROGRAM +
            '] Error: the input bamfile/samfile should be *.bam or *.sam.')
        sys.exit(1)
    # parameter tickle
    if (RNA is None and DNA is None) or (RNA != None and DNA != None):
        shell.eprint('[' + PROGRAM + '] Error: --DNA or --RNA is needed!')
        sys.exit(1)
    if DNA == True and SS == True:
        shell.eprint(
            '[' + PROGRAM +
            '] Warning: DNA do not have --ss, but it\'s ok to run this program.'
        )
    if os.path.exists(outdir) == False:
        try:
            os.makedirs(outdir)
        except:
            sys.exit(1)
    # core
    try:
        f = pysam.AlignmentFile(bamfile, file_type_tag)
    except:
        shell.eprint('[' + PROGRAM + '] Error: check the bam file ' +
                     os.path.basename(bamfile) + ' please!')
        sys.exit(1)
    ## DNA and RNA's bamfile name should be different by yourself.
    if SS:
        fn1 = outdir + '.'.join(
            os.path.basename(bamfile).split('.')[:-1]) + '.negative.bam'
        fn2 = outdir + '.'.join(
            os.path.basename(bamfile).split('.')[:-1]) + '.positive.bam'
        fw1 = pysam.AlignmentFile(fn1, 'wb', template=f)
        fw2 = pysam.AlignmentFile(fn2, 'wb', template=f)
    else:
        fn3 = outdir + '.'.join(
            os.path.basename(bamfile).split('.')[:-1]) + '.best.bam'
        fw3 = pysam.AlignmentFile(fn3, 'wb', template=f)
    for line in f.fetch(until_eof=True):
        if int(line.mapping_quality) < mapQ:
            continue
        flag = int(line.flag)
        tags = line.get_tags()
        if 1024 & flag and RmDup:
            continue
        if RNA:
            if Uniq and (('XT', 'U') not in tags or ('X0', 1) not in tags or
                         ('X1', 0) not in tags):
                continue
            if 256 & flag:
                continue
            if flag in (67, 131, 115, 179):
                continue
            if 32 & flag and 16 & flag:
                continue
            if SS:
                if 64 & flag and 32 & flag and line.template_length >= 0:
                    fw1.write(line)
                elif 128 & flag and 16 & flag and line.template_length <= 0:
                    fw1.write(line)
                elif 64 & flag and 16 & flag and line.template_length <= 0:
                    fw2.write(line)
                elif 128 & flag and 32 & flag and line.template_length >= 0:
                    fw2.write(line)
                elif 64 & flag and 16 & flag != 16 and 32 & flag != 32:
                    fw1.write(line)
                elif 128 & flag and 16 & flag != 16 and 32 & flag != 32:
                    fw2.write(line)
                elif flag == 16:
                    fw2.write(line)
                elif flag == 0:
                    fw1.write(line)
            else:
                fw3.write(line)
        elif DNA:
            fw3.write(line)
        else:
            pass
    if SS:
        fw1.close()
        fw2.close()
        return fn1, fn2
    else:
        fw3.close()
        return fn3