Exemplo n.º 1
0
def main(args):

    if not isfile(args.batch_file):
        ms.error_message("Input batch file does not exist ({})".format(
            args.batch_file))
        return 1

    flist = []
    err_flag = False

    ms.message("Checking batch file")
    t0 = time()

    # load the batch file and confirm all of the input files exist.
    with open(args.batch_file, "r") as fin:
        for szl in fin:
            fname = szl.strip()
            if not isfile(fname):
                err_flag = True
                ms.error_message("{} does not exist".format(fname))
            else:
                flist.append(fname)

    ms.time_diff(t0)

    if err_flag:
        return 1

    rres = core(flist)

    return 1
Exemplo n.º 2
0
def main(args):

    ##
    ## check input files
    ##

    if not isfile(args.bam):
        ms.error_message("Input file does not exist")
        return 1

    if args.o is not None:

        if isfile(args.o):
            ms.message("Output file exists. Overwriting.")

        if args.z:
            if not re.search("\.gz$", args.o):
                args.o += ".gz"

        else:
            if re.search("\.gz$", args.o):
                # auto enable gzip if the output file name has .gz at the end
                args.z = True

    rres = core(args)

    return 0
Exemplo n.º 3
0
def main(args):

    if not isfile(args.gtf):
        ms.error_message("Input file does not exist: {}".format(args.gtf))
        return 1

    rres = core(args)

    return rres
Exemplo n.º 4
0
def main(args):

    ##
    ## check input files
    ##

    if not isfile(args.fastq):
        ms.error_message("Input file does not exist")
        return 1

    rres = core(args)

    return 0
Exemplo n.º 5
0
def main(args):

    # variables

    # check input files
    if not isfile(args.fasta):
        ms.error_message("Input FASTA file does not exist")
        return 1

    if not isfile(args.ref):
        ms.error_message("Input refFlat annotation does not exist")
        return 1

    if args.o is None:
        args.o = "{}.shreds.{}.fasta".format(drop_file_ext(args.fasta), args.l)
    else:
        tmp = "{}.shreds.{}.fasta".format(args.o, args.l)
        args.o = tmp

    if not args.just_alignment and not args.just_quantification:

        if isfile(args.o):
            unlink(args.o)

        if isfile("{}.gz".format(args.o)):
            unlink("{}.gz".format(args.o))

    if isfile("shred.log"):
        unlink("shred.log")

    if isfile("dedupe.log"):
        unlink("dedupe.log")

    if args.quantify:
        args.bbmap = True

    # good to go!

    if args.s:
        rres = core2(args)
    else:
        rres = core(args)

    return rres
Exemplo n.º 6
0
def main(args):

    # check input file
    if not os.path.isfile(args.bam):
        ms.error_message("Input file does not exist")
        return 1

    if not re.search("\.bam$", args.bam):
        ms.error_message("Input file should be a BAM file.")
        return 1

    ##
    ## check for output folder
    ##
    if not os.path.isdir(args.outpath):
        ms.message("Creating output folder {}".format(args.outpath))
        os.mkdir(args.outpath)

    rres = core(args)

    return rres
Exemplo n.º 7
0
def main(args):

    # check input file
    if not os.path.isfile(args.gtf):
        ms.error_message("Input file is missing")
        return 1

    t0 = time()
    ms.message("Loading annotation")
    dtid, dtid2gname, all_attrs, tid_order = load(args.gtf)
    ms.time_diff(t0)

    tmp = all_attrs.difference(set(["transcript_id", "gene_id", "gene_name"]))
    all_attrs = sorted(list(tmp))

    # header
    sys.stdout.write(
        "chrom\tdb\tfeature\tstart\tend\tscore\tstrand\tframe\ttranscript_id\tgene_id\tgene_name\t"
    )
    sys.stdout.write("\t".join(all_attrs))
    sys.stdout.write("\n")

    for tid in tid_order:
        for gg in dtid[tid]:
            # print each row
            lout = gg.parts[0:8]
            lout.append(gg.transcript_id())
            lout.append(gg.gene_id())
            lout.append(gg.gene_name())

            for aid in all_attrs:
                if aid in gg.attr:
                    lout.append(gg.attr[aid])
                else:
                    lout.append("na")

            sys.stdout.write("\t".join(lout) + "\n")

    return 0
Exemplo n.º 8
0
def main(args):

    # variables
    left = []
    right = []

    # figure out what's up

    if args.a is not None or args.b is not None:
        if not (args.a is not None and args.b is not None):
            ms.error_message(
                "You must specify both -a and -b if running a single sample.")
            return 1

    if args.a is not None and args.b is not None:
        # single sample
        left = [args.a]
        right = [args.b]

    elif args.f is not None:
        if not isfile(args.f):
            ms.error_message("Batch file does not exist")
            return 1

        # we have a batch file
        with open(args.f, "r") as fin:

            for szl in fin:
                aln = szl.strip().split("\t")
                left.append(aln[0])
                right.append(aln[1])

    if len(left) < 1 or len(right) < 1:
        ms.error_message("No samples to process!")
        return 1

    rres = core(left, right, args)

    return rres
Exemplo n.º 9
0
    if argc < 2:
        ms.message(usage())
        sys.exit(1)

    argv = argv[1:len(argv)]

    stat_list = []

    #	t0 = time()
    #	lktable2 = ta.build_lktable()
    #	ms.time_diff(t0)

    for i in range(len(argv)):
        if not isfile(argv[i]):
            ms.error_message("file does not exist ({})".format(argv[i]))
            continue

        mstat = Stats()

        ms.message("parsing {}".format(argv[i]))
        t0 = time()
        with ps.AlignmentFile(argv[i]) as fin:
            rnames = ps_tools.get_alignmentfile_rnames(fin)
            numhit = 0

            for aln in fin:
                mstat.lines += 1

                if (aln.flag & 0x100) == 0:
                    mstat.reads += 1
Exemplo n.º 10
0
def main(args):

    # variables
    bam_file = False

    bc = {}
    bc_readcount = defaultdict(int)
    bc_umi = {}
    num_bc = 0
    offset = 0
    lnum = 0
    sz_umi = ""
    umi_file = ""
    # string to capture file summary table that's used with kallisto pseudo -b
    sz_table = "#id\tumiFile\tcellFile\n"
    dumi = None
    bam_flag = False
    sam_header = ""
    quant_mode = False

    ##
    ## check for output folder
    ##
    if not os.path.isdir(args.outpath):
        ms.message("Creating output folder {}".format(args.outpath))
        os.mkdir(args.outpath)

    file_queue = JoinableQueue()
    p = None
    pool = []

    if args.R is not None:
        if not os.path.isfile(args.R):
            ms.error_message(
                "Supplied annotation file does not exist ({})".format(args.R))
            return 1
        else:
            quant_mode = True

    ##
    ## figure out if we have a bam as input. if so we have to convert it to sam for indexing
    ##
    if re.search("\.bam$", args.fin):
        # send the sam file into the output folder
        sam_name = args.outpath + "/" + os.path.basename(
            re.sub("\.bam$", ".sam", args.fin))

        if not os.path.isfile(sam_name):
            # need to convert alignments to sam
            bam_flag = True
            cmd = "samtools view -h {} > {}".format(args.fin, sam_name)
            t0 = time()
            message("Temporarily converting BAM to SAM format")
            rres = runcmd(cmd)
            if rres[0] != 0:
                sys.stderr.write(
                    "Error: samtools exited with non-zero exit status!\n")
                return 1

            sys.stderr.write("{} sec\n".format(time() - t0))

    else:
        sam_name = args.fin

    ##
    ## we need to index all barcodes and track umi per barcode. if these pickle
    ## files exist we can use them
    ##

    bc_pkl = args.outpath + "/" + BC_PICKLE
    bc_umi_pkl = args.outpath + "/" + BC_UMI_PICKLE
    bc_readcount_pkl = args.outpath + "/" + BC_READCOUNT
    sam_header_pkl = args.outpath + "/sam_header.pkl"

    if os.path.isfile(bc_pkl) and os.path.isfile(
            bc_umi_pkl) and os.path.isfile(bc_readcount_pkl):
        ##
        # load indexes from pickles
        ms.message(
            "Loading existing barcode and umi indexes from output folder")
        t0 = time()
        bc = pickle.load(open(bc_pkl, "rb"))
        bc_umi = pickle.load(open(bc_umi_pkl, "rb"))
        bc_readcount = pickle.load(open(bc_readcount_pkl, "rb"))
        sam_header = pickle.load(open(sam_header_pkl, "rb"))
        num_bc = len(bc.keys())
        ms.time_diff(t0)

    else:
        # we have to index

        #
        # parse the alignments. in this loop we only extract the cell barcode and the umi
        # plus record the file position offsets for barcodes. the dict that is built
        # is indexed by the barcodes and each element contains a list of file offsets for
        # reads that came from that barcode. we also get all of the distinct umis collected
        # per barcode in this loop in order to estimate the actual cell count before
        # writing all of the read files out to disk
        message('Indexing cell barcodes from alignments and counting raw UMI.')
        t0 = time()
        with open(sam_name, "r") as fin:

            for szl in fin:
                if szl[0] == "@":
                    # append header line to header string
                    sam_header += szl
                    offset += len(szl)
                    continue

                # count lines and produce progress message so we know this thing is
                # running
                lnum += 1
                if lnum % 1000000 == 0:
                    progress_message("read {} lines".format(lnum))

                # fetch the cell barcode from the read name
                line_bc = parse_barcode(szl)

                if line_bc not in bc:
                    # first encounter with this barcode
                    num_bc += 1
                    # init a list for this barcode's line offsets within this sam file
                    bc[line_bc] = []
                    # init a dict for the barcode to track umis
                    bc_umi[line_bc] = defaultdict(int)

                # append line offset to this barcode's list
                bc[line_bc].append(offset)
                # get the umi and add it to this barcode's dict IF this is not a
                # secondary alignment
                aln = szl.split("\t")

                if (int(aln[1]) & 0x100) == 0:
                    # not a secondary alignment. track it.
                    umi = parse_umi(szl)
                    bc_umi[line_bc][umi] += 1

                if ((int(aln[1]) & 0x4) == 0) and ((int(aln[1]) & 0x100) == 0):
                    # this read is aligned and is a primary alignment so we can count this one
                    # into this barcode's aligned read count
                    bc_readcount[line_bc] += 1

                # update offset to the next line
                offset += len(szl)

        # final progress message and total time of parsing
        progress_message("read {} lines".format(lnum), last=True)
        sys.stderr.write("{} sec\n".format(time() - t0))

        t0 = time()

        if not args.no_pickles:
            ms.message("saving indexes to disk")
            pickle.dump(bc, open(bc_pkl, "wb"))
            pickle.dump(bc_umi, open(bc_umi_pkl, "wb"))
            pickle.dump(bc_readcount, open(bc_readcount_pkl, "wb"))
            pickle.dump(sam_header, open(sam_header_pkl, "wb"))
            ms.time_diff(t0)

    #
    # implement cell number detection per 10x.
    # here's what happens. you take the 'exp-cells' value (expected cells)
    # and multiply that by 0.01 to get an index. sort the barcodes and the
    # barcode umi counts in descending order and jump to the index you just
    # calculated and then take that index's umi count. scale that count
    # by 0.1. now you take as many cells, starting from the top of the umi
    # count sorted list, that have at least that many UMI.  that's literally
    # how they do it.
    #

    t0 = time()
    message("Determining cell count")

    #
    # write a file that will contain the cell id, umi count and read count
    # for each cell id. might be informative...who knows.
    with open("{}/barcode_umi_counts.txt".format(args.outpath), "w") as fout:
        bc_umi_counts = []

        fout.write("barcode\tumi_count\tdistinct_reads\talignments\n")

        for lbc in bc.keys():
            num_umi = len(bc_umi[lbc].keys())
            bc_umi_counts.append([lbc, num_umi])
            # write the cell id, distinct umi count and total read count to file
            fout.write("\t".join(
                map(str, [lbc, num_umi, bc_readcount[lbc],
                          len(bc[lbc])])))
            fout.write("\n")

    #
    # sort by umi count in descending order and threshold
    bc_umi_counts.sort(key=lambda x: x[1], reverse=True)
    exp_cells = int(math.floor(args.exp_cells * 0.01 - 1))

    num_reads = 0
    num_umi = 0

    i = 0
    while True:
        if bc_umi_counts[i][1] < bc_umi_counts[exp_cells][1] * 1.0 / 10:
            break

        # count umi and count distinct reads
        lbc = bc_umi_counts[i][0]
        num_reads += bc_readcount[lbc]
        num_umi += len(bc_umi[lbc].keys())

        i += 1

    #
    # number of actual cells is 'i' because 'i' is incremented before
    # checking if the umi count passes the threshold. i-1 is the index
    # of the last cell we would accept
    num_cells = i

    #
    # now we can generate a summary for the detected cells
    with open("{}/cell_summary.tsv".format(args.outpath), "w") as fout:

        fout.write("estimated_cells\t{}\n".format(num_cells))
        fout.write("total_reads\t{}\n".format(num_reads))
        fout.write("total_umi\t{}\n".format(num_umi))
        fout.write("reads_per_cell\t{}\n".format(num_reads * 1.0 / num_cells))
        fout.write("umi_per_cell\t{}\n".format(num_umi * 1.0 / num_cells))

        # find the median barcode and corresponding read count
        if num_cells % 2 == 0:
            # even count
            median_idx = num_cells / 2
        else:
            median_idx = num_cells / 2 + 1

        median_lbc = bc_umi_counts[median_idx][0]
        fout.write("median_reads_per_cell\t{}\n".format(
            bc_readcount[median_lbc]))

    #
    # let user know what's up
    sys.stderr.write("{} sec\n".format(time() - t0))
    sys.stderr.write("Total distinct barcodes:  {}\n".format(num_bc))
    sys.stderr.write("Cell number estimate:     {}\n".format(num_cells))

    if args.estimate_only:
        if bam_flag:
            # input was BAM so we can dump the converted file. just putting in
            # some logic to be certain that the original file is not deleted.
            if os.path.isfile(args.fin) and os.path.isfile(sam_name) and (
                    sam_name != args.fin):
                os.unlink(sam_name)

        ms.message("Done.")
        return 0

    if args.force_cells is not None:
        # change number of cells to either the total barcodes or the
        # value provided by the user, whichever is smaller
        num_cells = min([args.force_cells, num_bc])
        sys.stderr.write("Forced cell output:       {}\n".format(num_cells))

    t0 = time()

    message(
        "Parsing individual detected cell alignments out to individual files")

    if quant_mode:
        # start quantificaion child processes for parsed sam files
        for i in range(args.p):
            p = Process(target=quantification_worker,
                        args=(
                            file_queue,
                            args,
                        ))
            p.daemon = True
            p.start()
            pool.append(p)

    else:
        # start child process for sam to bam conversion
        for i in range(args.p):
            p = Process(target=compress_reads, args=(file_queue, ))
            p.daemon = True
            p.start()
            pool.append(p)

    fin = open(sam_name, "r")

    # write individual cell files
    i = 0
    sz_umi = ""
    while i < num_cells:
        # get barcode
        lbc = bc_umi_counts[i][0]
        # start output strings
        szout = sam_header
        #sz_umi = ""
        # setup output file name
        cell_file = "{}/{}.sam".format(args.outpath, lbc)
        #umi_file = "{}.umi".format(lbc)

        # update user on progress
        progress_message("Writing {} - {}/{} ({} reads)".format(
            cell_file, i + 1, num_cells, len(bc[lbc])))

        if args.samplerate < 1 and args.samplerate > 0:

            ##
            # to subsample we have to run through all read offsets for this cell and index the reads
            # then take a subset of them to write out to disk. I have to do this because the
            # alignment file contains secondary alignments which have to be collapsed by
            # read name prior to the subsampling.
            read_index = defaultdict(list)
            for offset in bc[lbc]:
                fin.seek(offset)
                aln = fin.readline().strip().split("\t")
                rname = aln[0]
                read_index[rname].append(offset)

            #
            # now by looping through distinct reads we can dump out only those that are at the specified rate
            for rname in read_index.keys():
                if random.random() > args.samplerate:
                    continue

                # dump this read
                for offset in read_index[rname]:
                    fin.seek(offset)
                    szout += fin.readline()

        else:

            # loop through line offsets for this barcode and append lines to the output string
            for offset in bc[lbc]:
                fin.seek(offset)
                szout += fin.readline()

        # write the file
        with open(cell_file, "w") as fout:
            fout.write(szout)

        # send the file off for bam compression
        file_queue.put(cell_file)

        i += 1

    fin.close()

    sys.stderr.write("\n")
    sys.stderr.write("{} sec\n".format(time() - t0))

    if bam_flag:
        # input was BAM so we can dump the converted file. just putting in
        # some logic to be certain that the original file is not deleted.
        if os.path.isfile(args.fin) and os.path.isfile(sam_name) and (
                sam_name != args.fin):
            os.unlink(sam_name)

    sys.stderr.write("Waiting for child process to finish compressing files\n")

    for p in pool:
        file_queue.put(None)
    file_queue.join()

    for p in pool:
        p.join()

    message("finished!")

    return 0
Exemplo n.º 11
0
def core(left, right, args):

    ##
    ## create queue and child process for compressing the fastq files
    ##

    bc_len = args.barcode_length
    umi_len = args.umi_length

    tasks = JoinableQueue()
    p = Process(target=gz_worker, args=(tasks, ))
    p.daemon = True
    p.start()

    ##
    ## input files are paired from the sequencer so we just have to read through them
    ## and write them back out

    for i in range(len(left)):

        m1 = left[i]
        m2 = right[i]

        # pick apart the name of the second file to build the output name
        base = basename(m2)
        path = dirname(m2)

        base_parts = base.split(".")
        stub = base_parts[0]

        outfile = "{}_prepped.fastq".format(stub)

        if outfile == m1 or outfile == m2:
            ms.error_message(
                "Output file path matches input file path. WTF? {}".format(
                    outfile))
            sys.exit(1)

        if isfile(outfile):
            ms.warning_message(
                "output file exists. overwriting. {}".format(outfile))

        fout = open(outfile, "w")

        try:
            ms.message("Processing {}".format(m1))
            with open_reads(m1) as fin1, open_reads(m2) as fin2:

                nidx = 0
                nreads = 0
                lread = []
                for szl2 in fin2:

                    if (nreads % 1000000) == 0:
                        ms.progress_message("Parsed {} reads".format(nreads))

                    nidx += 1
                    if nidx == 1:
                        # read name line
                        rname = szl2.strip().split()
                        rname = rname[0]
                        # read two lines from the barcode file
                        szl1 = fin1.readline()
                        szl1 = fin1.readline().strip()
                        # this is the barcode so we can pick it apart. I'm going to put the cell barcode
                        # at the front of the read so that I can maybe leverage samtools sort to sort
                        # barcodes together for me prior to parsing cells out
                        rname_tmp = re.sub("^\@", "", rname)
                        rname = "@{}:{}".format(szl1[0:bc_len], rname_tmp)
                        # 20180226
                        # moved the cell barcode to the front of the read name so we only
                        # need to write the umi at the end and not both
                        #rname += ":{}:{}".format(szl1[0:16], szl1[16:len(szl1)])
                        rname += ":{}".format(szl1[bc_len:(bc_len + umi_len)])
                        lread.append(rname + "\n")
                        # read the remaining lines for this read from the barcodes file
                        szl1 = fin1.readline()
                        szl1 = fin1.readline()
                    elif nidx < 4:
                        lread.append(szl2)

                    if nidx == 4:
                        # finished with read
                        lread.append(szl2)
                        fout.write("".join(lread))
                        nidx = 0
                        lread = []
                        nreads += 1

            ms.progress_message("Parsed {} reads".format(nreads), last=True)

        except:
            fout.close()
            sys.exit(1)

        fout.close()

        #ms.message("compressing {}".format(outfile))
        #system("gzip -f {}".format(outfile))
        tasks.put(outfile)

    tasks.put(None)
    ms.message("Waiting for gzip compression to complete.")
    tasks.join()
    p.join()

    # done

    return 0
Exemplo n.º 12
0
def main(args):

    # variables

    dkey_index = defaultdict(list)
    ltable = []
    lheader = []
    idx = 0
    col_idx = args.c - 1
    num_col = []

    if not isfile(args.source_file):
        ms.error_message("Input file does not exist")
        return 1

    ##
    ## open input file and parse it in
    with open(args.source_file, "r") as fin:

        if args.H:
            szl = fin.readline()
            lheader = szl.strip().split("\t")

        for szl in fin:
            # put each row of the file into a list. also keep track of which rows belong to
            # which keys in a dict
            aln = szl.strip().split("\t")
            ltable.append(aln)
            num_col.append(len(aln))
            dkey_index[aln[col_idx]].append(idx)
            idx += 1

    ##
    # check column count for consistency
    if min(num_col) != max(num_col):
        ms.error_message(
            "Column count is not consistent throughout the input file!")
        return 1

    num_col = min(num_col)

    ##
    ## build collapsed version
    lout = []
    for kid in sorted(dkey_index.keys()):
        tmp = []
        num_row = len(dkey_index[kid])

        for i in range(num_col):
            if i == col_idx:
                continue

            if num_row > 1:
                # this key has multiple rows so we have to collapse the value
                # from the current column within each row into a single string
                ltmp = []
                for j in dkey_index[kid]:
                    ltmp.append(ltable[j][i])

                # check to see if this field has more than a single level
                set_tmp = set(ltmp)
                if len(set_tmp) == 1:
                    # single level
                    tmp.append(ltmp[0])
                else:
                    # more than one level so keep all of them
                    tmp.append(",".join(ltmp))

            else:
                # no collapse, single row
                tmp.append(ltable[dkey_index[kid][0]][i])

        lout.append("\t".join([kid] + tmp) + "\n")

    if args.H:
        hout = [lheader[col_idx]]
        for i in range(num_col):
            if i != col_idx:
                hout.append(lheader[i])

        sys.stdout.write("\t".join(hout))
        sys.stdout.write("\n")

    for i in range(len(lout)):
        sys.stdout.write(lout[i])

    return 0
Exemplo n.º 13
0
def core(args):

    annot = {}
    tid2gname = {}

    stub = hashlib.md5(args.fasta).hexdigest()

    fixed_case = "{}.ref.fa".format(stub)
    #gene_seq = "{}.gene.fa".format(stub)
    gene_shred = "{}.shred.fa".format(stub)
    gene_final = "{}.final.fa".format(stub)
    sub_chars = re.escape("[]{}\|/?!@#$%^&*()+=.") + "\s"

    fset = [fixed_case, "{}.fai".format(fixed_case), gene_shred, gene_final]

    ##
    # parse refflat
    annot, tid2gname = load_refflat(args.ref)

    ##
    # change case of input fasta
    ms.message("Converting all reference bases to uppercase")
    rres = fasta_fix_case(args.fasta, fixed_case)

    if args.just_alignment:
        args.bbmap = True

    if args.just_quantification:
        args.quantify = True

    if not args.just_alignment and not args.just_quantification:

        ##
        # creat worker process to deal with all of the shredding
        tasks = JoinableQueue()
        results = Queue()
        p = None
        pool = []

        for i in range(args.p):
            p = Process(target=worker, args=(
                tasks,
                results,
            ))
            p.daemon = True
            p.start()
            pool.append(p)

        ##
        # get to work!
        i = 0
        n = len(annot.keys())
        ms.message("Starting main loop")
        for gid in annot.keys():
            i += 1
            if (i % 3) == 0:
                ms.progress_message("processing {}. {} of {}".format(
                    gid, i, n))

            gidHat = re.sub("[{}]".format(sub_chars), "", gid)

            # export all sequences belonging to this gene
            gene_seq = "{}.{}.fa".format(stub, gidHat)

            rres = samtools_faidx(fixed_case, gene_seq, annot[gid])
            tasks.put([stub, gidHat])

        for p in pool:
            tasks.put(None)

        ms.progress_message("Waiting for shredding to complete", last=True)

        tasks.join()
        for p in pool:
            p.join()

        ms.message("done")

        results.put(None)

        while True:
            fname = results.get()
            if fname is None:
                break

            if not isfile(fname):
                continue

            ms.message("Joining {}".format(fname))

            rres = cat_result(fname, args.o)
            unlink(fname)

    # done!

    bam_out = re.sub("\.fasta$", ".bam", args.o)

    if args.bbmap:
        if not isfile(args.o):
            if not isfile("{}.gz".format(args.o)):
                ms.error_message(
                    "Shredded reads file doest not exist, gzipped or not ({})".
                    format(args.o))
                return 1
            else:
                # gunzip the reads file
                runcmd("gunzip {}.gz".format(args.o))

        ms.message("Aligning shreds back against the reference")

        rres = bbmap(args.o, fixed_case, bam_out, args.t)

    if args.quantify:
        if not isfile(bam_out):
            ms.error_message(
                "Expected alignment file does not exist ({})".format(bam_out))
            return 1

        ms.message("Parsing alignments")
        pares = parse_alignments(bam_out, tid2gname)
        rres = process_pares(pares)

        tsvout = re.sub("\.bam$", ".tsv", bam_out)

        # output:
        # gene_name, total_reads, min_mapp, max_mapp, mean_mapp, most_similar, all_genes, all_gene_counts
        ms.message("Writing mappability report")
        with open(tsvout, "w") as fout:

            fout.write("#read_length={}\n".format(args.l))
            fout.write("\t".join([
                "#gene_name", "total_reads", "min_mapp", "max_mapp",
                "mean_mapp", "most_similar", "all_targets", "target_counts"
            ]) + "\n")

            for gname in sorted(rres.keys()):

                total_reads = rres[gname]['total_reads']
                if total_reads == 0:
                    ms.warning_message("{} had zero reads".format(gname))
                min_mapp = 1
                max_mapp = 1
                mean_mapp = 1
                most_similar = "na"
                all_genes = "na"
                all_gene_counts = "na"

                if len(rres[gname]['target']) > 0:
                    mapp = []
                    for n in rres[gname]['target_count']:
                        if total_reads > 0:
                            mapp.append(1 - n * 1.0 / total_reads)
                        else:
                            mapp.append(0)

                    min_mapp = min(mapp)
                    max_mapp = max(mapp)
                    mean_mapp = np.mean(mapp)

                    for i in range(len(mapp)):
                        if mapp[i] == min_mapp:
                            most_similar = rres[gname]['target'][i]

                    all_genes = ",".join(rres[gname]['target'])
                    all_gene_counts = ",".join(
                        map(str, rres[gname]['target_count']))

                lout = [
                    gname, total_reads, min_mapp, max_mapp, mean_mapp,
                    most_similar, all_genes, all_gene_counts
                ]
                fout.write("\t".join(map(str, lout)) + "\n")

    if not args.just_quantification and not args.just_alignment:
        if args.z:
            if isfile("{}.gz".format(args.o)):
                unlink("{}.gz".format(args.o))

            cmd = "gzip {}".format(args.o)
            runcmd(cmd)

    #
    # clear out temp files
    for fname in fset:
        if isfile(fname):
            unlink(fname)

    return 0
Exemplo n.º 14
0
def core(args):

    sam0 = ["", "4", "*", "0", "0", "*", "*", "0", "0", "", ""]
    linen = 0
    rnum = 0
    tmpname = hashlib.md5(args.fastq).hexdigest()
    samout = "@HD\tVN:1.0\tSO:unsorted\n"

    # open input file
    if re.search("\.gz$", args.fastq):
        fin = gzip.open(args.fastq, "r")
    else:
        fin = open(args.fastq, "r")

    # open SAM output
    fout = open("{}.sam".format(tmpname), "w")
    fout.write(samout)

    t0 = time()

    for szl in fin:
        linen += 1
        if linen == 1:
            # read name
            rname = szl.strip()
        elif linen == 2:
            # read
            seq = szl.strip()
        elif linen == 4:
            qual = szl.strip()
            # get the sam alignment read for writing
            sam = list(sam0)
            sam[SAM_QNAME] = re.sub("^\@", "", rname)
            sam[SAM_SEQ] = seq
            sam[SAM_QUAL] = qual
            # write line out to file
            fout.write("\t".join(sam))
            fout.write("\n")
            # reset line counter
            linen = 0
            rnum += 1

        if rnum > 0 and (rnum % 1000000) == 0:
            ms.progress_message("parsed {} reads".format(rnum))

    ms.progress_message("parsed {} reads".format(rnum), last=True)

    ms.time_diff(t0)

    fout.close()
    fin.close()

    ms.message("Converting to BAM")
    t0 = time()

    t = args.t
    if t == 0:
        t = cpu_count() / 2
    elif t > cpu_cout():
        t = cpu_count()

    cmd = "samtools view -bS -@ {} -o {} {}.sam".format(t, args.bam, tmpname)
    rres = utils.runcmd(cmd)
    if rres[0] != 0:
        ms.error_message("Failed to create BAM file!")
        return 1

    ms.time_diff(t0)

    unlink("{}.sam".format(tmpname))

    return 0