예제 #1
0
 def start(self, fastq_file1, fastq_file2, output_prefix, samplesFile, batchsize=10000, uncompressed=False, output_unidentified=False, verbose=True, debug=False):
     """
         split a double barcoded Illumina Sequencing Run by project
     """
     self.verbose = verbose
     try:
         # read in primer sequences
         sTable = sampleTable(samplesFile)
         if self.verbose:
             sys.stdout.write("sample table length: %s, and %s projects.\n" % (sTable.getSampleNumber(), len(sTable.getProjectList())))
         # read in primer sequences if present
         # setup output files
         identified_count = 0
         unidentified_count = 0
         self.run_out = {}
         for project in sTable.getProjectList():
             self.run_out[project] = IlluminaTwoReadOutput(os.path.join(output_prefix, project), uncompressed)
         if output_unidentified:
             self.run_out["Unidentified"] = IlluminaTwoReadOutput(os.path.join(output_prefix, 'UnidentifiedProject'), uncompressed)
         # establish and open the Illumin run
         self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2)
         self.run.open()
         lasttime = time.time()
         while 1:
             # get next batch of reads
             reads = self.run.next(batchsize)
             if len(reads) == 0:
                 break
             # process individual reads
             for read in reads:
                 read.assignRead(sTable)  # barcode
                 if read.goodRead is True:
                     self.run_out[read.project].addRead(read.getFastq())
                     identified_count += 1
                 else:
                     unidentified_count += 1
                     if output_unidentified:
                         self.run_out["Unidentified"].addRead(read.getFastq())
             # Write out reads
             for key in self.run_out:
                 self.run_out[key].writeReads()
             if self.verbose:
                 sys.stderr.write("processed %s total reads, %s Reads/second, %s identified reads, %s unidentified reads (%s%%)\n" % (self.run.count(), round(self.run.count()/(time.time() - lasttime), 0), identified_count, unidentified_count, round((float(identified_count)/float(self.run.count()))*100)))
         if self.verbose:
             sys.stdout.write("%s reads processed in %s minutes, %s (%s%%) identified\n\n" % (self.run.count(), round((time.time()-lasttime)/(60), 2), identified_count, round((float(identified_count)/float(self.run.count()))*100, 1)))
         for key in self.run_out:
             sys.stdout.write("%s (%s%%)\treads found for project\t%s\n" % (self.run_out[key].count(), round((float(self.run_out[key].count())/float(self.run.count()))*100, 1), key))
         self.clean()
         return 0
     except (KeyboardInterrupt, SystemExit):
         self.clean()
         sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
         return 1
     except:
         self.clean()
         if not debug:
             sys.stderr.write("A fatal error was encountered. trying turning on debug\n")
         if debug:
             sys.stderr.write("".join(traceback.format_exception(*sys.exc_info())))
         return 1
예제 #2
0
    def start(self,
              barcodesFile,
              primerFile,
              samplesFile,
              verbose=True,
              debug=False):
        """
        Start preprocessing double barcoded Illumina sequencing run, perform
        """
        self.verbose = verbose
        try:
            # read in barcode sequences
            bcTable = barcodeTable(barcodesFile)
            if self.verbose:
                sys.stdout.write("barcode table length: %s\n" %
                                 bcTable.getLength())

            if primerFile is not None:
                prTable = primerTable(primerFile)
                if verbose:
                    sys.stdout.write(
                        "primer table length P5 Primer Sequences:%s, P7 Primer Sequences:%s\n"
                        % (len(prTable.getP5sequences()),
                           len(prTable.getP7sequences())))

                res1 = self.validatePrimer(prTable, debug)
            else:
                res1 = 0
                prTable = None

            # read in sample sheet
            sTable = sampleTable(samplesFile)
            if verbose:
                sys.stdout.write(
                    "sample table length: %s, and %s projects.\n" %
                    (sTable.getSampleNumber(), len(sTable.getProjectList())))

            res2 = self.validateSample(bcTable, prTable, sTable, debug)

            if res1 == 0 and res2 == 0:
                sys.stderr.write("Validation confirmed, files are ok\n")
                return 0
            else:
                sys.stderr.write("Failed validation\n")
                return 1

        except (KeyboardInterrupt, SystemExit):
            self.clean()
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1

        except Exception:
            self.clean()
            if not debug:
                sys.stderr.write(
                    "A fatal error was encountered. trying turning on debug\n")
            if debug:
                sys.stderr.write("".join(
                    traceback.format_exception(*sys.exc_info())))
            return 1
예제 #3
0
    def start(self, barcodesFile, primerFile, samplesFile, verbose=True, debug=False):
        """
        Start preprocessing double barcoded Illumina sequencing run, perform
        """
        self.verbose = verbose
        try:
            # read in barcode sequences
            bcTable = barcodeTable(barcodesFile)
            if self.verbose:
                sys.stdout.write("barcode table length: %s\n" % bcTable.getLength())

            if primerFile is not None:
                prTable = primerTable(primerFile)
                if verbose:
                    sys.stdout.write("primer table length P5 Primer Sequences:%s, P7 Primer Sequences:%s\n" % (len(prTable.getP5sequences()), len(prTable.getP7sequences())))

                res1 = self.validatePrimer(prTable, debug)
            else:
                res1 = 0
                prTable = None

            # read in sample sheet
            sTable = sampleTable(samplesFile)
            if verbose:
                sys.stdout.write("sample table length: %s, and %s projects.\n" % (sTable.getSampleNumber(), len(sTable.getProjectList())))

            res2 = self.validateSample(bcTable, prTable, sTable, debug)

            if res1 == 0 and res2 == 0:
                sys.stderr.write("Validation confirmed, files are ok\n")
                return 0
            else:
                sys.stderr.write("Failed validation\n")
                return 1

        except (KeyboardInterrupt, SystemExit):
            self.clean()
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1

        except Exception:
            self.clean()
            if not debug:
                sys.stderr.write("A fatal error was encountered. trying turning on debug\n")
            if debug:
                sys.stderr.write("".join(traceback.format_exception(*sys.exc_info())))
            return 1
예제 #4
0
    def start(self,
              fastq_file1,
              fastq_file2,
              fastq_file3,
              fastq_file4,
              output_prefix,
              barcodesFile,
              primerFile,
              samplesFile,
              barcodeMaxDiff=1,
              I1rc=True,
              I2rc=False,
              dedup_float=4,
              primerMaxDiff=4,
              primerEndMatch=4,
              batchsize=10000,
              uncompressed=False,
              output_unidentified=False,
              minQ=None,
              minL=0,
              verbose=True,
              debug=False,
              kprimer=False,
              test=False):
        """
        Start preprocessing double barcoded Illumina sequencing run, perform
        """
        self.verbose = verbose
        evalPrimer = primerFile is not None
        evalSample = samplesFile is not None
        try:
            v = validateApp()
            # read in barcode sequences
            bcTable = barcodeTable(barcodesFile, I1rc, I2rc)
            if self.verbose:
                sys.stdout.write("barcode table length: %s\n" %
                                 bcTable.getLength())
            # read in primer sequences if present
            if evalPrimer:
                prTable = primerTable(primerFile)
                if verbose:
                    sys.stdout.write(
                        "primer table length P5 Primer Sequences:%s, P7 Primer Sequences:%s\n"
                        % (len(prTable.getP5sequences()),
                           len(prTable.getP7sequences())))
                if v.validatePrimer(prTable, debug) != 0:
                    sys.stderr.write("Failed validation\n")
                    self.clean()
                    return 1
            else:
                prTable = None
            if evalSample:
                sTable = sampleTable(samplesFile)
                if verbose:
                    sys.stdout.write(
                        "sample table length: %s, and %s projects.\n" %
                        (sTable.getSampleNumber(), len(
                            sTable.getProjectList())))
                if v.validateSample(bcTable, prTable, sTable, debug) != 0:
                    sys.stderr.write("Failed validation\n")
                    self.clean()
                    return 1

            # output table
            try:
                if evalSample:
                    bctable_name = os.path.join(output_prefix,
                                                'Identified_Barcodes.txt')
                else:
                    bctable_name = output_prefix + '_Identified_Barcodes.txt'
                misc.make_sure_path_exists(os.path.dirname(bctable_name))
                bcFile = open(bctable_name, 'w')
            except:
                sys.stderr.write("ERROR: Can't open file %s for writing\n" %
                                 bctable_name)
                raise
            barcode_counts = {}
            bcsuccesscount = 0
            prsuccesscount = 0
            sampsuccesscount = 0
            trimsuccesscount = 0
            identified_count = 0
            # setup output files
            self.run_out = {}
            if evalSample:
                for project in sTable.getProjectList():
                    self.run_out[project] = IlluminaTwoReadOutput(
                        os.path.join(output_prefix, project), uncompressed)
            else:
                self.run_out["Identified"] = IlluminaTwoReadOutput(
                    output_prefix, uncompressed)
            if output_unidentified:
                if evalSample:
                    self.run_out["Unidentified"] = IlluminaTwoReadOutput(
                        os.path.join(output_prefix, 'UnidentifiedProject'),
                        uncompressed)
                else:
                    self.run_out["Unidentified"] = IlluminaTwoReadOutput(
                        output_prefix + "_Unidentified", uncompressed)
            # establish and open the Illumina run
            self.run = FourReadIlluminaRun(fastq_file1, fastq_file2,
                                           fastq_file3, fastq_file4)
            self.run.open()
            totaltime = time.time()
            while 1:
                lasttime = time.time()
                # get next batch of reads
                reads = self.run.next(batchsize)
                if len(reads) == 0:
                    break
                # process individual reads
                for read in reads:
                    bcsuccesscount += read.assignBarcode(
                        bcTable, barcodeMaxDiff)  # barcode
                    if evalPrimer:  # primer
                        prsuccesscount += read.assignPrimer(
                            prTable, dedup_float, primerMaxDiff,
                            primerEndMatch)
                    if evalSample:  # sample
                        sampsuccesscount += read.assignRead(
                            sTable)  # barcode + primer
                    if minQ is not None and read.goodRead:
                        trimsuccesscount += read.trimRead(minQ, minL)
                    if read.goodRead is True:
                        identified_count += 1
                        if evalSample:
                            self.run_out[read.getProject()].addRead(
                                read.getFastq(kprimer))
                        else:
                            self.run_out["Identified"].addRead(
                                read.getFastq(kprimer))
                    else:
                        if output_unidentified:
                            self.run_out["Unidentified"].addRead(
                                read.getFastq(True))
                    ###############################################
                    # Record data for final barcode table
                    if read.getBarcode() is None and '-' in barcode_counts:
                        if evalPrimer and read.getPrimer() is None:
                            barcode_counts['-']['-'] += 1
                        elif evalPrimer:
                            barcode_counts['-'][read.getPrimer()] += 1
                        else:
                            barcode_counts['-']["Total"] += 1
                    elif read.getBarcode() in barcode_counts:
                        if evalPrimer and read.getPrimer() is None:
                            barcode_counts[read.getBarcode()]['-'] += 1
                        elif evalPrimer:
                            barcode_counts[read.getBarcode()][
                                read.getPrimer()] += 1
                        else:
                            barcode_counts[read.getBarcode()]["Total"] += 1
                    else:
                        # setup blank primer count table for the new barcode
                        if read.getBarcode() is None:
                            barcode_counts['-'] = {}
                            if evalPrimer:
                                for pr in prTable.getPrimers():
                                    barcode_counts['-'][pr] = 0
                                barcode_counts['-']['-'] = 0
                                if read.getPrimer() is None:
                                    barcode_counts['-']['-'] = 1
                                else:
                                    barcode_counts['-'][read.getPrimer()] = 1
                            else:
                                barcode_counts['-']["Total"] = 1
                        else:
                            barcode_counts[read.getBarcode()] = {}
                            if evalPrimer:
                                for pr in prTable.getPrimers():
                                    barcode_counts[read.getBarcode()][pr] = 0
                                barcode_counts[read.getBarcode()]['-'] = 0
                                if read.getPrimer() is None:
                                    barcode_counts[read.getBarcode()]['-'] = 1
                                else:
                                    barcode_counts[read.getBarcode()][
                                        read.getPrimer()] = 1
                            else:
                                barcode_counts[read.getBarcode()]["Total"] = 1

                # Write out reads
                for key in self.run_out:
                    self.run_out[key].writeReads()
                if self.verbose:
                    sys.stderr.write(
                        "processed %s total reads, %s Reads/second, %s identified reads(%s%%), %s unidentified reads\n"
                        %
                        (self.run.count(),
                         round(batchsize /
                               (time.time() - lasttime), 0), identified_count,
                         round((float(identified_count) /
                                float(self.run.count())) * 100,
                               1), self.run.count() - identified_count))
                if test:  # exit after the first batch to test the inputs
                    break
            if self.verbose:
                sys.stdout.write(
                    "%s reads processed in %s minutes, %s (%s%%) identified\n\n"
                    %
                    (self.run.count(),
                     round((time.time() - totaltime) /
                           (60), 2), identified_count,
                     round(
                         (float(identified_count) / float(self.run.count())) *
                         100, 1)))
            # Write out barcode and primer table
            if (identified_count > 0):
                # write out header line
                if evalPrimer:
                    txt = 'Barcode\t' + '\t'.join(
                        prTable.getPrimers()) + '\tNone' + '\n'
                else:
                    txt = 'Barcode\tTotal\n'
                bcFile.write(txt)
                bckeys = barcode_counts.keys()
                for bc in bcTable.getBarcodes():
                    if bc in bckeys and evalPrimer:
                        txt = str(bc)
                        for pr in prTable.getPrimers():
                            txt = '\t'.join([txt, str(barcode_counts[bc][pr])])
                        txt = "\t".join([txt, str(barcode_counts[bc]['-'])])
                    elif bc in bckeys:
                        txt = "\t".join(
                            [str(bc),
                             str(barcode_counts[bc]["Total"])])
                    else:
                        continue
                    bcFile.write(txt + '\n')
                if '-' in bckeys:
                    if evalPrimer:
                        txt = 'None'
                        for pr in prTable.getPrimers():
                            txt = '\t'.join(
                                [txt, str(barcode_counts['-'][pr])])
                        txt = "\t".join([txt, str(barcode_counts['-']['-'])])
                    else:
                        txt = "\t".join(
                            ['None', str(barcode_counts['-']["Total"])])
                    bcFile.write(txt + '\n')

            # write out project table
            sys.stdout.write(
                "%s reads (%s%% of total run) successfully identified barcode\n"
                % (bcsuccesscount,
                   round(
                       (float(bcsuccesscount) / float(self.run.count())) * 100,
                       1)))
            if evalPrimer:  # primer
                sys.stdout.write(
                    "%s reads (%s%% of total run) successfully identified barcode and primer\n"
                    % (prsuccesscount,
                       round(
                           (float(prsuccesscount) / float(self.run.count())) *
                           100, 1)))
            if evalSample:  # sample
                sys.stdout.write(
                    "%s reads (%s%% of total run) successfully assigned to sample\n"
                    %
                    (sampsuccesscount,
                     round(
                         (float(sampsuccesscount) / float(self.run.count())) *
                         100, 1)))
            if minQ is not None:
                sys.stdout.write(
                    "%s reads (%s%% of total run) successfully pass trimming criteria\n"
                    %
                    (trimsuccesscount,
                     round(
                         (float(trimsuccesscount) / float(self.run.count())) *
                         100, 1)))

            sys.stdout.write("%s reads (%s%% of total run) unidentified\n\n" %
                             (self.run.count() - identified_count,
                              round(
                                  (float(self.run.count() - identified_count) /
                                   float(self.run.count())) * 100, 1)))

            if evalSample and self.verbose:
                for key in self.run_out:
                    sys.stdout.write(
                        "%s reads (%s%% of total run) found for project\t%s\n"
                        % (self.run_out[key].count(),
                           round((float(self.run_out[key].count()) /
                                  float(self.run.count())) * 100, 1), key))
            self.clean()
            return 0
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except:
            self.clean()
            if not debug:
                sys.stderr.write(
                    "A fatal error was encountered. trying turning on debug\n")
            if debug:
                sys.stderr.write("".join(
                    traceback.format_exception(*sys.exc_info())))
            return 1
예제 #5
0
    def start(self, fixrank_file, samplesFile, output_prefix='table', rank='genus', threshold=0.5, minsize=None, maxsize=None, biom=False, hdf5=False, verbose=True, debug=False):
        """
            Start processing classification fixrank files
        """
        self.verbose = verbose
        evalSample = samplesFile is not None
        if biom or hdf5:
            try:
                import biom
            except ImportError:
                sys.stderr.write("Cannot import python biom module")
                raise
        try:
            lines = 0
            lasttime = time.time()
            self.ffixrank = []
            # samples
            if evalSample:
                sTable = sampleTable(samplesFile)
                if verbose:
                    sys.stdout.write("sample table length: %s, and %s projects.\n" % (sTable.getSampleNumber(), len(sTable.getProjectList())))

            # check input fixrank files
            for ffile in fixrank_file:
                self.ffixrank.extend(glob.glob(ffile))
                if len(self.ffixrank) == 0 or not all(os.path.isfile(f) for f in self.ffixrank):
                    sys.stderr.write('ERROR:[abundance_app] fixrank file(s) not found\n')
                    raise

            abundanceTable = dict()
            bootscore = dict()
            tax_len = dict()
            primers = dict()
            tax_level_counts = OrderedDict()
            tax_level_counts['domain'] = 0
            tax_level_counts['phylum'] = 0
            tax_level_counts['class'] = 0
            tax_level_counts['order'] = 0
            tax_level_counts['family'] = 0
            tax_level_counts['genus'] = 0
            tax_level_counts['species'] = 0
            tax_level_counts['isolate'] = 0
            sampleList = []
            sampleCounts = Counter()
            discardedReads = 0
            for ffile in self.ffixrank:
                with open(ffile, "rb") as infile:
                    for line in infile:
                        lrank = fixrankLine(line.rstrip('\n'), rank, threshold)
                        if (minsize is not None or maxsize is not None) and lrank.getSize() != "PAIR":
                            if minsize is not None and int(lrank.getSize()) < minsize:
                                discardedReads += 1
                                continue
                            if maxsize is not None and int(lrank.getSize()) > maxsize:
                                discardedReads += 1
                                continue
                        if evalSample:
                            lrank.assignRead(sTable)
                        if lrank.isOk():
                            if lrank.getLevel() not in tax_level_counts:
                                tax_level_counts[lrank.getLevel()] = 0
                            tax_level_counts[lrank.getLevel()] += 1
                            if lrank.getSampleID() not in sampleList:
                                sampleList.append(lrank.getSampleID())
                                primers[lrank.getSampleID()] = []
                            sampleCounts[lrank.getSampleID()] += 1
                            if lrank.getPrimer() is not None and lrank.getPrimer() not in primers[lrank.getSampleID()]:
                                primers[lrank.getSampleID()].append(lrank.getPrimer())
                            if lrank.getCall() in abundanceTable.keys():
                                abundanceTable[lrank.getCall()][lrank.getSampleID()] += 1
                                bootscore[lrank.getCall()] += lrank.getBootstrap()
                                if lrank.getSize() == "PAIR":
                                    tax_len[lrank.getCall()]["PAIR"] += 1
                                else:
                                    tax_len[lrank.getCall()]["SINGLE"] += int(lrank.getSize())
                            else:
                                abundanceTable[lrank.getCall()] = Counter()
                                abundanceTable[lrank.getCall()][lrank.getSampleID()] += 1
                                bootscore[lrank.getCall()] = lrank.getBootstrap()
                                tax_len[lrank.getCall()] = {}
                                if lrank.getSize() == "PAIR":
                                    tax_len[lrank.getCall()]["PAIR"] = 1
                                    tax_len[lrank.getCall()]["SINGLE"] = 0
                                else:
                                    tax_len[lrank.getCall()]["PAIR"] = 0
                                    tax_len[lrank.getCall()]["SINGLE"] = int(lrank.getSize())
                        lines += 1
                        if lines % 100000 is 0 and self.verbose:
                            sys.stderr.write("processed %s total lines, %s lines/second\n" % (lines, round(lines / (time.time() - lasttime), 0)))
            sys.stdout.write("%s lines processed in %s minutes\n" % (lines, round((time.time() - lasttime) / (60), 2)))

            sys.stdout.write("Classification numbers (reads):\n")
            for level in tax_level_counts:
                sys.stdout.write("   %s:\t%i\n" % (level, tax_level_counts[level]))
            if discardedReads > 0:
                sys.stdout.write("discarded %s reads for size\n" % str(discardedReads))
            if self.verbose:
                sys.stderr.write("Writing output\n")

            def calc_single_len(single, pairs, total):
                try:
                    return single / (total - pairs)
                except ZeroDivisionError:
                    return 0
            # output files
            # biom format
            if biom or hdf5:
                # generate table formats
                data = []
                obs_ids = []
                sampleList = sorted(sampleList, key=lambda s: s.lower())
                sampleList_md = [{'primers': ";".join(primers[v])} for v in sampleList]
                if evalSample:
                    if sTable.hasMetadata() is True:
                        for i, v in enumerate(sampleList):
                            sampleList_md[i].update(sTable.sampleMetadata[v]["Metadata"])

                # taxanomic keys and metadata
                taxa_keys = sorted(abundanceTable.keys(), key=lambda s: s.lower())
                mbootscore = {v: round(bootscore[v] / sum(abundanceTable[v].values()), 3) for v in taxa_keys}

                mtax_len_s = {v: calc_single_len(tax_len[v]["SINGLE"], tax_len[v]["PAIR"], sum(abundanceTable[v].values())) for v in taxa_keys}
                mtax_len_p = {v: round(tax_len[v]["PAIR"] / (sum(abundanceTable[v].values())), 3) for v in taxa_keys}

                def func(x):
                    return x.split(';')
                taxa_keys_md = [{'taxonomy': func(v), 'mean_rdp_bootstrap_value': mbootscore[v], 'mean_sequence_length_single': mtax_len_s[v], 'percentage_paired': mtax_len_p[v]} for v in taxa_keys]

                # build the data object
                for i, taxa in enumerate(taxa_keys):
                    obs_ids.append("Taxa_%05d" % (i))
                    tmpd = []
                    for sample in sampleList:
                        tmpd.append(float(abundanceTable[taxa][sample]))
                    data.append(tmpd)

                # build the biom Table object
                biomT = biom.Table(data=data, observation_ids=obs_ids, sample_ids=sampleList,
                                   observation_metadata=taxa_keys_md, sample_metadata=sampleList_md,
                                   input_is_dense=True,
                                   table_id=None, type="OTU table", create_date=None, generated_by="dbcAmplicons",
                                   observation_group_metadata=None, sample_group_metadata=None)

                if hdf5:
                    try:
                        import h5py  # Temporarily deactive h5py output (DISABLED)
                        sys.stderr.write("Writing hd5 formatted biom file to: %s\n" % (output_prefix + '.biom'))
                        with h5py.File(output_prefix + '.biom', 'w') as f:
                            biomT.to_hdf5(f, "dbcAmplicons")
                    except ImportError:
                        sys.stderr.write("h5py Import Error: Writing json formatted biom file to: %s\n" % (output_prefix + '.biom'))
                        with open(output_prefix + '.biom', 'w') as f:
                            f.write(biomT.to_json("dbcAmplicons"))
                else:
                        sys.stderr.write("Writing json formatted biom file to: %s\n" % (output_prefix + '.biom'))
                        with open(output_prefix + '.biom', 'w') as f:
                            f.write(biomT.to_json("dbcAmplicons"))

            # abundance and proportions tables
            if evalSample:
                ab_name = output_prefix + '.abundance.txt'
                prop_name = output_prefix + '.proportions.txt'
            else:
                ab_name = output_prefix + '.abundance.txt'
                prop_name = output_prefix + '.proportions.txt'
            try:
                abFile = open(ab_name, 'w')
                propFile = open(prop_name, 'w')
            except Exception:
                sys.stderr.write("ERROR:[abundance_app] Can't open files (%s,%s) for writing\n" % (ab_name, prop_name))
            sys.stderr.write("Writing abundance file to: %s\n" % (ab_name))
            sys.stderr.write("Writing proportions file to: %s\n" % (prop_name))
            # write out header line
            sampleList = sorted(sampleList, key=lambda s: s.lower())
            txt = 'Taxon_Name\tLevel\t' + '\t'.join(sampleList) + '\n'
            abFile.write(txt)
            propFile.write(txt)
            taxa_keys = sorted(abundanceTable.keys(), key=lambda s: s.lower())

            levels = {'d': 'domain', 'p': 'phylum', 'c': 'class', 'o': 'order', 'f': 'family', 'g': 'genus', 's': 'species', 'i': 'isolate'}
            for taxa in taxa_keys:
                tmp = taxa.split(";")[-1].split("__")

                txt1 = txt2 = tmp[1] + '\t' + levels[tmp[0]]
                for sample in sampleList:
                    txt1 = '\t'.join([txt1, str(abundanceTable[taxa][sample])])
                    if sampleCounts[sample] > 0:
                        txt2 = '\t'.join([txt2, str(float(abundanceTable[taxa][sample]) / float(sampleCounts[sample]))])
                    else:
                        txt2 = '\t'.join([txt2, str(0.0)])
                abFile.write(txt1 + '\n')
                propFile.write(txt2 + '\n')
            txt = "Sample Counts\tNA"
            for sample in sampleList:
                    txt = '\t'.join([txt, str(sampleCounts[sample])])
            propFile.write(txt + '\n')

            # output total counts and info (across all samples) for each taxa
            cntFile = open(output_prefix + '.taxa_info.txt', 'w')
            cntFile.write("Taxon_Name\tMeanBootstrapValue\tMeanLengthMerged\tPercentageAsPairs\tTotal\n")
            taxa_keys = sorted(abundanceTable.keys(), key=lambda s: s.lower())

            for abt in taxa_keys:
                cntFile.write(str(abt) + '\t' +
                              str(round(bootscore[abt] / sum(abundanceTable[abt].values()), 3)) + '\t' +
                              str(calc_single_len(tax_len[abt]["SINGLE"], tax_len[abt]["PAIR"], sum(abundanceTable[abt].values()))) + '\t' +
                              str(round(tax_len[abt]["PAIR"] / (sum(abundanceTable[abt].values())), 3)) + '\t' +
                              str(sum(abundanceTable[abt].values())) + '\n')
            if self.verbose:
                sys.stderr.write("finished in %s minutes\n" % (round((time.time() - lasttime) / (60), 2)))
            self.clean()
            return 0
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except Exception:
            self.clean()
            sys.stderr.write("A fatal error was encountered.\n")
            if debug:
                sys.stderr.write("".join(traceback.format_exception(*sys.exc_info())))
            return 1
예제 #6
0
 def start(self,
           fastq_file1,
           fastq_file2,
           output_prefix,
           samplesFile,
           batchsize=10000,
           uncompressed=False,
           output_unidentified=False,
           verbose=True,
           debug=False):
     """
         split a double barcoded Illumina Sequencing Run by project
     """
     self.verbose = verbose
     try:
         # read in primer sequences
         sTable = sampleTable(samplesFile)
         if self.verbose:
             sys.stdout.write(
                 "sample table length: %s, and %s projects.\n" %
                 (sTable.getSampleNumber(), len(sTable.getProjectList())))
         # read in primer sequences if present
         # setup output files
         identified_count = 0
         unidentified_count = 0
         self.run_out = {}
         for project in sTable.getProjectList():
             self.run_out[project] = IlluminaTwoReadOutput(
                 os.path.join(output_prefix, project), uncompressed)
         if output_unidentified:
             self.run_out["Unidentified"] = IlluminaTwoReadOutput(
                 os.path.join(output_prefix, 'UnidentifiedProject'),
                 uncompressed)
         # establish and open the Illumin run
         self.run = TwoReadIlluminaRun(fastq_file1, fastq_file2)
         self.run.open()
         lasttime = time.time()
         while 1:
             # get next batch of reads
             reads = self.run.next(batchsize)
             if len(reads) == 0:
                 break
             # process individual reads
             for read in reads:
                 read.assignRead(sTable)  # barcode
                 if read.goodRead is True:
                     self.run_out[read.project].addRead(read.getFastq())
                     identified_count += 1
                 else:
                     unidentified_count += 1
                     if output_unidentified:
                         self.run_out["Unidentified"].addRead(
                             read.getFastq())
             # Write out reads
             for key in self.run_out:
                 self.run_out[key].writeReads()
             if self.verbose:
                 sys.stderr.write(
                     "processed %s total reads, %s Reads/second, %s identified reads, %s unidentified reads (%s%%)\n"
                     % (self.run.count(),
                        round(self.run.count() / (time.time() - lasttime),
                              0), identified_count, unidentified_count,
                        round((float(identified_count) /
                               float(self.run.count())) * 100)))
         if self.verbose:
             sys.stdout.write(
                 "%s reads processed in %s minutes, %s (%s%%) identified\n\n"
                 %
                 (self.run.count(), round(
                     (time.time() - lasttime) / (60), 2), identified_count,
                  round(
                      (float(identified_count) / float(self.run.count())) *
                      100, 1)))
         for key in self.run_out:
             sys.stdout.write(
                 "%s (%s%%)\treads found for project\t%s\n" %
                 (self.run_out[key].count(),
                  round((float(self.run_out[key].count()) /
                         float(self.run.count())) * 100, 1), key))
         self.clean()
         return 0
     except (KeyboardInterrupt, SystemExit):
         self.clean()
         sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
         return 1
     except:
         self.clean()
         if not debug:
             sys.stderr.write(
                 "A fatal error was encountered. trying turning on debug\n")
         if debug:
             sys.stderr.write("".join(
                 traceback.format_exception(*sys.exc_info())))
         return 1
예제 #7
0
    def start(self,
              fastq_file1, fastq_file2, fastq_file3, fastq_file4, output_prefix,
              bcFile, primerFile=None, samplesFile=None,
              barcodeMaxDiff=1, I1rc=True, I2rc=False, dedup_float=4,
              primerMaxDiff=4, primerEndMatch=4, flip=False, batchsize=10000,
              uncompressed=False, output_unidentified=False, minQ=None,
              minL=0, verbose=True, debug=False, kprimer=False, test=False):
        """
        Start preprocessing double barcoded Illumina sequencing run, perform
        """
        print('---')
        print('Running preprocessApp')
        print('')
        self.verbose = verbose
        evalPrimer = primerFile is not None
        evalSample = samplesFile is not None
        try:
            v = validateApp()
            # read in barcode sequences
            bcTable = barcodeTable(bcFile, I1rc, I2rc)
            if self.verbose:
                sys.stdout.write("barcode table length: %s\n" % bcTable.getLength())
            # read in primer sequences if present
            if evalPrimer:
                prTable = primerTable(primerFile)
                if verbose:
                    sys.stdout.write("primer table length P5 Primer Sequences:%s, P7 Primer Sequences:%s\n" % (len(prTable.getP5sequences()), len(prTable.getP7sequences())))
                if v.validatePrimer(prTable, debug) != 0:
                    sys.stderr.write("Failed validation\n")
                    self.clean()
                    return 1
            else:
                prTable = None
            if evalSample:
                sTable = sampleTable(samplesFile)
                if verbose:
                    sys.stdout.write("sample table length: %s, and %s projects.\n" % (sTable.getSampleNumber(), len(sTable.getProjectList())))
                if v.validateSample(bcTable, prTable, sTable, debug) != 0:
                    sys.stderr.write("Failed validation\n")
                    self.clean()
                    return 1

            # output table
            try:
                if evalSample:
                    bctable_name = os.path.join(output_prefix, 'Identified_Barcodes.txt')
                else:
                    bctable_name = output_prefix + '_Identified_Barcodes.txt'
                misc.make_sure_path_exists(os.path.dirname(bctable_name))
                bcFile = open(bctable_name, 'w')
            except Exception:
                sys.stderr.write("ERROR: Can't open file %s for writing\n" % bctable_name)
                raise
            barcode_counts = {}
            bcsuccesscount = 0
            prsuccesscount = 0
            sampsuccesscount = 0
            trimsuccesscount = 0
            identified_count = 0
            # setup output files
            self.run_out = {}
            if evalSample:
                for project in sTable.getProjectList():
                    self.run_out[project] = IlluminaTwoReadOutput(os.path.join(output_prefix, project), uncompressed)
            else:
                self.run_out["Identified"] = IlluminaTwoReadOutput(output_prefix, uncompressed)
            if output_unidentified:
                if evalSample:
                    self.run_out["Unidentified"] = IlluminaTwoReadOutput(os.path.join(output_prefix, 'UnidentifiedProject'), uncompressed)
                else:
                    self.run_out["Unidentified"] = IlluminaTwoReadOutput(output_prefix + "_Unidentified", uncompressed)
            # establish and open the Illumina run
            self.run = FourReadIlluminaRun(fastq_file1, fastq_file2, fastq_file3, fastq_file4)
            self.run.open()
            totaltime = time.time()
            while 1:
                lasttime = time.time()
                # get next batch of reads
                reads = self.run.next(batchsize)
                if len(reads) == 0:
                    break
                # process individual reads
                #for read in reads:
                    #print(read.assignBarcode(bcTable, barcodeMaxDiff))
                for read in reads:
                    bcsuccesscount += read.assignBarcode(bcTable, barcodeMaxDiff)  # barcode
                    if evalPrimer:  # primer
                        prsuccesscount += read.assignPrimer(prTable, dedup_float, primerMaxDiff, primerEndMatch, flip)
                    if evalSample:  # sample
                        sampsuccesscount += read.assignRead(sTable)  # barcode + primer
                    if minQ is not None and read.goodRead:
                        trimsuccesscount += read.trimRead(minQ, minL)
                    if read.goodRead is True:
                        identified_count += 1
                        if evalSample:
                            self.run_out[read.getProject()].addRead(read.getFastq(kprimer))
                        else:
                            self.run_out["Identified"].addRead(read.getFastq(kprimer))
                    else:
                        if output_unidentified:
                            self.run_out["Unidentified"].addRead(read.getFastq(True))
                    ###############################################
                    # Record data for final barcode table
                    if read.getBarcode() is None and '-' in barcode_counts:
                        if evalPrimer and read.getPrimer() is None:
                            barcode_counts['-']['-'] += 1
                        elif evalPrimer:
                            barcode_counts['-'][read.getPrimer()] += 1
                        else:
                            barcode_counts['-']["Total"] += 1
                    elif read.getBarcode() in barcode_counts:
                        if evalPrimer and read.getPrimer() is None:
                            barcode_counts[read.getBarcode()]['-'] += 1
                        elif evalPrimer:
                            barcode_counts[read.getBarcode()][read.getPrimer()] += 1
                        else:
                            barcode_counts[read.getBarcode()]["Total"] += 1
                    else:
                        # setup blank primer count table for the new barcode
                        if read.getBarcode() is None:
                            barcode_counts['-'] = {}
                            if evalPrimer:
                                for pr in prTable.getPrimers():
                                    barcode_counts['-'][pr] = 0
                                barcode_counts['-']['-'] = 0
                                if read.getPrimer() is None:
                                    barcode_counts['-']['-'] = 1
                                else:
                                    barcode_counts['-'][read.getPrimer()] = 1
                            else:
                                barcode_counts['-']["Total"] = 1
                        else:
                            barcode_counts[read.getBarcode()] = {}
                            if evalPrimer:
                                for pr in prTable.getPrimers():
                                    barcode_counts[read.getBarcode()][pr] = 0
                                barcode_counts[read.getBarcode()]['-'] = 0
                                if read.getPrimer() is None:
                                    barcode_counts[read.getBarcode()]['-'] = 1
                                else:
                                    barcode_counts[read.getBarcode()][read.getPrimer()] = 1
                            else:
                                barcode_counts[read.getBarcode()]["Total"] = 1

                # Write out reads
                for key in self.run_out:
                    self.run_out[key].writeReads()
                if self.verbose:
                    sys.stderr.write("processed %s total reads, %s Reads/second, %s identified reads(%s%%), %s unidentified reads\n" %
                                     (self.run.count(),
                                      round(batchsize / (time.time() - lasttime), 0),
                                      identified_count,
                                      round((float(identified_count) / float(self.run.count())) * 100, 1),
                                      self.run.count() - identified_count))
                if test:  # exit after the first batch to test the inputs
                    break
            if self.verbose:
                    sys.stdout.write("%s reads processed in %s minutes, %s (%s%%) identified\n\n" %
                                     (self.run.count(),
                                      round((time.time() - totaltime) / (60), 2),
                                      identified_count,
                                      round((float(identified_count) / float(self.run.count())) * 100, 1)))
            # Write out barcode and primer table
            if (identified_count > 0):
                # write out header line
                if evalPrimer:
                    txt = 'Barcode\t' + '\t'.join(prTable.getPrimers()) + '\tNone' + '\n'
                else:
                    txt = 'Barcode\tTotal\n'
                bcFile.write(txt)
                bckeys = barcode_counts.keys()
                for bc in bcTable.getBarcodes():
                    if bc in bckeys and evalPrimer:
                        txt = str(bc)
                        for pr in prTable.getPrimers():
                            txt = '\t'.join([txt, str(barcode_counts[bc][pr])])
                        txt = "\t".join([txt, str(barcode_counts[bc]['-'])])
                    elif bc in bckeys:
                        txt = "\t".join([str(bc), str(barcode_counts[bc]["Total"])])
                    else:
                        continue
                    bcFile.write(txt + '\n')
                if '-' in bckeys:
                    if evalPrimer:
                        txt = 'None'
                        for pr in prTable.getPrimers():
                            txt = '\t'.join([txt, str(barcode_counts['-'][pr])])
                        txt = "\t".join([txt, str(barcode_counts['-']['-'])])
                    else:
                        txt = "\t".join(['None', str(barcode_counts['-']["Total"])])
                    bcFile.write(txt + '\n')

            # write out project table
            sys.stdout.write("%s reads (%s%% of total run) successfully identified barcode\n" %
                             (bcsuccesscount,
                              round((float(bcsuccesscount) / float(self.run.count())) * 100, 1)))
            if evalPrimer:  # primer
                sys.stdout.write("%s reads (%s%% of total run) successfully identified barcode and primer\n" %
                                 (prsuccesscount,
                                  round((float(prsuccesscount) / float(self.run.count())) * 100, 1)))
            if evalSample:  # sample
                sys.stdout.write("%s reads (%s%% of total run) successfully assigned to sample\n" %
                                 (sampsuccesscount,
                                  round((float(sampsuccesscount) / float(self.run.count())) * 100, 1)))
            if minQ is not None:
                sys.stdout.write("%s reads (%s%% of total run) successfully pass trimming criteria\n" %
                                 (trimsuccesscount,
                                  round((float(trimsuccesscount) / float(self.run.count())) * 100, 1)))

            sys.stdout.write("%s reads (%s%% of total run) unidentified\n\n" %
                             (self.run.count() - identified_count,
                              round((float(self.run.count() - identified_count) / float(self.run.count())) * 100, 1)))

            if evalSample and self.verbose:
                for key in self.run_out:
                    sys.stdout.write("%s reads (%s%% of total run) found for project\t%s\n" %
                                     (self.run_out[key].count(),
                                      round((float(self.run_out[key].count()) / float(self.run.count())) * 100, 1), key))
            self.clean()
            return 0
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            sys.stderr.write("%s unexpectedly terminated\n" % (__name__))
            return 1
        except Exception:
            self.clean()
            if not debug:
                sys.stderr.write("A fatal error was encountered. trying turning on debug\n")
            if debug:
                sys.stderr.write("".join(traceback.format_exception(*sys.exc_info())))
            return 1