示例#1
0
 def start(self):
     if not ('assembler' in self.params):
         raise exceptions.FatalError("assembler not defined in params")
     if self.params['map_against_reads'] and self.params['iteration'] == 1:
         self.RunMapAgainstReads()
     elif self.params['assembler'] == 'newbler':
         self.RunNewbler()
     elif self.params['assembler'] == 'spades':
         self.RunSpades()
     else:
         raise exceptions.FatalError("Assembler %s isn't recognized." %
                                     self.params['assembler'])
示例#2
0
 def check_bins(self, bins):
     for bin in bins:
         try:
             subprocess.check_output(['which', bin])
         except CalledProcessError:
             raise exceptions.FatalError(
                 "Cannot find %s in path, or the 'which' "
                 "command is missing" % (bin))
示例#3
0
文件: mapper.py 项目: kdm9/ARC
 def SAM_to_dict(self, filename):
     """ Read a SAM file to a mapping dict and return it """
     #Check for necessary files:
     if os.path.exists(filename) is False:
         raise exceptions.FatalError("Missing SAM file")
     try:
         inf = open(filename, 'r')
     except Exception as exc:
         txt = "Failed to open SAM file %s" % filename
         txt += '\n\t' + str(exc)
         raise exceptions.FatalError(txt)
     read_map = {}  # target:{read} dictionary of dictionaries
     i = 0
     discards = 0
     startT = time.time()
     for l in inf:
         i += 1
         if l[0] != "@":  # skip header lines
             l2 = l.strip().split()
             if l2[2] == "*":  # skip unmapped
                 continue
             readid = keyfunction(self.params['sra'])(
                 l2[0])  # .split("/")[0]
             target = l2[2]
             # handle references built using assembled contigs:
             if len(target.split("_:_")) == 3:
                 target, status = target.split("_:_")[1:]
                 # This keeps ARC from writing reads which mapped to finished contigs
                 if status.startswith("Contig") or status.startswith(
                         "isogroup"):
                     discards += 1
                     continue
             if target not in read_map:
                 read_map[target] = {}
             read_map[target][readid] = 1
     # Report total time:
     logger.info("Sample: %s, Processed %s lines from SAM in %s seconds." %
                 (self.params['sample'], i, time.time() - startT))
     if discards > 0:
         logger.info(
             "%s out of %s reads mapped to finished contigs and were not recruited for assembly."
             % (discards, i))
     return read_map
示例#4
0
文件: mapper.py 项目: kdm9/ARC
 def start(self):
     if not ('mapper' in self.params):
         raise exceptions.FatalError("mapper not defined in params")
     if self.params['mapper'] == 'bowtie2':
         logger.info("Sample: %s Running bowtie2." % self.params['sample'])
         self.run_bowtie2()
     if self.params['mapper'] == 'blat':
         logger.info("Sample: %s Running blat." % self.params['sample'])
         self.run_blat()
     #Mapping is done, run splitreads:
     logger.info("Sample: %s Running splitreads." % self.params['sample'])
     self.splitreads()
示例#5
0
    def check(self):
        # Check that the reference file exists
        if 'reference' in self.config:
            self.config['reference'] = os.path.realpath(
                self.config['reference'])
            if not os.path.exists(self.config['reference']):
                raise exceptions.FatalError("Error, cannot find reference %s" %
                                            (self.config['reference']))
        else:
            raise exceptions.FatalError('Error, reference not included in %s' %
                                        self.filename)

        # Check to see if the samples are valid
        if len(self.config['Samples']) > 0:
            for sample in self.config['Samples']:
                pe_one = 'PE1' in self.config['Samples'][sample]
                pe_two = 'PE2' in self.config['Samples'][sample]
                pe = pe_one and pe_two
                se = 'SE' in self.config['Samples'][sample]

                if not (pe or se):
                    raise exceptions.FatalError(
                        "Error you must specify PE files and/or a SE file for "
                        "each sample.")
        else:
            raise exceptions.FatalError("Could not find samples in %s" %
                                        self.filename)

        if self.config['format'] not in self.FORMATS:
            raise exceptions.FatalError(
                "Error, file format not specificed in ARC_self.txt.")

        if self.config['mapper'] not in self.MAPPERS:
            raise exceptions.FatalError("Error mapper must be either %s" %
                                        (', '.join(self.MAPPERS.keys())))
        else:
            self.check_bins(self.MAPPERS[self.config['mapper']])

        if self.config['assembler'] not in self.ASSEMBLERS:
            raise exceptions.FatalError("Error assembler must be either: %s" %
                                        (', '.join(self.ASSEMBLERS.keys())))
        else:
            self.check_bins(self.ASSEMBLERS[self.config['assembler']])

        if self.config['subsample'] <= 0 and self.config['subsample'] > 1:
            raise exceptions.FatalError(
                "Error, you must specify a value greater than 0 and less than or equal to 1 for subsample"
            )
示例#6
0
 def set_defaults(self):
     for key, value in self.OPTIONS.iteritems():
         if key not in self.config:
             if value is None:
                 raise exceptions.FatalError(
                     "Error, %s required but not specificed in "
                     "ARC_self.config.txt" % key)
             else:
                 logger.info(
                     "%s not specified in ARC_config.txt, defaulting to "
                     "%s" % (key, value))
                 self.config[key] = value
     # Anything listed below here is not expected to be in the config but
     # needs initialized
     self.config['iteration'] = 0
示例#7
0
    def __init__(self, filename):
        if os.path.exists(filename) is False:
            raise exceptions.FatalError(
                "Error, you must run ARC in a folder containing "
                "ARC_config.txt")
        self.filename = filename

        # Initialize config
        self.config = {}

        # Read config file, set the defaults, and check
        self.read()
        self.set_defaults()
        self.check()
        self.convert()
示例#8
0
    def read(self):
        infile = open(self.filename, 'r')

        # Read in comments and globals.  Treats '##' as comments and '#' as
        # global variables.
        while True:
            line = infile.readline()
            if not line:
                break

            line = line.strip()
            # Blank line
            if line == "":
                continue

            arr = line.split()

            if arr[0] == "#":
                cfg = arr[1].split('=')
                if len(cfg) != 2 or cfg[1] == "":
                    raise exceptions.FatalError(
                        "Error, parameters not specified correctly, please "
                        "use # name=value. Offending entry: \n\t%s" % arr[1])
                # Go ahead and convert the things that should be ints to ints
                key = cfg[0].strip()
                value = cfg[1].strip()
                if re.match(r"^[0-9]*\.[0-9]+$", value):
                    self.config[key] = float(value)
                elif re.match(r"^[0-9]+$", value):
                    self.config[key] = int(value)
                elif value in ('True', 'true'):
                    self.config[key] = True
                elif value in ('False', 'false'):
                    self.config[key] = False
                else:
                    self.config[key] = value

            elif arr[0] == "##":
                pass
            else:
                # We just sucked in the header for the samples
                break

        # Now get the sample information
        self.config['Samples'] = {}
        while True:
            line = infile.readline()
            if not line:
                break

            line = line.strip()
            # Blank line
            if line == "" or line[0] == '#':
                continue

            arr = line.split()
            if len(arr) != 3:
                raise exceptions.FatalError(
                    "Error, sample description entry is not properly "
                    "formatted! Offending entry: %s" % line)

            sample_id = arr[0].strip()
            filename = arr[1].strip()
            filetype = arr[2].strip()

            if sample_id not in self.config['Samples']:
                self.config['Samples'][sample_id] = {}

            if filetype in self.config['Samples'][sample_id]:
                raise exceptions.FatalError(
                    "Error same FileType specified more than once "
                    "for sample_id %s." % sample_id)
            if not os.path.exists(filename):
                raise exceptions.FatalError(
                    "%s file indicated but not found: %s" %
                    (filetype, filename))
            else:
                self.config['Samples'][sample_id][filetype] = os.path.realpath(
                    filename)
示例#9
0
    def RunNewbler(self):
        #Code for running newbler
        """
        Expects params keys:
            PE1 and PE2 and/or SE
            target_dir
            -urt
        """
        #Check for necessary params:
        if not (
            ('assembly_PE1' in self.params and 'assembly_PE2' in self.params)
                or 'assembly_SE' in self.params):
            raise exceptions.FatalError('Missing self.params in RunNewbler.')

        #Check for necessary files:
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not (
                os.path.exists(self.params['assembly_PE1'])
                or not (os.path.exists(self.params['assembly_PE2']))):
            raise exceptions.FatalError('Missing PE files in RunNewbler.')

        if 'assembly_SE' in self.params and not (os.path.exists(
                self.params['assembly_SE'])):
            raise exceptions.FatalError('Missing SE file in RunNewbler.')

        sample = self.params['sample']
        target = self.params['target']
        killed = False
        failed = False

        #determine whether to pipe output to a file or /dev/null
        if self.params['verbose']:
            out = open(os.path.join(self.params['target_dir'], "assembly.log"),
                       'w')
        else:
            out = open(os.devnull, 'w')

        #Build args for newAssembly:
        args = ['newAssembly', '-force']
        if self.params['last_assembly'] and self.params['cdna']:
            #only run with cdna switch on the final assembly
            args += ['-cdna']
        args += [os.path.join(self.params['target_dir'], 'assembly')]
        logger.debug("Calling newAssembly for sample: %s target %s" %
                     (sample, target))
        logger.info(" ".join(args))
        ret = subprocess.call(args, stdout=out, stderr=out)
        #Build args for addRun:
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params:
            args = [
                'addRun',
                os.path.join(self.params['target_dir'], 'assembly')
            ]
            args += [self.params['assembly_PE1']]
            logger.debug("Calling addRun for sample: %s target %s" %
                         (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.call(args, stdout=out, stderr=out)

            args = [
                'addRun',
                os.path.join(self.params['target_dir'], 'assembly')
            ]
            args += [self.params['assembly_PE2']]
            logger.debug("Calling addRun for sample: %s target %s" %
                         (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.call(args, stdout=out, stderr=out)
        if 'assembly_SE' in self.params:
            args = [
                'addRun',
                os.path.join(self.params['target_dir'], 'assembly')
            ]
            args += [self.params['assembly_SE']]
            logger.debug("Calling addRun for sample: %s target %s" %
                         (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.call(args, stdout=out, stderr=out)

        #Build args for runProject
        args = ['runProject']
        args += ['-cpu', '1']
        if self.params['last_assembly'] and self.params['cdna']:
            args += ['-noace']
        else:
            args += ['-nobig']
        if self.params['urt'] and not self.params['last_assembly']:
            #only run with the -urt switch when it isn't the final assembly
            args += ['-urt']
        if self.params['rip']:
            args += ['-rip']
        args += [os.path.join(self.params['target_dir'], 'assembly')]
        try:
            start = time.time()
            logger.debug("Calling runProject for sample: %s target %s" %
                         (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.Popen(args, stdout=out, stderr=out)
            pid = ret.pid
            while ret.poll() is None:
                if time.time() - start > self.params['assemblytimeout']:
                    self.kill_process_children(pid)
                    logger.warn(
                        "Sample: %s target: %s iteration: %s Killing assembly after %s seconds"
                        % (sample, target, self.params['iteration'],
                           time.time() - start))
                    killed = True
                    break
                time.sleep(.5)
        except Exception as exc:
            txt = "Sample: %s, Target: %s: Unhandeled error running Newbler assembly" % (
                self.params['sample'], self.params['target'])
            txt += '\n\t' + str(exc) + "".join(traceback.format_exception)
            logger.warn(txt)
            failed = True
            pass
        finally:
            out.close()

        #Sometimes newbler doesn't seem to exit completely:
        self.kill_process_children(pid)

        #if ret != 0:
        #raise exceptions.RerunnableError("Newbler assembly failed.")

        if not killed and ret.poll() != 0:
            #raise exceptions.RerunnableError("Newbler assembly failed.")
            failed = True

        if failed:
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly failed after %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_failed\t" + str(time.time() - start))
            outf.close()
        if killed:
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly killed after %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_killed\t" + str(time.time() - start))
            outf.close()
        else:
            #Run finished without error
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly finished in %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_complete\t" + str(time.time() - start))
            outf.close()
示例#10
0
    def RunSpades(self):
        """
        Several arguments can be passed to spades.py: -1 [PE1], -2 [PE2], -s [SE], and -o [target_dir]
        """
        #Check that required params are available
        if not (('assembly_PE1' in self.params
                 and 'assembly_PE2' in self.params) or
                ('assembly_SE' in self.params)):
            raise exceptions.FatalError('Missing self.params in RunSpades.')

        #Check that the files actually exist
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not (
                os.path.exists(self.params['assembly_PE1'])
                or not (os.path.exists(self.params['assembly_PE2']))):
            raise exceptions.FatalError('Missing PE files in RunSpades.')
        if 'assembly_SE' in self.params and not (os.path.exists(
                self.params['assembly_SE'])):
            raise exceptions.FatalError('Missing SE file in RunSpades.')

        sample = self.params['sample']
        target = self.params['target']

        #Build args for assembler call
        args = ['spades.py', '-t', '1']
        if self.params['only-assembler'] and not self.params['last_assembly']:
            args.append("--only-assembler")
        if self.params['format'] == 'fasta':
            args.append(
                '--only-assembler'
            )  # spades errors on read correction if the input isn't fastq
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params:
            args += [
                '-1', self.params['assembly_PE1'], '-2',
                self.params['assembly_PE2']
            ]
        if 'assembly_SE' in self.params:
            args += ['-s', self.params['assembly_SE']]
        args += ['-o', os.path.join(self.params['target_dir'], 'assembly')]
        if self.params['verbose']:
            out = open(os.path.join(self.params['target_dir'], "assembly.log"),
                       'w')
        else:
            out = open(os.devnull, 'w')

        logger.debug("Sample: %s target: %s Running spades assembler." %
                     (sample, target))
        logger.info(" ".join(args))
        killed = False
        failed = False
        start = time.time()
        try:
            #ret = subprocess.call(args, stderr=out, stdout=out)
            ret = subprocess.Popen(args, stdout=out, stderr=out)
            pid = ret.pid
            while ret.poll() is None:
                if time.time() - start > self.params['assemblytimeout']:
                    ret.kill()
                    killed = True
                    logger.warn(
                        "Sample: %s target: %s Assembly killed after %s seconds."
                        % (sample, target, time.time() - start))
                    break
                time.sleep(.5)
        except Exception as exc:
            txt = (
                "Sample: %s, Target: %s: Unhandeled error running Spades assembly"
                % (sample, target))
            txt += '\n\t' + str(exc)
            logger.warn(txt)
            failed = True
            pass
        finally:
            out.close()

        #Ensure that assembler exits cleanly:
        self.kill_process_children(pid)

        if not killed and ret.poll() != 0:
            failed = True
        if failed:
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly failed after %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_failed")
            outf.close()
        elif killed:
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly killed after %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_killed")
            outf.close()
        else:
            #Run finished without error
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly finished in %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_complete")
            outf.close()
示例#11
0
    def writeCDNAresults(self, target, target_folder, outf, contigf):
        """
        This is ONLY called when a cDNA target is finished.

        When doing a cDNA type run, it is very useful to have both the following:
        1) All contigs that belong to a gene (isogroup)
            - It would be particularly good to re-orient these if they are in RC.
        2) Total number of reads assembled in each gene (isogroup)

        Additionally it would be excellent to some day also get the following:
        3) Transcript (isotig) structure
        4) Estimate of isotig specific reads.

        """
        if self.params['assembler'] == 'newbler':
            contigf = os.path.join(self.params['working_dir'], target_folder,
                                   "assembly", "assembly", "454AllContigs.fna")
            isotigsf = os.path.join(self.params['working_dir'], target_folder,
                                    "assembly", "assembly",
                                    "454IsotigsLayout.txt")
            readstatusf = os.path.join(self.params['working_dir'],
                                       target_folder, "assembly", "assembly",
                                       "454ReadStatus.txt")
        else:
            logger.info(
                "WARNING writeCDNAresults called when assembler was not Newbler"
            )
            return None
        if not (os.path.exists(contigf) and os.path.exists(isotigsf)
                and os.path.exists(readstatusf)):
            logger.info("CDNA WARNING MISSING FILE!! %s %s" %
                        (target, self.params['sample']))
            logger.info(contigf, os.path.exists(contigf))
            logger.info(isotigsf, os.path.exists(isotigsf))
            logger.info(readstatusf, os.path.exists(readstatusf))
            return None
        #Storage data structures:
        isogroups = {
        }  # A dict of isogroups which each contain an in-order list of contigs
        readcounts = Counter(
        )  # A dict of all contigs, these contain read counts (from ReadStatus)
        contig_orientation = {}
        contig_to_isogroup = {}
        contig_idx = SeqIO.index(contigf, "fasta")
        # Parse isotigsf:
        igroup = ""
        #print self.params['sample'], target, "Parsing isotigsf: %s" % isotigsf
        for l in open(isotigsf, 'r'):
            #Handle lines with only a '\n'
            if l == '\n':
                pass
            #Handle lines for isogroup:
            elif l[0:9] == '>isogroup':
                igroup = l.strip().split()[0].strip(">")
            #Handle lines containing all contigs:
            elif l.strip().split()[0] == 'Contig':
                l2 = l.strip().split()
                contigs = map(lambda x: "contig" + x, l2[2:-1])
                isogroups[igroup] = contigs
                for contig in contigs:
                    if contig not in contig_orientation:
                        contig_orientation[contig] = '+'
                        contig_to_isogroup[contig] = igroup
                    else:
                        raise exceptions.FatalError(
                            'Contig %s in %s more than once' %
                            (contig, contigf))
            #Handle lines containing contig orientation info:
            elif l[0:6] == 'isotig':
                l2 = l[l.find(" ") + 1:l.rfind(" ") - 1]
                l3 = [l2[i:i + 6] for i in range(0, len(l2), 6)]
                for i in range(len(l3)):
                    if l3[i][0] == '<':
                        # contig is in reverse orientation
                        contig = isogroups[igroup][i]
                        contig_orientation[contig] = '-'
        #print self.params['sample'], target, "Parsed isotigsf, contigs:", len(isogroups), "contig_to_isogroup", len(contig_to_isogroup), "contig_orientation", len(contig_orientation)
        #Now parse readstatus:
        inf = open(readstatusf, 'r')
        inf.readline()  # discard first line
        for l in inf:
            l2 = l.strip().split('\t')
            #Determine if this read was assembled
            if len(l2) == 8:
                contig = l2[2]
                # Note that there are some built in limits to the number of contigs that can be in an isogroup:
                # http://contig.wordpress.com/2010/08/31/running-newbler-de-novo-transcriptome-assembly-i/
                # These won't appear in the IsotigsLayout.txt, but ARE in the ReadStatus.txt file.
                if contig in contig_to_isogroup:
                    readcounts[contig_to_isogroup[contig]] += 1
                else:
                    readcounts['ExceedsThreshold'] += 1
        #print self.params['sample'], target, "Parse read status"

        #Finally, output all of this information appropriately:
        countsf = open(
            os.path.join(self.params['finished_dir'],
                         "isogroup_read_counts.tsv"), 'a')
        sample = self.params['sample']
        #First write out readcounts: sample \t target \t isogroup \t readcount
        for isogroup in readcounts:
            countsf.write('\t'.join(
                [sample, target, isogroup,
                 str(readcounts[isogroup])]) + '\n')
        countsf.close()
        #print self.params['sample'], target, "Wrote readcounts"

        #Next write the contigs in proper order and orientation:
        ncontigs = 0
        nisogroups = 0
        for isogroup in isogroups:
            nisogroups += 1
            for contig in isogroups[isogroup]:
                ncontigs += 1
                seqrec = contig_idx[contig]
                #print self.params['sample'], target, seqrec
                if contig_orientation[contig] == '-':
                    seqrec.seq = seqrec.seq.reverse_complement()
                #print self.params['sample'], target, seqrec
                seqrec.name = seqrec.id = sample + "_:_" + target + "_:_" + isogroup + "|" + contig
                #print self.params['sample'], target, seqrec
                SeqIO.write(seqrec, outf, "fasta")
        ## TODO: add support for the ExceedsThreshold contigs
        logger.info(
            "Sample: %s target: %s iteration: %s Finished writing %s contigs, %s isogroups "
            % (self.params['sample'], target, self.params['iteration'],
               ncontigs, nisogroups))
示例#12
0
文件: mapper.py 项目: kdm9/ARC
    def run_bowtie2(self):
        """
        Builds idx and runs bowtie2 -I 0 -X 1500 --local
        Expects params:
            sample, target, reference, working_dir, PE1 and PE2 and/or SE
        """
        #Check for necessary params:
        if not ('sample' in self.params and 'reference' in self.params
                and 'working_dir' in self.params and
                (('PE1' in self.params and 'PE2' in self.params)
                 or 'SE' in self.params)):
            raise exceptions.FatalError('Missing params in run_bowtie2.')
        #Check for necessary files:
        if os.path.exists(self.params['reference']) is False:
            raise exceptions.FatalError("Missing reference file for mapping")
        if 'PE1' in self.params and 'PE2' in self.params:
            if not (os.path.exists(self.params['PE1'])
                    and os.path.exists(self.params['PE2'])):
                raise exceptions.FatalError(
                    "One or both PE files can not be found for mapping.")
        if 'SE' in self.params:
            if not os.path.exists(self.params['SE']):
                raise exceptions.FatalError("SE file cannot be found.")

        #Make idx directory
        try:
            working_dir = self.params['working_dir']
            idx_dir = os.path.realpath(os.path.join(working_dir, 'idx'))
            os.mkdir(idx_dir)
        except Exception as exc:
            txt = "Sample: %s Error creating working directory." % (
                self.params['sample']) + '\n\t' + str(exc)
            raise exceptions.FatalError(txt)

        #Check whether to log to temporary file, or default to os.devnull
        if 'verbose' in self.params:
            out = open(os.path.join(working_dir, "mapping_log.txt"), 'w')
        else:
            out = open(os.devnull, 'w')

        #Set up a path to the index
        base = os.path.join(idx_dir, 'idx')

        #Build index
        #The idea is to map against the finished contigs and in-progress
        # contigs, thereby ensuring that the -k parameter (or best map)
        # are respected properly, and avoid the situation where reads which
        # were mapped to a now finished target might later be mapped to a an
        # in-progress target.
        fin_outf = os.path.join(self.params['finished_dir'], 'contigs.fasta')
        args = ['bowtie2-build', '-f']
        if os.path.exists(fin_outf) and os.path.getsize(fin_outf) > 0:
            args.append(','.join((fin_outf, self.params['reference'])))
        else:
            args.append(self.params['reference'])
        args.append(base)
        logger.info("Sample: %s Calling bowtie2-build." %
                    self.params['sample'])
        logger.info(" ".join(args))
        try:
            ret = subprocess.call(args, stdout=out, stderr=out)
        except Exception as exc:
            txt = ("Sample %s: Unhandeled error running bowtie2-build" %
                   self.params['sample']) + '\n\t' + str(exc)
            # make sure that out is closed before throwing exception
            out.close()
            raise exceptions.FatalError(txt)

        if ret != 0:
            out.close()
            raise exceptions.FatalError(
                "Sample: %s Error creating bowtie2 index, check log file." %
                self.params['sample'])

        #Do bowtie2 mapping:
        n_bowtieprocs = int(
            round(
                max(
                    float(self.params['nprocs']) / len(self.params['Samples']),
                    1)))
        args = ['bowtie2', '-I', '0', '-X', '1500', '--no-unal']

        #Tune the sensitivity so that on the first iteration the mapper is
        # very sensitive. On later iterations the mapper is very specific.
        if self.params['iteration'] == 0 and self.params['sloppymapping']:
            args.append("--very-sensitive-local")
        else:
            args += [
                "--very-fast-local", "--mp", "12", "--rdg", "12,6", "--rfg",
                "12,6"
            ]

        args += ['-p', str(n_bowtieprocs), '-x', base]
        if self.params['bowtie2_k'] > 1:
            args += ['-k', str(self.params['bowtie2_k'])]
        if self.params['format'] == 'fasta':
            args += ['-f']
        if 'PE1' in self.params and 'PE2' in self.params:
            args += ['-1', self.params['PE1'], '-2', self.params['PE2']]
        if 'SE' in self.params:
            args += ['-U', self.params['SE']]
        args += ['-S', os.path.join(working_dir, 'mapping.sam')]
        logger.info("Sample: %s Calling bowtie2 mapper" %
                    self.params['sample'])
        logger.info(" ".join(args))

        try:
            ret = subprocess.call(args, stdout=out, stderr=out)
            out.close()
        except Exception as exc:
            txt = ("Sample %s: Unhandeled error running bowtie2 mapping" %
                   self.params['sample']) + '\n\t' + str(exc)
            raise exceptions.FatalError(txt)

        out.close()
        if ret != 0:
            raise exceptions.FatalError(
                "Sample %s: Bowtie2 mapping returned an error, check log file."
                % self.params['sample'])

        #Extract the SAM to a dict
        self.params['mapping_dict'] = self.SAM_to_dict(
            os.path.join(working_dir, 'mapping.sam'))
        #clean up intermediary files:
        os.remove(os.path.join(working_dir, 'mapping.sam'))
        os.system("rm -rf %s" % idx_dir)
示例#13
0
文件: mapper.py 项目: kdm9/ARC
    def splitreads(self):
        """ Split reads and then kick off assemblies once the reads are split for a target, use safe_targets for names"""
        self.params['iteration'] += 1

        # Write out statistics for any/all targets which failed to recruit reads:
        for target in self.params['summary_stats'].keys():
            # print "Target", target
            if target not in self.params['mapping_dict']:
                writeTargetStats(finished_dir=self.params['finished_dir'],
                                 sample=self.params['sample'],
                                 target=target,
                                 targetLength=self.params['summary_stats']
                                 [target]['targetLength'],
                                 status='NoReads',
                                 iteration=self.params['iteration'],
                                 readcount=0,
                                 num_contigs=0,
                                 contig_length=0)
                del self.params['summary_stats'][target]

        checker_params = {}
        for k in self.params:
            checker_params[k] = self.params[k]
        del checker_params['mapping_dict']
        checker_params['targets'] = {}
        iteration = self.params['iteration']
        # open previously created indexes:
        if 'PE1' in self.params and 'PE2' in self.params:
            idx_PE1 = SeqIO.index_db(
                os.path.join(self.params['working_dir'], "PE1.idx"),
                key_function=keyfunction(self.params['sra']))
            idx_PE2 = SeqIO.index_db(
                os.path.join(self.params['working_dir'], "PE2.idx"),
                key_function=keyfunction(self.params['sra']))
        if 'SE' in self.params:
            idx_SE = SeqIO.index_db(
                os.path.join(self.params['working_dir'], "SE.idx"),
                key_function=keyfunction(self.params['sra']))
        if 'readcounts' not in checker_params:
            checker_params['readcounts'] = {}
        # if 'contigcounts' not in checker_params:
        #    checker_params['contigcounts'] = {}
        statsf = open(
            os.path.join(self.params['finished_dir'], 'mapping_stats.tsv'),
            'a')
        for target in self.params['mapping_dict']:
            startT = time.time()
            # logger.info("Running splitreads for Sample: %s target: %s" % (self.params['sample'], target))
            target_dir = os.path.join(self.params['working_dir'],
                                      self.params['safe_targets'][target])
            if target not in checker_params['readcounts']:
                checker_params['readcounts'][target] = Counter()
            # if target not in checker_params['contigcounts']:
            #    checker_params['contigcounts'] = Counter()
            if os.path.exists(target_dir):
                os.system("rm -rf %s" % target_dir)
            os.mkdir(target_dir)

            reads = self.params['mapping_dict'][target]
            # track how many total reads were added for this cycle
            checker_params['readcounts'][target][iteration] = len(reads)
            statsf.write('\t'.join([
                self.params['sample'], target,
                str(iteration),
                str(len(reads))
            ]) + '\n')

            SEs = PEs = 0
            if 'PE1' and 'PE2' in self.params:
                outf_PE1 = open(
                    os.path.join(target_dir, "PE1." + self.params['format']),
                    'w')
                outf_PE2 = open(
                    os.path.join(target_dir, "PE2." + self.params['format']),
                    'w')
            if 'SE' in self.params:
                outf_SE = open(
                    os.path.join(target_dir, "SE." + self.params['format']),
                    'w')

            for readID in reads:
                if self.params['subsample'] < 1 and randint(
                        0, 100) > self.params['subsample'] * 100:
                    continue
                if 'PE1' in self.params and readID in idx_PE1:
                    # read1 = idx_PE1[readID]
                    # read2 = idx_PE2[readID]
                    read1 = idx_PE1.get(readID, None)
                    read2 = idx_PE2.get(readID, None)
                    if read2 is None:
                        raise exceptions.FatalError(
                            "ERROR: ReadID %s was found in PE1 file but not PE2"
                            % readID)
                    new_readID = readID.replace(":", "_") + ":0:0:0:0#0/"
                    read1.id = read1.name = new_readID + "1"
                    read2.id = read2.name = new_readID + "2"
                    SeqIO.write(read1, outf_PE1, self.params['format'])
                    SeqIO.write(read2, outf_PE2, self.params['format'])
                    PEs += 1
                elif 'SE' in self.params and readID in idx_SE:
                    read1 = idx_SE[readID]
                    read1.id = read1.name = readID.replace(":",
                                                           "_") + ":0:0:0:0#0/"
                    SeqIO.write(read1, outf_SE, self.params['format'])
                    SEs += 1
            if 'PE1' in self.params and 'PE2' in self.params:
                outf_PE1.close()
                outf_PE2.close()
            if 'SE' in self.params:
                outf_SE.close()

            #Build assembly job:
            assembly_params = {}
            assembly_params['target'] = target
            assembly_params['target_dir'] = target_dir
            assembly_params['iteration'] = iteration
            assembly_params['last_assembly'] = False
            assembler_keys = [
                'assembler', 'sample', 'verbose', 'format', 'assemblytimeout',
                'map_against_reads', 'urt', 'numcycles', 'cdna', 'rip',
                'only-assembler'
            ]
            for k in assembler_keys:
                assembly_params[k] = self.params[k]
            cur_reads = checker_params['readcounts'][target][
                iteration]  # note that this is a counter, so no key errors can occur
            previous_reads = checker_params['readcounts'][target][iteration -
                                                                  1]

            #Turn off URT in situations where this will be the last iteration due to readcounts:

            if cur_reads <= previous_reads and iteration > 2 or iteration >= self.params[
                    'numcycles']:
                logger.info(
                    "Sample: %s target: %s iteration: %s Setting last_assembly to True"
                    %
                    (self.params['sample'], target, self.params['iteration']))
                assembly_params['last_assembly'] = True

            #properly handle the case where no reads ended up mapping for the PE or SE inputs:
            if PEs > 0:
                assembly_params['assembly_PE1'] = os.path.join(
                    target_dir, "PE1." + self.params['format'])
                assembly_params['assembly_PE2'] = os.path.join(
                    target_dir, "PE2." + self.params['format'])
            if SEs > 0:
                assembly_params['assembly_SE'] = os.path.join(
                    target_dir, "SE." + self.params['format'])

            #All reads have been written at this point, add an assembly to the queue:
            logger.info(
                "Sample: %s target: %s iteration: %s Split %s reads in %s seconds"
                % (self.params['sample'], target, self.params['iteration'],
                   len(reads), time.time() - startT))

            #Only add an assembly job and AssemblyChecker target if is there are >0 reads:
            if PEs + SEs > 0:
                checker_params['targets'][target_dir] = False
                self.submit(Assembler.to_job(assembly_params))

        statsf.close()
        logger.info("------------------------------------")
        logger.info("| Sample: %s Iteration %s of numcycles %s" %
                    (checker_params['sample'], checker_params['iteration'],
                     checker_params['numcycles']))
        logger.info("------------------------------------")
        if 'PE1' in self.params and 'PE2' in self.params:
            idx_PE1.close()
            idx_PE2.close()
            del idx_PE1
            del idx_PE2
        if 'SE' in self.params:
            idx_SE.close()
            del idx_SE

        #Kick off a job which checks if all assemblies are done, and if not adds a copy of itself to the job queue
        if len(checker_params['targets']) > 0:
            # checker = AssemblyChecker(checker_params)
            self.submit(AssemblyChecker.to_job(checker_params))
        else:
            logger.info("Sample: %s No reads mapped, no more work to do." %
                        checker_params['sample'])
示例#14
0
文件: mapper.py 项目: kdm9/ARC
    def run_blat(self):
        #Check for necessary params:
        if not ('sample' in self.params and 'reference' in self.params
                and 'working_dir' in self.params and
                (('PE1' in self.params and 'PE2' in self.params)
                 or 'SE' in self.params)):
            raise exceptions.FatalError('Missing self.params in run_bowtie2.')
        #Check for necessary files:
        if os.path.exists(self.params['reference']) is False:
            raise exceptions.FatalError("Missing reference file for mapping")
        if 'PE1' in self.params and 'PE2' in self.params:
            if not (os.path.exists(self.params['PE1'])
                    and os.path.exists(self.params['PE2'])):
                raise exceptions.FatalError(
                    "One or both PE files can not be found for mapping.")
        if 'SE' in self.params:
            if not os.path.exists(self.params['SE']):
                raise exceptions.FatalError("SE file cannot be found.")

        #Blat doesn't need an index
        working_dir = self.params['working_dir']

        #Check whether to log to temporary file, or default to os.devnull
        if 'verbose' in self.params:
            out = open(os.path.join(working_dir, "mapping_log.txt"), 'w')
        else:
            out = open(os.devnull, 'w')

        #Build a temporary txt file with all of the reads:
        allreads_outf = open(os.path.join(working_dir, 'reads.txt'), 'w')
        if 'PE1' in self.params and 'PE2' in self.params:
            allreads_outf.write(self.params['PE1'] + '\n')
            allreads_outf.write(self.params['PE2'] + '\n')
        if 'SE' in self.params:
            allreads_outf.write(self.params['SE'] + '\n')
        allreads_outf.close()

        #Do blat mapping
        args = [
            'blat', self.params['reference'],
            os.path.join(working_dir, 'reads.txt')
        ]
        if self.params['format'] == 'fastq':
            args.append('-fastq')
        if self.params['fastmap']:
            args.append('-fastMap')
        #Some new experimental params to increase specificity after the first iteration:
        if self.params['maskrepeats']:
            args.append("-mask=lower")
        if self.params['iteration'] > 0 or not self.params['sloppymapping']:
            args.append("-minIdentity=98")
            args.append("-minScore=40")
        args.append(os.path.join(working_dir, 'mapping.psl'))

        logger.info("Sample: %s Calling blat mapper" % self.params['sample'])
        logger.debug(" ".join(args))
        try:
            ret = subprocess.call(args, stdout=out, stderr=out)
        except Exception as exc:
            txt = (
                "Sample %s: Unhandeled error running blat mapping, check log file."
                % self.params['sample']) + '\n\t' + str(exc)
            raise exceptions.FatalError(txt)
        finally:
            out.close()
        if ret != 0:
            raise exceptions.FatalError(
                'Sample: %s Error running blat mapping, check log file. \n\t %s'
                % (self.params['sample'], " ".join(args)))

        #Extract the PSL to a dict
        self.params['mapping_dict'] = self.PSL_to_dict(
            os.path.join(working_dir, 'mapping.psl'))

        #Cleanup
        os.remove(os.path.join(working_dir, 'mapping.psl'))
        out.close()