Exemplo n.º 1
0
Arquivo: mapper.py Projeto: kdm9/ARC
    def PSL_to_dict(self, filename):
        """Process a PSL file to the dict format """
        try:
            inf = open(filename, 'r')
        except Exception as inst:
            if type(inst) == IOError:
                logger.error("Failed to open mapping dictionary %s." %
                             filename)
            raise inst
        read_map = {}
        i = 0
        startT = time.time()

        psl_header = False

        for l in inf:
            i += 1
            # Check for PSL header and skip 5 lines if it exists
            if i == 1 and l.split()[0] == 'psLayout':
                psl_header = True
            if psl_header and i <= 5:
                continue
            l2 = l.strip().split("\t")
            readid = keyfunction(self.params['sra'])(
                l2[9])  # .split("/")[0]  # remove unique part of PE reads
            target = l2[13]
            # handle references built using assembled contigs:
            if len(target.split("_:_")) > 1:
                target = target.split("_:_")[1]
            if target not in read_map:
                read_map[target] = {}
            read_map[target][readid] = 1
        logger.info("Sample: %s, Processed %s lines from PSL in %s seconds." %
                    (self.params['sample'], i, time.time() - startT))
        return read_map
Exemplo n.º 2
0
Arquivo: mapper.py Projeto: ibest/ARC
    def PSL_to_dict(self, filename):
        """Process a PSL file to the dict format """
        try:
            inf = open(filename, 'r')
        except Exception as inst:
            if type(inst) == IOError:
                logger.error("Failed to open mapping dictionary %s." % filename)
            raise inst
        read_map = {}
        i = 0
        startT = time.time()

        psl_header = False

        for l in inf:
            i += 1
            # Check for PSL header and skip 5 lines if it exists
            if i == 1 and l.split()[0] == 'psLayout':
                psl_header = True
            if psl_header and i <= 5:
                continue
            l2 = l.strip().split("\t")
            readid = keyfunction(self.params['sra'])(l2[9])  # .split("/")[0]  # remove unique part of PE reads
            target = l2[13]
            # handle references built using assembled contigs:
            if len(target.split("_:_")) > 1:
                target = target.split("_:_")[1]
            if target not in read_map:
                read_map[target] = {}
            read_map[target][readid] = 1
        logger.info("Sample: %s, Processed %s lines from PSL in %s seconds." % (self.params['sample'], i, time.time() - startT))
        return read_map
Exemplo n.º 3
0
 def SAM_to_dict(self, filename):
     """ Read a SAM file to a mapping dict and return it """
     #Check for necessary files:
     if os.path.exists(filename) is False:
         raise exceptions.FatalError("Missing SAM file")
     try:
         inf = open(filename, 'r')
     except Exception as exc:
         txt = "Failed to open SAM file %s" % filename
         txt += '\n\t' + str(exc)
         raise exceptions.FatalError(txt)
     read_map = {}  # target:{read} dictionary of dictionaries
     i = 0
     startT = time.time()
     for l in inf:
         i += 1
         if l[0] != "@":  # skip header lines
             l2 = l.strip().split()
             if l2[2] == "*":  # skip unmapped
                 continue
             readid = l2[0].split("/")[0]
             target = l2[2]
             #handle references built using assembled contigs:
             if len(target.split("_:_")) == 3:
                 target, status = target.split("_:_")[1:]
                 # This keeps ARC from writing reads which mapped to finished contigs
                 if status.startswith("Contig") or status.startswith("isogroup"):
                     continue
             if target not in read_map:
                 read_map[target] = {}
             read_map[target][readid] = 1
     #Report total time:
     logger.info("Sample: %s, Processed %s lines from SAM in %s seconds." % (self.params['sample'], i, time.time() - startT))
     return read_map
Exemplo n.º 4
0
 def RunMapAgainstReads(self):
     """
     A pseudo-assembler for cases where we don't actually assemble reads and instead just write them out as contigs.
     """
     #print "Creating finished file: " + os.path.join(self.params['target_dir'], 'finished')
     start = time.time()
     outf = open(os.path.join(self.params['target_dir'], 'finished'), 'w')
     outf.write("map_against_reads")
     sample = self.params['sample']
     target = self.params['target']
     logger.info("Sample: %s target: %s iteration: %s Assembly finished in %s seconds" % (sample, target, self.params['iteration'], time.time() - start))
     outf.close()
Exemplo n.º 5
0
 def start(self):
     """ run through list of targets, check any that haven't finished already """
     sample = self.params['sample']
     completed = sum(self.params['targets'].values())
     logger.info("Sample: %s AssemblyChecker started with %s of %s targets completed" % (sample, completed, len(self.params['targets'])))
     for target_folder in self.params['targets']:
         if not self.params['targets'][target_folder]:
             f = os.path.join(target_folder, 'finished')
             if os.path.exists(f):
                 self.params['targets'][target_folder] = True
                 logger.info("%s exists" % f)
                 completed += 1
     #Now check whether all have finished, if not, add a new AssemblyChecker to the queue
     if len(self.params['targets']) > sum(self.params['targets'].values()):
         #some jobs haven't completed yet
         checker_params = {}
         for k in self.params:
             checker_params[k] = self.params[k]
         #checker_params = deepcopy(self.params)
         # checker = AssemblyChecker(checker_params)
         time.sleep(5)  # sleep 5 seconds before putting a checker back on the job_q
         self.submit(AssemblyChecker.to_job(checker_params))
         logger.info("Sample: %s Assemblies not finished: %s of %s targets completed" % (sample, completed, len(self.params['targets'])))
     else:
         params = {}
         for k in self.params:
             params[k] = self.params[k]
         # params = deepcopy(self.params)
         # finisher = Finisher(params)
         logger.debug("Sample: %s, iteration %s, Submitting finisher job to queue." % (sample, self.params['iteration']))
         self.submit(Finisher.to_job(params))
         logger.info("Sample: %s Assemblies finished: %s of %s targets completed" % (sample, completed, len(self.params['targets'])))
Exemplo n.º 6
0
Arquivo: app.py Projeto: samhunter/ARC
    def start(self, loglevel, configfile='ARC_config.txt'):
        try:
            logger.setup(loglevel=loglevel)

            logger.info("Reading config file...")
            config = Config(configfile)
            values = config.get()

            logger.info(
                "Setting up working directories and building indexes...")
            self.setup(values)

            spawn = Spawn(values)

            logger.info("Running ARC.")
            spawn.submit()
            spawn.run()

            logger.info("Cleaning up.")
            self.clean()

            return 0
        except FatalError as e:
            logger.error("A fatal error was encountered. \n\t%s" % str(e))
            return 1
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            logger.error("%s unexpectedly terminated" % (__name__))
            return 1
Exemplo n.º 7
0
 def RunMapAgainstReads(self):
     """
     A pseudo-assembler for cases where we don't actually assemble reads and instead just write them out as contigs.
     """
     #print "Creating finished file: " + os.path.join(self.params['target_dir'], 'finished')
     start = time.time()
     outf = open(os.path.join(self.params['target_dir'], 'finished'), 'w')
     outf.write("map_against_reads")
     sample = self.params['sample']
     target = self.params['target']
     logger.info(
         "Sample: %s target: %s iteration: %s Assembly finished in %s seconds"
         % (sample, target, self.params['iteration'], time.time() - start))
     outf.close()
Exemplo n.º 8
0
    def start(self, loglevel, configfile='ARC_config.txt'):
        try:
            logger.setup(loglevel=loglevel)

            logger.info("Reading config file...")
            config = Config(configfile)
            values = config.get()

            logger.info(
                "Setting up working directories and building indexes...")
            self.setup(values)

            spawn = Spawn(values)

            logger.info("Running ARC.")
            spawn.submit()
            spawn.run()

            logger.info("Cleaning up.")
            self.clean()

            return 0
        except FatalError as e:
            logger.error("A fatal error was encountered. \n\t%s" % str(e))
            return 1
        except (KeyboardInterrupt, SystemExit):
            self.clean()
            logger.error("%s unexpectedly terminated" % (__name__))
            return 1
Exemplo n.º 9
0
Arquivo: config.py Projeto: ibest/ARC
 def set_defaults(self):
     for key, value in self.OPTIONS.iteritems():
         if key not in self.config:
             if value is None:
                 raise exceptions.FatalError(
                     "Error, %s required but not specificed in "
                     "ARC_self.config.txt" % key)
             else:
                 logger.info(
                     "%s not specified in ARC_config.txt, defaulting to "
                     "%s" % (key, value))
                 self.config[key] = value
     # Anything listed below here is not expected to be in the config but
     # needs initialized
     self.config['iteration'] = 0
Exemplo n.º 10
0
 def set_defaults(self):
     for key, value in self.OPTIONS.iteritems():
         if key not in self.config:
             if value is None:
                 raise exceptions.FatalError(
                     "Error, %s required but not specificed in "
                     "ARC_self.config.txt" % key)
             else:
                 logger.info(
                     "%s not specified in ARC_config.txt, defaulting to "
                     "%s" % (key, value))
                 self.config[key] = value
     # Anything listed below here is not expected to be in the config but
     # needs initialized
     self.config['iteration'] = 0
Exemplo n.º 11
0
Arquivo: mapper.py Projeto: kdm9/ARC
 def SAM_to_dict(self, filename):
     """ Read a SAM file to a mapping dict and return it """
     #Check for necessary files:
     if os.path.exists(filename) is False:
         raise exceptions.FatalError("Missing SAM file")
     try:
         inf = open(filename, 'r')
     except Exception as exc:
         txt = "Failed to open SAM file %s" % filename
         txt += '\n\t' + str(exc)
         raise exceptions.FatalError(txt)
     read_map = {}  # target:{read} dictionary of dictionaries
     i = 0
     discards = 0
     startT = time.time()
     for l in inf:
         i += 1
         if l[0] != "@":  # skip header lines
             l2 = l.strip().split()
             if l2[2] == "*":  # skip unmapped
                 continue
             readid = keyfunction(self.params['sra'])(
                 l2[0])  # .split("/")[0]
             target = l2[2]
             # handle references built using assembled contigs:
             if len(target.split("_:_")) == 3:
                 target, status = target.split("_:_")[1:]
                 # This keeps ARC from writing reads which mapped to finished contigs
                 if status.startswith("Contig") or status.startswith(
                         "isogroup"):
                     discards += 1
                     continue
             if target not in read_map:
                 read_map[target] = {}
             read_map[target][readid] = 1
     # Report total time:
     logger.info("Sample: %s, Processed %s lines from SAM in %s seconds." %
                 (self.params['sample'], i, time.time() - startT))
     if discards > 0:
         logger.info(
             "%s out of %s reads mapped to finished contigs and were not recruited for assembly."
             % (discards, i))
     return read_map
Exemplo n.º 12
0
Arquivo: spawn.py Projeto: ibest/ARC
    def submit(self):
        # Get the number of samples from the configuration
        logger.info("Submitting initial mapping runs.")

        for sample in self.config['Samples']:
            s = self.config['Samples'][sample]
            params = {}
            for k in self.config:
                params[k] = self.config[k]
            params['working_dir'] = s['working_dir']
            params['finished_dir'] = s['finished_dir']
            #params['reference'] = s['reference']
            params['reference'] = os.path.join(s['working_dir'], 'I000_contigs.fasta')
            params['sample'] = sample

            if 'PE1' in s and 'PE2' in s:
                params['PE1'] = s['PE1']
                params['PE2'] = s['PE2']
            if 'SE' in s:
                params['SE'] = s['SE']

            # mapper = Mapper(params)
            self.q.put(Mapper.to_job(params))
Exemplo n.º 13
0
    def submit(self):
        # Get the number of samples from the configuration
        logger.info("Submitting initial mapping runs.")

        for sample in self.config['Samples']:
            s = self.config['Samples'][sample]
            params = {}
            for k in self.config:
                params[k] = self.config[k]
            params['working_dir'] = s['working_dir']
            params['finished_dir'] = s['finished_dir']
            #params['reference'] = s['reference']
            params['reference'] = os.path.join(s['working_dir'],
                                               'I000_contigs.fasta')
            params['sample'] = sample

            if 'PE1' in s and 'PE2' in s:
                params['PE1'] = s['PE1']
                params['PE2'] = s['PE2']
            if 'SE' in s:
                params['SE'] = s['SE']

            # mapper = Mapper(params)
            self.q.put(Mapper.to_job(params))
Exemplo n.º 14
0
Arquivo: mapper.py Projeto: ibest/ARC
 def start(self):
     if not('mapper' in self.params):
         raise exceptions.FatalError("mapper not defined in params")
     if self.params['mapper'] == 'bowtie2':
         logger.info("Sample: %s Running bowtie2." % self.params['sample'])
         self.run_bowtie2()
     if self.params['mapper'] == 'blat':
         logger.info("Sample: %s Running blat." % self.params['sample'])
         self.run_blat()
     #Mapping is done, run splitreads:
     logger.info("Sample: %s Running splitreads." % self.params['sample'])
     self.splitreads()
Exemplo n.º 15
0
Arquivo: mapper.py Projeto: kdm9/ARC
 def start(self):
     if not ('mapper' in self.params):
         raise exceptions.FatalError("mapper not defined in params")
     if self.params['mapper'] == 'bowtie2':
         logger.info("Sample: %s Running bowtie2." % self.params['sample'])
         self.run_bowtie2()
     if self.params['mapper'] == 'blat':
         logger.info("Sample: %s Running blat." % self.params['sample'])
         self.run_blat()
     #Mapping is done, run splitreads:
     logger.info("Sample: %s Running splitreads." % self.params['sample'])
     self.splitreads()
Exemplo n.º 16
0
 def start(self):
     """ run through list of targets, check any that haven't finished already """
     sample = self.params['sample']
     completed = sum(self.params['targets'].values())
     logger.info(
         "Sample: %s AssemblyChecker started with %s of %s targets completed"
         % (sample, completed, len(self.params['targets'])))
     for target_folder in self.params['targets']:
         if not self.params['targets'][target_folder]:
             f = os.path.join(target_folder, 'finished')
             if os.path.exists(f):
                 self.params['targets'][target_folder] = True
                 logger.info("%s exists" % f)
                 completed += 1
     #Now check whether all have finished, if not, add a new AssemblyChecker to the queue
     if len(self.params['targets']) > sum(self.params['targets'].values()):
         #some jobs haven't completed yet
         checker_params = {}
         for k in self.params:
             checker_params[k] = self.params[k]
         #checker_params = deepcopy(self.params)
         # checker = AssemblyChecker(checker_params)
         time.sleep(
             5
         )  # sleep 5 seconds before putting a checker back on the job_q
         self.submit(AssemblyChecker.to_job(checker_params))
         logger.info(
             "Sample: %s Assemblies not finished: %s of %s targets completed"
             % (sample, completed, len(self.params['targets'])))
     else:
         params = {}
         for k in self.params:
             params[k] = self.params[k]
         # params = deepcopy(self.params)
         # finisher = Finisher(params)
         logger.debug(
             "Sample: %s, iteration %s, Submitting finisher job to queue." %
             (sample, self.params['iteration']))
         self.submit(Finisher.to_job(params))
         logger.info(
             "Sample: %s Assemblies finished: %s of %s targets completed" %
             (sample, completed, len(self.params['targets'])))
Exemplo n.º 17
0
    def RunSpades(self):
        """
        Several arguments can be passed to spades.py: -1 [PE1], -2 [PE2], -s [SE], and -o [target_dir]
        """
        #Check that required params are available
        if not (('assembly_PE1' in self.params and 'assembly_PE2' in self.params) or ('assembly_SE' in self.params)):
            raise exceptions.FatalError('Missing self.params in RunSpades.')

        #Check that the files actually exist
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not(os.path.exists(self.params['assembly_PE1']) or not(os.path.exists(self.params['assembly_PE2']))):
            raise exceptions.FatalError('Missing PE files in RunSpades.')
        if 'assembly_SE' in self.params and not(os.path.exists(self.params['assembly_SE'])):
            raise exceptions.FatalError('Missing SE file in RunSpades.')

        sample = self.params['sample']
        target = self.params['target']

        #Build args for assembler call
        args = ['spades.py', '-t', '1']
        if self.params['only-assembler'] and not self.params['last_assembly']:
            args.append("--only-assembler")
        if self.params['format'] == 'fasta':
            args.append('--only-assembler')  # spades errors on read correction if the input isn't fastq
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params:
            args += ['-1', self.params['assembly_PE1'], '-2', self.params['assembly_PE2']]
        if 'assembly_SE' in self.params:
            args += ['-s', self.params['assembly_SE']]
        args += ['-o', os.path.join(self.params['target_dir'], 'assembly')]
        if self.params['verbose']:
            out = open(os.path.join(self.params['target_dir'], "assembly.log"), 'w')
        else:
            out = open(os.devnull, 'w')

        logger.debug("Sample: %s target: %s Running spades assembler." % (sample, target))
        logger.info(" ".join(args))
        killed = False
        failed = False
        start = time.time()
        try:
            #ret = subprocess.call(args, stderr=out, stdout=out)
            ret = subprocess.Popen(args, stdout=out, stderr=out)
            pid = ret.pid
            while ret.poll() is None:
                if time.time() - start > self.params['assemblytimeout']:
                    ret.kill()
                    killed = True
                    logger.warn("Sample: %s target: %s Assembly killed after %s seconds." % (sample, target, time.time() - start))
                    break
                time.sleep(.5)
        except Exception as exc:
            txt = ("Sample: %s, Target: %s: Unhandeled error running Spades assembly" % (sample, target))
            txt += '\n\t' + str(exc)
            logger.warn(txt)
            failed = True
            pass
        finally:
            out.close()

        #Ensure that assembler exits cleanly:
        self.kill_process_children(pid)

        if not killed and ret.poll() != 0:
            failed = True
        if failed:
            logger.info("Sample: %s target: %s iteration: %s Assembly failed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"), 'w')
            outf.write("assembly_failed")
            outf.close()
        elif killed:
            logger.info("Sample: %s target: %s iteration: %s Assembly killed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"), 'w')
            outf.write("assembly_killed")
            outf.close()
        else:
            #Run finished without error
            logger.info("Sample: %s target: %s iteration: %s Assembly finished in %s seconds" % (sample, target, self.params['iteration'], time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"), 'w')
            outf.write("assembly_complete")
            outf.close()
Exemplo n.º 18
0
Arquivo: mapper.py Projeto: ibest/ARC
    def run_bowtie2(self):
        """
        Builds idx and runs bowtie2 -I 0 -X 1500 --local
        Expects params:
            sample, target, reference, working_dir, PE1 and PE2 and/or SE
        """
        #Check for necessary params:
        if not ('sample' in self.params and 'reference' in self.params and
                'working_dir' in self.params and (('PE1' in self.params and
                              'PE2' in self.params) or 'SE' in self.params)):
            raise exceptions.FatalError('Missing params in run_bowtie2.')
        #Check for necessary files:
        if os.path.exists(self.params['reference']) is False:
            raise exceptions.FatalError("Missing reference file for mapping")
        if 'PE1' in self.params and 'PE2' in self.params:
            if not (os.path.exists(self.params['PE1']) and
                    os.path.exists(self.params['PE2'])):
                raise exceptions.FatalError(
                    "One or both PE files can not be found for mapping.")
        if 'SE' in self.params:
            if not os.path.exists(self.params['SE']):
                raise exceptions.FatalError("SE file cannot be found.")

        #Make idx directory
        try:
            working_dir = self.params['working_dir']
            idx_dir = os.path.realpath(os.path.join(working_dir, 'idx'))
            os.mkdir(idx_dir)
        except Exception as exc:
            txt = "Sample: %s Error creating working directory." % (
                self.params['sample']) + '\n\t' + str(exc)
            raise exceptions.FatalError(txt)

        #Check whether to log to temporary file, or default to os.devnull
        if 'verbose' in self.params:
            out = open(os.path.join(working_dir, "mapping_log.txt"), 'w')
        else:
            out = open(os.devnull, 'w')

        #Set up a path to the index
        base = os.path.join(idx_dir, 'idx')

        #Build index
        #The idea is to map against the finished contigs and in-progress
        # contigs, thereby ensuring that the -k parameter (or best map)
        # are respected properly, and avoid the situation where reads which
        # were mapped to a now finished target might later be mapped to a an
        # in-progress target.
        fin_outf = os.path.join(self.params['finished_dir'], 'contigs.fasta')
        args = ['bowtie2-build', '-f']
        if os.path.exists(fin_outf) and os.path.getsize(fin_outf) > 0:
            args.append(','.join((fin_outf, self.params['reference'])))
        else:
            args.append(self.params['reference'])
        args.append(base)
        logger.info("Sample: %s Calling bowtie2-build." %
                    self.params['sample'])
        logger.info(" ".join(args))
        try:
            ret = subprocess.call(args, stdout=out, stderr=out)
        except Exception as exc:
            txt = ("Sample %s: Unhandeled error running bowtie2-build"
                   % self.params['sample']) + '\n\t' + str(exc)
            # make sure that out is closed before throwing exception
            out.close()
            raise exceptions.FatalError(txt)

        if ret != 0:
            out.close()
            raise exceptions.FatalError(
                "Sample: %s Error creating bowtie2 index, check log file."
                % self.params['sample'])

        #Do bowtie2 mapping:
        n_bowtieprocs = int(round(max(float(self.params['nprocs'])/len(self.params['Samples']), 1)))
        args = ['bowtie2', '-I', '0', '-X', '1500', '--no-unal']

        #Tune the sensitivity so that on the first iteration the mapper is
        # very sensitive. On later iterations the mapper is very specific.
        if self.params['iteration'] == 0 and self.params['sloppymapping']:
            args.append("--very-sensitive-local")
        else:
            args += ["--very-fast-local", "--mp", "12", "--rdg", "12,6",
                     "--rfg", "12,6"]

        args += ['-p', str(n_bowtieprocs), '-x', base]
        if self.params['bowtie2_k'] > 1:
            args += ['-k', str(self.params['bowtie2_k'])]
        if self.params['format'] == 'fasta':
            args += ['-f']
        if 'PE1' in self.params and 'PE2' in self.params:
            args += ['-1', self.params['PE1'], '-2', self.params['PE2']]
        if 'SE' in self.params:
            args += ['-U', self.params['SE']]
        args += ['-S', os.path.join(working_dir, 'mapping.sam')]
        logger.info(
            "Sample: %s Calling bowtie2 mapper" % self.params['sample'])
        logger.info(" ".join(args))

        try:
            ret = subprocess.call(args, stdout=out, stderr=out)
            out.close()
        except Exception as exc:
            txt = ("Sample %s: Unhandeled error running bowtie2 mapping" %
                   self.params['sample']) + '\n\t' + str(exc)
            raise exceptions.FatalError(txt)

        out.close()
        if ret != 0:
            raise exceptions.FatalError(
                "Sample %s: Bowtie2 mapping returned an error, check log file."
                % self.params['sample'])

        #Extract the SAM to a dict
        self.params['mapping_dict'] = self.SAM_to_dict(
            os.path.join(working_dir, 'mapping.sam'))
        #clean up intermediary files:
        os.remove(os.path.join(working_dir, 'mapping.sam'))
        os.system("rm -rf %s" % idx_dir)
Exemplo n.º 19
0
    def RunNewbler(self):
        #Code for running newbler
        """
        Expects params keys:
            PE1 and PE2 and/or SE
            target_dir
            -urt
        """
        #Check for necessary params:
        if not (
            ('assembly_PE1' in self.params and 'assembly_PE2' in self.params)
                or 'assembly_SE' in self.params):
            raise exceptions.FatalError('Missing self.params in RunNewbler.')

        #Check for necessary files:
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not (
                os.path.exists(self.params['assembly_PE1'])
                or not (os.path.exists(self.params['assembly_PE2']))):
            raise exceptions.FatalError('Missing PE files in RunNewbler.')

        if 'assembly_SE' in self.params and not (os.path.exists(
                self.params['assembly_SE'])):
            raise exceptions.FatalError('Missing SE file in RunNewbler.')

        sample = self.params['sample']
        target = self.params['target']
        killed = False
        failed = False

        #determine whether to pipe output to a file or /dev/null
        if self.params['verbose']:
            out = open(os.path.join(self.params['target_dir'], "assembly.log"),
                       'w')
        else:
            out = open(os.devnull, 'w')

        #Build args for newAssembly:
        args = ['newAssembly', '-force']
        if self.params['last_assembly'] and self.params['cdna']:
            #only run with cdna switch on the final assembly
            args += ['-cdna']
        args += [os.path.join(self.params['target_dir'], 'assembly')]
        logger.debug("Calling newAssembly for sample: %s target %s" %
                     (sample, target))
        logger.info(" ".join(args))
        ret = subprocess.call(args, stdout=out, stderr=out)
        #Build args for addRun:
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params:
            args = [
                'addRun',
                os.path.join(self.params['target_dir'], 'assembly')
            ]
            args += [self.params['assembly_PE1']]
            logger.debug("Calling addRun for sample: %s target %s" %
                         (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.call(args, stdout=out, stderr=out)

            args = [
                'addRun',
                os.path.join(self.params['target_dir'], 'assembly')
            ]
            args += [self.params['assembly_PE2']]
            logger.debug("Calling addRun for sample: %s target %s" %
                         (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.call(args, stdout=out, stderr=out)
        if 'assembly_SE' in self.params:
            args = [
                'addRun',
                os.path.join(self.params['target_dir'], 'assembly')
            ]
            args += [self.params['assembly_SE']]
            logger.debug("Calling addRun for sample: %s target %s" %
                         (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.call(args, stdout=out, stderr=out)

        #Build args for runProject
        args = ['runProject']
        args += ['-cpu', '1']
        if self.params['last_assembly'] and self.params['cdna']:
            args += ['-noace']
        else:
            args += ['-nobig']
        if self.params['urt'] and not self.params['last_assembly']:
            #only run with the -urt switch when it isn't the final assembly
            args += ['-urt']
        if self.params['rip']:
            args += ['-rip']
        args += [os.path.join(self.params['target_dir'], 'assembly')]
        try:
            start = time.time()
            logger.debug("Calling runProject for sample: %s target %s" %
                         (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.Popen(args, stdout=out, stderr=out)
            pid = ret.pid
            while ret.poll() is None:
                if time.time() - start > self.params['assemblytimeout']:
                    self.kill_process_children(pid)
                    logger.warn(
                        "Sample: %s target: %s iteration: %s Killing assembly after %s seconds"
                        % (sample, target, self.params['iteration'],
                           time.time() - start))
                    killed = True
                    break
                time.sleep(.5)
        except Exception as exc:
            txt = "Sample: %s, Target: %s: Unhandeled error running Newbler assembly" % (
                self.params['sample'], self.params['target'])
            txt += '\n\t' + str(exc) + "".join(traceback.format_exception)
            logger.warn(txt)
            failed = True
            pass
        finally:
            out.close()

        #Sometimes newbler doesn't seem to exit completely:
        self.kill_process_children(pid)

        #if ret != 0:
        #raise exceptions.RerunnableError("Newbler assembly failed.")

        if not killed and ret.poll() != 0:
            #raise exceptions.RerunnableError("Newbler assembly failed.")
            failed = True

        if failed:
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly failed after %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_failed\t" + str(time.time() - start))
            outf.close()
        if killed:
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly killed after %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_killed\t" + str(time.time() - start))
            outf.close()
        else:
            #Run finished without error
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly finished in %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_complete\t" + str(time.time() - start))
            outf.close()
Exemplo n.º 20
0
Arquivo: app.py Projeto: samhunter/ARC
    def setup(self, config):
        """
            Set up working folder for each sample. Also assign a "safe_target"
            name to each target so that folder creation works. This is a little
            bit tricky because if the user has targets with the _:_ seperator
            in the name it messes up the splitter and SAM_to_dict. This code is
            therefore written with the assumption that the user has put the _:_
            in the name purposely so that multiple entries in the reference
            fasta will be treated as a single target.
        """
        format = config['format']
        for sample in config['Samples']:
            s = config['Samples'][sample]
            working_dir = os.path.realpath(config['workingdirectory'] + '/working_' + sample)
            #working_dir = os.path.realpath('./working_' + sample)
            finished_dir = os.path.realpath('./finished_' + sample)
            config['Samples'][sample]['working_dir'] = working_dir
            config['Samples'][sample]['finished_dir'] = finished_dir
            if os.path.exists(working_dir):
                logger.info(
                    "WARNING working directory already exists for "
                    "sample %s, deleting old results if any." % (sample))
                os.system('rm -rf %s' % finished_dir)
                os.system('rm -rf %s/t__*' % working_dir)
                os.system('rm -rf %s/*.psl' % working_dir)
                os.system('rm %s/I*_contigs.fasta' % working_dir)
                if os.path.exists('%s/idx' % working_dir):
                    os.system('rm -rf %s/idx' % working_dir)
                os.mkdir(finished_dir)
            else:
                os.mkdir(working_dir)
                os.mkdir(finished_dir)

            # Create stats file:
            statsf = open(os.path.join(finished_dir, "mapping_stats.tsv"), 'w')
            statsf.write('\t'.join(
                ['Sample', 'Target', 'Iteration', 'Reads']) + '\n')
            statsf.close()

            # Create Target Summary Table
            tstf = open(os.path.join(finished_dir, "target_summary_table.tsv"), 'w')
            tstf.write('\t'.join(
                ['Sample', 'Target', 'RefLen', 'Status', 'Iteration', 'Reads', 'Contigs', 'ContigLength']) + '\n')
            tstf.close()

            #Create a stats file for cdna
            if config['cdna']:
                countsf = open(os.path.join(finished_dir, "isogroup_read_counts.tsv"), 'a')
                countsf.write('\t'.join(['Sample', 'Target', 'isogroup', 'readcount']) + '\n')
                countsf.close()

            # Build a separate index for each read file in the input, put them
            # in working_dir
            #Consider parallelizing this?
            start = time.time()
            if 'PE1' in s:
                if not os.path.exists(os.path.join(working_dir, "PE1.idx")):
                    print s['PE1']
                    p1 = SeqIO.index_db(
                        os.path.join(working_dir, "PE1.idx"),
                        s['PE1'],
                        format,
                        key_function=lambda x: x.split("/")[0])
            if 'PE2' in s:
                if not os.path.exists(os.path.join(working_dir, "PE2.idx")):
                    print s['PE2']
                    p2 = SeqIO.index_db(
                        os.path.join(working_dir, "PE2.idx"),
                        s['PE2'],
                        format,
                        key_function=lambda x: x.split("/")[0])
                    if len(p1) != len(p2):
                        logger.error("The number of reads in %s and %s do not match, "
                                     "check the config for errors" % (s['PE1'], s['PE2']))
            if 'SE' in s:
                if not os.path.exists(os.path.join(working_dir, "SE.idx")):
                    print s['SE']
                    SeqIO.index_db(
                        os.path.join(working_dir, "SE.idx"),
                        s['SE'],
                        format,
                        key_function=lambda x: x.split("/")[0])

            logger.info(
                "Sample: %s, indexed reads in %s seconds." % (
                    sample, time.time() - start))

            #Read through the references, mask them if necessary

            #mapper_params['reference'] = os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta')

        # Read through the reference, set up a set of safe names for the targets.
        # Also create the Target Summary Table which is indexed by original target name (following ARC conventions)
        # Also mask sequences and write them to a new set of output files
        #safe_targets is a two-way lookup, meaning it has both the safe target ID and the contig ID.
        summary_stats = {}
        safe_targets = {}
        new_refsf = {}
        for sample in config['Samples']:
            s = config['Samples'][sample]
            new_refsf[sample] = open(os.path.join(s['working_dir'], 'I000_contigs.fasta'), 'w')

        i = 0
        for t in SeqIO.parse(config['reference'], "fasta"):
            if len(t.name.split("_:_")) == 1:
                target = t.name
            else:
                target = t.name.split("_:_")[1]

            safe_targets[target] = "t__%06d" % i
            safe_targets["t__%06d" % i] = target
            i += 1
            if target not in summary_stats:
                summary_stats[target] = {'targetLength': len(t)}
            else:
                summary_stats[target]['targetLength'] = (summary_stats[target]['targetLength'] + len(t))

            #Write contigs:
            if config['maskrepeats']:
                #t.seq = Seq(str(mask_seq(t.seq.tostring(), config['mapper'])))
                t.seq = Seq(str(mask_seq(str(t.seq), config['mapper'])))
            #Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't
            if len(t) != t.seq.count('n'):
                for outf in new_refsf.values():
                    SeqIO.write(t, outf, "fasta")
            else:
                writeTargetStats(finished_dir=s['finished_dir'],
                                 sample=sample,
                                 target=target,
                                 targetLength=summary_stats[target]['targetLength'],
                                 status='MaskedOut',
                                 iteration=0,
                                 readcount=0,
                                 num_contigs=0, contig_length=0)
                del summary_stats[target]

        config['safe_targets'] = safe_targets
        config['summary_stats'] = summary_stats
Exemplo n.º 21
0
    def RunSpades(self):
        """
        Several arguments can be passed to spades.py: -1 [PE1], -2 [PE2], -s [SE], and -o [target_dir]
        """
        #Check that required params are available
        if not (('assembly_PE1' in self.params
                 and 'assembly_PE2' in self.params) or
                ('assembly_SE' in self.params)):
            raise exceptions.FatalError('Missing self.params in RunSpades.')

        #Check that the files actually exist
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not (
                os.path.exists(self.params['assembly_PE1'])
                or not (os.path.exists(self.params['assembly_PE2']))):
            raise exceptions.FatalError('Missing PE files in RunSpades.')
        if 'assembly_SE' in self.params and not (os.path.exists(
                self.params['assembly_SE'])):
            raise exceptions.FatalError('Missing SE file in RunSpades.')

        sample = self.params['sample']
        target = self.params['target']

        #Build args for assembler call
        args = ['spades.py', '-t', '1']
        if self.params['only-assembler'] and not self.params['last_assembly']:
            args.append("--only-assembler")
        if self.params['format'] == 'fasta':
            args.append(
                '--only-assembler'
            )  # spades errors on read correction if the input isn't fastq
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params:
            args += [
                '-1', self.params['assembly_PE1'], '-2',
                self.params['assembly_PE2']
            ]
        if 'assembly_SE' in self.params:
            args += ['-s', self.params['assembly_SE']]
        args += ['-o', os.path.join(self.params['target_dir'], 'assembly')]
        if self.params['verbose']:
            out = open(os.path.join(self.params['target_dir'], "assembly.log"),
                       'w')
        else:
            out = open(os.devnull, 'w')

        logger.debug("Sample: %s target: %s Running spades assembler." %
                     (sample, target))
        logger.info(" ".join(args))
        killed = False
        failed = False
        start = time.time()
        try:
            #ret = subprocess.call(args, stderr=out, stdout=out)
            ret = subprocess.Popen(args, stdout=out, stderr=out)
            pid = ret.pid
            while ret.poll() is None:
                if time.time() - start > self.params['assemblytimeout']:
                    ret.kill()
                    killed = True
                    logger.warn(
                        "Sample: %s target: %s Assembly killed after %s seconds."
                        % (sample, target, time.time() - start))
                    break
                time.sleep(.5)
        except Exception as exc:
            txt = (
                "Sample: %s, Target: %s: Unhandeled error running Spades assembly"
                % (sample, target))
            txt += '\n\t' + str(exc)
            logger.warn(txt)
            failed = True
            pass
        finally:
            out.close()

        #Ensure that assembler exits cleanly:
        self.kill_process_children(pid)

        if not killed and ret.poll() != 0:
            failed = True
        if failed:
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly failed after %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_failed")
            outf.close()
        elif killed:
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly killed after %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_killed")
            outf.close()
        else:
            #Run finished without error
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly finished in %s seconds"
                % (sample, target, self.params['iteration'],
                   time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"),
                        'w')
            outf.write("assembly_complete")
            outf.close()
Exemplo n.º 22
0
Arquivo: mapper.py Projeto: ibest/ARC
    def splitreads(self):
        """ Split reads and then kick off assemblies once the reads are split for a target, use safe_targets for names"""
        self.params['iteration'] += 1

        # Write out statistics for any/all targets which failed to recruit reads:
        for target in self.params['summary_stats'].keys():
            # print "Target", target
            if target not in self.params['mapping_dict']:
                writeTargetStats(finished_dir=self.params['finished_dir'],
                                 sample=self.params['sample'],
                                 target=target,
                                 targetLength=self.params['summary_stats'][target]['targetLength'],
                                 status='NoReads',
                                 iteration=self.params['iteration'],
                                 readcount=0,
                                 num_contigs=0, contig_length=0)
                del self.params['summary_stats'][target]

        checker_params = {}
        for k in self.params:
            checker_params[k] = self.params[k]
        del checker_params['mapping_dict']
        checker_params['targets'] = {}
        iteration = self.params['iteration']
        # open previously created indexes:
        if 'PE1' in self.params and 'PE2' in self.params:
            idx_PE1 = SeqIO.index_db(os.path.join(self.params['working_dir'], "PE1.idx"), key_function=keyfunction(self.params['sra']))
            idx_PE2 = SeqIO.index_db(os.path.join(self.params['working_dir'], "PE2.idx"), key_function=keyfunction(self.params['sra']))
        if 'SE' in self.params:
            idx_SE = SeqIO.index_db(os.path.join(self.params['working_dir'], "SE.idx"), key_function=keyfunction(self.params['sra']))
        if 'readcounts' not in checker_params:
            checker_params['readcounts'] = {}
        # if 'contigcounts' not in checker_params:
        #    checker_params['contigcounts'] = {}
        statsf = open(os.path.join(self.params['finished_dir'], 'mapping_stats.tsv'), 'a')
        for target in self.params['mapping_dict']:
            startT = time.time()
            # logger.info("Running splitreads for Sample: %s target: %s" % (self.params['sample'], target))
            target_dir = os.path.join(self.params['working_dir'], self.params['safe_targets'][target])
            if target not in checker_params['readcounts']:
                checker_params['readcounts'][target] = Counter()
            # if target not in checker_params['contigcounts']:
            #    checker_params['contigcounts'] = Counter()
            if os.path.exists(target_dir):
                os.system("rm -rf %s" % target_dir)
            os.mkdir(target_dir)

            reads = self.params['mapping_dict'][target]
            # track how many total reads were added for this cycle
            checker_params['readcounts'][target][iteration] = len(reads)
            statsf.write('\t'.join([self.params['sample'], target, str(iteration), str(len(reads))]) + '\n')

            SEs = PEs = 0
            if 'PE1' and 'PE2' in self.params:
                outf_PE1 = open(os.path.join(target_dir, "PE1." + self.params['format']), 'w')
                outf_PE2 = open(os.path.join(target_dir, "PE2." + self.params['format']), 'w')
            if 'SE' in self.params:
                outf_SE = open(os.path.join(target_dir, "SE." + self.params['format']), 'w')

            for readID in reads:
                if self.params['subsample'] < 1 and randint(0, 100) > self.params['subsample'] * 100:
                    continue
                if 'PE1' in self.params and readID in idx_PE1:
                    # read1 = idx_PE1[readID]
                    # read2 = idx_PE2[readID]
                    read1 = idx_PE1.get(readID, None)
                    read2 = idx_PE2.get(readID, None)
                    if read2 is None:
                        raise exceptions.FatalError("ERROR: ReadID %s was found in PE1 file but not PE2" % readID)
                    new_readID = readID.replace(":", "_") + ":0:0:0:0#0/"
                    read1.id = read1.name = new_readID + "1"
                    read2.id = read2.name = new_readID + "2"
                    SeqIO.write(read1, outf_PE1, self.params['format'])
                    SeqIO.write(read2, outf_PE2, self.params['format'])
                    PEs += 1
                elif 'SE' in self.params and readID in idx_SE:
                    read1 = idx_SE[readID]
                    read1.id = read1.name = readID.replace(":", "_") + ":0:0:0:0#0/"
                    SeqIO.write(read1, outf_SE, self.params['format'])
                    SEs += 1
            if 'PE1' in self.params and 'PE2' in self.params:
                outf_PE1.close()
                outf_PE2.close()
            if 'SE' in self.params:
                outf_SE.close()

            #Build assembly job:
            assembly_params = {}
            assembly_params['target'] = target
            assembly_params['target_dir'] = target_dir
            assembly_params['iteration'] = iteration
            assembly_params['last_assembly'] = False
            assembler_keys = ['assembler', 'sample', 'verbose', 'format', 'assemblytimeout', 'map_against_reads', 'urt', 'numcycles', 'cdna', 'rip', 'only-assembler']
            for k in assembler_keys:
                assembly_params[k] = self.params[k]
            cur_reads = checker_params['readcounts'][target][iteration]  # note that this is a counter, so no key errors can occur
            previous_reads = checker_params['readcounts'][target][iteration - 1]

            #Turn off URT in situations where this will be the last iteration due to readcounts:

            if cur_reads <= previous_reads and iteration > 2 or iteration >= self.params['numcycles']:
                logger.info("Sample: %s target: %s iteration: %s Setting last_assembly to True" % (self.params['sample'], target, self.params['iteration']))
                assembly_params['last_assembly'] = True

            #properly handle the case where no reads ended up mapping for the PE or SE inputs:
            if PEs > 0:
                assembly_params['assembly_PE1'] = os.path.join(target_dir, "PE1." + self.params['format'])
                assembly_params['assembly_PE2'] = os.path.join(target_dir, "PE2." + self.params['format'])
            if SEs > 0:
                assembly_params['assembly_SE'] = os.path.join(target_dir, "SE." + self.params['format'])

            #All reads have been written at this point, add an assembly to the queue:
            logger.info("Sample: %s target: %s iteration: %s Split %s reads in %s seconds" % (self.params['sample'], target, self.params['iteration'], len(reads), time.time() - startT))

            #Only add an assembly job and AssemblyChecker target if is there are >0 reads:
            if PEs + SEs > 0:
                checker_params['targets'][target_dir] = False
                self.submit(Assembler.to_job(assembly_params))

        statsf.close()
        logger.info("------------------------------------")
        logger.info("| Sample: %s Iteration %s of numcycles %s" % (checker_params['sample'], checker_params['iteration'], checker_params['numcycles']))
        logger.info("------------------------------------")
        if 'PE1' in self.params and 'PE2' in self.params:
            idx_PE1.close()
            idx_PE2.close()
            del idx_PE1
            del idx_PE2
        if 'SE' in self.params:
            idx_SE.close()
            del idx_SE

        #Kick off a job which checks if all assemblies are done, and if not adds a copy of itself to the job queue
        if len(checker_params['targets']) > 0:
            # checker = AssemblyChecker(checker_params)
            self.submit(AssemblyChecker.to_job(checker_params))
        else:
            logger.info("Sample: %s No reads mapped, no more work to do." % checker_params['sample'])
Exemplo n.º 23
0
Arquivo: mapper.py Projeto: kdm9/ARC
    def run_blat(self):
        #Check for necessary params:
        if not ('sample' in self.params and 'reference' in self.params
                and 'working_dir' in self.params and
                (('PE1' in self.params and 'PE2' in self.params)
                 or 'SE' in self.params)):
            raise exceptions.FatalError('Missing self.params in run_bowtie2.')
        #Check for necessary files:
        if os.path.exists(self.params['reference']) is False:
            raise exceptions.FatalError("Missing reference file for mapping")
        if 'PE1' in self.params and 'PE2' in self.params:
            if not (os.path.exists(self.params['PE1'])
                    and os.path.exists(self.params['PE2'])):
                raise exceptions.FatalError(
                    "One or both PE files can not be found for mapping.")
        if 'SE' in self.params:
            if not os.path.exists(self.params['SE']):
                raise exceptions.FatalError("SE file cannot be found.")

        #Blat doesn't need an index
        working_dir = self.params['working_dir']

        #Check whether to log to temporary file, or default to os.devnull
        if 'verbose' in self.params:
            out = open(os.path.join(working_dir, "mapping_log.txt"), 'w')
        else:
            out = open(os.devnull, 'w')

        #Build a temporary txt file with all of the reads:
        allreads_outf = open(os.path.join(working_dir, 'reads.txt'), 'w')
        if 'PE1' in self.params and 'PE2' in self.params:
            allreads_outf.write(self.params['PE1'] + '\n')
            allreads_outf.write(self.params['PE2'] + '\n')
        if 'SE' in self.params:
            allreads_outf.write(self.params['SE'] + '\n')
        allreads_outf.close()

        #Do blat mapping
        args = [
            'blat', self.params['reference'],
            os.path.join(working_dir, 'reads.txt')
        ]
        if self.params['format'] == 'fastq':
            args.append('-fastq')
        if self.params['fastmap']:
            args.append('-fastMap')
        #Some new experimental params to increase specificity after the first iteration:
        if self.params['maskrepeats']:
            args.append("-mask=lower")
        if self.params['iteration'] > 0 or not self.params['sloppymapping']:
            args.append("-minIdentity=98")
            args.append("-minScore=40")
        args.append(os.path.join(working_dir, 'mapping.psl'))

        logger.info("Sample: %s Calling blat mapper" % self.params['sample'])
        logger.debug(" ".join(args))
        try:
            ret = subprocess.call(args, stdout=out, stderr=out)
        except Exception as exc:
            txt = (
                "Sample %s: Unhandeled error running blat mapping, check log file."
                % self.params['sample']) + '\n\t' + str(exc)
            raise exceptions.FatalError(txt)
        finally:
            out.close()
        if ret != 0:
            raise exceptions.FatalError(
                'Sample: %s Error running blat mapping, check log file. \n\t %s'
                % (self.params['sample'], " ".join(args)))

        #Extract the PSL to a dict
        self.params['mapping_dict'] = self.PSL_to_dict(
            os.path.join(working_dir, 'mapping.psl'))

        #Cleanup
        os.remove(os.path.join(working_dir, 'mapping.psl'))
        out.close()
Exemplo n.º 24
0
    def write_target(self,
                     target,
                     target_folder,
                     outf,
                     finished=False,
                     map_against_reads=False,
                     killed=False,
                     status=None):
        # either map_against_reads was passed in, or
        # no contigs were assembled and target isn't finished, or
        # assembler crashed and no contig file was created
        # --> write reads as contigs
        num_contigs = 0  # store how many contigs were written out
        contig_length = 0
        if finished and status is None:
            status = 'Finished'
        if killed:
            status = 'Killed'
        if map_against_reads is False and killed is False:
            if self.params['assembler'] == 'newbler':
                contigf = os.path.join(self.params['working_dir'],
                                       target_folder, "assembly", "assembly",
                                       "454AllContigs.fna")
            elif self.params['assembler'] == 'spades':
                contigf = os.path.join(self.params['working_dir'],
                                       target_folder, "assembly",
                                       "contigs.fasta")
            #add support for a special output if this is the final assembly and newbler -cdna was used:
            if finished and self.params['cdna'] and self.params[
                    'assembler'] == 'newbler':
                self.writeCDNAresults(target, target_folder, outf, contigf)
            elif os.path.exists(contigf):
                i = 0
                contig_inf = open(contigf, 'r')
                for contig in SeqIO.parse(contig_inf, 'fasta'):
                    i += 1
                    if finished:
                        contig.name = contig.id = self.params[
                            'sample'] + "_:_" + target + "_:_" + "Contig%03d" % i
                    else:
                        contig.name = contig.id = self.params[
                            'sample'] + "_:_" + target + "_:_" + "Unfinished%03d" % i
                    contig = contig.upper()
                    #Only mask repeats on intermediate iterations.
                    if self.params['maskrepeats'] and not finished:
                        #contig.seq = Seq(str(mask_seq(contig.seq.tostring(), self.params['mapper'])))
                        contig.seq = Seq(
                            str(
                                mask_seq(str(contig.seq),
                                         self.params['mapper'])))
                    #Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't
                    if len(contig.seq) != contig.seq.count('n'):
                        SeqIO.write(contig, outf, "fasta")
                        contig_length += len(contig.seq)
                contig_inf.close()
                logger.info(
                    "Sample: %s target: %s iteration: %s Finished writing %s contigs "
                    % (self.params['sample'], target, self.params['iteration'],
                       i))
                num_contigs += i
                #if i == 0 and finished is False and self.params['iteration'] < 2:
                #    map_against_reads = True

        if map_against_reads:
            i = 0
            logger.info("Sample %s target %s: Writing reads as contigs." %
                        (self.params['sample'], target))
            if 'PE1' in self.params and 'PE2' in self.params:
                inf_PE1n = os.path.join(target_folder,
                                        "PE1." + self.params['format'])
                inf_PE2n = os.path.join(target_folder,
                                        "PE2." + self.params['format'])
                if os.path.exists(inf_PE1n) and os.path.exists(inf_PE2n):
                    inf_PE1 = open(inf_PE1n, 'r')
                    inf_PE2 = open(inf_PE2n, 'r')
                    for r in SeqIO.parse(inf_PE1, self.params['format']):
                        i += 1
                        r.name = r.id = self.params[
                            'sample'] + "_:_" + target + "_:_" + "Read%04d" % i
                        SeqIO.write(r, outf, "fasta")
                    for r in SeqIO.parse(inf_PE2, self.params['format']):
                        i += 1
                        r.name = r.id = self.params[
                            'sample'] + "_:_" + target + "_:_" + "Read%04d" % i
                        SeqIO.write(r, outf, "fasta")
                    inf_PE1.close()
                    inf_PE2.close()

            if 'SE' in self.params:
                inf_SEn = os.path.join(target_folder,
                                       "SE." + self.params['format'])
                if os.path.exists(inf_SEn):
                    inf_SE = open(inf_SEn, 'r')
                for r in SeqIO.parse(inf_SE, self.params['format']):
                    i += 1
                    r.name = r.id = self.params[
                        'sample'] + "_:_" + target + "_:_" + "Read%04d" % i
                    SeqIO.write(r, outf, "fasta")
                inf_SE.close()
            num_contigs += i

        if finished or killed:
            #Write reads:
            if 'PE1' in self.params and 'PE2' in self.params:
                inf_PE1n = os.path.join(target_folder,
                                        "PE1." + self.params['format'])
                inf_PE2n = os.path.join(target_folder,
                                        "PE2." + self.params['format'])
                if os.path.exists(inf_PE1n) and os.path.exists(inf_PE2n):
                    inf_PE1 = open(inf_PE1n, 'r')
                    inf_PE2 = open(inf_PE2n, 'r')

                    outf_PE1 = open(
                        os.path.join(self.params['finished_dir'],
                                     "PE1." + self.params['format']), 'a')
                    outf_PE2 = open(
                        os.path.join(self.params['finished_dir'],
                                     "PE2." + self.params['format']), 'a')

                    for r in SeqIO.parse(inf_PE1, self.params['format']):
                        r.description = self.params['sample'] + "_:_" + target
                        SeqIO.write(r, outf_PE1, self.params['format'])
                    for r in SeqIO.parse(inf_PE2, self.params['format']):
                        r.description = self.params['sample'] + "_:_" + target
                        SeqIO.write(r, outf_PE2, self.params['format'])
                    outf_PE1.close()
                    outf_PE2.close()

            if 'SE' in self.params:
                inf_SEn = os.path.join(target_folder,
                                       "SE." + self.params['format'])
                if os.path.exists(inf_SEn):
                    inf_SE = open(inf_SEn, 'r')
                    outf_SE = open(
                        os.path.join(self.params['finished_dir'],
                                     "SE." + self.params['format']), 'a')
                    for r in SeqIO.parse(inf_SE, self.params['format']):
                        r.description = self.params['sample'] + "_:_" + target
                        SeqIO.write(r, outf_SE, self.params['format'])
                    outf_SE.close()

        # Finally a special case for situations where assembly of a target is killed, but contigs exist from
        # a previous assembly. Note that we only do this when not running in cDNA mode.
        if killed and self.params['iteration'] > 1 and not self.params['cdna']:
            #No contigs will be available, however contigs from the previous iteration will be present in
            # I00N_contigs.fasta, grab these and write them out instead
            logger.info(
                "Sample: %s target: %s iteration: %s Writing contigs from previous iteration."
                % (self.params['sample'], target, self.params['iteration']))
            contigf = os.path.join(
                self.params['working_dir'],
                'I%03d' % (self.params['iteration'] - 1) + '_contigs.fasta')
            if os.path.exists(contigf):
                for contig in SeqIO.parse(contigf, 'fasta'):
                    if contig.id.split("_:_")[1] == target:
                        contig.name = contig.id = contig.id.replace(
                            "Unfinished", "Contig")
                        SeqIO.write(contig, outf, "fasta")
                        num_contigs += 1
                        contig_length += len(contig.seq)
        #Cleanup temporary assembly, and reads:
        if not self.params['keepassemblies']:
            os.system("rm -rf %s" % target_folder)

        #write out target stats:
        if finished or killed:
            writeTargetStats(finished_dir=self.params['finished_dir'],
                             sample=self.params['sample'],
                             target=target,
                             targetLength=self.params['summary_stats'][target]
                             ['targetLength'],
                             status=status,
                             iteration=self.params['iteration'],
                             readcount=self.params['readcounts'][target][
                                 self.params['iteration']],
                             num_contigs=num_contigs,
                             contig_length=contig_length)
            del self.params['summary_stats'][target]

        #writeTargetStats(target, status, num_contigs, contig_length, self.params)

#summary_stats[target] = {'RefLen': len(t), 'Status': 'NA', 'Iteration': None,
#                                         'Reads': None, 'Contigs': None, 'ContigLength': None}
#[self.params['Sample'], target, 'TargetLength', 'Status', 'Iteration', 'Reads', 'Contigs', 'ContigLength']) + '\n')
        if finished or killed:
            return 0
        else:
            return num_contigs
Exemplo n.º 25
0
Arquivo: mapper.py Projeto: kdm9/ARC
    def run_bowtie2(self):
        """
        Builds idx and runs bowtie2 -I 0 -X 1500 --local
        Expects params:
            sample, target, reference, working_dir, PE1 and PE2 and/or SE
        """
        #Check for necessary params:
        if not ('sample' in self.params and 'reference' in self.params
                and 'working_dir' in self.params and
                (('PE1' in self.params and 'PE2' in self.params)
                 or 'SE' in self.params)):
            raise exceptions.FatalError('Missing params in run_bowtie2.')
        #Check for necessary files:
        if os.path.exists(self.params['reference']) is False:
            raise exceptions.FatalError("Missing reference file for mapping")
        if 'PE1' in self.params and 'PE2' in self.params:
            if not (os.path.exists(self.params['PE1'])
                    and os.path.exists(self.params['PE2'])):
                raise exceptions.FatalError(
                    "One or both PE files can not be found for mapping.")
        if 'SE' in self.params:
            if not os.path.exists(self.params['SE']):
                raise exceptions.FatalError("SE file cannot be found.")

        #Make idx directory
        try:
            working_dir = self.params['working_dir']
            idx_dir = os.path.realpath(os.path.join(working_dir, 'idx'))
            os.mkdir(idx_dir)
        except Exception as exc:
            txt = "Sample: %s Error creating working directory." % (
                self.params['sample']) + '\n\t' + str(exc)
            raise exceptions.FatalError(txt)

        #Check whether to log to temporary file, or default to os.devnull
        if 'verbose' in self.params:
            out = open(os.path.join(working_dir, "mapping_log.txt"), 'w')
        else:
            out = open(os.devnull, 'w')

        #Set up a path to the index
        base = os.path.join(idx_dir, 'idx')

        #Build index
        #The idea is to map against the finished contigs and in-progress
        # contigs, thereby ensuring that the -k parameter (or best map)
        # are respected properly, and avoid the situation where reads which
        # were mapped to a now finished target might later be mapped to a an
        # in-progress target.
        fin_outf = os.path.join(self.params['finished_dir'], 'contigs.fasta')
        args = ['bowtie2-build', '-f']
        if os.path.exists(fin_outf) and os.path.getsize(fin_outf) > 0:
            args.append(','.join((fin_outf, self.params['reference'])))
        else:
            args.append(self.params['reference'])
        args.append(base)
        logger.info("Sample: %s Calling bowtie2-build." %
                    self.params['sample'])
        logger.info(" ".join(args))
        try:
            ret = subprocess.call(args, stdout=out, stderr=out)
        except Exception as exc:
            txt = ("Sample %s: Unhandeled error running bowtie2-build" %
                   self.params['sample']) + '\n\t' + str(exc)
            # make sure that out is closed before throwing exception
            out.close()
            raise exceptions.FatalError(txt)

        if ret != 0:
            out.close()
            raise exceptions.FatalError(
                "Sample: %s Error creating bowtie2 index, check log file." %
                self.params['sample'])

        #Do bowtie2 mapping:
        n_bowtieprocs = int(
            round(
                max(
                    float(self.params['nprocs']) / len(self.params['Samples']),
                    1)))
        args = ['bowtie2', '-I', '0', '-X', '1500', '--no-unal']

        #Tune the sensitivity so that on the first iteration the mapper is
        # very sensitive. On later iterations the mapper is very specific.
        if self.params['iteration'] == 0 and self.params['sloppymapping']:
            args.append("--very-sensitive-local")
        else:
            args += [
                "--very-fast-local", "--mp", "12", "--rdg", "12,6", "--rfg",
                "12,6"
            ]

        args += ['-p', str(n_bowtieprocs), '-x', base]
        if self.params['bowtie2_k'] > 1:
            args += ['-k', str(self.params['bowtie2_k'])]
        if self.params['format'] == 'fasta':
            args += ['-f']
        if 'PE1' in self.params and 'PE2' in self.params:
            args += ['-1', self.params['PE1'], '-2', self.params['PE2']]
        if 'SE' in self.params:
            args += ['-U', self.params['SE']]
        args += ['-S', os.path.join(working_dir, 'mapping.sam')]
        logger.info("Sample: %s Calling bowtie2 mapper" %
                    self.params['sample'])
        logger.info(" ".join(args))

        try:
            ret = subprocess.call(args, stdout=out, stderr=out)
            out.close()
        except Exception as exc:
            txt = ("Sample %s: Unhandeled error running bowtie2 mapping" %
                   self.params['sample']) + '\n\t' + str(exc)
            raise exceptions.FatalError(txt)

        out.close()
        if ret != 0:
            raise exceptions.FatalError(
                "Sample %s: Bowtie2 mapping returned an error, check log file."
                % self.params['sample'])

        #Extract the SAM to a dict
        self.params['mapping_dict'] = self.SAM_to_dict(
            os.path.join(working_dir, 'mapping.sam'))
        #clean up intermediary files:
        os.remove(os.path.join(working_dir, 'mapping.sam'))
        os.system("rm -rf %s" % idx_dir)
Exemplo n.º 26
0
Arquivo: spawn.py Projeto: ibest/ARC
    def run(self):
        logger.info("Starting...")
        logger.debug("Setting up workers.")

        for i in range(self.nprocs):
            worker = ProcessRunner(
                i,
                self.q,
                self.status,
                self.stats,
                self.pid)
            self.workers.append(worker)
            worker.daemon = False
            worker.start()

        while True:
            try:
                self.q.join()

                # This shouldn't be needed but we will check just in case
                if self.all_workers_waiting():
                    logger.debug("Workers are all waiting and the queue is empty.  Exiting")
                    break
                else:
                    logger.debug("Workers are not in a waiting state.  Waiting for more.")
                    time.sleep(5)

            except exceptions.FatalError:
                logger.error("A fatal error was encountered.")
                self.killall()
                raise
            except (KeyboardInterrupt, SystemExit):
                logger.error("Terminating processes")
                self.killall()
                raise
            except Exception as e:
                ex_type, ex, tb = sys.exc_info()
                logger.error("\n".join(traceback.format_exception(ex_type, ex, tb)))
                logger.error("An unhandled exception occurred")
                self.killall()
                raise
            finally:
                # Kill 'em all!
                self.killall()

        logger.info("-----")
        logger.info("%d processes returned ok." % (self.stats[0]))
        logger.info("%d processes had to be rerun." % (self.stats[1]))
        logger.info("-----")
        logger.info("%d Mapper jobs run." % (self.stats[2]))
        logger.info("%d Assembly jobs run." % (self.stats[3]))
        logger.info("%d Checker jobs run." % (self.stats[4]))
        logger.info("%d Finisher jobs run." % (self.stats[5]))
        logger.info("-----")
Exemplo n.º 27
0
    def writeCDNAresults(self, target, target_folder, outf, contigf):
        """
        This is ONLY called when a cDNA target is finished.

        When doing a cDNA type run, it is very useful to have both the following:
        1) All contigs that belong to a gene (isogroup)
            - It would be particularly good to re-orient these if they are in RC.
        2) Total number of reads assembled in each gene (isogroup)

        Additionally it would be excellent to some day also get the following:
        3) Transcript (isotig) structure
        4) Estimate of isotig specific reads.

        """
        if self.params['assembler'] == 'newbler':
            contigf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454AllContigs.fna")
            isotigsf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454IsotigsLayout.txt")
            readstatusf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454ReadStatus.txt")
        else:
            logger.info("WARNING writeCDNAresults called when assembler was not Newbler")
            return None
        if not (os.path.exists(contigf) and os.path.exists(isotigsf) and os.path.exists(readstatusf)):
            logger.info("CDNA WARNING MISSING FILE!! %s %s" % (target, self.params['sample']))
            logger.info(contigf, os.path.exists(contigf))
            logger.info(isotigsf, os.path.exists(isotigsf))
            logger.info(readstatusf, os.path.exists(readstatusf))
            return None
        #Storage data structures:
        isogroups = {}  # A dict of isogroups which each contain an in-order list of contigs
        readcounts = Counter()  # A dict of all contigs, these contain read counts (from ReadStatus)
        contig_orientation = {}
        contig_to_isogroup = {}
        contig_idx = SeqIO.index(contigf, "fasta")
        # Parse isotigsf:
        igroup = ""
        #print self.params['sample'], target, "Parsing isotigsf: %s" % isotigsf
        for l in open(isotigsf, 'r'):
            #Handle lines with only a '\n'
            if l == '\n':
                pass
            #Handle lines for isogroup:
            elif l[0:9] == '>isogroup':
                igroup = l.strip().split()[0].strip(">")
            #Handle lines containing all contigs:
            elif l.strip().split()[0] == 'Contig':
                l2 = l.strip().split()
                contigs = map(lambda x: "contig" + x, l2[2:-1])
                isogroups[igroup] = contigs
                for contig in contigs:
                    if contig not in contig_orientation:
                        contig_orientation[contig] = '+'
                        contig_to_isogroup[contig] = igroup
                    else:
                        raise exceptions.FatalError('Contig %s in %s more than once' % (contig, contigf))
            #Handle lines containing contig orientation info:
            elif l[0:6] == 'isotig':
                l2 = l[l.find(" ") + 1: l.rfind(" ") - 1]
                l3 = [l2[i:i+6] for i in range(0, len(l2), 6)]
                for i in range(len(l3)):
                    if l3[i][0] == '<':
                        # contig is in reverse orientation
                        contig = isogroups[igroup][i]
                        contig_orientation[contig] = '-'
        #print self.params['sample'], target, "Parsed isotigsf, contigs:", len(isogroups), "contig_to_isogroup", len(contig_to_isogroup), "contig_orientation", len(contig_orientation)
        #Now parse readstatus:
        inf = open(readstatusf, 'r')
        inf.readline()  # discard first line
        for l in inf:
            l2 = l.strip().split('\t')
            #Determine if this read was assembled
            if len(l2) == 8:
                contig = l2[2]
                # Note that there are some built in limits to the number of contigs that can be in an isogroup:
                # http://contig.wordpress.com/2010/08/31/running-newbler-de-novo-transcriptome-assembly-i/
                # These won't appear in the IsotigsLayout.txt, but ARE in the ReadStatus.txt file.
                if contig in contig_to_isogroup:
                    readcounts[contig_to_isogroup[contig]] += 1
                else:
                    readcounts['ExceedsThreshold'] += 1
        #print self.params['sample'], target, "Parse read status"

        #Finally, output all of this information appropriately:
        countsf = open(os.path.join(self.params['finished_dir'], "isogroup_read_counts.tsv"), 'a')
        sample = self.params['sample']
        #First write out readcounts: sample \t target \t isogroup \t readcount
        for isogroup in readcounts:
            countsf.write('\t'.join([sample, target, isogroup, str(readcounts[isogroup])]) + '\n')
        countsf.close()
        #print self.params['sample'], target, "Wrote readcounts"

        #Next write the contigs in proper order and orientation:
        ncontigs = 0
        nisogroups = 0
        for isogroup in isogroups:
            nisogroups += 1
            for contig in isogroups[isogroup]:
                ncontigs += 1
                seqrec = contig_idx[contig]
                #print self.params['sample'], target, seqrec
                if contig_orientation[contig] == '-':
                    seqrec.seq = seqrec.seq.reverse_complement()
                #print self.params['sample'], target, seqrec
                seqrec.name = seqrec.id = sample + "_:_" + target + "_:_" + isogroup + "|" + contig
                #print self.params['sample'], target, seqrec
                SeqIO.write(seqrec, outf, "fasta")
        ## TODO: add support for the ExceedsThreshold contigs
        logger.info("Sample: %s target: %s iteration: %s Finished writing %s contigs, %s isogroups " % (self.params['sample'],
                    target, self.params['iteration'], ncontigs, nisogroups))
Exemplo n.º 28
0
    def start(self):
        sample = self.params['sample']
        logger.info("Sample: %s Starting finisher" % self.params['sample'])
        finished_dir = self.params['finished_dir']
        sample_finished = False
        targets_written = 0
        iteration = self.params['iteration']

        #Set up output for both finished and additional mapping outputs
        fin_outf = open(os.path.join(finished_dir, 'contigs.fasta'), 'a')
        remap_outf = open(os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta'), 'w')

        #check whether the sample is globally finished
        if self.params['iteration'] >= self.params['numcycles']:
            sample_finished = True

        #loop over the current set of targets_folders
        for target_folder in self.params['targets']:
            #Extract target specific details:
            target_map_against_reads = False
            safe_target = target_folder.split("/")[-1]  # get last element of path name
            target = self.params['safe_targets'][safe_target]
            cur_reads = self.params['readcounts'][target][iteration]  # note that this is a counter, so no key errors can occur
            previous_reads = self.params['readcounts'][target][iteration - 1]

            #Get finished assembly status:
            with open(os.path.join(target_folder, 'finished'), 'r') as finishedf:
                l = finishedf.readline().strip().split()[0]

            logger.info("Sample: %s target: %s finishing target.." % (self.params['sample'], target))
            logger.info("Sample: %s target: %s iteration: %s Assembly reports status: %s." % (sample, target, self.params['iteration'], l))

            if l in ('assembly_failed', 'map_against_reads'):
                target_map_against_reads = True

            if l == 'assembly_killed':
                #only write out the reads, assembly won't have contigs
                self.write_target(target, target_folder, outf=fin_outf, finished=False, map_against_reads=False, killed=True)
            elif sample_finished:  # everything goes into the final file/folders.
                self.write_target(target, target_folder, outf=fin_outf, finished=True, )
            elif target_map_against_reads and cur_reads > previous_reads and iteration < 3:
                #Only map against reads if we have improvement in mapping and we haven't been mapping for multiple iterations
                targets_written += self.write_target(target, target_folder, outf=remap_outf, finished=False, map_against_reads=True)
            else:
                #Check read counts and retire target, or send it back for re-mapping depending on mapped reads
                if iteration > 1 and cur_reads != 0 and previous_reads != 0:
                    if cur_reads / previous_reads > self.params['max_incorporation']:
                        logger.info("Sample %s target %s hit a repetitive region, no more mapping will be done" % (self.params['sample'], target))
                        self.write_target(target, target_folder, outf=fin_outf, finished=True, status='Repeat')
                    elif cur_reads <= previous_reads and iteration > 2:
                        #Give the mapper a couple extra iterations in case the first mapping got a lot of reads which didn't assemble
                        logger.info("Sample %s target %s did not incorporate any more reads, no more mapping will be done" % (self.params['sample'], target))
                        self.write_target(target, target_folder, outf=fin_outf, finished=True)
                    else:
                        #nothing fancy is going on, just write the contigs out for remapping
                        targets_written += self.write_target(target, target_folder, outf=remap_outf, finished=False)
                else:
                    #nothing fancy is going on, just write the contigs out for remapping
                    targets_written += self.write_target(target, target_folder, outf=remap_outf, finished=False)

        fin_outf.flush()
        remap_outf.flush()
        fin_outf.close()
        remap_outf.close()

        if targets_written > 0:
            # Build a new mapper and put it on the queue
            from ARC.runners import Mapper
            mapper_params = {}
            for k in self.params:
                mapper_params[k] = self.params[k]
            del mapper_params['targets']
            mapper_params['reference'] = os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta')
            self.submit(Mapper.to_job(mapper_params))
            logger.info("Sample: %s Added new mapper to queue: iteration %s" % (self.params['sample'], self.params['iteration']))

        else:
            logger.info("Sample: %s Mapper not added to queue. Work finished." % self.params['sample'])
Exemplo n.º 29
0
Arquivo: mapper.py Projeto: ibest/ARC
    def run_blat(self):
        #Check for necessary params:
        if not ('sample' in self.params and 'reference' in self.params and 'working_dir' in self.params and (('PE1' in self.params   and 'PE2' in self.params) or 'SE' in self.params)):
            raise exceptions.FatalError('Missing self.params in run_bowtie2.')
        #Check for necessary files:
        if os.path.exists(self.params['reference']) is False:
            raise exceptions.FatalError("Missing reference file for mapping")
        if 'PE1' in self.params and 'PE2' in self.params:
            if not (os.path.exists(self.params['PE1']) and os.path.exists(self.params['PE2'])):
                raise exceptions.FatalError(
                    "One or both PE files can not be found for mapping.")
        if 'SE' in self.params:
            if not os.path.exists(self.params['SE']):
                raise exceptions.FatalError("SE file cannot be found.")

        #Blat doesn't need an index
        working_dir = self.params['working_dir']

        #Check whether to log to temporary file, or default to os.devnull
        if 'verbose' in self.params:
            out = open(os.path.join(working_dir, "mapping_log.txt"), 'w')
        else:
            out = open(os.devnull, 'w')

        #Build a temporary txt file with all of the reads:
        allreads_outf = open(os.path.join(working_dir, 'reads.txt'), 'w')
        if 'PE1' in self.params and 'PE2' in self.params:
            allreads_outf.write(self.params['PE1'] + '\n')
            allreads_outf.write(self.params['PE2'] + '\n')
        if 'SE' in self.params:
            allreads_outf.write(self.params['SE'] + '\n')
        allreads_outf.close()

        #Do blat mapping
        args = ['blat', self.params['reference'], os.path.join(working_dir, 'reads.txt')]
        if self.params['format'] == 'fastq':
            args.append('-fastq')
        if self.params['fastmap']:
            args.append('-fastMap')
        #Some new experimental params to increase specificity after the first iteration:
        if self.params['maskrepeats']:
            args.append("-mask=lower")
        if self.params['iteration'] > 0 or not self.params['sloppymapping']:
            args.append("-minIdentity=98")
            args.append("-minScore=40")
        args.append(os.path.join(working_dir, 'mapping.psl'))

        logger.info("Sample: %s Calling blat mapper" % self.params['sample'])
        logger.debug(" ".join(args))
        try:
            ret = subprocess.call(args, stdout=out, stderr=out)
        except Exception as exc:
            txt = ("Sample %s: Unhandeled error running blat mapping, check log file." % self.params['sample']) + '\n\t' + str(exc)
            raise exceptions.FatalError(txt)
        finally:
            out.close()
        if ret != 0:
            raise exceptions.FatalError('Sample: %s Error running blat mapping, check log file. \n\t %s' % (self.params['sample'], " ".join(args)))

        #Extract the PSL to a dict
        self.params['mapping_dict'] = self.PSL_to_dict(os.path.join(working_dir, 'mapping.psl'))

        #Cleanup
        os.remove(os.path.join(working_dir, 'mapping.psl'))
        out.close()
Exemplo n.º 30
0
 def log(self, msg):
     if logger.level() == logging.DEBUG:
         name = self.name
     else:
         name = self.__class__.__name__
     logger.info("%-12s| %s" % (name, msg))
Exemplo n.º 31
0
Arquivo: mapper.py Projeto: kdm9/ARC
    def splitreads(self):
        """ Split reads and then kick off assemblies once the reads are split for a target, use safe_targets for names"""
        self.params['iteration'] += 1

        # Write out statistics for any/all targets which failed to recruit reads:
        for target in self.params['summary_stats'].keys():
            # print "Target", target
            if target not in self.params['mapping_dict']:
                writeTargetStats(finished_dir=self.params['finished_dir'],
                                 sample=self.params['sample'],
                                 target=target,
                                 targetLength=self.params['summary_stats']
                                 [target]['targetLength'],
                                 status='NoReads',
                                 iteration=self.params['iteration'],
                                 readcount=0,
                                 num_contigs=0,
                                 contig_length=0)
                del self.params['summary_stats'][target]

        checker_params = {}
        for k in self.params:
            checker_params[k] = self.params[k]
        del checker_params['mapping_dict']
        checker_params['targets'] = {}
        iteration = self.params['iteration']
        # open previously created indexes:
        if 'PE1' in self.params and 'PE2' in self.params:
            idx_PE1 = SeqIO.index_db(
                os.path.join(self.params['working_dir'], "PE1.idx"),
                key_function=keyfunction(self.params['sra']))
            idx_PE2 = SeqIO.index_db(
                os.path.join(self.params['working_dir'], "PE2.idx"),
                key_function=keyfunction(self.params['sra']))
        if 'SE' in self.params:
            idx_SE = SeqIO.index_db(
                os.path.join(self.params['working_dir'], "SE.idx"),
                key_function=keyfunction(self.params['sra']))
        if 'readcounts' not in checker_params:
            checker_params['readcounts'] = {}
        # if 'contigcounts' not in checker_params:
        #    checker_params['contigcounts'] = {}
        statsf = open(
            os.path.join(self.params['finished_dir'], 'mapping_stats.tsv'),
            'a')
        for target in self.params['mapping_dict']:
            startT = time.time()
            # logger.info("Running splitreads for Sample: %s target: %s" % (self.params['sample'], target))
            target_dir = os.path.join(self.params['working_dir'],
                                      self.params['safe_targets'][target])
            if target not in checker_params['readcounts']:
                checker_params['readcounts'][target] = Counter()
            # if target not in checker_params['contigcounts']:
            #    checker_params['contigcounts'] = Counter()
            if os.path.exists(target_dir):
                os.system("rm -rf %s" % target_dir)
            os.mkdir(target_dir)

            reads = self.params['mapping_dict'][target]
            # track how many total reads were added for this cycle
            checker_params['readcounts'][target][iteration] = len(reads)
            statsf.write('\t'.join([
                self.params['sample'], target,
                str(iteration),
                str(len(reads))
            ]) + '\n')

            SEs = PEs = 0
            if 'PE1' and 'PE2' in self.params:
                outf_PE1 = open(
                    os.path.join(target_dir, "PE1." + self.params['format']),
                    'w')
                outf_PE2 = open(
                    os.path.join(target_dir, "PE2." + self.params['format']),
                    'w')
            if 'SE' in self.params:
                outf_SE = open(
                    os.path.join(target_dir, "SE." + self.params['format']),
                    'w')

            for readID in reads:
                if self.params['subsample'] < 1 and randint(
                        0, 100) > self.params['subsample'] * 100:
                    continue
                if 'PE1' in self.params and readID in idx_PE1:
                    # read1 = idx_PE1[readID]
                    # read2 = idx_PE2[readID]
                    read1 = idx_PE1.get(readID, None)
                    read2 = idx_PE2.get(readID, None)
                    if read2 is None:
                        raise exceptions.FatalError(
                            "ERROR: ReadID %s was found in PE1 file but not PE2"
                            % readID)
                    new_readID = readID.replace(":", "_") + ":0:0:0:0#0/"
                    read1.id = read1.name = new_readID + "1"
                    read2.id = read2.name = new_readID + "2"
                    SeqIO.write(read1, outf_PE1, self.params['format'])
                    SeqIO.write(read2, outf_PE2, self.params['format'])
                    PEs += 1
                elif 'SE' in self.params and readID in idx_SE:
                    read1 = idx_SE[readID]
                    read1.id = read1.name = readID.replace(":",
                                                           "_") + ":0:0:0:0#0/"
                    SeqIO.write(read1, outf_SE, self.params['format'])
                    SEs += 1
            if 'PE1' in self.params and 'PE2' in self.params:
                outf_PE1.close()
                outf_PE2.close()
            if 'SE' in self.params:
                outf_SE.close()

            #Build assembly job:
            assembly_params = {}
            assembly_params['target'] = target
            assembly_params['target_dir'] = target_dir
            assembly_params['iteration'] = iteration
            assembly_params['last_assembly'] = False
            assembler_keys = [
                'assembler', 'sample', 'verbose', 'format', 'assemblytimeout',
                'map_against_reads', 'urt', 'numcycles', 'cdna', 'rip',
                'only-assembler'
            ]
            for k in assembler_keys:
                assembly_params[k] = self.params[k]
            cur_reads = checker_params['readcounts'][target][
                iteration]  # note that this is a counter, so no key errors can occur
            previous_reads = checker_params['readcounts'][target][iteration -
                                                                  1]

            #Turn off URT in situations where this will be the last iteration due to readcounts:

            if cur_reads <= previous_reads and iteration > 2 or iteration >= self.params[
                    'numcycles']:
                logger.info(
                    "Sample: %s target: %s iteration: %s Setting last_assembly to True"
                    %
                    (self.params['sample'], target, self.params['iteration']))
                assembly_params['last_assembly'] = True

            #properly handle the case where no reads ended up mapping for the PE or SE inputs:
            if PEs > 0:
                assembly_params['assembly_PE1'] = os.path.join(
                    target_dir, "PE1." + self.params['format'])
                assembly_params['assembly_PE2'] = os.path.join(
                    target_dir, "PE2." + self.params['format'])
            if SEs > 0:
                assembly_params['assembly_SE'] = os.path.join(
                    target_dir, "SE." + self.params['format'])

            #All reads have been written at this point, add an assembly to the queue:
            logger.info(
                "Sample: %s target: %s iteration: %s Split %s reads in %s seconds"
                % (self.params['sample'], target, self.params['iteration'],
                   len(reads), time.time() - startT))

            #Only add an assembly job and AssemblyChecker target if is there are >0 reads:
            if PEs + SEs > 0:
                checker_params['targets'][target_dir] = False
                self.submit(Assembler.to_job(assembly_params))

        statsf.close()
        logger.info("------------------------------------")
        logger.info("| Sample: %s Iteration %s of numcycles %s" %
                    (checker_params['sample'], checker_params['iteration'],
                     checker_params['numcycles']))
        logger.info("------------------------------------")
        if 'PE1' in self.params and 'PE2' in self.params:
            idx_PE1.close()
            idx_PE2.close()
            del idx_PE1
            del idx_PE2
        if 'SE' in self.params:
            idx_SE.close()
            del idx_SE

        #Kick off a job which checks if all assemblies are done, and if not adds a copy of itself to the job queue
        if len(checker_params['targets']) > 0:
            # checker = AssemblyChecker(checker_params)
            self.submit(AssemblyChecker.to_job(checker_params))
        else:
            logger.info("Sample: %s No reads mapped, no more work to do." %
                        checker_params['sample'])
Exemplo n.º 32
0
    def start(self):
        sample = self.params['sample']
        logger.info("Sample: %s Starting finisher" % self.params['sample'])
        finished_dir = self.params['finished_dir']
        sample_finished = False
        targets_written = 0
        iteration = self.params['iteration']

        #Set up output for both finished and additional mapping outputs
        fin_outf = open(os.path.join(finished_dir, 'contigs.fasta'), 'a')
        remap_outf = open(
            os.path.join(self.params['working_dir'],
                         'I%03d' % self.params['iteration'] +
                         '_contigs.fasta'), 'w')

        #check whether the sample is globally finished
        if self.params['iteration'] >= self.params['numcycles']:
            sample_finished = True

        #loop over the current set of targets_folders
        for target_folder in self.params['targets']:
            #Extract target specific details:
            target_map_against_reads = False
            safe_target = target_folder.split("/")[
                -1]  # get last element of path name
            target = self.params['safe_targets'][safe_target]
            cur_reads = self.params['readcounts'][target][
                iteration]  # note that this is a counter, so no key errors can occur
            previous_reads = self.params['readcounts'][target][iteration - 1]

            #Get finished assembly status:
            with open(os.path.join(target_folder, 'finished'),
                      'r') as finishedf:
                l = finishedf.readline().strip().split()[0]

            logger.info("Sample: %s target: %s finishing target.." %
                        (self.params['sample'], target))
            logger.info(
                "Sample: %s target: %s iteration: %s Assembly reports status: %s."
                % (sample, target, self.params['iteration'], l))

            if l in ('assembly_failed', 'map_against_reads'):
                target_map_against_reads = True

            if l == 'assembly_killed':
                #only write out the reads, assembly won't have contigs
                self.write_target(target,
                                  target_folder,
                                  outf=fin_outf,
                                  finished=False,
                                  map_against_reads=False,
                                  killed=True)
            elif sample_finished:  # everything goes into the final file/folders.
                self.write_target(
                    target,
                    target_folder,
                    outf=fin_outf,
                    finished=True,
                )
            elif target_map_against_reads and cur_reads > previous_reads and iteration < 3:
                #Only map against reads if we have improvement in mapping and we haven't been mapping for multiple iterations
                targets_written += self.write_target(target,
                                                     target_folder,
                                                     outf=remap_outf,
                                                     finished=False,
                                                     map_against_reads=True)
            else:
                #Check read counts and retire target, or send it back for re-mapping depending on mapped reads
                if iteration > 1 and cur_reads != 0 and previous_reads != 0:
                    if cur_reads / previous_reads > self.params[
                            'max_incorporation']:
                        logger.info(
                            "Sample %s target %s hit a repetitive region, no more mapping will be done"
                            % (self.params['sample'], target))
                        self.write_target(target,
                                          target_folder,
                                          outf=fin_outf,
                                          finished=True,
                                          status='Repeat')
                    elif cur_reads <= previous_reads and iteration > 2:
                        #Give the mapper a couple extra iterations in case the first mapping got a lot of reads which didn't assemble
                        logger.info(
                            "Sample %s target %s did not incorporate any more reads, no more mapping will be done"
                            % (self.params['sample'], target))
                        self.write_target(target,
                                          target_folder,
                                          outf=fin_outf,
                                          finished=True)
                    else:
                        #nothing fancy is going on, just write the contigs out for remapping
                        targets_written += self.write_target(target,
                                                             target_folder,
                                                             outf=remap_outf,
                                                             finished=False)
                else:
                    #nothing fancy is going on, just write the contigs out for remapping
                    targets_written += self.write_target(target,
                                                         target_folder,
                                                         outf=remap_outf,
                                                         finished=False)

        fin_outf.flush()
        remap_outf.flush()
        fin_outf.close()
        remap_outf.close()

        if targets_written > 0:
            # Build a new mapper and put it on the queue
            from ARC.runners import Mapper
            mapper_params = {}
            for k in self.params:
                mapper_params[k] = self.params[k]
            del mapper_params['targets']
            mapper_params['reference'] = os.path.join(
                self.params['working_dir'],
                'I%03d' % self.params['iteration'] + '_contigs.fasta')
            self.submit(Mapper.to_job(mapper_params))
            logger.info("Sample: %s Added new mapper to queue: iteration %s" %
                        (self.params['sample'], self.params['iteration']))

        else:
            logger.info(
                "Sample: %s Mapper not added to queue. Work finished." %
                self.params['sample'])
Exemplo n.º 33
0
    def RunNewbler(self):
        #Code for running newbler
        """
        Expects params keys:
            PE1 and PE2 and/or SE
            target_dir
            -urt
        """
        #Check for necessary params:
        if not (('assembly_PE1' in self.params and 'assembly_PE2' in self.params) or 'assembly_SE' in self.params):
            raise exceptions.FatalError('Missing self.params in RunNewbler.')

        #Check for necessary files:
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params and not(os.path.exists(self.params['assembly_PE1']) or not(os.path.exists(self.params['assembly_PE2']))):
            raise exceptions.FatalError('Missing PE files in RunNewbler.')

        if 'assembly_SE' in self.params and not(os.path.exists(self.params['assembly_SE'])):
            raise exceptions.FatalError('Missing SE file in RunNewbler.')

        sample = self.params['sample']
        target = self.params['target']
        killed = False
        failed = False

        #determine whether to pipe output to a file or /dev/null
        if self.params['verbose']:
            out = open(os.path.join(self.params['target_dir'], "assembly.log"), 'w')
        else:
            out = open(os.devnull, 'w')

        #Build args for newAssembly:
        args = ['newAssembly', '-force']
        if self.params['last_assembly'] and self.params['cdna']:
            #only run with cdna switch on the final assembly
            args += ['-cdna']
        args += [os.path.join(self.params['target_dir'], 'assembly')]
        logger.debug("Calling newAssembly for sample: %s target %s" % (sample, target))
        logger.info(" ".join(args))
        ret = subprocess.call(args, stdout=out, stderr=out)
        #Build args for addRun:
        if 'assembly_PE1' in self.params and 'assembly_PE2' in self.params:
            args = ['addRun', os.path.join(self.params['target_dir'], 'assembly')]
            args += [self.params['assembly_PE1']]
            logger.debug("Calling addRun for sample: %s target %s" % (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.call(args, stdout=out, stderr=out)

            args = ['addRun', os.path.join(self.params['target_dir'], 'assembly')]
            args += [self.params['assembly_PE2']]
            logger.debug("Calling addRun for sample: %s target %s" % (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.call(args, stdout=out, stderr=out)
        if 'assembly_SE' in self.params:
            args = ['addRun', os.path.join(self.params['target_dir'], 'assembly')]
            args += [self.params['assembly_SE']]
            logger.debug("Calling addRun for sample: %s target %s" % (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.call(args, stdout=out, stderr=out)

        #Build args for runProject
        args = ['runProject']
        args += ['-cpu', '1']
        if self.params['last_assembly'] and self.params['cdna']:
            args += ['-noace']
        else:
            args += ['-nobig']
        if self.params['urt'] and not self.params['last_assembly']:
            #only run with the -urt switch when it isn't the final assembly
            args += ['-urt']
        if self.params['rip']:
            args += ['-rip']
        args += [os.path.join(self.params['target_dir'], 'assembly')]
        try:
            start = time.time()
            logger.debug("Calling runProject for sample: %s target %s" % (sample, target))
            logger.debug(" ".join(args))
            ret = subprocess.Popen(args, stdout=out, stderr=out)
            pid = ret.pid
            while ret.poll() is None:
                if time.time() - start > self.params['assemblytimeout']:
                    self.kill_process_children(pid)
                    logger.warn("Sample: %s target: %s iteration: %s Killing assembly after %s seconds" % (sample, target, self.params['iteration'], time.time() - start))
                    killed = True
                    break
                time.sleep(.5)
        except Exception as exc:
            txt = "Sample: %s, Target: %s: Unhandeled error running Newbler assembly" % (self.params['sample'], self.params['target'])
            txt += '\n\t' + str(exc) + "".join(traceback.format_exception)
            logger.warn(txt)
            failed = True
            pass
        finally:
            out.close()

        #Sometimes newbler doesn't seem to exit completely:
        self.kill_process_children(pid)

        #if ret != 0:
            #raise exceptions.RerunnableError("Newbler assembly failed.")

        if not killed and ret.poll() != 0:
            #raise exceptions.RerunnableError("Newbler assembly failed.")
            failed = True

        if failed:
            logger.info("Sample: %s target: %s iteration: %s Assembly failed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"), 'w')
            outf.write("assembly_failed\t" + str(time.time() - start))
            outf.close()
        if killed:
            logger.info("Sample: %s target: %s iteration: %s Assembly killed after %s seconds" % (sample, target, self.params['iteration'], time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"), 'w')
            outf.write("assembly_killed\t" + str(time.time() - start))
            outf.close()
        else:
            #Run finished without error
            logger.info("Sample: %s target: %s iteration: %s Assembly finished in %s seconds" % (sample, target, self.params['iteration'], time.time() - start))
            outf = open(os.path.join(self.params['target_dir'], "finished"), 'w')
            outf.write("assembly_complete\t" + str(time.time() - start))
            outf.close()
Exemplo n.º 34
0
    def write_target(self, target, target_folder, outf, finished=False, map_against_reads=False, killed=False, status=None):
        # either map_against_reads was passed in, or
        # no contigs were assembled and target isn't finished, or
        # assembler crashed and no contig file was created
        # --> write reads as contigs
        num_contigs = 0  # store how many contigs were written out
        contig_length = 0
        if finished and status is None:
            status = 'Finished'
        if killed:
            status = 'Killed'
        if map_against_reads is False and killed is False:
            if self.params['assembler'] == 'newbler':
                contigf = os.path.join(self.params['working_dir'], target_folder, "assembly", "assembly", "454AllContigs.fna")
            elif self.params['assembler'] == 'spades':
                contigf = os.path.join(self.params['working_dir'], target_folder, "assembly", "contigs.fasta")
            #add support for a special output if this is the final assembly and newbler -cdna was used:
            if finished and self.params['cdna'] and self.params['assembler'] == 'newbler':
                self.writeCDNAresults(target, target_folder, outf, contigf)
            elif os.path.exists(contigf):
                i = 0
                contig_inf = open(contigf, 'r')
                for contig in SeqIO.parse(contig_inf, 'fasta'):
                    i += 1
                    if finished:
                        contig.name = contig.id = self.params['sample'] + "_:_" + target + "_:_" + "Contig%03d" % i
                    else:
                        contig.name = contig.id = self.params['sample'] + "_:_" + target + "_:_" + "Unfinished%03d" % i
                    contig = contig.upper()
                    #Only mask repeats on intermediate iterations.
                    if self.params['maskrepeats'] and not finished:
                        #contig.seq = Seq(str(mask_seq(contig.seq.tostring(), self.params['mapper'])))
                        contig.seq = Seq(str(mask_seq(str(contig.seq), self.params['mapper'])))
                    #Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't
                    if len(contig.seq) != contig.seq.count('n'):
                        SeqIO.write(contig, outf, "fasta")
                        contig_length += len(contig.seq)
                contig_inf.close()
                logger.info("Sample: %s target: %s iteration: %s Finished writing %s contigs " % (self.params['sample'], target, self.params['iteration'], i))
                num_contigs += i
                #if i == 0 and finished is False and self.params['iteration'] < 2:
                #    map_against_reads = True

        if map_against_reads:
            i = 0
            logger.info("Sample %s target %s: Writing reads as contigs." % (self.params['sample'], target))
            if 'PE1' in self.params and 'PE2' in self.params:
                inf_PE1n = os.path.join(target_folder, "PE1." + self.params['format'])
                inf_PE2n = os.path.join(target_folder, "PE2." + self.params['format'])
                if os.path.exists(inf_PE1n) and os.path.exists(inf_PE2n):
                    inf_PE1 = open(inf_PE1n, 'r')
                    inf_PE2 = open(inf_PE2n, 'r')
                    for r in SeqIO.parse(inf_PE1, self.params['format']):
                        i += 1
                        r.name = r.id = self.params['sample'] + "_:_" + target + "_:_" + "Read%04d" % i
                        SeqIO.write(r, outf, "fasta")
                    for r in SeqIO.parse(inf_PE2, self.params['format']):
                        i += 1
                        r.name = r.id = self.params['sample'] + "_:_" + target + "_:_" + "Read%04d" % i
                        SeqIO.write(r, outf, "fasta")
                    inf_PE1.close()
                    inf_PE2.close()

            if 'SE' in self.params:
                inf_SEn = os.path.join(target_folder, "SE." + self.params['format'])
                if os.path.exists(inf_SEn):
                    inf_SE = open(inf_SEn, 'r')
                for r in SeqIO.parse(inf_SE, self.params['format']):
                    i += 1
                    r.name = r.id = self.params['sample'] + "_:_" + target + "_:_" + "Read%04d" % i
                    SeqIO.write(r, outf, "fasta")
                inf_SE.close()
            num_contigs += i

        if finished or killed:
            #Write reads:
            if 'PE1' in self.params and 'PE2' in self.params:
                inf_PE1n = os.path.join(target_folder, "PE1." + self.params['format'])
                inf_PE2n = os.path.join(target_folder, "PE2." + self.params['format'])
                if os.path.exists(inf_PE1n) and os.path.exists(inf_PE2n):
                    inf_PE1 = open(inf_PE1n, 'r')
                    inf_PE2 = open(inf_PE2n, 'r')

                    outf_PE1 = open(os.path.join(self.params['finished_dir'], "PE1." + self.params['format']), 'a')
                    outf_PE2 = open(os.path.join(self.params['finished_dir'], "PE2." + self.params['format']), 'a')

                    for r in SeqIO.parse(inf_PE1, self.params['format']):
                        r.description = self.params['sample'] + "_:_" + target
                        SeqIO.write(r, outf_PE1, self.params['format'])
                    for r in SeqIO.parse(inf_PE2, self.params['format']):
                        r.description = self.params['sample'] + "_:_" + target
                        SeqIO.write(r, outf_PE2, self.params['format'])
                    outf_PE1.close()
                    outf_PE2.close()

            if 'SE' in self.params:
                inf_SEn = os.path.join(target_folder, "SE." + self.params['format'])
                if os.path.exists(inf_SEn):
                    inf_SE = open(inf_SEn, 'r')
                    outf_SE = open(os.path.join(self.params['finished_dir'], "SE." + self.params['format']), 'a')
                    for r in SeqIO.parse(inf_SE, self.params['format']):
                        r.description = self.params['sample'] + "_:_" + target
                        SeqIO.write(r, outf_SE, self.params['format'])
                    outf_SE.close()

        # Finally a special case for situations where assembly of a target is killed, but contigs exist from
        # a previous assembly. Note that we only do this when not running in cDNA mode.
        if killed and self.params['iteration'] > 1 and not self.params['cdna']:
            #No contigs will be available, however contigs from the previous iteration will be present in
            # I00N_contigs.fasta, grab these and write them out instead
            logger.info("Sample: %s target: %s iteration: %s Writing contigs from previous iteration."
                        % (self.params['sample'], target, self.params['iteration']))
            contigf = os.path.join(self.params['working_dir'], 'I%03d' % (self.params['iteration'] - 1) + '_contigs.fasta')
            if os.path.exists(contigf):
                for contig in SeqIO.parse(contigf, 'fasta'):
                    if contig.id.split("_:_")[1] == target:
                        contig.name = contig.id = contig.id.replace("Unfinished", "Contig")
                        SeqIO.write(contig, outf, "fasta")
                        num_contigs += 1
                        contig_length += len(contig.seq)
        #Cleanup temporary assembly, and reads:
        if not self.params['keepassemblies']:
            os.system("rm -rf %s" % target_folder)

        #write out target stats:
        if finished or killed:
            writeTargetStats(finished_dir=self.params['finished_dir'],
                             sample=self.params['sample'],
                             target=target,
                             targetLength=self.params['summary_stats'][target]['targetLength'],
                             status=status,
                             iteration=self.params['iteration'],
                             readcount=self.params['readcounts'][target][self.params['iteration']],
                             num_contigs=num_contigs, contig_length=contig_length)
            del self.params['summary_stats'][target]

        #writeTargetStats(target, status, num_contigs, contig_length, self.params)

#summary_stats[target] = {'RefLen': len(t), 'Status': 'NA', 'Iteration': None,
#                                         'Reads': None, 'Contigs': None, 'ContigLength': None}
            #[self.params['Sample'], target, 'TargetLength', 'Status', 'Iteration', 'Reads', 'Contigs', 'ContigLength']) + '\n')
        if finished or killed:
            return 0
        else:
            return num_contigs
Exemplo n.º 35
0
    def writeCDNAresults(self, target, target_folder, outf, contigf):
        """
        This is ONLY called when a cDNA target is finished.

        When doing a cDNA type run, it is very useful to have both the following:
        1) All contigs that belong to a gene (isogroup)
            - It would be particularly good to re-orient these if they are in RC.
        2) Total number of reads assembled in each gene (isogroup)

        Additionally it would be excellent to some day also get the following:
        3) Transcript (isotig) structure
        4) Estimate of isotig specific reads.

        """
        if self.params['assembler'] == 'newbler':
            contigf = os.path.join(self.params['working_dir'], target_folder,
                                   "assembly", "assembly", "454AllContigs.fna")
            isotigsf = os.path.join(self.params['working_dir'], target_folder,
                                    "assembly", "assembly",
                                    "454IsotigsLayout.txt")
            readstatusf = os.path.join(self.params['working_dir'],
                                       target_folder, "assembly", "assembly",
                                       "454ReadStatus.txt")
        else:
            logger.info(
                "WARNING writeCDNAresults called when assembler was not Newbler"
            )
            return None
        if not (os.path.exists(contigf) and os.path.exists(isotigsf)
                and os.path.exists(readstatusf)):
            logger.info("CDNA WARNING MISSING FILE!! %s %s" %
                        (target, self.params['sample']))
            logger.info(contigf, os.path.exists(contigf))
            logger.info(isotigsf, os.path.exists(isotigsf))
            logger.info(readstatusf, os.path.exists(readstatusf))
            return None
        #Storage data structures:
        isogroups = {
        }  # A dict of isogroups which each contain an in-order list of contigs
        readcounts = Counter(
        )  # A dict of all contigs, these contain read counts (from ReadStatus)
        contig_orientation = {}
        contig_to_isogroup = {}
        contig_idx = SeqIO.index(contigf, "fasta")
        # Parse isotigsf:
        igroup = ""
        #print self.params['sample'], target, "Parsing isotigsf: %s" % isotigsf
        for l in open(isotigsf, 'r'):
            #Handle lines with only a '\n'
            if l == '\n':
                pass
            #Handle lines for isogroup:
            elif l[0:9] == '>isogroup':
                igroup = l.strip().split()[0].strip(">")
            #Handle lines containing all contigs:
            elif l.strip().split()[0] == 'Contig':
                l2 = l.strip().split()
                contigs = map(lambda x: "contig" + x, l2[2:-1])
                isogroups[igroup] = contigs
                for contig in contigs:
                    if contig not in contig_orientation:
                        contig_orientation[contig] = '+'
                        contig_to_isogroup[contig] = igroup
                    else:
                        raise exceptions.FatalError(
                            'Contig %s in %s more than once' %
                            (contig, contigf))
            #Handle lines containing contig orientation info:
            elif l[0:6] == 'isotig':
                l2 = l[l.find(" ") + 1:l.rfind(" ") - 1]
                l3 = [l2[i:i + 6] for i in range(0, len(l2), 6)]
                for i in range(len(l3)):
                    if l3[i][0] == '<':
                        # contig is in reverse orientation
                        contig = isogroups[igroup][i]
                        contig_orientation[contig] = '-'
        #print self.params['sample'], target, "Parsed isotigsf, contigs:", len(isogroups), "contig_to_isogroup", len(contig_to_isogroup), "contig_orientation", len(contig_orientation)
        #Now parse readstatus:
        inf = open(readstatusf, 'r')
        inf.readline()  # discard first line
        for l in inf:
            l2 = l.strip().split('\t')
            #Determine if this read was assembled
            if len(l2) == 8:
                contig = l2[2]
                # Note that there are some built in limits to the number of contigs that can be in an isogroup:
                # http://contig.wordpress.com/2010/08/31/running-newbler-de-novo-transcriptome-assembly-i/
                # These won't appear in the IsotigsLayout.txt, but ARE in the ReadStatus.txt file.
                if contig in contig_to_isogroup:
                    readcounts[contig_to_isogroup[contig]] += 1
                else:
                    readcounts['ExceedsThreshold'] += 1
        #print self.params['sample'], target, "Parse read status"

        #Finally, output all of this information appropriately:
        countsf = open(
            os.path.join(self.params['finished_dir'],
                         "isogroup_read_counts.tsv"), 'a')
        sample = self.params['sample']
        #First write out readcounts: sample \t target \t isogroup \t readcount
        for isogroup in readcounts:
            countsf.write('\t'.join(
                [sample, target, isogroup,
                 str(readcounts[isogroup])]) + '\n')
        countsf.close()
        #print self.params['sample'], target, "Wrote readcounts"

        #Next write the contigs in proper order and orientation:
        ncontigs = 0
        nisogroups = 0
        for isogroup in isogroups:
            nisogroups += 1
            for contig in isogroups[isogroup]:
                ncontigs += 1
                seqrec = contig_idx[contig]
                #print self.params['sample'], target, seqrec
                if contig_orientation[contig] == '-':
                    seqrec.seq = seqrec.seq.reverse_complement()
                #print self.params['sample'], target, seqrec
                seqrec.name = seqrec.id = sample + "_:_" + target + "_:_" + isogroup + "|" + contig
                #print self.params['sample'], target, seqrec
                SeqIO.write(seqrec, outf, "fasta")
        ## TODO: add support for the ExceedsThreshold contigs
        logger.info(
            "Sample: %s target: %s iteration: %s Finished writing %s contigs, %s isogroups "
            % (self.params['sample'], target, self.params['iteration'],
               ncontigs, nisogroups))
Exemplo n.º 36
0
    def setup(self, config):
        """
            Set up working folder for each sample. Also assign a "safe_target"
            name to each target so that folder creation works. This is a little
            bit tricky because if the user has targets with the _:_ seperator
            in the name it messes up the splitter and SAM_to_dict. This code is
            therefore written with the assumption that the user has put the _:_
            in the name purposely so that multiple entries in the reference
            fasta will be treated as a single target.
        """
        format = config['format']
        for sample in config['Samples']:
            s = config['Samples'][sample]
            working_dir = os.path.realpath(config['workingdirectory'] +
                                           '/working_' + sample)
            #working_dir = os.path.realpath('./working_' + sample)
            finished_dir = os.path.realpath('./finished_' + sample)
            config['Samples'][sample]['working_dir'] = working_dir
            config['Samples'][sample]['finished_dir'] = finished_dir
            if os.path.exists(working_dir):
                logger.info("WARNING working directory already exists for "
                            "sample %s, deleting old results if any." %
                            (sample))
                os.system('rm -rf %s' % finished_dir)
                os.system('rm -rf %s/t__*' % working_dir)
                os.system('rm -rf %s/*.psl' % working_dir)
                os.system('rm %s/I*_contigs.fasta' % working_dir)
                if os.path.exists('%s/idx' % working_dir):
                    os.system('rm -rf %s/idx' % working_dir)
                os.mkdir(finished_dir)
            else:
                os.mkdir(working_dir)
                os.mkdir(finished_dir)

            # Create stats file:
            statsf = open(os.path.join(finished_dir, "mapping_stats.tsv"), 'w')
            statsf.write(
                '\t'.join(['Sample', 'Target', 'Iteration', 'Reads']) + '\n')
            statsf.close()

            # Create Target Summary Table
            tstf = open(os.path.join(finished_dir, "target_summary_table.tsv"),
                        'w')
            tstf.write('\t'.join([
                'Sample', 'Target', 'RefLen', 'Status', 'Iteration', 'Reads',
                'Contigs', 'ContigLength'
            ]) + '\n')
            tstf.close()

            # Create a stats file for cdna
            if config['cdna']:
                countsf = open(
                    os.path.join(finished_dir, "isogroup_read_counts.tsv"),
                    'a')
                countsf.write(
                    '\t'.join(['Sample', 'Target', 'isogroup', 'readcount']) +
                    '\n')
                countsf.close()

            # Build a separate index for each read file in the input, put them
            # in working_dir
            # Consider parallelizing this?
            try:
                start = time.time()
                if 'PE1' in s:
                    if not os.path.exists(os.path.join(working_dir,
                                                       "PE1.idx")):
                        print s['PE1']
                        index_file = os.path.join(working_dir, "PE1.idx")
                        p1 = SeqIO.index_db(index_file,
                                            s['PE1'],
                                            format,
                                            key_function=keyfunction(
                                                config['sra']))
                if 'PE2' in s:
                    if not os.path.exists(os.path.join(working_dir,
                                                       "PE2.idx")):
                        print s['PE2']
                        index_file = os.path.join(working_dir, "PE2.idx")
                        p2 = SeqIO.index_db(index_file,
                                            s['PE2'],
                                            format,
                                            key_function=keyfunction(
                                                config['sra']))
                        if len(p1) != len(p2):
                            logger.error(
                                "The number of reads in %s and %s do not match, "
                                "check the config for errors" %
                                (s['PE1'], s['PE2']))
                if 'SE' in s:
                    if not os.path.exists(os.path.join(working_dir, "SE.idx")):
                        print s['SE']
                        index_file = os.path.join(working_dir, "SE.idx")
                        SeqIO.index_db(index_file,
                                       s['SE'],
                                       format,
                                       key_function=keyfunction(config['sra']))
            except (KeyboardInterrupt, SystemExit):
                print "Removing partial index: %s" % index_file
                os.unlink(index_file)
                raise
            logger.info("Sample: %s, indexed reads in %s seconds." %
                        (sample, time.time() - start))

            # Read through the references, mask them if necessary

            # mapper_params['reference'] = os.path.join(self.params['working_dir'], 'I%03d' % self.params['iteration'] + '_contigs.fasta')

        # Read through the reference, set up a set of safe names for the targets.
        # Also create the Target Summary Table which is indexed by original target name (following ARC conventions)
        # Also mask sequences and write them to a new set of output files
        # safe_targets is a two-way lookup, meaning it has both the safe target ID and the contig ID.
        summary_stats = {}
        safe_targets = {}
        new_refsf = {}
        for sample in config['Samples']:
            s = config['Samples'][sample]
            new_refsf[sample] = open(
                os.path.join(s['working_dir'], 'I000_contigs.fasta'), 'w')

        i = 0
        for t in SeqIO.parse(config['reference'], "fasta"):
            if len(t.name.split("_:_")) == 1:
                target = t.name
            else:
                target = t.name.split("_:_")[1]

            safe_targets[target] = "t__%06d" % i
            safe_targets["t__%06d" % i] = target
            i += 1
            if target not in summary_stats:
                summary_stats[target] = {'targetLength': len(t)}
            else:
                summary_stats[target]['targetLength'] = (
                    summary_stats[target]['targetLength'] + len(t))

            # Write contigs:
            if config['maskrepeats']:
                t.seq = Seq(str(mask_seq(str(t.seq), config['mapper'])))
            # Bowtie2 crashes if a contig is all 'n' so only write it out if it isn't
            if len(t) != t.seq.count('n'):
                for outf in new_refsf.values():
                    SeqIO.write(t, outf, "fasta")
            else:
                writeTargetStats(
                    finished_dir=s['finished_dir'],
                    sample=sample,
                    target=target,
                    targetLength=summary_stats[target]['targetLength'],
                    status='MaskedOut',
                    iteration=0,
                    readcount=0,
                    num_contigs=0,
                    contig_length=0)
                del summary_stats[target]

        config['safe_targets'] = safe_targets
        config['summary_stats'] = summary_stats
Exemplo n.º 37
0
 def info(self, msg):
     if self.loglevel == logging.DEBUG:
         name = self.name
     else:
         name = self.__class__.__name__
     logger.info("%-12s| %s" % (name, msg))
Exemplo n.º 38
0
    def run(self):
        logger.info("Starting...")
        logger.debug("Setting up workers.")

        for i in range(self.nprocs):
            worker = ProcessRunner(i, self.q, self.status, self.stats,
                                   self.pid)
            self.workers.append(worker)
            worker.daemon = False
            worker.start()

        while True:
            try:
                self.q.join()

                # This shouldn't be needed but we will check just in case
                if self.all_workers_waiting():
                    logger.debug(
                        "Workers are all waiting and the queue is empty.  Exiting"
                    )
                    break
                else:
                    logger.debug(
                        "Workers are not in a waiting state.  Waiting for more."
                    )
                    time.sleep(5)

            except exceptions.FatalError:
                logger.error("A fatal error was encountered.")
                self.killall()
                raise
            except (KeyboardInterrupt, SystemExit):
                logger.error("Terminating processes")
                self.killall()
                raise
            except Exception as e:
                ex_type, ex, tb = sys.exc_info()
                logger.error("\n".join(
                    traceback.format_exception(ex_type, ex, tb)))
                logger.error("An unhandled exception occurred")
                self.killall()
                raise
            finally:
                # Kill 'em all!
                self.killall()

        logger.info("-----")
        logger.info("%d processes returned ok." % (self.stats[0]))
        logger.info("%d processes had to be rerun." % (self.stats[1]))
        logger.info("-----")
        logger.info("%d Mapper jobs run." % (self.stats[2]))
        logger.info("%d Assembly jobs run." % (self.stats[3]))
        logger.info("%d Checker jobs run." % (self.stats[4]))
        logger.info("%d Finisher jobs run." % (self.stats[5]))
        logger.info("-----")