예제 #1
0
파일: run.py 프로젝트: eliasOnAWS/graftM
    def setattributes(self, args):

        self.hk = HouseKeeping()
        self.s = Stats_And_Summary()
        if args.subparser_name == 'graft':
            commands = ExternalProgramSuite([
                'orfm', 'nhmmer', 'hmmsearch', 'mfqe', 'pplacer',
                'ktImportText', 'diamond'
            ])
            self.hk.set_attributes(self.args)
            self.hk.set_euk_hmm(self.args)
            if args.euk_check:
                self.args.search_hmm_files.append(self.args.euk_hmm_file)

            self.ss = SequenceSearcher(
                self.args.search_hmm_files,
                (None if self.args.search_only else self.args.aln_hmm_file))
            self.sequence_pair_list = self.hk.parameter_checks(args)
            if hasattr(args, 'reference_package'):
                self.p = Pplacer(self.args.reference_package)

        elif self.args.subparser_name == "create":
            commands = ExternalProgramSuite(
                ['taxit', 'FastTreeMP', 'hmmalign', 'mafft'])
            self.create = Create(commands)
class KronaBuilder:
    def __init__(self):
        self.hk = HouseKeeping()

    def otuTablePathListToKrona(self, otuTablePaths, outputName, cmd_log):
        otuTables = []
        for path in otuTablePaths:
            for table in self.parseOtuTable(path):
                otuTables.append(table)
        self.runKrona(otuTables, outputName, cmd_log)

    def parseOtuTable(self, otuTablePath):
        data = csv.reader(open(otuTablePath), delimiter="\t")

        # Parse headers (sample names)
        fields = data.next()
        if len(fields) < 3: raise "Badly formed OTU table %s" % otuTablePath
        tables = []
        for i in range(len(fields)-2):
            table = OtuTable(fields[i+1])
            tables.append(table)

        # Parse the data in
        taxonomyColumn = len(fields)-1
        for row in data:
            for i in range(len(fields)-2):
                taxonomy = row[taxonomyColumn]
                tables[i].sampleCounts[taxonomy] = row[i+1]

        return tables

    def runKrona(self, otuTables, outputName, cmd_log):
        # write out the tables to files
        tempfiles = []
        tempfile_paths = []
        for table in otuTables:
            tmps = tempfile.mkstemp('','CommunityMkrona')
            tmp = tmps[1]
            out = open(tmp,'w')
            tempfiles.append(out)
            tempfile_paths.append(tmp)
            for taxonomy, count in table.sampleCounts.iteritems():
                tax = "\t".join(taxonomy.split(';'))
                out.write("%s\t%s\n" % (count,tax))
            out.close()

        cmd = ["ktImportText",'-o',outputName]
        for i, tmp in enumerate(tempfile_paths):
            cmd.append(','.join([tmp,otuTables[i].sample_name]))

        # run the actual krona
        self.hk.add_cmd(cmd_log, ' '.join(cmd) + ' 1>/dev/null ')
        subprocess.check_call(' '.join(cmd) + ' 1>/dev/null ', shell=True)

        # close tempfiles
        for t in tempfiles:
            t.close()
예제 #3
0
파일: run.py 프로젝트: wwood/graftM
 def setattributes(self, args):
     self.kb = KronaBuilder()
     self.hk = HouseKeeping()
     self.s = Stats_And_Summary()
     self.tg = TaxoGroup()
     self.e = Extract()
     if args.subparser_name == 'graft':
         self.hk.set_attributes(self.args)
         self.h = Hmmer(self.args.search_hmm_files, self.args.aln_hmm_file)
         self.sequence_pair_list, self.input_file_format = self.hk.parameter_checks(args)
         if hasattr(args, 'reference_package'):
             self.p = Pplacer(self.args.reference_package)
예제 #4
0
파일: pplacer.py 프로젝트: wwood/graftM
 def __init__(self, refpkg):
     self.refpkg = refpkg
     self.hk = HouseKeeping()
예제 #5
0
파일: pplacer.py 프로젝트: wwood/graftM
class Pplacer:
    ### Contains function related to processing alignment files to jplace files
    ### and running comparisons between forward and revere reads if reverse
    ### reads are provided.

    def __init__(self, refpkg):
        self.refpkg = refpkg
        self.hk = HouseKeeping()

    # Run pplacer
    def pplacer(self, output_file, output_path, input_path, threads, cmd_log):
        ## Runs pplacer on concatenated alignment file
        cmd = "pplacer -j %s --verbosity 0 --out-dir %s -c %s %s" % (threads, output_path, self.refpkg, input_path) # Set command
        self.hk.add_cmd(cmd_log, cmd) # Log it
        subprocess.check_call(cmd, shell=True) # Run it
        output_path = '.'.join(input_path.split('.')[:-1]) + '.jplace'
        return output_path

    def alignment_merger(self, alignment_files, output_alignment_path):
        ## Concatenate aligned read_files into one file. Each read with it's
        ## own unique identifier assigning it to a particular origin file

        alias_hash = {} # Set up a hash with file names and their unique identifier
        file_number = 0 # file counter (unique identifier)
        with open(output_alignment_path, 'w') as output:
            for alignment_file in alignment_files: # For each alignment
                alignments = list(SeqIO.parse(open(alignment_file, 'r'), 'fasta')) # read list
                for record in alignments: # For each record in the read list
                    record.id = record.id + '_' + str(file_number) # append the unique identifier to the record id
                SeqIO.write(alignments, output, "fasta") # And write the reads to the file
                alias_hash[str(file_number)] = {'output_path': os.path.join(os.path.dirname(alignment_file),'placements.jplace') ,
                                             'place': []}
                file_number += 1
        return alias_hash

    def guppy_class(self, main_guppy_path, jplace_list, cmd_log):
        ## Run guppy classify, and parse the output to the appropriate paths

        # Create concatenated guppy classify file from all .jplace files
        # created in the placement step
        cmd = 'guppy classify -c %s %s > %s' % (self.refpkg, ' '.join(jplace_list), main_guppy_path)
        self.hk.add_cmd(cmd_log, cmd)
        subprocess.check_call(cmd, shell=True)
        # Create list of guppys
        all_guppys = [x.rstrip() for x in open(main_guppy_path, 'r').readlines()]
        gup = []
        guppys = []
        for line in all_guppys:
            if 'name' in line and len(gup) == 0:
                gup.append(line)
            elif 'name' in line and len(gup) >= 0:
                guppys.append(gup)
                gup = [line]
            else:
                gup.append(line)
        guppys.append(gup)

        # Parse the guppy files.
        for idx, gup in enumerate(guppys):
            gup = [x for x in gup if x] # For each of the guppys remove empty components of the list
            out = os.path.join(os.path.dirname(jplace_list[idx]), 'placements.guppy') # Find the output
            r_num = len(list(set( [x.split()[0] for x in gup if 'name' not in x]))) # Calculate the number of placements
            with open(out, 'w') as out_guppy:
                for line in gup:
                    out_guppy.write(line + '\n')
        self.hk.delete([main_guppy_path])

        return

    def jplace_split(self, jplace_file, alias_hash, summary_dict):
        ## Split the jplace file into their respective directories

        # Load the placement file
        placement_file = json.load(open(jplace_file))

        # Parse the placements based on unique identifies appended to the end
        # of each read
        for placement in placement_file['placements']: # for each placement
            hash = {} # create an empty hash
            for alias in alias_hash: # For each alias, append to the 'place' list each read that identifier
                hash = {'p': placement['p'],
                        'nm': [nm for nm in placement['nm'] if nm[0].split('_')[-1] == alias]}
                alias_hash[alias]['place'].append(hash)

        # Write the jplace file to their respective file paths.
        jplace_path_list = []
        for alias in alias_hash:
            output = {'fields': placement_file['fields'],
                      'version': placement_file['version'],
                      'tree':  placement_file['tree'],
                      'placements': alias_hash[alias]['place'],
                      'metadata': placement_file['metadata']}
            with open(alias_hash[alias]['output_path'], 'w') as output_path:
                json.dump(output, output_path, ensure_ascii=False)
            jplace_path_list.append(alias_hash[alias]['output_path'])
        summary_dict['jplace_path_list'] = jplace_path_list
        return summary_dict

    def place(self, summary_dict, files, args):
        ## Pipeline taking multiple alignment files and returning multiple
        ## placement and guppy files, as well as the comparison between forward
        ## and reverse reads, if the reverse pipeline is selected

        start = timeit.default_timer() # Start placement timer

        # Merge the alignments so they can all be placed at once.
        alias_hash = self.alignment_merger(summary_dict['seqs_list'], files.comb_aln_fa())

        # Run pplacer on merged file
        jplace = self.pplacer(files.jplace_output_path(), args.output_directory, files.comb_aln_fa(), args.threads, files.command_log_path())

        stop = timeit.default_timer() # stop placement timer and log
        summary_dict['place_t'] = str( int(round((stop - start), 0)) )

        # Split the jplace file
        summary_dict = self.jplace_split(jplace, alias_hash, summary_dict)
        self.hk.delete([jplace])
        # Run guppy classify and parse the output
        self.guppy_class(files.main_guppy_path(), summary_dict['jplace_path_list'], files.command_log_path())

        # If the reverse pipe has been specified, run the comparisons between the two pipelines. If not then just return.

        for base in summary_dict['base_list']:
            if summary_dict['reverse_pipe']:
                summary_dict[base] = Compare().compare_hits(summary_dict[base], base)
                summary_dict[base] = Compare().compare_placements(os.path.join(args.output_directory, base, 'forward', 'placements.guppy'),
                                                                  os.path.join(args.output_directory, base, 'reverse', 'placements.guppy'),
                                                                  summary_dict[base],
                                                                  args.placements_cutoff)
            elif not summary_dict['reverse_pipe']: # Set the trusted placements as
                summary_dict[base]['trusted_placements'] = {}
                tc = TaxoGroup().guppy_splitter(os.path.join(args.output_directory,base,'placements.guppy'), args.placements_cutoff)
                for read, entry in tc.iteritems():
                    summary_dict[base]['trusted_placements'][read] = entry['placement']
        return summary_dict
예제 #6
0
파일: hmmer.py 프로젝트: dparks1134/graftM
 def __init__(self, search_hmm, aln_hmm=None):
     self.search_hmm = search_hmm
     self.aln_hmm = aln_hmm
     self.hk = HouseKeeping()
예제 #7
0
파일: hmmer.py 프로젝트: dparks1134/graftM
class Hmmer:

    def __init__(self, search_hmm, aln_hmm=None):
        self.search_hmm = search_hmm
        self.aln_hmm = aln_hmm
        self.hk = HouseKeeping()

    def hmmalign(self, input_path, run_stats, cmd_log, for_file, rev_file, for_sto_file, rev_sto_file, for_conv_file, rev_conv_file):
        # Align input reads to a specified hmm.
        if run_stats['rev_true']:
            read_info = run_stats['reads']
            reverse = []
            forward = []
            records = list(SeqIO.parse(open(input_path), 'fasta'))

            # Split the reads into reverse and forward lists
            for record in records:

                if read_info[record.id]['direction'] == '+':
                    forward.append(record)

                elif read_info[record.id]['direction'] == '-':
                    reverse.append(record)

                else:
                    raise Exception(Messenger().error_message('Programming error: hmmalign'))
                    exit(1)

            # Write reverse complement and forward reads to files
            with open(for_file, 'w') as for_aln:
                for record in forward:
                    if record.id and record.seq: # Check that both the sequence and ID fields are there, HMMalign will segfault if not.
                        for_aln.write('>'+record.id+'\n')
                        for_aln.write(str(record.seq)+'\n')

            with open(rev_file, 'w') as rev_aln:
                for record in reverse:
                    if record.id and record.seq:
                        rev_aln.write('>'+record.id+'\n')
                        rev_aln.write(str(record.seq.reverse_complement())+'\n')


            # HMMalign and convert to fasta format
            cmd = 'hmmalign --trim -o %s %s %s 2>/dev/null; seqmagick convert %s %s' % (for_sto_file,
                                                                                        self.aln_hmm,
                                                                                        for_file,
                                                                                        for_sto_file,
                                                                                        for_conv_file)
            self.hk.add_cmd(cmd_log, cmd)
            subprocess.check_call(cmd, shell=True)
            cmd = 'hmmalign --trim -o %s %s %s 2>/dev/null; seqmagick convert %s %s' % (rev_sto_file,
                                                                                        self.aln_hmm,
                                                                                        rev_file,
                                                                                        rev_sto_file,
                                                                                        rev_conv_file)

            self.hk.add_cmd(cmd_log, cmd)
            subprocess.check_call(cmd, shell=True)

        # If there are only forward reads, just hmmalign and be done with it.
        else:
            cmd = 'hmmalign --trim -o %s %s %s ; seqmagick convert %s %s' % (for_sto_file,
                                                                             self.aln_hmm,
                                                                             input_path,
                                                                             for_sto_file,
                                                                             for_conv_file)
            self.hk.add_cmd(cmd_log, cmd)
            subprocess.check_call(cmd, shell=True)


    def hmmsearch(self, output_path, input_path, input_file_format, seq_type, threads, eval, min_orf_length, restrict_read_length, cmd_log):
        '''Run a hmmsearch on the input_path raw reads, and return the name
        of the output table. Keep a log of the commands.'''
        # Define the base hmmsearch command.
        output_table_list = []
        tee = ' | tee'
        hmm_number = len(self.search_hmm)
        if hmm_number > 1:
            for idx, hmm in enumerate(self.search_hmm):
                out = os.path.join(os.path.split(output_path)[0], os.path.basename(hmm).split('.')[0] +'_'+ os.path.split(output_path)[1])
                output_table_list.append(out)
                if idx + 1 == hmm_number:
                    tee += " | hmmsearch %s --cpu %s --domtblout %s %s - >/dev/null " % (eval, threads, out, hmm)
                elif idx + 1 < hmm_number:
                    tee += " >(hmmsearch %s --cpu %s --domtblout %s %s - >/dev/null) " % (eval, threads, out, hmm)
                else:
                    raise Exception("Programming Error.") 

        elif hmm_number == 1:
            tee = ' | hmmsearch %s --cpu %s --domtblout %s %s - >/dev/null' % (eval, threads, output_path, self.search_hmm[0])
            output_table_list.append(output_path)
        
        # Choose an input to this base command based off the file format found.
        if seq_type == 'nucleotide': # If the input is nucleotide sequence
            orfm_cmdline = self.orfm_command_line(min_orf_length, restrict_read_length)
            cmd = '%s %s %s ' % (orfm_cmdline, input_path, tee)
            self.hk.add_cmd(cmd_log, cmd)
            subprocess.check_call(["/bin/bash", "-c", cmd])
        
        elif seq_type == 'protein': # If the input is amino acid sequence
            if input_file_format == FORMAT_FASTQ_GZ: # If its gzipped
                cmd = "awk '{print \">\" substr($0,2);getline;print;getline;getline}' <(zcat %s) %s" % (input_path, tee) # Unzip it and feed it into the base command
                self.hk.add_cmd(cmd_log, cmd)
                subprocess.check_call(["/bin/bash", "-c", cmd])
            elif input_file_format == FORMAT_FASTA: # If it is in fasta format
                cmd = "cat %s %s" % (input_path, tee) # It can be searched directly, no manpulation required
                self.hk.add_cmd(cmd_log, cmd)
                subprocess.check_call(["/bin/bash", "-c", cmd])
            else:
                raise Exception('Programming Error: error guessing input file format')
        else:
            raise Exception('Programming Error: error guessing input sequence type')
        return output_table_list

    def nhmmer(self, output_path, input_path, input_file_format, threads, eval, cmd_log):
        ## Run a nhmmer search on input_path file and return the name of
        ## resultant output table. Keep log of command.
        output_table_list = []
        tee = ''
        hmm_number = len(self.search_hmm)
        if hmm_number > 1:
            for idx, hmm in enumerate(self.search_hmm):
                out = os.path.join(os.path.split(output_path)[0], os.path.basename(hmm).split('.')[0] +'_'+ os.path.split(output_path)[1])
                output_table_list.append(out)
                if idx + 1 == hmm_number:
                    tee += " | nhmmer %s --cpu %s --tblout %s %s - >/dev/null " % (eval, threads, out, hmm)
                elif idx + 1 < hmm_number:
                    tee += " >(nhmmer %s --cpu %s --tblout %s %s - >/dev/null) " % (eval, threads, out, hmm)
                else:
                    raise Exception("Programming Error.")    
        elif hmm_number == 1:
            tee = ' | nhmmer %s --cpu %s --tblout %s %s - >/dev/null' % (eval, threads, output_path, self.search_hmm[0])
            output_table_list.append(output_path) 
        if input_file_format == FORMAT_FASTA:
            cmd = "cat %s | tee %s" % (input_path, tee)
            self.hk.add_cmd(cmd_log, cmd)
            subprocess.check_call(["/bin/bash", "-c", cmd])

        elif input_file_format == FORMAT_FASTQ_GZ:
            cmd = "awk '{print \">\" substr($0,2);getline;print;getline;getline}' <(zcat %s) | tee %s " % (input_path, tee)
            self.hk.add_cmd(cmd_log, cmd)
            subprocess.check_call(["/bin/bash", "-c", cmd])
        else:
            raise Exception(Messenger().message('ERROR: Suffix on %s not familiar. Please submit an .fq.gz or .fa file\n' % (input_path)))
        return output_table_list

    def hmmtable_reader(self, hmmtable):
        hash = {}
        seen = {}
        
        def buildHash(hit, program):
            if program == 'hmmsearch':
                if float(hit[17]) - float(hit[18]) > 0:
                    len = float(hit[17]) - float(hit[18])
                elif float(hit[17]) - float(hit[18]) < 0:
                    len = float(hit[18]) - float(hit[17])
                read_hash= {'len': len,
                            'bit': float(hit[7]),
                            'hmmfrom':float(hit[16]),
                            'hmmto':float(hit[15]),
                            'alifrom':hit[17],
                            'alito':hit[18]}
            elif program == 'nhmmer':
                if float(hit[6]) - float(hit[7]) > 0:
                    len = float(hit[6]) - float(hit[7])
                elif float(hit[6]) - float(hit[7]) < 0:
                    len = float(hit[7]) - float(hit[6])
                read_hash = {'len':len,
                             'bit':float(hit[13]),
                             'direction':hit[11],
                             'hmmfrom':hit[4],
                             'hmmto':hit[5],
                             'alifrom':hit[6],
                             'alito':hit[7]}
            return read_hash
        
        for idx, table in enumerate(hmmtable):
            program = [line.rstrip().split()[2] for line in open(table).readlines() if line.startswith('# Program:')][0]
            for hit in [line.rstrip().split() for line in open(table).readlines() if not line.startswith('#')]:                
                read_name = hit[0]
                
                if read_name in seen.keys(): # If the read name has been seen before.. 
                    if seen[read_name]==idx:
                        hash[read_name].append(buildHash(hit, program))
                else:
                    hash[read_name]=[buildHash(hit, program)]
                seen[read_name]=idx
        return hash
        
    def check_euk_contamination(self, output_path, euk_free_output_path, input_path, run_stats, input_file_format, threads, evalue, raw_reads, base, cmd_log, euk_hmm):
        reads_with_better_euk_hit = []
        reads_unique_to_eukaryotes = []
        cutoff = float(0.9*run_stats['read_length'])
        # do a nhmmer using a Euk specific hmm
        nhmmer_cmd = "nhmmer --cpu %s %s --tblout %s %s" % (threads, evalue, output_path, euk_hmm)

        if input_file_format == FORMAT_FASTA:
            cmd = "%s %s 2>&1 > /dev/null" % (nhmmer_cmd, raw_reads)
            self.hk.add_cmd(cmd_log, cmd)
            subprocess.check_call(cmd, shell = True)

        elif input_file_format == FORMAT_FASTQ_GZ:
            cmd = "%s <(awk '{print \">\" substr($0,2);getline;print;getline;getline}' <(zcat %s )) 2>&1 > /dev/null" %  (nhmmer_cmd, raw_reads)
            self.hk.add_cmd(cmd_log, cmd)
            subprocess.check_call(["/bin/bash", "-c", cmd])

        else:
            raise Exception(Messenger().error_message('Suffix on %s not familiar. Please submit an .fq.gz or .fa file\n' % (raw_reads)))


        # check for evalues that are lower, after eliminating hits with an
        # alignment length of < 90% the length of the whole read.
        euk_reads = self.hmmtable_reader([output_path])
        euk_crossover = [x for x in euk_reads.keys() if x in run_stats['reads'].keys()]
        reads_unique_to_eukaryotes = [x for x in euk_reads.keys() if x not in run_stats['reads'].keys()]
        
        for entry in euk_crossover: # for every cross match
            if euk_reads[entry][0]['bit'] >= float(run_stats['reads'][entry]['bit']):
                if euk_reads[entry][0]['len'] > cutoff:
                    reads_with_better_euk_hit.append(entry)
                elif euk_reads[entry][0]['len'] < cutoff:
                    continue
            else:
                continue

        # Return Euk contamination
        if len(reads_with_better_euk_hit) == 0:
            Messenger().message("No contaminating eukaryotic reads detected in %s" % (os.path.basename(raw_reads)))

        else:
            Messenger().message("Found %s read(s) that may be eukaryotic" % len(reads_with_better_euk_hit + reads_unique_to_eukaryotes))
        # Write a file with the Euk free reads.
        with open(euk_free_output_path, 'w') as euk_free_output:
            for record in list(SeqIO.parse(open(input_path, 'r'), 'fasta')):
                if record.id not in reads_with_better_euk_hit:
                    SeqIO.write(record, euk_free_output, "fasta")
        run_stats['euk_uniq'] = len(reads_unique_to_eukaryotes)
        run_stats['euk_contamination'] = len(reads_with_better_euk_hit)
        return run_stats, euk_free_output_path

    def filter_hmmsearch(self, output_hash, contents, args, input_file_format, cmd_log):
        for seq_file in sequence_file_list:
            hmmout_table_title = suffix[0]
            table_title_list.append(hmmout_table_title)
            hmmsearch_cmd = " hmmsearch --cpu %s %s -o /dev/null --domtblout %s %s " % (threads, eval, hmmout_table_title, self.hmm)
            # TODO: capture stderr and report if the check_call fails
            if input_file_format == FORMAT_FASTA or input_file_format == FORMAT_FASTQ_GZ:
                if contents.pipe == 'P':
                    cmd = 'orfm %s | %s /dev/stdin' % (seq_file, hmmsearch_cmd)
                    self.hk.add_cmd(cmd_log, cmd)
                    subprocess.check_call(["/bin/bash", "-c", cmd])
            else:
                Messenger().message('ERROR: Suffix on %s not recegnised\n' % (seq_file))
                exit(1)
            del suffix[0]
        return table_title_list

    def csv_to_titles(self, output_path, input_path, run_stats):
        ## process hmmsearch/nhmmer results into a list of titles to <base_filename>_readnames.txt
        run_stats['reads'] = self.hmmtable_reader(input_path)
        count=sum([len(x) for x in run_stats['reads'].values()])

        # See if there are any reads in there reverse direction. Store True if so for later reference

        try:
            if any([x for x in sum(run_stats['reads'].values(), []) if x['direction'] =='-']):
                run_stats['rev_true'] = True
            else:
                run_stats['rev_true'] = False
        except KeyError:
            run_stats['rev_true'] = False

        if count > 0: # Return if there weren't any reads found
            Messenger().message('%s read(s) found' % (count))
        else: # Otherwise, report the number of reads
            Messenger().message('%s reads found, cannot continue with no information' % (len(run_stats['reads'].keys())))
            return run_stats, False
        # And write the read names to output
        orfm_regex = re.compile('^(\S+)_(\d+)_(\d)_(\d+)')
        with open(output_path, 'w') as output_file:
            for record in run_stats['reads'].keys():
                regex_match = orfm_regex.match(record)
                if regex_match is not None:
                    output_file.write(regex_match.groups(0)[0]+'\n')
                if regex_match is None:
                    output_file.write(record+'\n')
        return run_stats, output_path

    def extract_from_raw_reads(self, output_path, input_path, raw_sequences_path, input_file_format, cmd_log, read_stats):
        # Use the readnames specified to extract from the original sequence
        # file to a fasta formatted file.        
        def removeOverlaps(item):
            for a, b in itertools.combinations(item, 2):
                fromto_a=[int(a['alifrom']),int(a['alito'])]
                fromto_b=[int(b['alifrom']),int(b['alito'])]
                range_a=range(min(fromto_a), max(fromto_a))
                range_b=range(min(fromto_b), max(fromto_b))
                intersect_length=len(set(range_a).intersection(set(range_b)))
                if intersect_length > 0:
                    if range_a > range_b:
                        item.remove(b)
                    elif a in item:
                        item.remove(a)
                else:
                    continue
            return item
                    
        def extractMultipleHits(reads_path, stats):
            # Extra function that reads in hits and splits out the regions 
            # (usually in a contig) that hit the HMM as a distinct match.
            reads=SeqIO.to_dict(SeqIO.parse(reads_path, "fasta"))
            new_stats={}
            out_reads={}
            for key,item in stats.iteritems():
                item=removeOverlaps(item)

                if len(item)>1:
                    counter=0
                    for entry in item:
                        f=int(entry['alifrom'])-1
                        t=int(entry['alito'])-1
                        read_rename=key + '_%s' % str(counter)
                        out_reads[read_rename]=str(reads[key].seq)[f:t]
                        new_stats[read_rename]=entry
                        counter+=1
                else:
                    out_reads[key]=str(reads[key].seq)
                    new_stats[key]=item[0]
            out_path = reads_path[:-3]+'_split.fa'
            with open(out_path, 'w') as out:
                for key,item in out_reads.iteritems():
                    out.write(">%s\n" % (str(key)))
                    out.write("%s\n" % (str(item)))
                    
            return new_stats, out_path
        # Run fxtract to obtain reads form original sequence file
        fxtract_cmd = "fxtract -H -X -f %s " % input_path
        if input_file_format == FORMAT_FASTA:
            cmd = "%s %s > %s" % (fxtract_cmd, raw_sequences_path, output_path)
            self.hk.add_cmd(cmd_log, cmd)
            subprocess.check_call(cmd, shell=True)
        elif input_file_format == FORMAT_FASTQ_GZ:
            cmd = "%s -z %s | awk '{print \">\" substr($0,2);getline;print;getline;getline}' > %s" % (fxtract_cmd, raw_sequences_path, output_path)
            self.hk.add_cmd(cmd_log, cmd)
            subprocess.check_call(cmd, shell=True)
        else:
            raise Exception("Programming error")
        # Check if there are reads that need splitting
        if any([x for x in read_stats if len(read_stats[x])>1]):
            read_stats, output_path=extractMultipleHits(output_path, read_stats) 
        else:
            new_stats={}
            for key, item in read_stats.iteritems():
                new_stats[key]=item[0]
            read_stats=new_stats
        return read_stats, output_path

    def check_read_length(self, reads, pipe):
        lengths = []
        record_list = []
        # First check if the reverse pipe is happening, because the read names
        # are different.
        record_list += list(SeqIO.parse(open(reads, 'r'), 'fasta'))
        for record in record_list:
            lengths.append(len(record.seq))
        if pipe == "P":
            return (sum(lengths) / float(len(lengths)))/3
        elif pipe =="D":
            return sum(lengths) / float(len(lengths))
        
    def alignment_correcter(self, alignment_file_list, output_file_name):
        corrected_sequences = {}
        for alignment_file in alignment_file_list:
            insert_list = [] # Define list containing inserted positions to be removed (lower case characters)
            sequence_list = list(SeqIO.parse(open(alignment_file, 'r'), 'fasta'))
            for sequence in sequence_list: # For each sequence in the alignment
                for idx, nt in enumerate(list(sequence.seq)): # For each nucleotide in the sequence
                    if nt.islower(): # Check for lower case character
                        insert_list.append(idx) # Add to the insert list if it is
            insert_list = list(OrderedDict.fromkeys(sorted(insert_list, reverse = True))) # Reverse the list and remove duplicate positions
            for sequence in sequence_list: # For each sequence in the alignment
                new_seq = list(sequence.seq) # Define a list of sequences to be iterable list for writing
                for position in insert_list: # For each position in the removal list
                    del new_seq[position] # Delete that inserted position in every sequence
                corrected_sequences['>'+sequence.id+'\n'] = ''.join(new_seq)+'\n'
        with open(output_file_name, 'w') as output_file: # Create an open file to write the new sequences to
            for fasta_id, fasta_seq in corrected_sequences.iteritems():
                output_file.write(fasta_id)
                output_file.write(fasta_seq)
      
    def orfm_command_line(self, min_orf_length, restrict_read_length):
        '''Return a string to run OrfM with, assuming sequences are incoming on
        stdin'''
        if restrict_read_length:
            orfm_arg_l = " -l %d" % restrict_read_length
        else:
            orfm_arg_l = ''
        
        return 'orfm -m %d %s ' % (min_orf_length, orfm_arg_l)

    def extract_orfs(self, input_path, raw_orf_path, hmmsearch_out_path, orf_titles_path, min_orf_length, restrict_read_length, orf_out_path, cmd_log):
        'Extract only the orfs that hit the hmm, return sequence file with within.'        
        # Build the command
        output_table_list = []
        tee = ' | tee'
        hmm_number = len(self.search_hmm)
        if hmm_number > 1:
            for idx, hmm in enumerate(self.search_hmm):
                out = os.path.join(os.path.split(hmmsearch_out_path)[0], os.path.basename(hmm).split('.')[0] +'_'+ os.path.split(hmmsearch_out_path)[1])
                output_table_list.append(out)
                if idx + 1 == hmm_number:
                    tee += " | hmmsearch --domtblout %s %s - >/dev/null " % (out, hmm)
                elif idx + 1 < hmm_number:
                    tee += " >(hmmsearch --domtblout %s %s - >/dev/null) " % (out, hmm)
                else:
                    raise Exception("Programming Error.") 
        elif hmm_number == 1:
            tee = ' | hmmsearch --domtblout %s %s - >/dev/null' % (hmmsearch_out_path, self.search_hmm[0])
            output_table_list.append(hmmsearch_out_path)
        # Call orfs on the sequences

        orfm_cmd = self.orfm_command_line(min_orf_length, restrict_read_length)
        cmd = '%s %s > %s' % (orfm_cmd, input_path, raw_orf_path)
        self.hk.add_cmd(cmd_log, cmd)
        subprocess.check_call(cmd, shell=True)

        cmd = 'cat %s %s' % (raw_orf_path, tee)
        self.hk.add_cmd(cmd_log, cmd)
        subprocess.check_call(['bash','-c',cmd])
        
        with open(orf_titles_path, 'w') as output:
            seen = []
            for table in output_table_list:
                for title in [x.split(' ')[0] for x in open(table).readlines() if not x.startswith('#')]:
                    if title not in seen:
                        output.write(str(title) + '\n')
                        seen.append(title)
                    else:
                        pass       
        
        # Extract the reads using the titles.
        cmd = 'fxtract -H -X -f %s %s > %s' % (orf_titles_path, raw_orf_path, orf_out_path)
        self.hk.add_cmd(cmd_log, cmd)
        subprocess.check_call(cmd, shell=True)
        
        # Return name of output file
        return orf_out_path

    def p_search(self, files, args, run_stats, base, input_file_format, raw_reads):
        # Main pipe of search step in protein pipeline:
        # recieves reads, and returns hits
        start = timeit.default_timer() # Start search timer
        # Searching raw reads with HMM
        hit_table = self.hmmsearch(files.hmmsearch_output_path(base),
                                   raw_reads,
                                   input_file_format,
                                   args.input_sequence_type,
                                   args.threads,
                                   args.eval,
                                   args.min_orf_length,
                                   args.restrict_read_length,
                                   files.command_log_path())
        # Processing the output table to give you the readnames of the hits
        run_stats, hit_readnames = self.csv_to_titles(files.readnames_output_path(base),
                                                      hit_table,
                                                      run_stats)

        if not hit_readnames:
            return False, run_stats
        # Extract the hits form the original raw read file
        run_stats['reads'], hit_reads = self.extract_from_raw_reads(files.fa_output_path(base),
                                                                    hit_readnames,
                                                                    raw_reads,
                                                                    input_file_format,
                                                                    files.command_log_path(),
                                                                    run_stats['reads'])
        
        if args.input_sequence_type == 'nucleotide':
            # Extract the orfs of these reads that hit the original search
            hit_orfs = self.extract_orfs(hit_reads,
                                         files.orf_output_path(base),
                                         files.orf_hmmsearch_output_path(base),
                                         files.orf_titles_output_path(base),
                                         args.min_orf_length,
                                         args.restrict_read_length,
                                         files.orf_fasta_output_path(base),
                                         files.command_log_path())
        elif args.input_sequence_type == 'protein':
            hit_orfs = hit_reads
        else:
            raise Exception('Programming Error')
        # Define the average read length of the hits
        run_stats['read_length'] = self.check_read_length(hit_orfs, "P")
        # Stop and log search timer
        stop = timeit.default_timer()
        run_stats['search_t'] = str(int(round((stop - start), 0)) )
        # Falsify some summary entries
        run_stats['euk_contamination'] = 'N/A'
        run_stats['euk_uniq'] = 'N/A'
        run_stats['euk_check_t'] = 'N/A'
        # Return hit reads, and summary hash
        return hit_orfs, run_stats

    def d_search(self, files, args, run_stats, base, input_file_format, raw_reads, euk_check):
        # Main pipe of search step in nucleotide pipeline:
        # recieves reads, and returns hits
        start = timeit.default_timer() # Start search timer

        # First search the reads using the HMM
        hit_table = self.nhmmer(files.hmmsearch_output_path(base),
                                raw_reads,
                                input_file_format,
                                args.threads,
                                args.eval,
                                files.command_log_path())

        # Next, get a list of readnames
        run_stats, hit_readnames = self.csv_to_titles(files.readnames_output_path(base),
                                                      hit_table,
                                                      run_stats)
        if not hit_readnames:
            return False, run_stats

        # And extract them from the original sequence file
        run_stats['reads'], hit_reads = self.extract_from_raw_reads(files.fa_output_path(base),
                                                                    hit_readnames,
                                                                    raw_reads,
                                                                    input_file_format,
                                                                    files.command_log_path(),
                                                                    run_stats['reads'])
        # Define the read length
        run_stats['read_length'] = self.check_read_length(hit_reads, "D")

        # Stop timing search and start timing euk check step.
        stop = timeit.default_timer()
        run_stats['search_t'] = str(int(round((stop - start), 0)) )
        start = timeit.default_timer()

        # Check for Eukarytoic contamination
        if euk_check:
            Messenger().message("Checking for Eukaryotic contamination")
            run_stats, hit_reads = self.check_euk_contamination(files.euk_contam_path(base),
                                                                files.euk_free_path(base),
                                                                hit_reads,
                                                                run_stats,
                                                                input_file_format,
                                                                args.threads,
                                                                args.eval,
                                                                raw_reads,
                                                                base,
                                                                files.command_log_path(),
                                                                args.euk_hmm_file)

        # Stop timing eukaryote check
        stop = timeit.default_timer()
        run_stats['euk_check_t'] = str(int(round((stop - start), 0)) )

        # Finally, return the hits
        return hit_reads, run_stats

    def align(self, files, args, run_stats, base, reads):
        # This pipeline takes unaligned reads, and aligns them agains a hmm,
        # regardless of their direction. Aligned reads with base insertions
        # removed are returned in the end. Times and commands are logged.

        start = timeit.default_timer()

        # HMMalign the forward reads, and reverse complement reads.
        self.hmmalign(reads,
                      run_stats,
                      files.command_log_path(),
                      files.output_for_path(base),
                      files.output_rev_path(base),
                      files.sto_for_output_path(base),
                      files.sto_rev_output_path(base),
                      files.conv_output_for_path(base),
                      files.conv_output_rev_path(base))

        # Correct the alignment for base insertions.
        if run_stats['rev_true']:
            self.alignment_correcter([files.conv_output_for_path(base), files.conv_output_rev_path(base)],
                                     files.aligned_fasta_output_path(base))
        else:
            self.alignment_correcter([files.conv_output_for_path(base)],
                                     files.aligned_fasta_output_path(base))
        stop = timeit.default_timer()
        run_stats['aln_t'] = str(int(round((stop - start), 0)) )

        # Return
        return files.aligned_fasta_output_path(base), run_stats
예제 #8
0
파일: run.py 프로젝트: wwood/graftM
class Run:
    ### Functions that make up pipelines in GraftM

    def __init__(self, args):
        self.args = args
        self.setattributes(self.args)

    def setattributes(self, args):
        self.kb = KronaBuilder()
        self.hk = HouseKeeping()
        self.s = Stats_And_Summary()
        self.tg = TaxoGroup()
        self.e = Extract()
        if args.subparser_name == 'graft':
            self.hk.set_attributes(self.args)
            self.h = Hmmer(self.args.search_hmm_files, self.args.aln_hmm_file)
            self.sequence_pair_list, self.input_file_format = self.hk.parameter_checks(args)
            if hasattr(args, 'reference_package'):
                self.p = Pplacer(self.args.reference_package)

    def protein_pipeline(self, base, summary_dict, sequence_file, direction):
        ## The main pipeline for GraftM searching for protein sequence

        # Set a variable to store the run statistics, to be added later to
        # the summary_dict
        if direction:
            run_stats = summary_dict[base][direction]
        elif not direction:
            run_stats = summary_dict[base]
        else:
            raise Exception('Programming Error: Assigning run_stats hash')
        # Tell user what is being searched with what
        Messenger().message('Searching %s' % (os.path.basename(sequence_file)))
        # Search for reads using hmmsearch
        hit_reads, run_stats = self.h.p_search(self.gmf,
                                               self.args,
                                               run_stats,
                                               base,
                                               self.input_file_format,
                                               sequence_file)
        if not hit_reads:
            return summary_dict, False
        # Align the reads.
        Messenger().message('Aligning reads to reference package database')
        hit_aligned_reads, run_stats = self.h.align(self.gmf,
                                                    self.args,
                                                    run_stats,
                                                    base,
                                                    hit_reads)
        # Set these paramaters as N/A 'cos they don't apply to the protein pipeline
        run_stats['n_contamin_euks'] = 'N/A'
        run_stats['n_uniq_euks'] = 'N/A'
        run_stats['euk_check_t'] = 'N/A'
        if direction:
            summary_dict[base][direction] = run_stats
        elif not direction:
            summary_dict[base] = run_stats
        else:
            raise Exception('Programming Error: Logging %s hash' % direction)

        return summary_dict, hit_aligned_reads

    def dna_pipeline(self, base, summary_dict, sequence_file, direction):
        ## The main pipeline for GraftM searching for DNA sequence

        # Set a variable to store the run statistics, to be added later to
        # the summary_dict
        if direction:
            run_stats = summary_dict[base][direction]
        elif not direction:
            run_stats = summary_dict[base]
        else:
            raise Exception('Programming Error: Assigning run_stats hash')

        # Search for reads using nhmmer
        Messenger().message('Searching %s' % os.path.basename(sequence_file))
        hit_reads, run_stats = self.h.d_search(self.gmf,
                                               self.args,
                                               run_stats,
                                               base,
                                               self.input_file_format,
                                               sequence_file)
        
        if not hit_reads:
            return summary_dict, False
        
        # Otherwise, run through the alignment
        Messenger().message('Aligning reads to reference package database')
        hit_aligned_reads, run_stats = self.h.align(self.gmf,
                                                    self.args,
                                                    run_stats,
                                                    base,
                                                    hit_reads)
        if direction:
            summary_dict[base][direction] = run_stats
        elif not direction:
            summary_dict[base] = run_stats
        else:
            raise Exception('Programming Error: Logging %s hash' % direction)
        return summary_dict, hit_aligned_reads

    def placement(self, summary_dict):
        ## This is the placement pipeline in GraftM, in aligned reads are
        ## placed into phylogenetic trees, and the results interpreted.
        ## If reverse reads are used, this is where the comparisons are made
        ## between placements, for the summary tables to be build in the
        ## next stage.
        # Concatenate alignment files, place in tree, split output guppy
        # and .jplace file for the output
        summary_dict = self.p.place(summary_dict,
                                    self.gmf,
                                    self.args)
        # Summary steps.
        start = timeit.default_timer()
        otu_tables = []
        for idx, base in enumerate(summary_dict['base_list']):
            # First assign the hash that contains all of the trusted placements
            # to a variable to it can be passed to otu_builder, to be written
            # to a file. :)

            if summary_dict['reverse_pipe']:
                placements = summary_dict[base]['comparison_hash']['trusted_placements']
                summary_dict[base]['read_length'] = (summary_dict[base]['forward']['read_length'] + summary_dict[base]['reverse']['read_length'])/2
            elif not summary_dict['reverse_pipe']:
                placements = summary_dict[base]['trusted_placements']
            else:
                raise Exception('Programming Error: Assigning placements hash')

            self.gmf = GraftMFiles(base, self.args.output_directory, False) # Assign the output directory to place output in
            Messenger().message('Building summary table for %s' % base)
            self.s.otu_builder(placements,
                                 self.gmf.summary_table_output_path(base),
                                 base)
            otu_tables.append(self.gmf.summary_table_output_path(base))

            # Generate coverage table
            Messenger().message('Building coverage table for %s' % base)
            self.s.coverage_of_hmm(self.args.aln_hmm_file,
                                     self.gmf.summary_table_output_path(base),
                                     self.gmf.coverage_table_path(base),
                                     summary_dict[base]['read_length'])

        Messenger().message('Building summary krona plot')
        self.kb.otuTablePathListToKrona(otu_tables,
                                        self.gmf.krona_output_path(),
                                        self.gmf.command_log_path())
        stop = timeit.default_timer()
        summary_dict['summary_t'] = str(int(round((stop - start), 0)) )

        # Compile basic run statistics if they are wanted
        summary_dict['stop_all'] = timeit.default_timer()
        summary_dict['all_t'] = str(int(round((summary_dict['stop_all'] - summary_dict['start_all']), 0)) )
        self.s.build_basic_statistics(summary_dict, self.gmf.basic_stats_path(), self.args.type)

        # Delete unnecessary files
        Messenger().message('Cleaning up')
        for base in summary_dict['base_list']:
            directions = ['forward', 'reverse']
            if summary_dict['reverse_pipe']:
                for i in range(0,2):
                    self.gmf = GraftMFiles(base, self.args.output_directory, directions[i])
                    self.hk.delete([self.gmf.for_aln_path(base),
                                    self.gmf.rev_aln_path(base),
                                    self.gmf.sto_for_output_path(base),
                                    self.gmf.sto_rev_output_path(base),
                                    self.gmf.conv_output_rev_path(base),
                                    self.gmf.conv_output_for_path(base),
                                    self.gmf.euk_free_path(base),
                                    self.gmf.euk_contam_path(base),
                                    self.gmf.readnames_output_path(base),
                                    self.gmf.sto_output_path(base),
                                    self.gmf.orf_titles_output_path(base),
                                    self.gmf.orf_hmmsearch_output_path(base),
                                    self.gmf.hmmsearch_output_path(base),
                                    self.gmf.orf_output_path(base),
                                    self.gmf.comb_aln_fa()])
            elif not summary_dict['reverse_pipe']:
                self.gmf = GraftMFiles(base, self.args.output_directory, False)
                self.hk.delete([self.gmf.for_aln_path(base),
                                self.gmf.rev_aln_path(base),
                                self.gmf.sto_for_output_path(base),
                                self.gmf.sto_rev_output_path(base),
                                self.gmf.conv_output_rev_path(base),
                                self.gmf.conv_output_for_path(base),
                                self.gmf.euk_free_path(base),
                                self.gmf.euk_contam_path(base),
                                self.gmf.readnames_output_path(base),
                                self.gmf.sto_output_path(base),
                                self.gmf.orf_titles_output_path(base),
                                self.gmf.hmmsearch_output_path(base),
                                self.gmf.orf_hmmsearch_output_path(base),
                                self.gmf.orf_output_path(base),
                                self.gmf.comb_aln_fa()])

        Messenger().message('Done, thanks for using graftM!\n')

    def graft(self):
        # The Graft pipeline:
        # Searches for reads using hmmer, and places them in phylogenetic
        # trees to derive a community structure.
        print '''
                                GRAFT
        
                       Joel Boyd, Ben Woodcroft
                                                         __/__
                                                  ______|
          _- - _                         ________|      |_____/
           - -            -             |        |____/_
           - _     --->  -   --->   ____|
          - _-  -         -             |      ______
             - _                        |_____|
           -                                  |______
            '''
        # Set up a dictionary that will record stats as the pipeline is running
        summary_table = {'euks_checked': self.args.check_total_euks,
                         'base_list': [],
                         'seqs_list': [],
                         'start_all': timeit.default_timer(),
                         'reverse_pipe': False}

        # Set the output directory if not specified and create that directory
        if not hasattr(self.args, 'output_directory'):
            self.args.output_directory = "GraftM_proc"
        self.hk.make_working_directory(self.args.output_directory,
                                       self.args.force)

        # For each pair (or single file passed to GraftM)
        for pair in self.sequence_pair_list:

            # Set the basename, and make an entry to the summary table.
            base = os.path.basename(pair[0]).split('.')[0]

            # Set reverse pipe if more than one pair
            if hasattr(self.args, 'reverse'):
                summary_table['reverse_pipe'] = True
                summary_table[base] = {'reverse':{}, 'forward':{}}
                pair_direction = ['forward', 'reverse']
            else:
                summary_table[base] = {}

            # Set pipeline and evalue by checking HMM format
            hmm_type, hmm_tc = self.hk.setpipe(self.args.aln_hmm_file)
            setattr(self.args, 'type', hmm_type)
            if hmm_tc:
                setattr(self.args, 'eval', '--cut_tc')
                
            # Guess the sequence file type, if not already specified to GraftM
            if not hasattr(self.args, 'input_sequence_type'):
                setattr(self.args, 'input_sequence_type',
                        self.hk.guess_sequence_type(pair[0],
                                                    self.input_file_format))
            # Make the working base directory
            self.hk.make_working_directory(os.path.join(self.args.output_directory,
                                                        base),
                                           self.args.force)

            # tell the user which file/s is being processed
            Messenger().header("Working on %s" % base)

            # for each of the paired end read files
            for read_file in pair:
                # Set the output file_name
                if summary_table['reverse_pipe']:
                    direction = pair_direction.pop(0)
                    Messenger().header("Working on %s reads" % direction)
                    self.gmf = GraftMFiles(base,
                                           self.args.output_directory,
                                           direction)
                    self.hk.make_working_directory(os.path.join(self.args.output_directory,
                                                                base,
                                                                direction),
                                                   self.args.force)
                elif not summary_table['reverse_pipe']:
                    direction = False
                    self.gmf = GraftMFiles(base,
                                           self.args.output_directory,
                                           direction)
                else:
                    raise Exception('Programming Error')

                if self.args.type == 'P':
                    summary_table, hit_aligned_reads = self.protein_pipeline(base,
                                                                            summary_table,
                                                                            read_file,
                                                                            direction)
                # Or the DNA pipeline
                elif self.args.type == 'D':
                    self.hk.set_euk_hmm(self.args)
                    summary_table, hit_aligned_reads = self.dna_pipeline(base,
                                                                        summary_table,
                                                                        read_file,
                                                                        direction)
                if not hit_aligned_reads:
                    continue

                # Add the run stats and the completed run to the summary table
                summary_table['seqs_list'].append(hit_aligned_reads)
                if base not in summary_table['base_list']:
                    summary_table['base_list'].append(base)

        # Leave the pipeline if search only was specified
        if self.args.search_only:
            Messenger().header('Stopping before placement\n')
            exit(0)
        # Tell the user we're on to placing the sequences into the tree.
        self.gmf = GraftMFiles('',
                               self.args.output_directory,
                               False)
        Messenger().header("Placing reads into phylogenetic tree")
        self.placement(summary_table)


    def manage(self):
        print '''
                            MANAGE

                   Joel Boyd, Ben Woodcroft

'''

        if self.args.seq:
            self.e.extract(self.args)

    def assemble(self):
        print '''
                           ASSEMBLE

                   Joel Boyd, Ben Woodcroft


          _- - _               ___            __/
           -                  /___\____      /\/
           - _     --->   ___/       \_\     \/
          - _-           /_/            \    /
             - _        /                \__/
                       /
'''
        self.tg.main(self.args)

    def main(self):

        if self.args.subparser_name == 'graft':
            self.graft()

        elif self.args.subparser_name == 'assemble':
            self.assemble()


        elif self.args.subparser_name == 'manage':
            self.manage()
예제 #9
0
 def __init__(self, refpkg):
     self.refpkg = refpkg
     self.hk = HouseKeeping()
예제 #10
0
class Pplacer:
    ### Contains function related to processing alignment files to jplace files
    ### and running comparisons between forward and revere reads if reverse
    ### reads are provided.

    def __init__(self, refpkg):
        self.refpkg = refpkg
        self.hk = HouseKeeping()

    # Run pplacer
    def pplacer(self, output_file, output_path, input_path, threads):
        ## Runs pplacer on concatenated alignment file
        cmd = "pplacer -j %s --verbosity 0 --out-dir %s -c %s %s" % (
            str(threads), output_path, self.refpkg, input_path)  # Set command
        extern.run(cmd)
        output_path = '.'.join(input_path.split('.')[:-1]) + '.jplace'
        return output_path

    def alignment_merger(self, alignment_files, output_alignment_path):
        ## Concatenate aligned read_files into one file. Each read with it's
        ## own unique identifier assigning it to a particular origin file
        alias_hash = {
        }  # Set up a hash with file names and their unique identifier
        file_number = 0  # file counter (unique identifier)
        with open(output_alignment_path, 'w') as output:
            for alignment_file in alignment_files:  # For each alignment
                alignments = list(
                    SeqIO.parse(open(alignment_file, 'r'),
                                'fasta'))  # read list
                for record in alignments:  # For each record in the read list
                    record.id = record.id + '_' + str(
                        file_number
                    )  # append the unique identifier to the record id
                SeqIO.write(alignments, output,
                            "fasta")  # And write the reads to the file
                alias_hash[str(file_number)] = {
                    'output_path':
                    os.path.join(os.path.dirname(alignment_file),
                                 'placements.jplace')
                }
                file_number += 1
        return alias_hash

    def convert_cluster_dict_keys_to_aliases(self, cluster_dict, alias_hash):
        '''
        Parameters
        ----------
        cluster_dict : dict
            dictionary stores information on pre-placement clustering  
        alias_hash : dict
            Stores information on each input read file given to GraftM, the
            corresponding reads found within each file, and their taxonomy
    
        Returns
        --------
        updated cluster_dict dict containing alias indexes for keys.
        '''
        output_dict = {}
        directory_to_index_dict = {
            os.path.split(item["output_path"])[0]: key
            for key, item in alias_hash.iteritems()
        }
        for key, item in cluster_dict.iteritems():
            cluster_file_directory = os.path.split(key)[0]
            cluster_idx = directory_to_index_dict[cluster_file_directory]
            output_dict[cluster_idx] = item
        return output_dict

    def jplace_split(self, original_jplace, cluster_dict):
        '''
        To make GraftM more efficient, reads are dereplicated and merged into
        one file prior to placement using pplacer. This function separates the
        single jplace file produced by this process into the separate jplace 
        files, one per input file (if multiple were provided) and backfills 
        abundance (re-replicates?) into the placement file so analyses can be 
        done using the placement files. 
        
        Parameters
        ----------
        original_jplace : dict (json)
            json .jplace file from the pplacer step.
        cluster_dict : dict
            dictionary stores information on pre-placement clustering  
        
        Returns
        -------
        A dict containing placement hashes to write to 
        new jplace file. Each key represents a file alias  
        '''
        output_hash = {}

        for placement in original_jplace['placements']:  # for each placement
            alias_placements_list = []
            nm_dict = {}

            p = placement['p']
            if 'nm' in placement.keys():
                nm = placement['nm']
            elif 'n' in placement.keys():
                nm = placement['n']
            else:
                raise Exception(
                    "Unexpected jplace format: Either 'nm' or 'n' are expected as keys in placement jplace .JSON file"
                )

            for nm_entry in nm:
                nm_list = []
                placement_read_name, plval = nm_entry
                read_alias_idx = placement_read_name.split('_')[
                    -1]  # Split the alias
                # index out of the read name, which
                # corresponds to the input file from
                # which the read originated.
                read_name = '_'.join(placement_read_name.split('_')[:-1])
                read_cluster = cluster_dict[read_alias_idx][read_name]
                for read in read_cluster:
                    nm_list.append([read.name, plval])
                if read_alias_idx not in nm_dict:
                    nm_dict[read_alias_idx] = nm_list
                else:
                    nm_dict[read_alias_idx] += nm_entry

            for alias_idx, nm_list in nm_dict.iteritems():
                placement_hash = {'p': p, 'nm': nm_list}
                if alias_idx not in output_hash:
                    output_hash[alias_idx] = [placement_hash]
                else:
                    output_hash[alias_idx].append(placement_hash)
        return output_hash

    def write_jplace(self, original_jplace, alias_hash):
        # Write the jplace file to their respective file paths.
        for alias_idx in alias_hash.keys():
            output = {
                'fields': original_jplace['fields'],
                'version': original_jplace['version'],
                'tree': original_jplace['tree'],
                'placements': alias_hash[alias_idx]['place'],
                'metadata': original_jplace['metadata']
            }
            with open(alias_hash[alias_idx]['output_path'], 'w') as output_io:
                json.dump(output,
                          output_io,
                          ensure_ascii=False,
                          indent=3,
                          separators=(',', ': '))

    @T.timeit
    def place(self, reverse_pipe, seqs_list, resolve_placements, files, args,
              slash_endings, tax_descr, clusterer):
        '''
        placement - This is the placement pipeline in GraftM, in aligned reads 
                    are placed into phylogenetic trees, and the results interpreted.
                    If reverse reads are used, this is where the comparisons are made
                    between placements, for the summary tables to be build in the
                    next stage.
         
        Parameters
        ----------
        reverse_pipe : bool
            True: reverse reads are placed separately
            False: no reverse reads to place.
        seqs_list : list
            list of paths to alignment fastas to be placed into the tree
        resolve_placements : bool
            True:resolve placements to their most trusted taxonomy
            False: classify reads to their most trusted taxonomy, until the 
                   confidence cutoff is reached. 
        files : list
            graftM output file name object
        args : obj
            argparse object
        Returns
        ------- 
        trusted_placements : dict
            dictionary of reads and their trusted placements
        '''
        trusted_placements = {}
        files_to_delete = []
        # Merge the alignments so they can all be placed at once.
        alias_hash = self.alignment_merger(seqs_list, files.comb_aln_fa())
        files_to_delete += seqs_list
        files_to_delete.append(files.comb_aln_fa())

        # Run pplacer on merged file
        jplace = self.pplacer(files.jplace_output_path(),
                              args.output_directory, files.comb_aln_fa(),
                              args.threads)
        files_to_delete.append(jplace)
        logging.info("Placements finished")

        #Read the json of refpkg
        logging.info("Reading classifications")
        classifications = Classify(tax_descr).assignPlacement(
            jplace, args.placements_cutoff, resolve_placements)
        logging.info("Reads classified")
        # If the reverse pipe has been specified, run the comparisons between the two pipelines. If not then just return.

        for idx, file in enumerate(seqs_list):

            if reverse_pipe:
                base_file = os.path.basename(file).replace(
                    '_forward_hits.aln.fa', '')
                forward_gup = classifications.pop(
                    sorted(classifications.keys())[0])
                reverse_gup = classifications.pop(
                    sorted(classifications.keys())[0])
                seqs_list.pop(idx + 1)
                placements_hash = Compare().compare_placements(
                    forward_gup, reverse_gup, args.placements_cutoff,
                    slash_endings, base_file)
                trusted_placements[base_file] = placements_hash[
                    'trusted_placements']

            else:  # Set the trusted placements as
                base_file = os.path.basename(file).replace('_hits.aln.fa', '')
                trusted_placements[base_file] = {}
                for read, entry in classifications[str(idx)].iteritems():
                    trusted_placements[base_file][read] = entry['placement']
        # Split the original jplace file
        # and write split jplaces to separate file directories
        with open(jplace) as f:
            jplace_json = json.load(f)
        cluster_dict = self.convert_cluster_dict_keys_to_aliases(
            clusterer.seq_library, alias_hash)
        hash_with_placements = self.jplace_split(jplace_json, cluster_dict)

        for file_alias, placement_entries_list in hash_with_placements.items():
            alias_hash[file_alias]['place'] = placement_entries_list

        self.write_jplace(jplace_json, alias_hash)

        self.hk.delete(
            files_to_delete)  # Remove combined split, not really useful

        return trusted_placements
 def __init__(self):
     self.hk = HouseKeeping()
예제 #12
0
파일: run.py 프로젝트: eliasOnAWS/graftM
class Run:

    PIPELINE_AA = "P"
    PIPELINE_NT = "D"

    _MIN_VERBOSITY_FOR_ART = 3  # with 2 then, only errors are printed

    PPLACER_TAXONOMIC_ASSIGNMENT = 'pplacer'
    DIAMOND_TAXONOMIC_ASSIGNMENT = 'diamond'

    MIN_ALIGNED_FILTER_FOR_NUCLEOTIDE_PACKAGES = 95
    MIN_ALIGNED_FILTER_FOR_AMINO_ACID_PACKAGES = 30

    DEFAULT_MAX_SAMPLES_FOR_KRONA = 100

    NO_ORFS_EXITSTATUS = 128

    def __init__(self, args):
        self.args = args
        self.setattributes(self.args)

    def setattributes(self, args):

        self.hk = HouseKeeping()
        self.s = Stats_And_Summary()
        if args.subparser_name == 'graft':
            commands = ExternalProgramSuite([
                'orfm', 'nhmmer', 'hmmsearch', 'mfqe', 'pplacer',
                'ktImportText', 'diamond'
            ])
            self.hk.set_attributes(self.args)
            self.hk.set_euk_hmm(self.args)
            if args.euk_check:
                self.args.search_hmm_files.append(self.args.euk_hmm_file)

            self.ss = SequenceSearcher(
                self.args.search_hmm_files,
                (None if self.args.search_only else self.args.aln_hmm_file))
            self.sequence_pair_list = self.hk.parameter_checks(args)
            if hasattr(args, 'reference_package'):
                self.p = Pplacer(self.args.reference_package)

        elif self.args.subparser_name == "create":
            commands = ExternalProgramSuite(
                ['taxit', 'FastTreeMP', 'hmmalign', 'mafft'])
            self.create = Create(commands)

    def summarise(self, base_list, trusted_placements, reverse_pipe, times,
                  hit_read_count_list, max_samples_for_krona):
        '''
        summarise - write summary information to file, including otu table, biom
                    file, krona plot, and timing information

        Parameters
        ----------
        base_list : array
            list of each of the files processed by graftm, with the path and
            and suffixed removed
        trusted_placements : dict
            dictionary of placements with entry as the key, a taxonomy string
            as the value
        reverse_pipe : bool
            True = run reverse pipe, False = run normal pipeline
        times : array
            list of the recorded times for each step in the pipeline in the
            format: [search_step_time, alignment_step_time, placement_step_time]
        hit_read_count_list : array
            list containing sublists, one for each file run through the GraftM
            pipeline, each two entries, the first being the number of putative
            eukaryotic reads (when searching 16S), the second being the number
            of hits aligned and placed in the tree.
        max_samples_for_krona: int
            If the number of files processed is greater than this number, then
            do not generate a krona diagram.
        Returns
        -------
        '''

        # Summary steps.
        placements_list = []
        for base in base_list:
            # First assign the hash that contains all of the trusted placements
            # to a variable to it can be passed to otu_builder, to be written
            # to a file. :)
            placements = trusted_placements[base]
            self.s.readTax(
                placements,
                GraftMFiles(base, self.args.output_directory,
                            False).read_tax_output_path(base))
            placements_list.append(placements)

        #Generate coverage table
        #logging.info('Building coverage table for %s' % base)
        #self.s.coverage_of_hmm(self.args.aln_hmm_file,
        #                         self.gmf.summary_table_output_path(base),
        #                         self.gmf.coverage_table_path(base),
        #                         summary_dict[base]['read_length'])

        logging.info('Writing summary table')
        with open(self.gmf.combined_summary_table_output_path(), 'w') as f:
            self.s.write_tabular_otu_table(base_list, placements_list, f)

        logging.info('Writing biom file')
        with biom_open(self.gmf.combined_biom_output_path(), 'w') as f:
            biom_successful = self.s.write_biom(base_list, placements_list, f)
        if not biom_successful:
            os.remove(self.gmf.combined_biom_output_path())

        logging.info('Building summary krona plot')
        if len(base_list) > max_samples_for_krona:
            logging.warn(
                "Skipping creation of Krona diagram since there are too many input files. The maximum can be overridden using --max_samples_for_krona"
            )
        else:
            self.s.write_krona_plot(base_list, placements_list,
                                    self.gmf.krona_output_path())

        # Basic statistics
        placed_reads = [len(trusted_placements[base]) for base in base_list]
        self.s.build_basic_statistics(times, hit_read_count_list, placed_reads, \
                                      base_list, self.gmf.basic_stats_path())

        # Delete unnecessary files
        logging.info('Cleaning up')
        for base in base_list:
            directions = ['forward', 'reverse']
            if reverse_pipe:
                for i in range(0, 2):
                    self.gmf = GraftMFiles(base, self.args.output_directory,
                                           directions[i])
                    self.hk.delete([
                        self.gmf.for_aln_path(base),
                        self.gmf.rev_aln_path(base),
                        self.gmf.conv_output_rev_path(base),
                        self.gmf.conv_output_for_path(base),
                        self.gmf.euk_free_path(base),
                        self.gmf.euk_contam_path(base),
                        self.gmf.readnames_output_path(base),
                        self.gmf.sto_output_path(base),
                        self.gmf.orf_titles_output_path(base),
                        self.gmf.orf_output_path(base),
                        self.gmf.output_for_path(base),
                        self.gmf.output_rev_path(base)
                    ])
            else:
                self.gmf = GraftMFiles(base, self.args.output_directory, False)
                self.hk.delete([
                    self.gmf.for_aln_path(base),
                    self.gmf.rev_aln_path(base),
                    self.gmf.conv_output_rev_path(base),
                    self.gmf.conv_output_for_path(base),
                    self.gmf.euk_free_path(base),
                    self.gmf.euk_contam_path(base),
                    self.gmf.readnames_output_path(base),
                    self.gmf.sto_output_path(base),
                    self.gmf.orf_titles_output_path(base),
                    self.gmf.orf_output_path(base),
                    self.gmf.output_for_path(base),
                    self.gmf.output_rev_path(base)
                ])

        logging.info('Done, thanks for using graftM!\n')

    def graft(self):
        # The Graft pipeline:
        # Searches for reads using hmmer, and places them in phylogenetic
        # trees to derive a community structure.
        if self.args.graftm_package:
            gpkg = GraftMPackage.acquire(self.args.graftm_package)
        else:
            gpkg = None

        REVERSE_PIPE = (True if self.args.reverse else False)
        INTERLEAVED = (True if self.args.interleaved else False)
        base_list = []
        seqs_list = []
        search_results = []
        hit_read_count_list = []
        db_search_results = []

        if gpkg:
            maximum_range = gpkg.maximum_range()

            if self.args.search_diamond_file:
                self.args.search_method = self.hk.DIAMOND_SEARCH_METHOD
                diamond_db = self.args.search_diamond_file[0]
            else:
                diamond_db = gpkg.diamond_database_path()
                if self.args.search_method == self.hk.DIAMOND_SEARCH_METHOD:
                    if not diamond_db:
                        logging.error(
                            "%s search method selected, but no diamond database specified. \
                        Please either provide a gpkg to the --graftm_package flag, or a diamond \
                        database to the --search_diamond_file flag." %
                            self.args.search_method)
                        raise Exception()
        else:
            # Get the maximum range, if none exists, make one from the HMM profile
            if self.args.maximum_range:
                maximum_range = self.args.maximum_range
            else:
                if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                    if not self.args.search_only:
                        maximum_range = self.hk.get_maximum_range(
                            self.args.aln_hmm_file)
                    else:
                        logging.debug(
                            "Running search only pipeline. maximum_range not configured."
                        )
                        maximum_range = None
                else:
                    logging.warning(
                        'Cannot determine maximum range when using %s pipeline and with no GraftM package specified'
                        % self.args.search_method)
                    logging.warning(
                        'Setting maximum_range to None (linked hits will not be detected)'
                    )
                    maximum_range = None
            if self.args.search_diamond_file:
                diamond_db = self.args.search_diamond_file
            else:
                if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                    diamond_db = None
                else:
                    logging.error(
                        "%s search method selected, but no gpkg or diamond database selected"
                        % self.args.search_method)

        if self.args.assignment_method == Run.DIAMOND_TAXONOMIC_ASSIGNMENT:
            if self.args.reverse:
                logging.warn(
                    "--reverse reads specified with --assignment_method diamond. Reverse reads will be ignored."
                )
                self.args.reverse = None

        # If merge reads is specified, check that there are reverse reads to merge with
        if self.args.merge_reads and not hasattr(self.args, 'reverse'):
            raise Exception("Programming error")

        # Set the output directory if not specified and create that directory
        logging.debug('Creating working directory: %s' %
                      self.args.output_directory)
        self.hk.make_working_directory(self.args.output_directory,
                                       self.args.force)

        # Set pipeline and evalue by checking HMM format
        if self.args.search_only:
            if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                hmm_type, hmm_tc = self.hk.setpipe(
                    self.args.search_hmm_files[0])
                logging.debug("HMM type: %s Trusted Cutoff: %s" %
                              (hmm_type, hmm_tc))
        else:
            hmm_type, hmm_tc = self.hk.setpipe(self.args.aln_hmm_file)
            logging.debug("HMM type: %s Trusted Cutoff: %s" %
                          (hmm_type, hmm_tc))

        if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
            setattr(self.args, 'type', hmm_type)
            if hmm_tc:
                setattr(self.args, 'evalue', '--cut_tc')
        else:
            setattr(self.args, 'type', self.PIPELINE_AA)

        if self.args.filter_minimum is not None:
            filter_minimum = self.args.filter_minimum
        else:
            if self.args.type == self.PIPELINE_NT:
                filter_minimum = Run.MIN_ALIGNED_FILTER_FOR_NUCLEOTIDE_PACKAGES
            else:
                filter_minimum = Run.MIN_ALIGNED_FILTER_FOR_AMINO_ACID_PACKAGES

        # Generate expand_search database if required
        if self.args.expand_search_contigs:
            if self.args.graftm_package:
                pkg = GraftMPackage.acquire(self.args.graftm_package)
            else:
                pkg = None
            boots = ExpandSearcher(search_hmm_files=self.args.search_hmm_files,
                                   maximum_range=self.args.maximum_range,
                                   threads=self.args.threads,
                                   evalue=self.args.evalue,
                                   min_orf_length=self.args.min_orf_length,
                                   graftm_package=pkg)

            # this is a hack, it should really use GraftMFiles but that class isn't currently flexible enough
            new_database = (os.path.join(self.args.output_directory, "expand_search.hmm") \
                            if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD \
                            else os.path.join(self.args.output_directory, "expand_search")
                            )

            if boots.generate_expand_search_database_from_contigs(
                    self.args.expand_search_contigs, new_database,
                    self.args.search_method):
                if self.args.search_method == self.hk.HMMSEARCH_SEARCH_METHOD:
                    self.ss.search_hmm.append(new_database)
                else:
                    diamond_db = new_database

        first_search_method = self.args.search_method
        if self.args.decoy_database:
            decoy_filter = DecoyFilter(
                Diamond(diamond_db, threads=self.args.threads),
                Diamond(self.args.decoy_database, threads=self.args.threads))
            doing_decoy_search = True
        elif self.args.search_method == self.hk.HMMSEARCH_AND_DIAMOND_SEARCH_METHOD:
            decoy_filter = DecoyFilter(
                Diamond(diamond_db, threads=self.args.threads))
            doing_decoy_search = True
            first_search_method = self.hk.HMMSEARCH_SEARCH_METHOD
        else:
            doing_decoy_search = False

        # For each pair (or single file passed to GraftM)
        logging.debug('Working with %i file(s)' % len(self.sequence_pair_list))
        for pair in self.sequence_pair_list:
            # Guess the sequence file type, if not already specified to GraftM
            unpack = UnpackRawReads(pair[0], self.args.input_sequence_type,
                                    INTERLEAVED)

            # Set the basename, and make an entry to the summary table.
            base = unpack.basename()
            pair_direction = ['forward', 'reverse']
            logging.info("Working on %s" % base)

            # Make the working base subdirectory
            self.hk.make_working_directory(
                os.path.join(self.args.output_directory, base),
                self.args.force)

            # for each of the paired end read files
            for read_file in pair:
                unpack = UnpackRawReads(read_file,
                                        self.args.input_sequence_type,
                                        INTERLEAVED)
                if read_file is None:
                    # placeholder for interleaved (second file is None)
                    continue

                if not os.path.isfile(read_file):  # Check file exists
                    logging.info('%s does not exist! Skipping this file..' %
                                 read_file)
                    continue

                # Set the output file_name
                if len(pair) == 2:
                    direction = 'interleaved' if pair[1] is None \
                                              else pair_direction.pop(0)
                    logging.info("Working on %s reads" % direction)
                    self.gmf = GraftMFiles(base, self.args.output_directory,
                                           direction)
                    self.hk.make_working_directory(
                        os.path.join(self.args.output_directory, base,
                                     direction), self.args.force)
                else:
                    direction = False
                    self.gmf = GraftMFiles(base, self.args.output_directory,
                                           direction)

                if self.args.type == self.PIPELINE_AA:
                    logging.debug("Running protein pipeline")
                    try:
                        search_time, (
                            result,
                            complement_information) = self.ss.aa_db_search(
                                self.gmf,
                                base,
                                unpack,
                                first_search_method,
                                maximum_range,
                                self.args.threads,
                                self.args.evalue,
                                self.args.min_orf_length,
                                self.args.restrict_read_length,
                                diamond_db,
                                self.args.diamond_performance_parameters,
                            )
                    except NoInputSequencesException as e:
                        logging.error(
                            "No sufficiently long open reading frames were found, indicating"
                            " either the input sequences are too short or the min orf length"
                            " cutoff is too high. Cannot continue sorry. Alternatively, there"
                            " is something amiss with the installation of OrfM. The specific"
                            " command that failed was: %s" % e.command)
                        exit(Run.NO_ORFS_EXITSTATUS)

                # Or the DNA pipeline
                elif self.args.type == self.PIPELINE_NT:
                    logging.debug("Running nucleotide pipeline")
                    search_time, (
                        result, complement_information) = self.ss.nt_db_search(
                            self.gmf, base, unpack, self.args.euk_check,
                            self.args.search_method, maximum_range,
                            self.args.threads, self.args.evalue)

                reads_detected = True
                if not result.hit_fasta() or os.path.getsize(
                        result.hit_fasta()) == 0:
                    logging.info('No reads found in %s' % base)
                    reads_detected = False

                if self.args.search_only:
                    db_search_results.append(result)
                    base_list.append(base)
                    continue

                # Filter out decoys if specified
                if reads_detected and doing_decoy_search:
                    with tempfile.NamedTemporaryFile(prefix="graftm_decoy",
                                                     suffix='.fa') as f:
                        tmpname = f.name
                    any_remaining = decoy_filter.filter(
                        result.hit_fasta(), tmpname)
                    if any_remaining:
                        shutil.move(tmpname, result.hit_fasta())
                    else:
                        # No hits remain after decoy filtering.
                        os.remove(result.hit_fasta())
                        continue

                if self.args.assignment_method == Run.PPLACER_TAXONOMIC_ASSIGNMENT:
                    logging.info(
                        'aligning reads to reference package database')
                    hit_aligned_reads = self.gmf.aligned_fasta_output_path(
                        base)

                    if reads_detected:
                        aln_time, aln_result = self.ss.align(
                            result.hit_fasta(), hit_aligned_reads,
                            complement_information, self.args.type,
                            filter_minimum)
                    else:
                        aln_time = 'n/a'
                    if not os.path.exists(
                            hit_aligned_reads
                    ):  # If all were filtered out, or there just was none..
                        with open(hit_aligned_reads, 'w') as f:
                            pass  # just touch the file, nothing else
                    seqs_list.append(hit_aligned_reads)

                db_search_results.append(result)
                base_list.append(base)
                search_results.append(result.search_result)
                hit_read_count_list.append(result.hit_count)

        # Write summary table
        srchtw = SearchTableWriter()
        srchtw.build_search_otu_table(
            [x.search_objects for x in db_search_results], base_list,
            self.gmf.search_otu_table())

        if self.args.search_only:
            logging.info(
                'Stopping before alignment and taxonomic assignment phase\n')
            exit(0)

        if self.args.merge_reads:  # not run when diamond is the assignment mode- enforced by argparse grokking
            logging.debug("Running merge reads output")
            if self.args.interleaved:
                fwd_seqs = seqs_list
                rev_seqs = []
            else:
                base_list = base_list[0::2]
                fwd_seqs = seqs_list[0::2]
                rev_seqs = seqs_list[1::2]
            merged_output=[GraftMFiles(base, self.args.output_directory, False).aligned_fasta_output_path(base) \
                           for base in base_list]
            logging.debug("merged reads to %s", merged_output)
            self.ss.merge_forev_aln(fwd_seqs, rev_seqs, merged_output)
            seqs_list = merged_output
            REVERSE_PIPE = False

        elif REVERSE_PIPE:
            base_list = base_list[0::2]

        # Leave the pipeline if search only was specified
        if self.args.search_and_align_only:
            logging.info('Stopping before taxonomic assignment phase\n')
            exit(0)
        elif not any(base_list):
            logging.error(
                'No hits in any of the provided files. Cannot continue with no reads to assign taxonomy to.\n'
            )
            exit(0)
        self.gmf = GraftMFiles('', self.args.output_directory, False)

        if self.args.assignment_method == Run.PPLACER_TAXONOMIC_ASSIGNMENT:
            clusterer = Clusterer()
            # Classification steps
            seqs_list = clusterer.cluster(seqs_list, REVERSE_PIPE)
            logging.info("Placing reads into phylogenetic tree")
            taxonomic_assignment_time, assignments = self.p.place(
                REVERSE_PIPE, seqs_list, self.args.resolve_placements,
                self.gmf, self.args, result.slash_endings,
                gpkg.taxtastic_taxonomy_path(), clusterer)
            assignments = clusterer.uncluster_annotations(
                assignments, REVERSE_PIPE)

        elif self.args.assignment_method == Run.DIAMOND_TAXONOMIC_ASSIGNMENT:
            logging.info("Assigning taxonomy with diamond")
            taxonomic_assignment_time, assignments = self._assign_taxonomy_with_diamond(\
                        base_list,
                        db_search_results,
                        gpkg,
                        self.gmf,
                        self.args.diamond_performance_parameters)
            aln_time = 'n/a'
        else:
            raise Exception("Unexpected assignment method encountered: %s" %
                            self.args.placement_method)

        self.summarise(base_list, assignments, REVERSE_PIPE,
                       [search_time, aln_time, taxonomic_assignment_time],
                       hit_read_count_list, self.args.max_samples_for_krona)

    @T.timeit
    def _assign_taxonomy_with_diamond(self, base_list, db_search_results,
                                      graftm_package, graftm_files,
                                      diamond_performance_parameters):
        '''Run diamond to assign taxonomy

        Parameters
        ----------
        base_list: list of str
            list of sequence block names
        db_search_results: list of DBSearchResult
            the result of running hmmsearches
        graftm_package: GraftMPackage object
            Diamond is run against this database
        graftm_files: GraftMFiles object
            Result files are written here
        diamond_performance_parameters : str
            extra args for DIAMOND

        Returns
        -------
        list of
        1. time taken for assignment
        2. assignments i.e. dict of base_list entry to dict of read names to
            to taxonomies, or None if there was no hit detected.
        '''
        runner = Diamond(graftm_package.diamond_database_path(),
                         self.args.threads, self.args.evalue)
        taxonomy_definition = Getaxnseq().read_taxtastic_taxonomy_and_seqinfo\
                (open(graftm_package.taxtastic_taxonomy_path()),
                 open(graftm_package.taxtastic_seqinfo_path()))
        results = {}

        # For each of the search results,
        for i, search_result in enumerate(db_search_results):
            if search_result.hit_fasta() is None:
                sequence_id_to_taxonomy = {}
            else:
                sequence_id_to_hit = {}
                # Run diamond
                logging.debug("Running diamond on %s" %
                              search_result.hit_fasta())
                diamond_result = runner.run(
                    search_result.hit_fasta(),
                    UnpackRawReads.PROTEIN_SEQUENCE_TYPE,
                    daa_file_basename=graftm_files.
                    diamond_assignment_output_basename(base_list[i]),
                    extra_args=diamond_performance_parameters)
                for res in diamond_result.each([
                        SequenceSearchResult.QUERY_ID_FIELD,
                        SequenceSearchResult.HIT_ID_FIELD
                ]):
                    if res[0] in sequence_id_to_hit:
                        # do not accept duplicates
                        if sequence_id_to_hit[res[0]] != res[1]:
                            raise Exception(
                                "Diamond unexpectedly gave two hits for a single query sequence for %s"
                                % res[0])
                    else:
                        sequence_id_to_hit[res[0]] = res[1]

                # Extract taxonomy of the best hit, and add in the no hits
                sequence_id_to_taxonomy = {}
                for seqio in SequenceIO().read_fasta_file(
                        search_result.hit_fasta()):
                    name = seqio.name
                    if name in sequence_id_to_hit:
                        # Add Root; to be in line with pplacer assignment method
                        sequence_id_to_taxonomy[name] = [
                            'Root'
                        ] + taxonomy_definition[sequence_id_to_hit[name]]
                    else:
                        # picked up in the initial search (by hmmsearch, say), but diamond misses it
                        sequence_id_to_taxonomy[name] = ['Root']

            results[base_list[i]] = sequence_id_to_taxonomy
        return results

    def main(self):

        if self.args.subparser_name == 'graft':
            if self.args.verbosity >= self._MIN_VERBOSITY_FOR_ART:
                print('''
                                GRAFT

                       Joel Boyd, Ben Woodcroft

                                                         __/__
                                                  ______|
          _- - _                         ________|      |_____/
           - -            -             |        |____/_
           - _     >>>>  -   >>>>   ____|
          - _-  -         -             |      ______
             - _                        |_____|
           -                                  |______
            ''')
            self.graft()

        elif self.args.subparser_name == 'create':
            if self.args.verbosity >= self._MIN_VERBOSITY_FOR_ART:
                print('''
                            CREATE

                   Joel Boyd, Ben Woodcroft

                                                    /
              >a                                   /
              -------------                       /
              >b                        |        |
              --------          >>>     |  GPKG  |
              >c                        |________|
              ----------
''')
            if self.args.dereplication_level < 0:
                logging.error(
                    "Invalid dereplication level selected! please enter a positive integer"
                )
                exit(1)

            else:
                if not self.args.sequences:
                    if not self.args.alignment and not self.args.rerooted_annotated_tree \
                                               and not self.args.rerooted_tree:
                        logging.error(
                            "Some sort of sequence data must be provided to run graftM create"
                        )
                        exit(1)
                if self.args.taxonomy:
                    if self.args.rerooted_annotated_tree:
                        logging.error(
                            "--taxonomy is incompatible with --rerooted_annotated_tree"
                        )
                        exit(1)
                    if self.args.taxtastic_taxonomy or self.args.taxtastic_seqinfo:
                        logging.error(
                            "--taxtastic_taxonomy and --taxtastic_seqinfo are incompatible with --taxonomy"
                        )
                        exit(1)
                elif self.args.rerooted_annotated_tree:
                    if self.args.taxtastic_taxonomy or self.args.taxtastic_seqinfo:
                        logging.error(
                            "--taxtastic_taxonomy and --taxtastic_seqinfo are incompatible with --rerooted_annotated_tree"
                        )
                        exit(1)
                else:
                    if not self.args.taxtastic_taxonomy or not self.args.taxtastic_seqinfo:
                        logging.error(
                            "--taxonomy, --rerooted_annotated_tree or --taxtastic_taxonomy/--taxtastic_seqinfo is required"
                        )
                        exit(1)
                if bool(self.args.taxtastic_taxonomy) ^ bool(
                        self.args.taxtastic_seqinfo):
                    logging.error(
                        "Both or neither of --taxtastic_taxonomy and --taxtastic_seqinfo must be defined"
                    )
                    exit(1)
                if self.args.alignment and self.args.hmm:
                    logging.warn(
                        "Using both --alignment and --hmm is rarely useful, but proceding on the assumption you understand."
                    )
                if len([
                        _f for _f in [
                            self.args.rerooted_tree,
                            self.args.rerooted_annotated_tree, self.args.tree
                        ] if _f
                ]) > 1:
                    logging.error("Only 1 input tree can be specified")
                    exit(1)

                self.create.main(
                    dereplication_level=self.args.dereplication_level,
                    sequences=self.args.sequences,
                    alignment=self.args.alignment,
                    taxonomy=self.args.taxonomy,
                    rerooted_tree=self.args.rerooted_tree,
                    unrooted_tree=self.args.tree,
                    tree_log=self.args.tree_log,
                    prefix=self.args.output,
                    rerooted_annotated_tree=self.args.rerooted_annotated_tree,
                    min_aligned_percent=float(self.args.min_aligned_percent) /
                    100,
                    taxtastic_taxonomy=self.args.taxtastic_taxonomy,
                    taxtastic_seqinfo=self.args.taxtastic_seqinfo,
                    hmm=self.args.hmm,
                    search_hmm_files=self.args.search_hmm_files,
                    force=self.args.force,
                    threads=self.args.threads)

        elif self.args.subparser_name == 'update':
            logging.info(
                "GraftM package %s specified to update with sequences in %s" %
                (self.args.graftm_package, self.args.sequences))
            if self.args.regenerate_diamond_db:
                gpkg = GraftMPackage.acquire(self.args.graftm_package)
                logging.info("Regenerating diamond DB..")
                gpkg.create_diamond_db()
                logging.info("Diamond database regenerated.")
                return
            elif not self.args.sequences:
                logging.error(
                    "--sequences is required unless regenerating the diamond DB"
                )
                exit(1)

            if not self.args.output:
                if self.args.graftm_package.endswith(".gpkg"):
                    self.args.output = self.args.graftm_package.replace(
                        ".gpkg", "-updated.gpkg")
                else:
                    self.args.output = self.args.graftm_package + '-update.gpkg'

            Update(
                ExternalProgramSuite([
                    'taxit', 'FastTreeMP', 'hmmalign', 'mafft'
                ])).update(input_sequence_path=self.args.sequences,
                           input_taxonomy_path=self.args.taxonomy,
                           input_graftm_package_path=self.args.graftm_package,
                           output_graftm_package_path=self.args.output)

        elif self.args.subparser_name == 'expand_search':
            args = self.args
            if not args.graftm_package and not args.search_hmm_files:
                logging.error(
                    "expand_search mode requires either --graftm_package or --search_hmm_files"
                )
                exit(1)

            if args.graftm_package:
                pkg = GraftMPackage.acquire(args.graftm_package)
            else:
                pkg = None

            expandsearcher = ExpandSearcher(
                search_hmm_files=args.search_hmm_files,
                maximum_range=args.maximum_range,
                threads=args.threads,
                evalue=args.evalue,
                min_orf_length=args.min_orf_length,
                graftm_package=pkg)
            expandsearcher.generate_expand_search_database_from_contigs(
                args.contigs,
                args.output_hmm,
                search_method=ExpandSearcher.HMM_SEARCH_METHOD)

        elif self.args.subparser_name == 'tree':
            if self.args.graftm_package:
                # shim in the paths from the graftm package, not overwriting
                # any of the provided paths.
                gpkg = GraftMPackage.acquire(self.args.graftm_package)
                if not self.args.rooted_tree:
                    self.args.rooted_tree = gpkg.reference_package_tree_path()
                if not self.args.input_greengenes_taxonomy:
                    if not self.args.input_taxtastic_seqinfo:
                        self.args.input_taxtastic_seqinfo = gpkg.taxtastic_seqinfo_path(
                        )
                    if not self.args.input_taxtastic_taxonomy:
                        self.args.input_taxtastic_taxonomy = gpkg.taxtastic_taxonomy_path(
                        )

            if self.args.rooted_tree:
                if self.args.unrooted_tree:
                    logging.error(
                        "Both a rooted tree and an un-rooted tree were provided, so it's unclear what you are asking GraftM to do. \
If you're unsure see graftM tree -h")
                    exit(1)
                elif self.args.reference_tree:
                    logging.error(
                        "Both a rooted tree and reference tree were provided, so it's unclear what you are asking GraftM to do. \
If you're unsure see graftM tree -h")
                    exit(1)

                if not self.args.decorate:
                    logging.error(
                        "It seems a rooted tree has been provided, but --decorate has not been specified so it is unclear what you are asking graftM to do."
                    )
                    exit(1)

                dec = Decorator(tree_path=self.args.rooted_tree)

            elif self.args.unrooted_tree and self.args.reference_tree:
                logging.debug(
                    "Using provided reference tree %s to reroot %s" %
                    (self.args.reference_tree, self.args.unrooted_tree))
                dec = Decorator(reference_tree_path=self.args.reference_tree,
                                tree_path=self.args.unrooted_tree)
            else:
                logging.error(
                    "Some tree(s) must be provided, either a rooted tree or both an unrooted tree and a reference tree"
                )
                exit(1)

            if self.args.output_taxonomy is None and self.args.output_tree is None:
                logging.error(
                    "Either an output tree or taxonomy must be provided")
                exit(1)
            if self.args.input_greengenes_taxonomy:
                if self.args.input_taxtastic_seqinfo or self.args.input_taxtastic_taxonomy:
                    logging.error(
                        "Both taxtastic and greengenes taxonomy were provided, so its unclear what taxonomy you want graftM to decorate with"
                    )
                    exit(1)
                logging.debug("Using input GreenGenes style taxonomy file")
                dec.main(self.args.input_greengenes_taxonomy,
                         self.args.output_tree, self.args.output_taxonomy,
                         self.args.no_unique_tax, self.args.decorate, None)
            elif self.args.input_taxtastic_seqinfo and self.args.input_taxtastic_taxonomy:
                logging.debug("Using input taxtastic style taxonomy/seqinfo")
                dec.main(self.args.input_taxtastic_taxonomy,
                         self.args.output_tree, self.args.output_taxonomy,
                         self.args.no_unique_tax, self.args.decorate,
                         self.args.input_taxtastic_seqinfo)
            else:
                logging.error(
                    "Either a taxtastic taxonomy or seqinfo file was provided. GraftM cannot continue without both."
                )
                exit(1)

        elif self.args.subparser_name == 'archive':
            # Back slashes in the ASCII art are escaped.
            if self.args.verbosity >= self._MIN_VERBOSITY_FOR_ART:
                print("""
                               ARCHIVE

                        Joel Boyd, Ben Woodcroft

                  ____.----.
        ____.----'          \\
        \\                    \\
         \\                    \\
          \\                    \\
           \\          ____.----'`--.__
            \\___.----'          |     `--.____
           /`-._                |       __.-' \\
          /     `-._            ___.---'       \\
         /          `-.____.---'                \\           +------+
        /            / | \\                       \\          |`.    |`.
       /            /  |  \\                   _.--'  <===>  |  `+--+---+
       `-.         /   |   \\            __.--'              |   |  |   |
          `-._    /    |    \\     __.--'     |              |   |  |   |
            | `-./     |     \\_.-'           |              +---+--+   |
            |          |                     |               `. |   `. |
            |          |                     |                 `+------+
            |          |                     |
            |          |                     |
            |          |                     |
            |          |                     |
            |          |                     |
            `-.        |                  _.-'
               `-.     |           __..--'
                  `-.  |      __.-'
                     `-|__.--'
            """)
            if self.args.create:
                if self.args.extract:
                    logging.error(
                        "Please specify whether to either create or export a GraftM package"
                    )
                    exit(1)
                if not self.args.graftm_package:
                    logging.error(
                        "Creating a GraftM package archive requires an package to be specified"
                    )
                    exit(1)
                if not self.args.archive:
                    logging.error(
                        "Creating a GraftM package archive requires an output archive path to be specified"
                    )
                    exit(1)

                archive = Archive()
                archive.create(self.args.graftm_package,
                               self.args.archive,
                               force=self.args.force)

            elif self.args.extract:
                archive = Archive()
                archive.extract(self.args.archive,
                                self.args.graftm_package,
                                force=self.args.force)
            else:
                logging.error(
                    "Please specify whether to either create or export a GraftM package"
                )
                exit(1)

        else:
            raise Exception("Unexpected subparser name %s" %
                            self.args.subparser_name)
예제 #13
0
class Pplacer:
    ### Contains function related to processing alignment files to jplace files
    ### and running comparisons between forward and revere reads if reverse
    ### reads are provided.

    def __init__(self, refpkg):
        self.refpkg = refpkg
        self.hk = HouseKeeping()

    # Run pplacer
    def pplacer(self, output_file, output_path, input_path, threads, cmd_log):
        ## Runs pplacer on concatenated alignment file
        cmd = "pplacer -j %s --verbosity 0 --out-dir %s -c %s %s" % (threads, output_path, self.refpkg, input_path) # Set command
        self.hk.add_cmd(cmd_log, cmd) # Log it
        subprocess.check_call(cmd, shell=True) # Run it
        output_path = '.'.join(input_path.split('.')[:-1]) + '.jplace'
        return output_path

    def alignment_merger(self, alignment_files, output_alignment_path):
        ## Concatenate aligned read_files into one file. Each read with it's
        ## own unique identifier assigning it to a particular origin file
        
        alias_hash = {} # Set up a hash with file names and their unique identifier
        file_number = 0 # file counter (unique identifier)
        with open(output_alignment_path, 'w') as output:
            for alignment_file in alignment_files: # For each alignment
                alignments = list(SeqIO.parse(open(alignment_file, 'r'), 'fasta')) # read list
                for record in alignments: # For each record in the read list
                    record.id = record.id + '_' + str(file_number) # append the unique identifier to the record id
                SeqIO.write(alignments, output, "fasta") # And write the reads to the file
                alias_hash[str(file_number)] = {'output_path': os.path.join(os.path.dirname(alignment_file),'placements.jplace') ,
                                             'place': []}
                file_number += 1
        return alias_hash

    def jplace_split(self, jplace_file, alias_hash, summary_dict):
        ## Split the jplace file into their respective directories

        # Load the placement file
        placement_file = json.load(open(jplace_file))

        # Parse the placements based on unique identifies appended to the end
        # of each read
        for placement in placement_file['placements']: # for each placement
            hash = {} # create an empty hash
            for alias in alias_hash: # For each alias, append to the 'place' list each read that identifier
                hash = {'p': placement['p'],
                        'nm': [nm for nm in placement['nm'] if nm[0].split('_')[-1] == alias]}
                alias_hash[alias]['place'].append(hash)

        # Write the jplace file to their respective file paths.
        jplace_path_list = []
        for alias in alias_hash:
            output = {'fields': placement_file['fields'],
                      'version': placement_file['version'],
                      'tree':  placement_file['tree'],
                      'placements': alias_hash[alias]['place'],
                      'metadata': placement_file['metadata']}
            with open(alias_hash[alias]['output_path'], 'w') as output_path:
                json.dump(output, output_path, ensure_ascii=False)
            jplace_path_list.append(alias_hash[alias]['output_path'])
        summary_dict['jplace_path_list'] = jplace_path_list
        return summary_dict

    def place(self, summary_dict, files, args):
        ## Pipeline taking multiple alignment files and returning multiple
        ## placement and guppy files, as well as the comparison between forward
        ## and reverse reads, if the reverse pipeline is selected

        start = timeit.default_timer() # Start placement timer

        # Merge the alignments so they can all be placed at once.
        alias_hash = self.alignment_merger(summary_dict['seqs_list'], files.comb_aln_fa())
        
        # Run pplacer on merged file
        jplace = self.pplacer(files.jplace_output_path(), args.output_directory, files.comb_aln_fa(), args.threads, files.command_log_path())
        Messenger().message("Placements finished")

        stop = timeit.default_timer() # stop placement timer and log
        summary_dict['place_t'] = str( int(round((stop - start), 0)) )

        # Split the jplace file
        summary_dict = self.jplace_split(jplace, alias_hash, summary_dict)
        
        #Read the json of refpkg
        Messenger().message("Reading classifications")
        tax_descr=json.load(open(self.refpkg+'/CONTENTS.json'))['files']['taxonomy']
        classifications=Classify(os.path.join(self.refpkg,tax_descr)).assignPlacement(jplace, args.placements_cutoff, 'reads', summary_dict['resolve_placements'])
        self.hk.delete([jplace])# Remove combined split, not really useful
        Messenger().message("Reads classified.")
        
        # If the reverse pipe has been specified, run the comparisons between the two pipelines. If not then just return.

        for idx, base in enumerate(summary_dict['base_list']):
            if summary_dict['reverse_pipe']:
                summary_dict[base] = Compare().compare_hits(summary_dict[base], base)

                forward_gup=classifications.pop(sorted(classifications.keys())[0]) 
                reverse_gup=classifications.pop(sorted(classifications.keys())[0])
                summary_dict[base] = Compare().compare_placements(forward_gup,
                                                                  reverse_gup,
                                                                  summary_dict[base],
                                                                  args.placements_cutoff)

            elif not summary_dict['reverse_pipe']: # Set the trusted placements as
                summary_dict[base]['trusted_placements'] = {}

                for read, entry in classifications[str(idx)].iteritems():
                    summary_dict[base]['trusted_placements'][read] = entry['placement']
        return summary_dict