Python PipelneUtils примеры использования

Язык программирования: Python

Пространство имен/Пакет: pipeline.utils

Класс/Тип: PipelneUtils

Примеров на hotexamples.com: 24

Python PipelneUtils - 24 примеров найдено. Это лучшие примеры Python кода для pipeline.utils.PipelneUtils, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

PipelneUtils(4)

is_local(4)

get_all_files(3)

print_both(3)

call_sh_script(2)

find_in_nested_dict(1)

open_write_close(1)

read_file(1)

write_seq_frequencies_in_file(1)

Пример #1

Показать файл

Файл: db_upload.py Проект: icefoxx/py_mbl_sequencing_pipeline

 def get_fasta_file_names(self):
     fa_files = []
     pipelne_utils   = PipelneUtils()
     files = pipelne_utils.get_all_files(self.fasta_dir)
     for full_name in files.keys():    
         if (files[full_name][1] == ".unique") and ((files[full_name][0].split(".")[-1].strip() == "fa") or (files[full_name][0].split("_")[-1] == "FILTERED")):
             fa_files.append(full_name)
     return fa_files

Пример #2

Показать файл

 def get_fasta_file_names(self):
     fa_files = []
     pipelne_utils   = PipelneUtils()
     files = pipelne_utils.get_all_files(self.in_file_path)
     for full_name in files.keys():    
         if (files[full_name][1] == ".unique") and (files[full_name][0].split(".")[-1].strip() == "fa"):
             print full_name
             fa_files.append(full_name)
     return fa_files

Пример #3

Показать файл

Файл: metadata.py Проект: avoorhis/py_mbl_sequencing_pipeline

 def __init__(self, command_line_args = None, configuration_dictionary = None):
     self.args = command_line_args
     self.general_config_dict = configuration_dictionary
     self.known_header_list  = C.csv_header_list
     self.pipeline_run_items = C.pipeline_run_items
     self.primer_suites      = self.convert_primer_suites(C.primer_suites)
     self.dna_regions        = C.dna_regions
     self.data_object = {}
     self.data_object['general'] = {}
     self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct
     then press 'c' to continue the pipeline\n"""
     self.res_headers = []
     self.env = {}
     self.utils  = PipelneUtils()

Пример #4

Показать файл

    def __init__(self, host="bpcweb7", db="test"):
# , read_default_file=os.path.expanduser("~/.my.cnf"), port = 3306
        
        self.utils  = PipelneUtils()        
        self.conn   = None
        self.cursor = None
        self.rows   = 0
        self.new_id = None
        self.lastrowid = None
        
        try:           
            self.utils.print_both("=" * 40)
            self.utils.print_both("host = " + str(host) + ", db = "  + str(db))
            self.utils.print_both("=" * 40)
            read_default_file = os.path.expanduser("~/.my.cnf")
            port_env = 3306
            
            if self.utils.is_local():
                host = "127.0.0.1"
                if db == "env454":
                    port_env = 3308
                    read_default_file = os.path.expanduser("~/.my.cnf_server")
                else:
                    db = "test_env454"
            self.conn   = MySQLdb.connect(host = host, db = db, read_default_file = read_default_file, port = port_env)
            self.cursor = self.conn.cursor()
            # self.escape = self.conn.escape()
                   
        except MySQLdb.Error, e:
            self.utils.print_both("Error %d: %s" % (e.args[0], e.args[1]))
            raise

Пример #5

Показать файл

    def __init__(self, runobj = None):
        self.utils      = PipelneUtils()
        self.runobj     = runobj
        self.run_keys   = self.runobj.run_keys
        self.rundate    = self.runobj.run
        try:
            self.use_cluster = self.runobj.use_cluster
        except:
            self.use_cluster = True
        self.chg_suffix         = ".chg"
        self.chimeras_suffix    = ".chimeras"      
        self.ref_suffix         = ".db"      
        self.denovo_suffix      = ".txt"        
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.chimeric_suffix    = ".chimeric.fa"
        self.base_suffix        = "unique" + self.chimeras_suffix

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            site       = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] 
            self.indir  = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.indir  = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        
        
#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd        
        #self.abskew      = C.chimera_checking_abskew
        self.refdb       = C.chimera_checking_refdb_6
        self.its_refdb   = C.chimera_checking_its_refdb_6
        self.input_file_names  = self.make_chimera_input_illumina_file_names()

Пример #6

Показать файл

Файл: db_upload.py Проект: msGenDev/py_mbl_sequencing_pipeline

    def get_fasta_file_names(self):
        fa_files = []
        pipelne_utils   = PipelneUtils()
        files = pipelne_utils.get_all_files(self.fasta_dir)

        for full_name in files.keys():
                
#             if (files[full_name][1] == ".unique") and ((files[full_name][0].split(".")[-1].strip() == "fa") or (files[full_name][0].split("_")[-1] == C.filtered_suffix)):
            if (full_name.endswith(self.nonchimeric_suffix)):                
                fa_files.append(full_name)
                print full_name
                self.suffix_used = self.nonchimeric_suffix
                next 
            elif (full_name.endswith(self.fa_unique_suffix)):
                fa_files.append(full_name)
                print full_name
                self.suffix_used = self.fa_unique_suffix                
        return fa_files

Пример #7

Показать файл

    def __init__(self, runobj = None):
        self.utils       = PipelneUtils()
        self.runobj      = runobj
        self.rundate     = self.runobj.run
        self.use_cluster = 1       
        self.unique_fasta_files = []
#        if self.runobj.vamps_user_upload:
#            site       = self.runobj.site
#            dir_prefix = self.runobj.user + '_' + self.runobj.run
#        else:
#            site = ''
#            dir_prefix = self.runobj.run         
#        dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site)

        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
 
        
        self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)
        self.fasta_dir    = self.dirs.check_dir(self.dirs.reads_overlap_dir)
        self.gast_dir     = self.dirs.check_dir(self.dirs.gast_dir)

        host_name     = runobj.database_host
        database_name = runobj.database_name
        
        self.filenames   = []
        self.my_conn     = MyConnection(host = 'newbpcdb2.jbpc-np.mbl.edu', db="env454")
#         self.my_conn     = MyConnection()

#         self.my_conn     = MyConnection(host = 'localhost', db="test_env454")
        self.sequence_table_name = "sequence_ill" 
        self.sequence_field_name = "sequence_comp" 
        self.my_csv              = None

        self.unique_file_counts = self.dirs.unique_file_counts
        self.dirs.delete_file(self.unique_file_counts)
        self.seq_id_dict = {}
        self.tax_id_dict = {}
        self.run_id      = None
#        self.nonchimeras_suffix = ".nonchimeric.fa"
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.fa_unique_suffix   = ".fa." + C.unique_suffix #.fa.unique
        self.v6_unique_suffix   = "MERGED_V6_PRIMERS_REMOVED." + C.unique_suffix
        self.suff_list = [self.nonchimeric_suffix, self.fa_unique_suffix, self.v6_unique_suffix]

#         self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique
        self.suffix_used        = ""

Пример #8

Показать файл

    def __init__(self, runobj = None):
        self.utils      = PipelneUtils()
        self.runobj     = runobj
        self.run_keys   = self.runobj.run_keys
        self.rundate    = self.runobj.run
        try:
            self.use_cluster = self.runobj.use_cluster
        except:
            self.use_cluster = True
        self.chg_suffix         = ".chg"
        self.chimeras_suffix    = ".chimeras"      
        self.ref_suffix         = ".db"      
        self.denovo_suffix      = ".txt"        
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.chimeric_suffix    = ".chimeric.fa"
        self.base_suffix        = "unique" + self.chimeras_suffix

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            os.environ['SGE_ROOT'] ='/opt/sge'
            os.environ['SGE_CELL'] ='grendel'
            path                   = os.environ['PATH']
            os.environ['PATH']     = '/opt/sge/bin/lx24-amd64:'+path
            site       = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] 
            self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)  
            self.indir  = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)        
            self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir)

        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.indir  = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        
        
#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd        
        #self.abskew      = C.chimera_checking_abskew
        self.refdb       = C.chimera_checking_refdb_6
        self.its_refdb   = C.chimera_checking_its_refdb_6
        self.input_file_names  = self.make_chimera_input_illumina_file_names()

Пример #9

Показать файл

Файл: illumina_files.py Проект: avoorhis/py_mbl_sequencing_pipeline

    def __init__(self, runobj):
        self.utils = PipelneUtils()
        self.runobj         = runobj
        self.out_files      = {}
        self.id_dataset_idx = {}
        self.in_file_path   = self.runobj.input_dir

        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''

        dirs      = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site)
        self.dirs = dirs
        self.out_file_path = dirs.check_dir(dirs.analysis_dir)
        self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
        self.platform = self.runobj.platform

Пример #10

Показать файл

class MyConnection:
    """
    Connection to env454
    Takes parameters from ~/.my.cnf, default host = "vampsdev", db="test"
    if different use my_conn = MyConnection(host, db)
    """
    def __init__(self, host="bpcweb7", db="test"):
# , read_default_file=os.path.expanduser("~/.my.cnf"), port = 3306
        
        self.utils  = PipelneUtils()        
        self.conn   = None
        self.cursor = None
        self.rows   = 0
        self.new_id = None
        self.lastrowid = None
        
        try:           
            self.utils.print_both("=" * 40)
            self.utils.print_both("host = " + str(host) + ", db = "  + str(db))
            self.utils.print_both("=" * 40)
            read_default_file = os.path.expanduser("~/.my.cnf")
            port_env = 3306
            
            if self.utils.is_local():
                host = "127.0.0.1"
                if db == "env454":
                    port_env = 3308
                    read_default_file = os.path.expanduser("~/.my.cnf_server")
                else:
                    db = "test_env454"
            self.conn   = MySQLdb.connect(host = host, db = db, read_default_file = read_default_file, port = port_env)
            self.cursor = self.conn.cursor()
            # self.escape = self.conn.escape()
                   
        except MySQLdb.Error, e:
            self.utils.print_both("Error %d: %s" % (e.args[0], e.args[1]))
            raise
        except:                       # catch everything

Пример #11

Показать файл

Файл: chimera.py Проект: msGenDev/py_mbl_sequencing_pipeline

    def __init__(self, runobj = None):
        self.utils      = PipelneUtils()
        self.runobj     = runobj
        self.run_keys   = self.runobj.run_keys
        self.rundate    = self.runobj.run
        
        self.chg_suffix         = ".chg"
        self.chimeras_suffix    = ".chimeras"      
        self.ref_suffix         = ".db"      
        self.denovo_suffix      = ".txt"        
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.chimeric_suffix    = ".chimeric.fa"
        self.base_suffix        = "unique" + self.chimeras_suffix

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            site       = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] 
            self.indir  = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.indir  = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        
        
#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd        
        #self.abskew      = C.chimera_checking_abskew
        self.refdb       = C.chimera_checking_refdb_6
        self.its_refdb   = C.chimera_checking_its_refdb_6
        self.input_file_names  = self.make_chimera_input_illumina_file_names()

Пример #12

Показать файл

 def __init__(self, runobj):
     self.utils = PipelneUtils()
     self.runobj         = runobj
     self.out_files      = {} 
     self.id_dataset_idx = {}
     self.in_file_path   = self.runobj.input_dir
             
     if self.runobj.vamps_user_upload:
         site = self.runobj.site
         dir_prefix=self.runobj.user+'_'+self.runobj.run
     else:
         site = ''
         dir_prefix = self.runobj.run
     if self.runobj.lane_name:
         lane_name = self.runobj.lane_name
     else:
         lane_name = ''
     
     dirs      = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
     self.dirs = dirs
     self.out_file_path = dirs.check_dir(dirs.analysis_dir)
     self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)

Пример #13

Показать файл

from pipeline.get_ini import readCSV
from pipeline.pipelinelogging import logger
from pipeline.utils import Dirs, PipelneUtils
import IlluminaUtils.lib.fastalib as fastalib

try:
    import MySQLdb
except MySQLdb.Error, e:
    message = """
    MySQLdb ERROR
      To load the correct module, try running these commands before running the pipeline:
       
source /xraid/bioware/Modules/etc/profile.modules
module load bioware
    """
    PipelneUtils.print_both(message)
    PipelneUtils.print_both("Error %d: %s" % (e.args[0], e.args[1]))
    raise
except:                       # catch everything
    PipelneUtils.print_both("Unexpected:")
#     print "Unexpected:"         # handle unexpected exceptions
    PipelneUtils.print_both(sys.exc_info()[0])
#     print sys.exc_info()[0]     # info about curr exception (type,value,traceback)
    raise          

#     sys.exit("""
#     MySQLdb ERROR
#       To load the correct module, try running these commands before running the pipeline:
#       
# source /xraid/bioware/Modules/etc/profile.modules
# module load bioware

Пример #14

Показать файл

class IlluminaFiles:
    """
    0) from run create all dataset_lines names files in output dir
    1) split fastq files from casava into files with dataset_names
    2) create ini files 
    3) process them through Meren's script
    4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload()    
    
    """
    def __init__(self, runobj):
        self.utils = PipelneUtils()
        self.runobj         = runobj
        self.out_files      = {} 
        self.id_dataset_idx = {}
        self.in_file_path   = self.runobj.input_dir
                
        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        dirs      = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
        self.dirs = dirs
        self.out_file_path = dirs.check_dir(dirs.analysis_dir)
        self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
        
    def split_files(self, compressed = False):
        """
        TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?)
        """   
#        print "compressed = %s" %       compressed
#        compressed = ast.literal_eval(compressed)     
        (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path)
#         correct_file_names = self.get_correct_file_names(in_files_r1)
        if (len(in_files_r1) > 0):
            self.read1(in_files_r1, compressed)
            self.read2(in_files_r2, compressed)
            self.create_inis()
        else:
#             print "ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes."
#             logger.debug("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
            self.utils.print_both("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
        self.close_dataset_files()

            

#        self.perfect_reads()
#        self.uniq_fa()

    def open_dataset_files(self):
        file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()]
        for f_name in file_name_base:
            output_file = os.path.join(self.out_file_path, f_name + ".fastq")
            self.out_files[f_name] = fq.FastQOutput(output_file)
        self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq"))        

    def close_dataset_files(self):
        [o_file[1].close() for o_file in self.out_files.iteritems()] 
        return
      
#     def perfect_reads(self):
#         self.utils.print_both("Extract perfect V6 reads:")
#         for idx_key in self.runobj.samples.keys():
#             file_name = os.path.join(self.out_file_path, idx_key + ".ini")
#             program_name = C.perfect_overlap_cmd
#             if self.utils.is_local():
#                 program_name = C.perfect_overlap_cmd_local       
#             try:
#                 if self.runobj.samples[idx_key].primer_suite.lower().startswith('archaeal'):
#                     call([program_name, file_name, "--archaea"]) 
#                 else: 
#                     call([program_name, file_name])
#             except:
#                 self.utils.print_both("Problems with program_name = %s, file_name = %s" % (program_name, file_name))
#                 raise  
#     
    def call_sh_script(self, script_name_w_path, where_to_run):
        try:
            call(['chmod', '0774', script_name_w_path])
            if self.utils.is_local():
                self.utils.print_both("call(['qsub', script_name_w_path], cwd=(where_to_run))")
                call(['bash', script_name_w_path], cwd=(where_to_run))                
            else:
                call(['qsub', script_name_w_path], cwd=(where_to_run))
#             pass
        except:
            self.utils.print_both("Problems with script_name = %s or qsub" % (script_name_w_path))
            raise     
        
#     todo: combine and DRY with partial - it's the same command, different arguments
    def merge_perfect(self):
        self.utils.print_both("merge perfect V6 reads:")
        program_name = C.perfect_overlap_cmd
        if self.utils.is_local():
            program_name = C.perfect_overlap_cmd_local
        add_arg = " --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0"
        command_line          = program_name + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)  
        return script_file_name    
    
    def trim_primers_perfect(self):
        self.utils.print_both("trim primers from perfect V6 reads:")
        
        merged_file_names = self.dirs.get_all_files_by_ext(self.dirs.reads_overlap_dir, "_MERGED")
        primer_suite = self.get_config_values('primer_suite')
        add_arg = ""
        if any([s.lower().startswith("Archaeal".lower()) for s in primer_suite]):
            add_arg += " --archaea"
        program_name = C.trim_primers_cmd + add_arg
        script_file_name      = self.create_job_array_script(program_name, self.dirs.reads_overlap_dir, merged_file_names)
        script_file_name_full = os.path.join(self.dirs.reads_overlap_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.reads_overlap_dir)  
        return script_file_name    

    """    
    def perfect_reads_cluster(self):
        '''
        iu-merge-pairs anna.ini --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0
            Each flag is critical. marker-gene-stringent looks complete overlaps, retain-only-overlap gets rid of adapters, max-num-mismatches retains only perfect overlaps. 
            This generates the test_MERGED file with all complete overlaps without any mismatches. But it has all the primers. 
            Then we process this file with the new and shiny iu-analyze-v6-complete-overlaps script:
        iu-trim-V6-primers test_MERGED

        '''
        self.utils.print_both("Extract perfect V6 reads:")
        script_file_name      = self.merge_perfect()
        trim_script_file_name = self.trim_primers_perfect()

        return (script_file_name, trim_script_file_name)    
    """          
                              
    def partial_overlap_reads_cluster(self):
        self.utils.print_both("Extract partial_overlap V4V5 reads:")
        program_name = C.partial_overlap_cmd
        if self.utils.is_local():
            program_name = C.partial_overlap_cmd_local       
        dna_region = self.get_config_values('dna_region')
        if ("ITS1" in list(dna_region)):
            add_arg = "--marker-gene-stringent"
        else:
            add_arg = ""
#         TODO: this part is the same in perfect overlap - move into a method    
        command_line          = program_name + " --enforce-Q30-check " + add_arg 
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)  
        self.utils.print_both("self.dirs.chmod_all(%s)" % (self.dirs.analysis_dir))
        self.dirs.chmod_all(self.dirs.analysis_dir)        
        
        return script_file_name      
                    
    def partial_overlap_reads(self):
        self.utils.print_both("Extract partial_overlap V4V5 reads:")
        for idx_key in self.runobj.samples.keys():
            ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.partial_overlap_cmd
            if self.utils.is_local():
                program_name = C.partial_overlap_cmd_local        
            try:
                if (self.runobj.samples[idx_key].dna_region == "ITS1"):
                    call([program_name, "--enforce-Q30-check", "--marker-gene-stringent", ini_file_name])
                else:
                    call([program_name, "--enforce-Q30-check", ini_file_name])
                               
#                 call([program_name, ini_file_name])           
#                 call([program_name, ini_file_name, idx_key])
#                 call([program_name, "--fast-merge", ini_file_name, idx_key])
            except Exception:
#                 except Exception, err:
                message = traceback.format_exc()
                self.utils.print_both(message)
    #or
#     print sys.exc_info()[0]

                self.utils.print_both("Problems with program_name = %s" % (program_name))
                raise  
                
#             print "HERE: program_name = " % (program_name)   
#             call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key])
            
    def get_config_values(self, key):
        config_path_data = [v for k, v in self.runobj.configPath.items()]
        return set([a[key] for a in config_path_data if key in a.keys()])
        
    def make_users_email(self):
        username = getpass.getuser() 
        return username + "@mbl.edu"
                
    def create_job_array_script(self, command_line, dir_to_run, files_list):
        files_string         = " ".join(files_list)
        files_list_size         = len(files_list)
        command_file_name = os.path.basename(command_line.split(" ")[0])
        script_file_name  = command_file_name + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        log_file_name     = script_file_name + ".sge_script.sh.log"
        email_mbl         = self.make_users_email()
        text = (
                '''#!/bin/bash
#$ -cwd
#$ -S /bin/bash
#$ -N %s
# Giving the name of the output log file
#$ -o %s
# Combining output/error messages into one file
#$ -j y
# Send mail to these users
#$ -M %s
# Send mail at job end; -m eas sends on end, abort, suspend.
#$ -m eas
#$ -t 1-%s
# Now the script will iterate %s times.

  file_list=(%s)
  
  i=$(expr $SGE_TASK_ID - 1)
#   echo "i = $i"
  # . /etc/profile.d/modules.sh
  # . /xraid/bioware/bioware-loader.sh
  . /xraid/bioware/Modules/etc/profile.modules
  module load bioware
    
  echo "%s ${file_list[$i]}"  
  %s ${file_list[$i]}  
''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line, command_line)
# ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
                )
        self.open_write_close(script_file_name_full, text)
        return script_file_name

    def filter_mismatches_cluster(self, max_mismatch = 3):
        self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch))
        command_line = C.filter_mismatch_cmd
        if self.utils.is_local():
            command_line = C.filter_mismatch_cmd_local    
        files_dir = self.dirs.reads_overlap_dir   
                
        file_list             = self.dirs.get_all_files_by_ext(files_dir, "_MERGED")
        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.call_sh_script(script_file_name_full, files_dir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir))
        self.dirs.chmod_all(files_dir)        
        
        return script_file_name              

    def filter_mismatches(self, max_mismatch = 3):
        self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch))
        n = 0        
        files = self.dirs.get_all_files()
        for full_name in files.keys():    
            if files[full_name][0].endswith('_MERGED'):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.filter_mismatch_cmd
                if self.utils.is_local():
                    program_name = C.filter_mismatch_cmd_local
#                 output_flag = "--output " + full_name + "_FILTERED"
# TODO:    Remove!!!
#                 output_flag = "-o " + full_name + "_FILTERED"           
#                 output_flag = "-o TTAGGC_NNNNTGACT_1_MERGED_FILTERED"           

#                 print "output_flag = %s" % (output_flag)
#                 print "%s %s %s" % (program_name, full_name, output_flag)                
#                 call([program_name, full_name, output_flag])
                call([program_name, full_name])

    def uniq_fa_cluster(self):
        self.utils.print_both("Uniqueing fasta files")
        command_line = C.fastaunique_cmd
        if self.utils.is_local():
            command_line = C.fastaunique_cmd_local   
        files_dir = self.dirs.reads_overlap_dir   
                
        file_list             = self.dirs.get_all_files_by_ext(files_dir, C.filtered_suffix)
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, ".fa")
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, "MERGED_V6_PRIMERS_REMOVED")
        
        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.call_sh_script(script_file_name_full, files_dir)  
        self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir))
        self.dirs.chmod_all(files_dir)        
        return script_file_name                           
                                       
    def uniq_fa(self):
        n = 0        
        self.utils.print_both("Uniqueing fasta files")
        files = self.dirs.get_all_files()
        for full_name in files.keys():    
#             if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'):
            if files[full_name][1] == ".fa" or files[full_name][0].endswith(C.filtered_suffix):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.fastaunique_cmd
                if self.utils.is_local():
                    program_name = C.fastaunique_cmd_local                
                call([program_name, full_name])

    def get_primers(self):
        proximal_primer = ""
        distal_primer   = ""
        primers         = {}
        for idx_key in self.runobj.samples.keys():
            primer_suite = self.runobj.samples[idx_key].primer_suite.lower()

            if primer_suite in C.primers_dict:
                proximal_primer = C.primers_dict[primer_suite]["proximal_primer"]
                distal_primer = C.primers_dict[primer_suite]["distal_primer"]
#                 print "proximal_primer: %s. distal_primer: %s" % (proximal_primer, distal_primer)
            else:
                self.utils.print_both("ERROR! Something wrong with the primer suite name: %s. NB: For v6mod it suppose to be 'Archaeal V6mod Suite'\n" % (primer_suite))
            primers[idx_key] = (proximal_primer, distal_primer) 
            
        return primers
        
    def create_inis(self):
        for idx_key in self.runobj.samples.keys():
            run_key = idx_key.split('_')[1].replace("N", ".");
            "todo: check if works w/o NNNN when there is a proper csv"
            email = self.runobj.samples[idx_key].email
#        for dataset in self.dataset_emails.keys():
#            dataset_idx_base = dataset + "_" + self.dataset_index[dataset]
#            print "dataset = %s, self.dataset_emails[dataset] = %s" % (dataset, self.dataset_emails[dataset])
            text = """[general]
project_name = %s
researcher_email = %s
input_directory = %s
output_directory = %s

[files]
pair_1 = %s
pair_2 = %s
""" % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq")

            "That's for parital overlap (v4v5 miseq illumina)" 
            if not self.runobj.do_perfect:
                primers = self.get_primers()    
                text += """
# following section is optional
[prefixes]
pair_1_prefix = ^""" + run_key + primers[idx_key][0] + "\npair_2_prefix = ^" + primers[idx_key][1]
                
            ini_file_name = os.path.join(self.out_file_path,  idx_key + ".ini")
            self.open_write_close(ini_file_name, text)

    def open_write_close(self, script_file_name, text):
        ini_file = open(script_file_name, "w")
        ini_file.write(text)
        ini_file.close()
 
    def get_fastq_file_names(self, f_input_file_path):
        in_files_r1 = []
        in_files_r2 = []
        "TODO: exclude dir with new created files from the loop"
        for dirname, dirnames, filenames in os.walk(f_input_file_path):
            correct_file_names = self.get_correct_file_names(filenames)

            for filename in sorted(list(correct_file_names)):
                if filename.find('_R1_') > 0:
                    in_files_r1.append(os.path.join(dirname, filename))
                elif filename.find('_R2_') > 0:
                    in_files_r2.append(os.path.join(dirname, filename))
                else:
                    sys.stderr.write("No read number in the file name: %s\n" % filename)
        self.utils.print_both("FFF0: in_files_r1 %s\n, in_files_r2 %s" % (in_files_r1, in_files_r2))                    
        return (in_files_r1, in_files_r2)
    
    def get_correct_file_names(self, filenames):
        correct_file_names = [];
        for file1 in filenames:
            index_sequence = self.get_index(file1)
#             self.runobj.run_keys
#             
            good_run_key_lane_names = [x for x in self.runobj.run_keys if x.startswith(index_sequence)]
            if len(good_run_key_lane_names) > 0:
                correct_file_names.append(file1)
        return set(correct_file_names)
        
    def read1(self, files_r1, compressed):
        """ loop through the fastq_file_names
            1) e.pair_no = 1, find run_key -> dataset name
            2) collect the relevant part of id
        """
        for file_r1 in files_r1:
            self.utils.print_both("====\nFFF1: file %s" % file_r1)
            f_input  = fq.FastQSource(file_r1, compressed)
            index_sequence = self.get_index(file_r1)
            while f_input.next(trim_to = C.trimming_length):
                e = f_input.entry
                # todo: a fork with or without NNNN, add an argument
                #                 ini_run_key  = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number   
                has_ns = any("NNNN" in s for s in self.runobj.run_keys)           
#                 has_ns = True             
                ini_run_key  = index_sequence + "_" + self.get_run_key(e.sequence, has_ns) + "_" + e.lane_number
                if int(e.pair_no) == 1:
                    dataset_file_name_base_r1 = ini_run_key + "_R1"
                    if (dataset_file_name_base_r1 in self.out_files.keys()):
                        self.out_files[dataset_file_name_base_r1].store_entry(e)
                        "TODO: make a method:"
                        short_id1 = e.header_line.split()[0]
                        short_id2 = ":".join(e.header_line.split()[1].split(":")[1:])
                        id2 = short_id1 + " 2:" + short_id2
                        self.id_dataset_idx[id2] = ini_run_key
                else:
                    self.out_files["unknown"].store_entry(e)
                    
    # def truncate_seq(self, seq):
    #     return seq[:C.trimming_length]
    
                    
    def get_run_key(self, e_sequence, has_ns = "True"):
        if has_ns:
            return ("NNNN" + e_sequence[4:9])
        else:
            return e_sequence[0:5]
    
    def remove_end_ns_strip(self, e_sequence):
        if e_sequence.endswith('N'):
            return e_sequence.rstrip('N')
        else:
            return e_sequence
        
    def read2(self, files_r2, compressed):
        "3) e.pair_no = 2, find id from 2), assign dataset_name"
        for file_r2 in files_r2:
            self.utils.print_both("FFF2: file %s" % file_r2)
            f_input  = fq.FastQSource(file_r2, compressed)
            while f_input.next(trim_to = C.trimming_length):
                e = f_input.entry
                
#                 start = time.time()  
#                 time_before = self.utils.get_time_now()
#                 e.sequence = self.remove_end_ns_strip(e.sequence)
#                 elapsed = (time.time() - start)
#                 print "remove_end_ns_strip with strip is done in: %s" % (elapsed)      
                
                if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx):
                    file_name = self.id_dataset_idx[e.header_line] + "_R2"
                    self.out_files[file_name].store_entry(e)        
                else:
                    self.out_files["unknown"].store_entry(e)

    def get_index(self, file_r1):
        file_name_parts = os.path.basename(file_r1).split("_")
#        if the file name starts with "IDX, then actual idx will be next.
        index = file_name_parts[0]
        if file_name_parts[0].startswith("IDX"):
            index = file_name_parts[1]
        return index

Пример #15

Показать файл

Файл: illumina_files.py Проект: icefoxx/py_mbl_sequencing_pipeline

class IlluminaFiles:
    """
    0) from run create all dataset_lines names files in output dir
    1) split fastq files from casava into files with dataset_names
    2) create ini files 
    3) process them through Meren's script
    4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload()    
    
    """
    def __init__(self, runobj):
        self.utils = PipelneUtils()
        self.runobj         = runobj
        self.out_files      = {} 
        self.id_dataset_idx = {}
        self.in_file_path   = self.runobj.input_dir
                
        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 

        self.out_file_path = dirs.check_dir(dirs.analysis_dir)
        self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
        
    def split_files(self, compressed = False):
        """
        TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?)
        """   
#        print "compressed = %s" %       compressed
#        compressed = ast.literal_eval(compressed)     
        (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path)
        self.read1(in_files_r1, compressed)
        self.read2(in_files_r2, compressed)
        self.create_inis()
        self.close_dataset_files()

#        self.perfect_reads()
#        self.uniq_fa()

    def open_dataset_files(self):
        file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()]
        for f_name in file_name_base:
            output_file = os.path.join(self.out_file_path, f_name + ".fastq")
            self.out_files[f_name] = fq.FastQOutput(output_file)
        self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq"))        

    def close_dataset_files(self):
        [o_file[1].close() for o_file in self.out_files.iteritems()] 
        return
   
    def get_all_files(self):
        files = {}
        for dirname, dirnames, filenames in os.walk(self.out_file_path):
            for file_name in filenames:
                full_name = os.path.join(dirname, file_name)
                (file_base, file_extension) = os.path.splitext(os.path.join(dirname, file_name))
                files[full_name] = (file_base, file_extension)
#        print "len(files) = %s" % len(files)
        return files
    
    def perfect_reads(self):
        print "Extract perfect V6 reads:"
        for idx_key in self.runobj.samples.keys():
            file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.perfect_overlap_cmd
            if self.utils.is_local():
                program_name = C.perfect_overlap_cmd_local                    
            try:
                if self.runobj.samples[idx_key].primer_suite.startswith('Archaeal'):
                    call([program_name, file_name, "--archaea"]) 
                else: 
                    call([program_name, file_name])
            except:
                print "Problems with program_name = %s, file_name = %s" % (program_name, file_name)
                raise  


    def partial_overlap_reads(self):
        print "Extract partial_overlap V4V5 reads:"
        for idx_key in self.runobj.samples.keys():
            ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.partial_overlap_cmd
            if self.utils.is_local():
                program_name = C.partial_overlap_cmd_local           
            call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key])
            
    def filter_mismatches(self, max_mismatch = 3):
        print "Filter mismatches if more then %s" % (max_mismatch)
        n = 0        
        files = self.get_all_files()
        for full_name in files.keys():    
            if files[full_name][0].endswith('_MERGED'):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.filter_mismatch_cmd
                if self.utils.is_local():
                    program_name = C.filter_mismatch_cmd_local
                output_flag = "--output " + full_name + "_FILTERED"                
                call([program_name, full_name, output_flag])
                    
    def uniq_fa(self):
        n = 0        
        print "Uniqueing fasta files"      
        files = self.get_all_files()
        for full_name in files.keys():    
            if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.fastaunique_cmd
                if self.utils.is_local():
                    program_name = C.fastaunique_cmd_local                
                call([program_name, full_name])

    def create_inis(self):
        for idx_key in self.runobj.samples.keys():
            run_key = idx_key.split('_')[1].replace("N", ".");
            email = self.runobj.samples[idx_key].email
#        for dataset in self.dataset_emails.keys():
#            dataset_idx_base = dataset + "_" + self.dataset_index[dataset]
#            print "dataset = %s, self.dataset_emails[dataset] = %s" % (dataset, self.dataset_emails[dataset])
            text = """[general]
project_name = %s
researcher_email = %s
input_directory = %s
output_directory = %s

[files]
pair_1 = %s
pair_2 = %s
""" % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq")

            "That's for v4v5 miseq illumina" 
            if not self.runobj.do_perfect:    
                text += """
# following section is optional
[prefixes]
pair_1_prefix = ^""" + run_key + """CCAGCAGC[C,T]GCGGTAA.
pair_2_prefix = ^CCGTC[A,T]ATT[C,T].TTT[G,A]A.T
                """
                
            ini_file_name = os.path.join(self.out_file_path,  idx_key + ".ini")
            self.open_write_close(ini_file_name, text)

    def open_write_close(self, ini_file_name, text):
        ini_file = open(ini_file_name, "w")
        ini_file.write(text)
        ini_file.close()
 
    def get_fastq_file_names(self, f_input_file_path):
        in_files_r1 = []
        in_files_r2 = []
        "TODO: exclude dir with new created files from the loop"
        for dirname, dirnames, filenames in os.walk(f_input_file_path):
            for filename in filenames:
                if filename.find('_R1_') > 0:
                    in_files_r1.append(os.path.join(dirname, filename))
                elif filename.find('_R2_') > 0:
                    in_files_r2.append(os.path.join(dirname, filename))
                else:
                    sys.stderr.write("No read number in the file name: %s\n" % filename)
        return (in_files_r1, in_files_r2)
        
    def read1(self, files_r1, compressed):
        """ loop through the fastq_file_names
            1) e.pair_no = 1, find run_key -> dataset name
            2) collect the relevant part of id
        """
        for file_r1 in files_r1:
            print "FFF1: file %s" % file_r1
            index_sequence = self.get_index(file_r1)
            f_input  = fq.FastQSource(file_r1, compressed)
            while f_input.next():
                e = f_input.entry
                ini_run_key  = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number                
#                ini_run_key  = e.index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number                
                if ini_run_key in self.runobj.samples.keys() and int(e.pair_no) == 1:
                    dataset_file_name_base_r1 = ini_run_key + "_R1"
                    if (dataset_file_name_base_r1 in self.out_files.keys()):
                        self.out_files[dataset_file_name_base_r1].store_entry(e)
                        "TODO: make a method:"
                        short_id1 = e.header_line.split()[0]
                        short_id2 = ":".join(e.header_line.split()[1].split(":")[1:])
                        id2 = short_id1 + " 2:" + short_id2
                        self.id_dataset_idx[id2] = ini_run_key
                else:
                    self.out_files["unknown"].store_entry(e)
                    
    def read2(self, files_r2, compressed):
        "3) e.pair_no = 2, find id from 2), assign dataset_name"
        for file_r2 in files_r2:
            print "FFF2: file %s" % file_r2
            f_input  = fq.FastQSource(file_r2, compressed)
            while f_input.next():
                e = f_input.entry
                
                if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx):
                    file_name = self.id_dataset_idx[e.header_line] + "_R2"
                    self.out_files[file_name].store_entry(e)        
                else:
                    self.out_files["unknown"].store_entry(e)

    def get_index(self, file_r1):
        file_name_parts = os.path.basename(file_r1).split("_")
#        if the file name starts with "IDX, then actual idx will be next.
        index = file_name_parts[0]
        if file_name_parts[0].startswith("IDX"):
            index = file_name_parts[1]
        return index

Пример #16

Показать файл

Файл: chimera.py Проект: avoorhis/py_mbl_sequencing_pipeline

    def __init__(self, runobj=None):
        self.utils = PipelneUtils()
        self.runobj = runobj
        self.run_keys = self.runobj.run_keys
        self.rundate = self.runobj.run
        try:
            self.use_cluster = self.runobj.use_cluster
        except:
            self.use_cluster = True
        self.chg_suffix = ".chg"
        self.chimeras_suffix = ".chimeras"
        self.ref_suffix = ".db"
        self.denovo_suffix = ".txt"
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix  #".nonchimeric.fa"
        self.chimeric_suffix = ".chimeric.fa"
        self.base_suffix = "unique" + self.chimeras_suffix

        self.cluster_slots = {
            "grendel": [12, 8],
            "cricket": [40],
            "cluster5": [32]
        }

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            os.environ['SGE_ROOT'] = '/opt/sge'
            os.environ['SGE_CELL'] = 'grendel'
            path = os.environ['PATH']
            os.environ['PATH'] = '/opt/sge/bin/lx24-amd64:' + path
            site = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload,
                             dir_prefix,
                             self.runobj.platform,
                             lane_name=lane_name,
                             site=site)
            self.idx_keys = convert_unicode_dictionary_to_str(
                json.loads(
                    open(self.runobj.trim_status_file_name,
                         "r").read()))["new_lane_keys"]
            self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)
            self.indir = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
            self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir)

        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload,
                             dir_prefix,
                             self.runobj.platform,
                             lane_name=lane_name,
                             site=site)
            self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)

#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd
        if self.utils.is_local():
            self.usearch_cmd = C.usearch6_cmd_local
        #self.abskew      = C.chimera_checking_abskew
        self.refdb = C.chimera_checking_refdb
        if self.utils.is_local():
            self.refdb_local = C.chimera_checking_refdb_local
        self.its_refdb = C.chimera_checking_its_refdb
        self.input_file_names = self.make_chimera_input_illumina_file_names()

Пример #17

Показать файл

Файл: illumina_files.py Проект: msGenDev/py_mbl_sequencing_pipeline

class IlluminaFiles:
    """
    0) from run create all dataset_lines names files in output dir
    1) split fastq files from casava into files with dataset_names
    2) create ini files 
    3) process them through Meren's script
    4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload()    
    
    """
    def __init__(self, runobj):
        self.utils = PipelneUtils()
        self.runobj         = runobj
        self.out_files      = {} 
        self.id_dataset_idx = {}
        self.in_file_path   = self.runobj.input_dir
                
        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        dirs      = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
        self.dirs = dirs
        self.out_file_path = dirs.check_dir(dirs.analysis_dir)
        self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
        
    def split_files(self, compressed = False):
        """
        TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?)
        """   
#        print "compressed = %s" %       compressed
#        compressed = ast.literal_eval(compressed)     
        (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path)
        self.read1(in_files_r1, compressed)
        self.read2(in_files_r2, compressed)
        self.create_inis()
        self.close_dataset_files()

#        self.perfect_reads()
#        self.uniq_fa()

    def open_dataset_files(self):
        file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()]
        for f_name in file_name_base:
            output_file = os.path.join(self.out_file_path, f_name + ".fastq")
            self.out_files[f_name] = fq.FastQOutput(output_file)
        self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq"))        

    def close_dataset_files(self):
        [o_file[1].close() for o_file in self.out_files.iteritems()] 
        return
   
    def get_all_files(self):
        files = {}
        for dirname, dirnames, filenames in os.walk(self.out_file_path):
            for file_name in filenames:
                full_name = os.path.join(dirname, file_name)
                (file_base, file_extension) = os.path.splitext(os.path.join(dirname, file_name))
                files[full_name] = (file_base, file_extension)
#        print "len(files) = %s" % len(files)
        return files
    
    def perfect_reads(self):
        print "Extract perfect V6 reads:"
        for idx_key in self.runobj.samples.keys():
            file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.perfect_overlap_cmd
            if self.utils.is_local():
                program_name = C.perfect_overlap_cmd_local                    
            try:
                if self.runobj.samples[idx_key].primer_suite.startswith('Archaeal'):
                    call([program_name, file_name, "--archaea"]) 
                else: 
                    call([program_name, file_name])
            except:
                print "Problems with program_name = %s, file_name = %s" % (program_name, file_name)
                raise  
    
    def call_sh_script(self, script_name_w_path, where_to_run):
        try:
            call(['chmod', '0774', script_name_w_path])
            call(['qsub', script_name_w_path], cwd=(where_to_run))
#             pass
        except:
            print "Problems with script_name = %s" % (script_name_w_path)
            raise  
        
    def perfect_reads_cluster(self):
        print "Extract perfect V6 reads:"
        program_name = C.perfect_overlap_cmd
        if self.utils.is_local():
            program_name = C.perfect_overlap_cmd_local
        primer_suite = self.get_config_values('primer_suite')
        if any("Archaeal" in s for s in primer_suite):
            add_arg = " --archaea"
        else: 
            add_arg = ""
        command_line          = program_name + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)  
        return script_file_name              
                          
    def partial_overlap_reads_cluster(self):
        print "Extract partial_overlap V4V5 reads:"
        program_name = C.partial_overlap_cmd
        if self.utils.is_local():
            program_name = C.partial_overlap_cmd_local       
        dna_region = self.get_config_values('dna_region')
        if ("ITS1" in list(dna_region)):
            add_arg = "--marker-gene-stringent"
        else:
            add_arg = ""
#         TODO: this part is the same in perfect overlap - move into a method    
        command_line          = program_name + " --enforce-Q30-check " + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)  
        return script_file_name      
                    
    def partial_overlap_reads(self):
        print "Extract partial_overlap V4V5 reads:"
        for idx_key in self.runobj.samples.keys():
            ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.partial_overlap_cmd
            if self.utils.is_local():
                program_name = C.partial_overlap_cmd_local        
            try:
                if (self.runobj.samples[idx_key].dna_region == "ITS1"):
                    call([program_name, "--enforce-Q30-check", "--marker-gene-stringent", ini_file_name])
                else:
                    call([program_name, "--enforce-Q30-check", ini_file_name])
                               
#                 call([program_name, ini_file_name])           
#                 call([program_name, ini_file_name, idx_key])
#                 call([program_name, "--fast-merge", ini_file_name, idx_key])
            except Exception:
#                 except Exception, err:
                print traceback.format_exc()
    #or
#     print sys.exc_info()[0]

                print "Problems with program_name = %s" % (program_name)
                raise  
                
#             print "HERE: program_name = " % (program_name)   
#             call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key])
            
    def get_config_values(self, key):
        config_path_data = [v for k, v in self.runobj.configPath.items()]
        return set([a[key] for a in config_path_data if key in a.keys()])
        
    def make_users_email(self):
        username = getpass.getuser() 
        return username + "@mbl.edu"
                
    def create_job_array_script(self, command_line, dir_to_run, files_list):
        files_string         = " ".join(files_list)
        files_list_size         = len(files_list)
        command_file_name = os.path.basename(command_line.split(" ")[0])
        script_file_name  = command_file_name + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        log_file_name     = script_file_name + ".sge_script.sh.log"
        email_mbl         = self.make_users_email()
        text = (
                '''#!/bin/bash
#$ -cwd
#$ -S /bin/bash
#$ -N %s
# Giving the name of the output log file
#$ -o %s
# Combining output/error messages into one file
#$ -j y
# Send mail to these users
#$ -M %s
# Send mail at job end; -m eas sends on end, abort, suspend.
#$ -m eas
#$ -t 1-%s
# Now the script will iterate %s times.

  file_list=(%s)
  
  i=$(expr $SGE_TASK_ID - 1)
#   echo "i = $i"
  source ~/.bashrc
  module load bioware
    
  echo "%s ${file_list[$i]}"  
  %s ${file_list[$i]}  
''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line, command_line)
# ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
                )
        self.open_write_close(script_file_name_full, text)
        return script_file_name

    def filter_mismatches_cluster(self, max_mismatch = 3):
        print "Filter mismatches if more then %s" % (max_mismatch)
        command_line = C.filter_mismatch_cmd
        if self.utils.is_local():
            command_line = C.filter_mismatch_cmd_local    
        files_dir = self.dirs.reads_overlap_dir   
                
        file_list             = self.dirs.get_all_files_by_ext(files_dir, "_MERGED")
        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.call_sh_script(script_file_name_full, files_dir)  
        return script_file_name              

    def filter_mismatches(self, max_mismatch = 3):
        print "Filter mismatches if more then %s" % (max_mismatch)
        n = 0        
        files = self.get_all_files()
        for full_name in files.keys():    
            if files[full_name][0].endswith('_MERGED'):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.filter_mismatch_cmd
                if self.utils.is_local():
                    program_name = C.filter_mismatch_cmd_local
#                 output_flag = "--output " + full_name + "_FILTERED"
# TODO:    Remove!!!
#                 output_flag = "-o " + full_name + "_FILTERED"           
#                 output_flag = "-o TTAGGC_NNNNTGACT_1_MERGED_FILTERED"           

#                 print "output_flag = %s" % (output_flag)
#                 print "%s %s %s" % (program_name, full_name, output_flag)                
#                 call([program_name, full_name, output_flag])
                call([program_name, full_name])

    def uniq_fa_cluster(self):
        print "Uniqueing fasta files"      
        command_line = C.fastaunique_cmd
        if self.utils.is_local():
            command_line = C.fastaunique_cmd_local   
        files_dir = self.dirs.reads_overlap_dir   
                
        file_list             = self.dirs.get_all_files_by_ext(files_dir, C.filtered_suffix)
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, ".fa")
        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.call_sh_script(script_file_name_full, files_dir)  
        return script_file_name                           
                                       
    def uniq_fa(self):
        n = 0        
        print "Uniqueing fasta files"      
        files = self.get_all_files()
        for full_name in files.keys():    
#             if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'):
            if files[full_name][1] == ".fa" or files[full_name][0].endswith(C.filtered_suffix):
                n +=1   
#                print "%s fasta file: %s" % (n, full_name)
                program_name = C.fastaunique_cmd
                if self.utils.is_local():
                    program_name = C.fastaunique_cmd_local                
                call([program_name, full_name])

    def get_primers(self):
        proximal_primer = ""
        distal_primer   = ""
        primers         = {}
        for idx_key in self.runobj.samples.keys():
            if self.runobj.samples[idx_key].primer_suite in C.primers_dict:
                proximal_primer = C.primers_dict[self.runobj.samples[idx_key].primer_suite]["proximal_primer"]
                distal_primer = C.primers_dict[self.runobj.samples[idx_key].primer_suite]["distal_primer"]

#            if self.runobj.samples[idx_key].primer_suite.startswith('Archaeal V4-V5'):
#                proximal_primer = "G[C,T][C,T]TAAA..[A,G][C,T][C,T][C,T]GTAGC"
#                distal_primer   = "CCGGCGTTGA.TCCAATT"
#            elif self.runobj.samples[idx_key].primer_suite.startswith('Bacterial V4-V5'):
#                proximal_primer = "CCAGCAGC[C,T]GCGGTAA."
#                distal_primer   = "CCGTC[A,T]ATT[C,T].TTT[G,A]A.T"
#            elif self.runobj.samples[idx_key].primer_suite.startswith('Archaeal V6mod'):
#                proximal_primer = "AATTGGCGGGGGAGCAC"
#                distal_primer   = "GCCATGCACC[A,T]CCTCT"
#            elif self.runobj.samples[idx_key].primer_suite.startswith('Fungal ITS1'):
#                proximal_primer = "GTAAAAGTCGTAACAAGGTTTC"
#                distal_primer   = "GTTCAAAGA[C,T]TCGATGATTCAC"
            else:
                print "ERROR! Something wrong with the primer suite name: %s. NB: For v6mod it suppose to be 'Archaeal V6mod Suite'" % (self.runobj.samples[idx_key].primer_suite)
            primers[idx_key] = (proximal_primer, distal_primer) 
            
        return primers
        
    def create_inis(self):
        primers = self.get_primers()
        for idx_key in self.runobj.samples.keys():
            run_key = idx_key.split('_')[1].replace("N", ".");
            email = self.runobj.samples[idx_key].email
#        for dataset in self.dataset_emails.keys():
#            dataset_idx_base = dataset + "_" + self.dataset_index[dataset]
#            print "dataset = %s, self.dataset_emails[dataset] = %s" % (dataset, self.dataset_emails[dataset])
            text = """[general]
project_name = %s
researcher_email = %s
input_directory = %s
output_directory = %s

[files]
pair_1 = %s
pair_2 = %s
""" % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq")

            "That's for v4v5 miseq illumina" 
            if not self.runobj.do_perfect:    
                text += """
# following section is optional
[prefixes]
pair_1_prefix = ^""" + run_key + primers[idx_key][0] + "\npair_2_prefix = ^" + primers[idx_key][1]
                
            ini_file_name = os.path.join(self.out_file_path,  idx_key + ".ini")
            self.open_write_close(ini_file_name, text)

    def open_write_close(self, script_file_name, text):
        ini_file = open(script_file_name, "w")
        ini_file.write(text)
        ini_file.close()
 
    def get_fastq_file_names(self, f_input_file_path):
        in_files_r1 = []
        in_files_r2 = []
        "TODO: exclude dir with new created files from the loop"
        for dirname, dirnames, filenames in os.walk(f_input_file_path):
            for filename in filenames:
                if filename.find('_R1_') > 0:
                    in_files_r1.append(os.path.join(dirname, filename))
                elif filename.find('_R2_') > 0:
                    in_files_r2.append(os.path.join(dirname, filename))
                else:
                    sys.stderr.write("No read number in the file name: %s\n" % filename)
        return (in_files_r1, in_files_r2)
        
    def read1(self, files_r1, compressed):
        """ loop through the fastq_file_names
            1) e.pair_no = 1, find run_key -> dataset name
            2) collect the relevant part of id
        """
        for file_r1 in files_r1:
            print "FFF1: file %s" % file_r1
            index_sequence = self.get_index(file_r1)
            f_input  = fq.FastQSource(file_r1, compressed)
            while f_input.next():
                e = f_input.entry
                ini_run_key  = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number                
#                ini_run_key  = e.index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number                
                if ini_run_key in self.runobj.samples.keys() and int(e.pair_no) == 1:
                    dataset_file_name_base_r1 = ini_run_key + "_R1"
                    if (dataset_file_name_base_r1 in self.out_files.keys()):
                        self.out_files[dataset_file_name_base_r1].store_entry(e)
                        "TODO: make a method:"
                        short_id1 = e.header_line.split()[0]
                        short_id2 = ":".join(e.header_line.split()[1].split(":")[1:])
                        id2 = short_id1 + " 2:" + short_id2
                        self.id_dataset_idx[id2] = ini_run_key
                else:
                    self.out_files["unknown"].store_entry(e)
                    
    def read2(self, files_r2, compressed):
        "3) e.pair_no = 2, find id from 2), assign dataset_name"
        for file_r2 in files_r2:
            print "FFF2: file %s" % file_r2
            f_input  = fq.FastQSource(file_r2, compressed)
            while f_input.next():
                e = f_input.entry
                
                if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx):
                    file_name = self.id_dataset_idx[e.header_line] + "_R2"
                    self.out_files[file_name].store_entry(e)        
                else:
                    self.out_files["unknown"].store_entry(e)

    def get_index(self, file_r1):
        file_name_parts = os.path.basename(file_r1).split("_")
#        if the file name starts with "IDX, then actual idx will be next.
        index = file_name_parts[0]
        if file_name_parts[0].startswith("IDX"):
            index = file_name_parts[1]
        return index

Пример #18

Показать файл

Файл: illumina_files.py Проект: avoorhis/py_mbl_sequencing_pipeline

class IlluminaFiles:
    """
    0) from run create all dataset_lines names files in output dir
    1) split fastq files from casava into files with dataset_names
    2) create ini files
    3) process them through Meren's script
    4) result - files dataset_lane-PERFECT_reads.fa.unique with frequencies - to process with env454upload()
    """
    def __init__(self, runobj):
        self.utils = PipelneUtils()
        self.runobj         = runobj
        self.out_files      = {}
        self.id_dataset_idx = {}
        self.in_file_path   = self.runobj.input_dir

        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''

        dirs      = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site)
        self.dirs = dirs
        self.out_file_path = dirs.check_dir(dirs.analysis_dir)
        self.results_path  = dirs.check_dir(dirs.reads_overlap_dir)
        self.platform = self.runobj.platform

    def split_files(self, compressed = False):
        """
        TODO: *) fastq_file_names method to collect all file_names with full path or directories_names (see get_all_files()?)
        """
#        logger.debug("compressed = %s" %       compressed)
#        compressed = ast.literal_eval(compressed)
        (in_files_r1, in_files_r2) = self.get_fastq_file_names(self.in_file_path)
#         correct_file_names = self.get_correct_file_names(in_files_r1)
        if (len(in_files_r1) > 0):
            self.read1(in_files_r1, compressed)
            self.read2(in_files_r2, compressed)
            self.create_inis()
        else:
#             logger.debug("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
#             logger.debug("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
            self.utils.print_both("ERROR: There is something wrong with fastq file names. Please check if they start with correct indexes.")
        self.close_dataset_files()



#        self.perfect_reads()
#        self.uniq_fa()

    def open_dataset_files(self):
        file_name_base = [i + "_R1" for i in self.runobj.samples.keys()] + [i + "_R2" for i in self.runobj.samples.keys()]
        for f_name in file_name_base:
            output_file = os.path.join(self.out_file_path, f_name + ".fastq")
            self.out_files[f_name] = fq.FastQOutput(output_file)
        self.out_files["unknown"] = fq.FastQOutput(os.path.join(self.out_file_path, "unknown" + ".fastq"))

    def close_dataset_files(self):
        [o_file[1].close() for o_file in self.out_files.items()]
        return

#     def perfect_reads(self):
#         self.utils.print_both("Extract perfect V6 reads:")
#         for idx_key in self.runobj.samples.keys():
#             file_name = os.path.join(self.out_file_path, idx_key + ".ini")
#             program_name = C.perfect_overlap_cmd
#             if self.utils.is_local():
#                 program_name = C.perfect_overlap_cmd_local
#             try:
#                 if self.runobj.samples[idx_key].primer_suite.lower().startswith('archaeal'):
#                     call([program_name, file_name, "--archaea"])
#                 else:
#                     call([program_name, file_name])
#             except:
#                 self.utils.print_both("Problems with program_name = %s, file_name = %s" % (program_name, file_name))
#                 raise
#
#     TODO: use from util
    def call_sh_script(self, script_name_w_path, where_to_run):
        try:
            call(['chmod', '0774', script_name_w_path])
            if self.utils.is_local():
                self.utils.print_both("call(['qsub', script_name_w_path], cwd=(where_to_run))")
                call(['bash', script_name_w_path], cwd=(where_to_run))
            else:
                call(['qsub', script_name_w_path], cwd=(where_to_run))
#             pass
        except:
            self.utils.print_both("Problems with script_name = %s or qsub" % (script_name_w_path))
            raise

#     todo: combine and DRY with partial - it's the same command, different arguments
    def merge_perfect(self):
        self.utils.print_both("merge perfect V6 reads:")
        program_name = C.perfect_overlap_cmd
        if self.utils.is_local():
            program_name = C.perfect_overlap_cmd_local
        add_arg = " --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0"
        command_line          = program_name + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)
        return script_file_name

    def trim_primers_perfect(self):
        self.utils.print_both("trim primers from perfect V6 reads:")

        merged_file_names = self.dirs.get_all_files_by_ext(self.dirs.reads_overlap_dir, "_MERGED")
        primer_suite = self.get_config_values('primer_suite')
        add_arg = ""
        if any([s.lower().startswith("archaeal") for s in primer_suite]):
            add_arg += " --archaea"
        program_name = C.trim_primers_cmd + add_arg
        script_file_name      = self.create_job_array_script(program_name, self.dirs.reads_overlap_dir, merged_file_names)
        script_file_name_full = os.path.join(self.dirs.reads_overlap_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.reads_overlap_dir)
        return script_file_name

    """
    def perfect_reads_cluster(self):
        '''
        iu-merge-pairs anna.ini --marker-gene-stringent --retain-only-overlap --max-num-mismatches 0
            Each flag is critical. marker-gene-stringent looks complete overlaps, retain-only-overlap gets rid of adapters, max-num-mismatches retains only perfect overlaps.
            This generates the test_MERGED file with all complete overlaps without any mismatches. But it has all the primers.
            Then we process this file with the new and shiny iu-analyze-v6-complete-overlaps script:
        iu-trim-V6-primers test_MERGED

        '''
        self.utils.print_both("Extract perfect V6 reads:")
        script_file_name      = self.merge_perfect()
        trim_script_file_name = self.trim_primers_perfect()

        return (script_file_name, trim_script_file_name)
    """

    def partial_overlap_reads_cluster(self):
        self.utils.print_both("Extract partial_overlap reads (from partial_overlap_reads_cluster):")
        program_name = C.partial_overlap_cmd
        if self.utils.is_local():
            program_name = C.partial_overlap_cmd_local
        dna_region = self.get_config_values('dna_region')
        if set(C.marker_gene_stringent_regions) & set(list(dna_region)):
            add_arg = "--marker-gene-stringent"
        else:
            add_arg = ""
#         TODO: this part is the same in perfect overlap - move into a method
        command_line          = program_name + " --enforce-Q30-check " + add_arg
        file_list             = self.dirs.get_all_files_by_ext(self.out_file_path, "ini")
        script_file_name      = self.create_job_array_script(command_line, self.dirs.analysis_dir, file_list)
        script_file_name_full = os.path.join(self.dirs.analysis_dir, script_file_name)
        self.call_sh_script(script_file_name_full, self.dirs.analysis_dir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (self.dirs.analysis_dir))
        self.dirs.chmod_all(self.dirs.analysis_dir)

        return script_file_name

    def partial_overlap_reads(self):
        self.utils.print_both("Extract partial_overlap reads (from partial_overlap_reads):")
        for idx_key in self.runobj.samples.keys():
            ini_file_name = os.path.join(self.out_file_path, idx_key + ".ini")
            program_name = C.partial_overlap_cmd
            if self.utils.is_local():
                program_name = C.partial_overlap_cmd_local
            try:
                if set(C.marker_gene_stringent_regions) & set(list(self.runobj.samples[idx_key].dna_region)):
                # if (self.runobj.samples[idx_key].dna_region == "ITS1"):
                    call([program_name, "--enforce-Q30-check", "--marker-gene-stringent", ini_file_name])
                else:
                    call([program_name, "--enforce-Q30-check", ini_file_name])

#                 call([program_name, ini_file_name])
#                 call([program_name, ini_file_name, idx_key])
#                 call([program_name, "--fast-merge", ini_file_name, idx_key])
            except Exception:
#                 except Exception, err:
                message = traceback.format_exc()
                self.utils.print_both(message)
    #or
#     logger.debug(sys.exc_info()[0])

                self.utils.print_both("Problems with program_name = %s" % (program_name))
                raise

#             logger.debug("HERE: program_name = " % (program_name))
#             call([program_name, "--fast-merge", "--compute-qual-dicts", ini_file_name, idx_key])

    def get_config_values(self, key):
        config_path_data = [v for k, v in self.runobj.configPath.items()]
        return set([a[key] for a in config_path_data if key in a.keys()])

#     TODO: use from util
    def make_users_email(self):
        username = getpass.getuser()
        return username + "@mbl.edu"

#     TODO: use from util
#     Removed by Hilary's request:
#     # Send mail at job end (e); -m as sends abort, suspend.
#     #$ -m as
    def create_job_array_script(self, command_line, dir_to_run, files_list):
        files_string         = " ".join(files_list)
        files_list_size         = len(files_list)
        command_file_name = os.path.basename(command_line.split(" ")[0])
        script_file_name  = command_file_name + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        log_file_name     = script_file_name + ".sge_script.sh.log"
        # email_mbl         = self.make_users_email()
        email_mbl = C.email_mbl
        text = (
                '''#!/bin/bash
#$ -cwd
#$ -S /bin/bash
#$ -N %s
# Giving the name of the output log file
#$ -o %s
# Combining output/error messages into one file
#$ -j y
# Send mail to these users
#$ -M %s
#$ -t 1-%s
# Now the script will iterate %s times.

  file_list=(%s)

  i=$(expr $SGE_TASK_ID - 1)
  # echo "i = $i"
  # . /etc/profile.d/modules.sh
  # . /xraid/bioware/bioware-loader.sh

  shopt -s expand_aliases # It will expand aliases that are loaded via modules
  . /xraid/bioware/Modules/etc/profile.modules
  module load bioware

  echo "%s ${file_list[$i]}"
  %s ${file_list[$i]}
''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line, command_line)
# ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
                )
        self.open_write_close(script_file_name_full, text)
        return script_file_name

    def filter_mismatches_cluster(self, max_mismatch = 3):
        self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch))
        command_line = C.filter_mismatch_cmd
        if self.utils.is_local():
            command_line = C.filter_mismatch_cmd_local
        files_dir = self.dirs.reads_overlap_dir

        file_list             = self.dirs.get_all_files_by_ext(files_dir, "_MERGED")
        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.utils.call_sh_script(script_file_name_full, files_dir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir))
        self.dirs.chmod_all(files_dir)

        return script_file_name

    def filter_mismatches(self, max_mismatch = 3):
        self.utils.print_both("Filter mismatches if more then %s" % (max_mismatch))
        n = 0
        files = self.dirs.get_all_files()
        for full_name in files.keys():
            if files[full_name][0].endswith('_MERGED'):
                n +=1
#                logger.debug("%s fasta file: %s" % (n, full_name))
                program_name = C.filter_mismatch_cmd
                if self.utils.is_local():
                    program_name = C.filter_mismatch_cmd_local
                call([program_name, full_name])

    def uniq_fa_cluster(self):
        self.utils.print_both("Uniqueing fasta files")
        command_line = C.fastaunique_cmd
        if self.utils.is_local():
            command_line = C.fastaunique_cmd_local
        files_dir = self.dirs.reads_overlap_dir

        file_list             = self.dirs.get_all_files_by_ext(files_dir, C.filtered_suffix)
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, ".fa")
        if len(file_list) == 0:
            file_list         = self.dirs.get_all_files_by_ext(files_dir, "MERGED_V6_PRIMERS_REMOVED")

        script_file_name      = self.create_job_array_script(command_line, files_dir, file_list)
        script_file_name_full = os.path.join(files_dir, script_file_name)
        self.call_sh_script(script_file_name_full, files_dir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (files_dir))
        self.dirs.chmod_all(files_dir)
        return script_file_name

    def uniq_fa(self):
        n = 0
        self.utils.print_both("Uniqueing fasta files")
        files = self.dirs.get_all_files()
        for full_name in files.keys():
#             if files[full_name][1] == ".fa" or files[full_name][0].endswith('_MERGED_FILTERED'):
            if files[full_name][1] == ".fa" or files[full_name][0].endswith(C.filtered_suffix):
                n +=1
                program_name = C.fastaunique_cmd
                if self.utils.is_local():
                    program_name = C.fastaunique_cmd_local
                call([program_name, full_name])

    def get_primers(self):
        proximal_primer = ""
        distal_primer   = ""
        primers         = {}
        for idx_key in self.runobj.samples.keys():
            primer_suite = self.runobj.samples[idx_key].primer_suite.lower()

            if primer_suite in C.primers_dict:
                proximal_primer = C.primers_dict[primer_suite]["proximal_primer"]
                distal_primer = C.primers_dict[primer_suite]["distal_primer"]
            else:
                self.utils.print_both("ERROR! Something wrong with the primer suite name: %s. NB: For v6mod it suppose to be 'Archaeal V6mod Suite'\n" % (primer_suite))
            primers[idx_key] = (proximal_primer, distal_primer)

        return primers

    def create_inis(self):
        for idx_key in self.runobj.samples.keys():
            run_key = idx_key.split('_')[1].replace("N", ".");
            "todo: check if works w/o NNNN when there is a proper csv"
            email = self.runobj.samples[idx_key].email
            text = """[general]
project_name = %s
researcher_email = %s
input_directory = %s
output_directory = %s

[files]
pair_1 = %s
pair_2 = %s
""" % (idx_key, email, self.out_file_path, self.results_path, idx_key + "_R1.fastq", idx_key + "_R2.fastq")

            "That's for parital overlap (v4v5 and hapto miseq illumina)"
            if not self.runobj.do_perfect:
                primers = self.get_primers()
                # logger.debug("run_key = %s, idx_key = %s, primers[idx_key][0], primers[idx_key][1] = %s" (run_key, idx_key, primers[idx_key][0], primers[idx_key][1]))
                text += """
# following section is optional
[prefixes]
pair_1_prefix = ^""" + run_key + primers[idx_key][0] + "\npair_2_prefix = ^" + primers[idx_key][1]

            ini_file_name = os.path.join(self.out_file_path,  idx_key + ".ini")
            self.open_write_close(ini_file_name, text)

#     TODO: use from utils
    def open_write_close(self, script_file_name, text):
        ini_file = open(script_file_name, "w")
        ini_file.write(text)
        ini_file.close()

    def get_fastq_file_names(self, f_input_file_path):
        in_files_r1 = []
        in_files_r2 = []
        "TODO: exclude dir with new created files from the loop"
        for dirname, dirnames, filenames in os.walk(f_input_file_path):
            correct_file_names = self.get_correct_file_names(filenames)

            for filename in sorted(list(correct_file_names)):
                if filename.find('_R1_') > 0:
                    in_files_r1.append(os.path.join(dirname, filename))
                elif filename.find('_R2_') > 0:
                    in_files_r2.append(os.path.join(dirname, filename))
                else:
                    sys.stderr.write("No read number in the file name: %s\n" % filename)
        self.utils.print_both("FFF0: in_files_r1 %s\n, in_files_r2 %s" % (in_files_r1, in_files_r2))
        return (in_files_r1, in_files_r2)

    def get_correct_file_names(self, filenames):
        correct_file_names = [];
        for file1 in filenames:
            index_sequence = self.get_index(file1)
#             self.runobj.run_keys
#
            good_run_key_lane_names = [x for x in self.runobj.run_keys if x.startswith(index_sequence)]
            if len(good_run_key_lane_names) > 0:
                correct_file_names.append(file1)
        return set(correct_file_names)


    def get_run_key(self, e_sequence, has_ns = "True"):
        if has_ns:
            return ("NNNN" + e_sequence[4:9])
        else:
            return e_sequence[0:5]

    def get_ini_run_key(self, index_sequence, e):
        has_ns = any("NNNN" in s for s in self.runobj.run_keys)

        lane_number = e.lane_number
        if self.platform == "nextseq":
            lane_number = "1"
        return index_sequence + "_" + self.get_run_key(e.sequence, has_ns) + "_" + lane_number

    def read1(self, files_r1, compressed):
        """ loop through the fastq_file_names
            1) e.pair_no = 1, find run_key -> dataset name
            2) collect the relevant part of id
        """
        for file_r1 in files_r1:
            self.utils.print_both("====\nFFF1: file %s" % file_r1)
            f_input  = fq.FastQSource(file_r1, compressed)
            index_sequence = self.get_index(file_r1)
            while f_input.next(trim_to = C.trimming_length):
            # while f_input.next(trim_to = C.trimming_length[self.platform]):
                e = f_input.entry
                # todo: a fork with or without NNNN, add an argument
                #                 ini_run_key  = index_sequence + "_" + "NNNN" + e.sequence[4:9] + "_" + e.lane_number
                # lane_number = e.lane_number
                # if self.platform == "nextseq":
                #     lane_number = "1"
                # ini_run_key  = index_sequence + "_" + self.get_run_key(e.sequence, has_ns) + "_" + lane_number
                ini_run_key = self.get_ini_run_key(index_sequence, e)
                if int(e.pair_no) == 1:
                    dataset_file_name_base_r1 = ini_run_key + "_R1"
                    if (dataset_file_name_base_r1 in self.out_files.keys()):
                        self.out_files[dataset_file_name_base_r1].store_entry(e)
                        "TODO: make a method:"
                        short_id1 = e.header_line.split()[0]
                        short_id2 = ":".join(e.header_line.split()[1].split(":")[1:])
                        id2 = short_id1 + " 2:" + short_id2
                        self.id_dataset_idx[id2] = ini_run_key
                    else:
                        self.out_files["unknown"].store_entry(e)

    # def truncate_seq(self, seq):
    #     return seq[:C.trimming_length]

    def remove_end_ns_strip(self, e_sequence):
        if e_sequence.endswith('N'):
            return e_sequence.rstrip('N')
        else:
            return e_sequence

    def read2(self, files_r2, compressed):
        "3) e.pair_no = 2, find id from 2), assign dataset_name"
        for file_r2 in files_r2:
            self.utils.print_both("FFF2: file %s" % file_r2)
            f_input  = fq.FastQSource(file_r2, compressed)
            while f_input.next(trim_to = C.trimming_length):
                e = f_input.entry

                if (int(e.pair_no) == 2) and (e.header_line in self.id_dataset_idx):
                    file_name = self.id_dataset_idx[e.header_line] + "_R2"
                    self.out_files[file_name].store_entry(e)
                else:
                    self.out_files["unknown"].store_entry(e)

    def get_index(self, file_r1):
        file_name_parts = os.path.basename(file_r1).split("_")
#        if the file name starts with "IDX, then actual idx will be next.
        index = file_name_parts[0]
        if file_name_parts[0].startswith("IDX"):
            index = file_name_parts[1]
        return index

Пример #19

Показать файл

class Chimera:
    """ Define here """
    def __init__(self, runobj = None):
        self.utils      = PipelneUtils()
        self.runobj     = runobj
        self.run_keys   = self.runobj.run_keys
        self.rundate    = self.runobj.run
        try:
            self.use_cluster = self.runobj.use_cluster
        except:
            self.use_cluster = True
        self.chg_suffix         = ".chg"
        self.chimeras_suffix    = ".chimeras"      
        self.ref_suffix         = ".db"      
        self.denovo_suffix      = ".txt"        
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.chimeric_suffix    = ".chimeric.fa"
        self.base_suffix        = "unique" + self.chimeras_suffix

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            site       = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] 
            self.indir  = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.indir  = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        
        
#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd        
        #self.abskew      = C.chimera_checking_abskew
        self.refdb       = C.chimera_checking_refdb_6
        self.its_refdb   = C.chimera_checking_its_refdb_6
        self.input_file_names  = self.make_chimera_input_illumina_file_names()
#         pprint(self.run_keys)
#         self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names)
        
    def make_chimera_input_illumina_file_names(self):
        input_file_names = {} 
        
        for idx_key in self.run_keys:
            file_name = idx_key + "_" + C.filtered_suffix + ".unique" 
           
            if os.path.exists(os.path.join(self.indir, file_name)):
                input_file_names[idx_key] = file_name
        
        return input_file_names
            
#     def make_chimera_output_illumina_file_names(self, input_file_names):
#         output_file_names = {} 
#         for idx_key, input_file_name in input_file_names.iteritems():
#             output_file_names[idx_key] = input_file_name
#         return output_file_names

    def get_current_dirname(self, in_or_out = ""):
        if in_or_out == "":
            cur_dirname    = self.indir 
        else:
            cur_dirname    = self.outdir
        return cur_dirname

    def is_chimera_check_file(self, filename):
        return filename.endswith((self.chimeras_suffix + self.denovo_suffix, self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix, self.nonchimeric_suffix))

    def get_current_filenames(self, cur_dirname):
        cur_file_names = []
        if cur_dirname == self.indir:
            cur_file_names = self.input_file_names.values()
        elif cur_dirname == self.outdir:
            cur_file_names = self.get_chimera_file_names(self.outdir)
        return cur_file_names

    def get_chimera_file_names(self, cur_dirname):
        cur_file_names = []        
        for dirname, dirnames, filenames in os.walk(cur_dirname):
            cur_file_names = [filename for filename in filenames if (self.is_chimera_check_file(filename))]
        return cur_file_names

#     def illumina_frequency_size(self, in_or_out = "", find = "frequency:", replace = ";size="):
#         cur_dirname    = self.get_current_dirname(in_or_out)
#         cur_file_names = self.get_current_filenames(cur_dirname)
# #         print "cur_file_names: "
# #         pprint(cur_file_names)
#         change_from_suffix = ""
#         change_to_suffix   = self.chg_suffix
# #         print "find = %s, replace = %s" % (find, replace)
#         regex              = re.compile(r"%s" % find)
# 
#         for cur_file_name in cur_file_names:
#             file_name = os.path.join(cur_dirname, cur_file_name)
#             with open(file_name + change_from_suffix, "r") as sources:
#                 lines = sources.readlines()
#             with open(file_name + change_to_suffix, "w") as target:
#                 for line in lines:
#                         target.write(regex.sub(replace, line))

    def read_file(self, source_name):
        with open(source_name, "r") as sources:
            return sources.readlines()

    def illumina_sed(self, lines, target_name, regex, replace, uppercase):
        with open(target_name, "w") as target:
            for line in lines:
                if line.startswith(">"):
                    line1 = regex.sub(replace, line)
                else:
                    if (uppercase):
                        line1 = line.upper()
                    else:
                        line1 = line
                target.write(line1)  


    def call_illumina_sed(self, from_to):
        """
            from_to = from_frequency_to_size or from_size_to_frequency
        """
        sed_from_to = namedtuple('sed_from_to', 'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase')

        from_frequency_to_size = sed_from_to(
        find               = "frequency:",
        replace            = ";size=",
        cur_dirname        = self.indir,
        cur_file_names     = self.get_current_filenames(self.indir),
        change_from_suffix = "",
        change_to_suffix   = self.chg_suffix,
        uppercase          = True
        )

        from_size_to_frequency = sed_from_to(
        find               = ";size=",
        replace            = "frequency:",
        cur_dirname        = self.outdir,
        cur_file_names     = self.get_chimera_file_names(self.outdir),
        change_from_suffix = "",
        change_to_suffix   = "",
        uppercase          = False        
        )
        
        if (from_to == "from_frequency_to_size"):
            tuple_name = from_frequency_to_size
        elif (from_to == "from_size_to_frequency"):
            tuple_name = from_size_to_frequency
        
        regex          = re.compile(r"%s" % tuple_name.find)                                
#         print "find = %s, replace = %s" % (find, replace)
        if (not tuple_name.cur_file_names) and (tuple_name == from_frequency_to_size):
            self.utils.print_both('ERROR: Did not find uniqued files (".unique") in %s, please check if the previous step has finished. Exiting.\n' % self.indir)
            sys.exit()
        for cur_file_name in tuple_name.cur_file_names:
            file_name = os.path.join(tuple_name.cur_dirname, cur_file_name)           
            source_name = file_name + tuple_name.change_from_suffix
            target_name = file_name + tuple_name.change_to_suffix 
            lines = self.read_file(source_name)
            self.illumina_sed(lines, target_name, regex, tuple_name.replace, tuple_name.uppercase)

    def illumina_freq_to_size_in_chg(self):
#         TODO: not used?
        find1    = "frequency:"
        replace1 = ";size="
        regex1   = re.compile(r"%s" % find1)        
        
#         print "cur_file_names: "
#         pprint(cur_file_names)
        cur_dirname        = self.get_current_dirname()
        cur_file_names     = self.get_current_filenames(cur_dirname)
        change_from_suffix = ""
        change_to_suffix   = self.chg_suffix
#         print "find = %s, replace = %s" % (find, replace)
 
        for cur_file_name in cur_file_names:
            file_name = os.path.join(cur_dirname, cur_file_name)
            with open(file_name + change_from_suffix, "r") as sources:
                lines = sources.readlines()
            with open(file_name + change_to_suffix, "w") as target:
#                 line2 = [regex1.sub(replace1, line) if line.startswith(">") else line.upper() for line in lines]
                for line in lines:
                    if line.startswith(">"):
                        line1 = regex1.sub(replace1, line)
                    else:
                        line1 = line.upper()
#                     print line1
                    target.write(line1)  


    def illumina_size_to_freq_in_chimer(self):
        find1           = ";size="
        replace1        = "frequency:"
        regex1          = re.compile(r"%s" % find1)        
 
        cur_file_names = self.get_chimera_file_names(self.outdir)
                    
        for file_chim in cur_file_names:
            file_chim_path = os.path.join(self.outdir, file_chim)
            with open(file_chim_path, "r") as sources:
                lines = sources.readlines()
            with open(file_chim_path, "w") as target:
                for line in lines:
                    line1 = regex1.sub(replace1, line)
                    target.write(line1)                    
              
    def illumina_rm_size_files(self):
        for idx_key in self.input_file_names:
            file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix)
            if os.path.exists(file_name):
                os.remove(file_name)
    
#     def illumina_chimera_size_files(self):
#     
#     import os
# [os.rename(f, f.replace('_', '-')) for f in os.listdir('.') if not f.startswith('.')]

        
          
    def check_if_cluster_is_done(self, time_before):
        cluster_done = False
        check_qstat_cmd_line = "qstat | grep \"%s\" | grep usearch | wc -l" % time_before
#         check_qstat_cmd_line = "qstat | grep usearch"

        self.utils.print_both("check_qstat_cmd_line = %s" % check_qstat_cmd_line)
        
        try:
            p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True)
            (output, err) = p.communicate()
            num_proc = int(output)
            self.utils.print_both("qstat is running %s 'usearch' processes" % num_proc)
    #         pprint(p)
            
            if (num_proc == 0):
                cluster_done = True
    #         print "cluster_done from check_if_cluster_is_done = %s" % cluster_done
        except:
            self.utils.print_both("Chimera checking can be done only on a cluster.")
            raise

        return cluster_done
        
          
    def create_chimera_cmd(self, input_file_name, output_file_name, ref_or_novo, ref_db = ""):
        """
        http://www.drive5.com/usearch/manual/uchime_denovo.html
        from usearch -help
        Chimera detection (UCHIME ref. db. mode):
          usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta]
            [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns]
         
        Chimera detection (UCHIME de novo mode):
          usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta]
             [-uchimeout results.uch] [-uchimealns results.alns]
          Input is estimated amplicons with integer abundances specified using ";size=N".
        usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime
        """        

        uchime_cmd_append = ""
        db_cmd_append     = ""
        dir_cmd_append    = ""

        if (ref_or_novo == "denovo"):
            uchime_cmd_append = " -uchime_denovo "           
            output_file_name  = output_file_name + self.chimeras_suffix + self.denovo_suffix 
        elif (ref_or_novo == "ref"):
            uchime_cmd_append = " -uchime_ref "
            output_file_name  = output_file_name + self.chimeras_suffix + self.ref_suffix           
            db_cmd_append     = " -db " + ref_db   
            dir_cmd_append    = " -strand plus"
        else:
            self.utils.print_both("Incorrect method, should be \"denovo\" or \"ref\"") 
        self.utils.print_both("output_file_name = %s" % output_file_name) 


        uchime_cmd = C.clusterize_cmd
        uchime_cmd += " "
        uchime_cmd += self.usearch_cmd
        uchime_cmd += uchime_cmd_append + input_file_name
        uchime_cmd += db_cmd_append
        uchime_cmd += " -uchimeout " + output_file_name
        """if we need nonchimeric for denovo and db separate we might create them here
#         uchime_cmd += " -nonchimeras "
#         uchime_cmd += (output_file_name + self.nonchimeric_suffix)
"""
        uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix)         
        uchime_cmd += dir_cmd_append
        uchime_cmd += " -notrunclabels"
        
        
#         print "uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd)
        return uchime_cmd
        
    def get_ref_db(self, dna_region):
        ref_db = ''
        if dna_region.upper() == 'ITS':
            logger.debug("got an ITS dna region so using refdb: " + self.its_refdb)
            ref_db = self.its_refdb
        else:
            logger.debug("using standard refdb: " + self.refdb)
            ref_db = self.refdb
        return ref_db       
    
    def chimera_checking(self, ref_or_novo):
        chimera_region_found = False
        output = {}
        
        for idx_key in self.input_file_names:
#             print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names)
            input_file_name  = os.path.join(self.indir,  self.input_file_names[idx_key] + self.chg_suffix)        
            output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key])        
            dna_region       = self.runobj.samples[idx_key].dna_region
#             print "dna_region = %s" % dna_region
            if dna_region in C.regions_to_chimera_check:
                chimera_region_found = True
            else:
                logger.debug('region not checked: ' +  dna_region)
                continue
            
#             print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)
            ref_db     = self.get_ref_db(dna_region)
#             print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo)
            
            uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db)
            self.utils.print_both("\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd))
            
            try:
                logger.info("chimera checking command: " + str(uchime_cmd))
                output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

            except OSError, e:
                self.utils.print_both("Problems with this command: %s" % (uchime_cmd))
                if self.utils.is_local():
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                else:
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                    self.utils.print_both("Execution of %s failed: %s" % (uchime_cmd, e))
                    raise                  
                               
# ???
        if not chimera_region_found:            
            return ('NOREGION', 'No regions found that need checking', '')
        else:
            return ("The usearch commands were created")

Пример #20

Показать файл

class dbUpload:
    """db upload methods"""
    Name = "dbUpload"
    """
    TODO: add tests and test case
    TODO: change hardcoded values to args: 
        self.sequence_table_name = "sequence_ill", 
        self.sequence_field_name = "sequence_comp"  
    TODO: generalize all bulk uploads and all inserts? to not copy and paste
    TODO: add refssu_id
    TODO: change csv validaton for new fields
    Order:
        # put_run_info
        # insert_seq()
        # insert_pdr_info()
        # gast
        # insert_taxonomy()
        # insert_sequence_uniq_info_ill()

    """
    def __init__(self, runobj = None):
        self.utils       = PipelneUtils()
        self.runobj      = runobj
        self.rundate     = self.runobj.run
        self.use_cluster = 1       
        self.unique_fasta_files = []
#        if self.runobj.vamps_user_upload:
#            site       = self.runobj.site
#            dir_prefix = self.runobj.user + '_' + self.runobj.run
#        else:
#            site = ''
#            dir_prefix = self.runobj.run         
#        dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site)

        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
 
        
        self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)
        self.fasta_dir    = self.dirs.check_dir(self.dirs.reads_overlap_dir)
        self.gast_dir     = self.dirs.check_dir(self.dirs.gast_dir)

        host_name     = runobj.database_host
        database_name = runobj.database_name
        
        self.filenames   = []
        self.my_conn     = MyConnection(host = 'newbpcdb2.jbpc-np.mbl.edu', db="env454")
#         self.my_conn     = MyConnection()

#         self.my_conn     = MyConnection(host = 'localhost', db="test_env454")
        self.sequence_table_name = "sequence_ill" 
        self.sequence_field_name = "sequence_comp" 
        self.my_csv              = None

        self.unique_file_counts = self.dirs.unique_file_counts
        self.dirs.delete_file(self.unique_file_counts)
        self.seq_id_dict = {}
        self.tax_id_dict = {}
        self.run_id      = None
#        self.nonchimeras_suffix = ".nonchimeric.fa"
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.fa_unique_suffix   = ".fa." + C.unique_suffix #.fa.unique
        self.v6_unique_suffix   = "MERGED_V6_PRIMERS_REMOVED." + C.unique_suffix
        self.suff_list = [self.nonchimeric_suffix, self.fa_unique_suffix, self.v6_unique_suffix]

#         self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique
        self.suffix_used        = ""
        
#        self.refdb_dir = '/xraid2-2/vampsweb/blastdbs/'
   
   
    def get_fasta_file_names(self):
        files_names = self.dirs.get_all_files(self.fasta_dir)
        self.unique_fasta_files = [f for f in files_names.keys() if f.endswith(tuple(self.suff_list))]
# needs return because how it's called from pipelineprocesor
        return self.unique_fasta_files
        

    def get_run_info_ill_id(self, filename_base):
        
        my_sql = """SELECT run_info_ill_id FROM run_info_ill 
                    JOIN run using(run_id)
                    WHERE file_prefix = '%s'
                    and run = '%s'
        """ % (filename_base, self.rundate)
        res    = self.my_conn.execute_fetch_select(my_sql)
        if res:
            return int(res[0][0])
        
    def make_seq_upper(self, filename):
        read_fasta = fastalib.ReadFasta(filename)
        sequences  = [seq.upper() for seq in read_fasta.sequences] #here we make uppercase for VAMPS compartibility    
        read_fasta.close()
        return sequences 
        
    def insert_seq(self, sequences):
      query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))"
      val_tmpl   = "'%s'"
      my_sql     = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences]))
      seq_id     = self.my_conn.execute_no_fetch(my_sql)
      self.utils.print_both("sequences in file: %s\n" % (len(sequences)))
      return seq_id
    #     try:
    #         query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))"
    #         val_tmpl   = "'%s'"
    #         my_sql     = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences]))
    #         seq_id     = self.my_conn.execute_no_fetch(my_sql)
    # #         print "sequences in file: %s" % (len(sequences))
    #         self.utils.print_both("sequences in file: %s\n" % (len(sequences)))
    #         return seq_id
    #     except self.my_conn.conn.cursor._mysql_exceptions.Error as err:
    #         if err.errno == 1582:
    #             self.utils.print_both(("ERROR: _mysql_exceptions.OperationalError: (1582, \"Incorrect parameter count in the call to native function 'COMPRESS'\"), there is an empty fasta in %s") % self.fasta_dir)
    #         else:
    #             raise
    #     except:
    #         if len(sequences) == 0:
    #             self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir)
    #         raise
        
    def get_seq_id_dict(self, sequences):
        id_name    = self.sequence_table_name + "_id" 
        query_tmpl = """SELECT %s, uncompress(%s) FROM %s WHERE %s in (COMPRESS(%s))"""
        val_tmpl   = "'%s'"
        try:
            my_sql     = query_tmpl % (id_name, self.sequence_field_name, self.sequence_table_name, self.sequence_field_name, '), COMPRESS('.join([val_tmpl % key for key in sequences]))
            res        = self.my_conn.execute_fetch_select(my_sql)
            one_seq_id_dict = dict((y, int(x)) for x, y in res)
            self.seq_id_dict.update(one_seq_id_dict)
        except:
            if len(sequences) == 0:
                self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir)
            raise


    def get_id(self, table_name, value):
        id_name = table_name + '_id'
        my_sql  = """SELECT %s FROM %s WHERE %s = '%s'""" % (id_name, table_name, table_name, value)
        res     = self.my_conn.execute_fetch_select(my_sql)
        if res:
            return int(res[0][0])         
            
    def get_sequence_id(self, seq):
        my_sql = """SELECT sequence_ill_id FROM sequence_ill WHERE COMPRESS('%s') = sequence_comp""" % (seq)
        res    = self.my_conn.execute_fetch_select(my_sql)
        if res:
            return int(res[0][0])     
    
    def insert_pdr_info(self, fasta, run_info_ill_id):
        res_id = ""
        if (not run_info_ill_id):
            self.utils.print_both("ERROR: There is no run info yet, please check if it's uploaded to env454")
            
        # ------- insert sequence info per run/project/dataset --------
        seq_upper = fasta.seq.upper()
        sequence_ill_id = self.seq_id_dict[seq_upper]

        seq_count       = int(fasta.id.split('|')[-1].split(':')[-1])
#        print run_info_ill_id, sequence_ill_id, seq_count
        my_sql          = """INSERT IGNORE INTO sequence_pdr_info_ill (run_info_ill_id, sequence_ill_id, seq_count) 
                             VALUES (%s, %s, %s)""" % (run_info_ill_id, sequence_ill_id, seq_count)

        try:
            res_id = self.my_conn.execute_no_fetch(my_sql)
            return res_id
        except:
            self.utils.print_both("Offensive query: %s" % my_sql)
            raise
        
    def make_gast_files_dict(self):
        return self.dirs.get_all_files(self.gast_dir, "gast")
        
        
    def gast_filename(self, filename):
#         todo: if filename in make_gast_files_dict, use it full path
        gast_file_names = self.make_gast_files_dict()
        gast_file_name_path = ""
        for gast_file_name_path, tpls in gast_file_names.iteritems():
            if any(t.endswith(filename) for t in tpls):
                return gast_file_name_path 
    
    def get_gast_result(self, filename):
        gast_file_name = self.gast_filename(filename)
        self.utils.print_both("current gast_file_name = %s." % gast_file_name)
        
        try:
            with open(gast_file_name) as fd:
                gast_dict = dict([(l.split("\t")[0], l.split("\t")[1:]) for l in fd])    
            return gast_dict
        except IOError, e:
#            print dir(e)
#['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__getslice__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'args', 'errno', 'filename', 'message', 'strerror']
#            print "errno = %s" % e.errno
            logger.debug("errno = %s" % e.errno)
            if e.errno == 2:
                # suppress "No such file or directory" error
                pass            
#         except OSError, e:
        except TypeError, e:
            self.utils.print_both("Check if there is a gast file under %s for %s." % (self.gast_dir, filename))
            pass

Пример #21

Показать файл

    def put_seq_statistics_in_file(self, filename, seq_in_file):
        pipelne_utils   = PipelneUtils()
#        if os.path.exists(file_full):
#            os.remove(file_full)
        pipelne_utils.write_seq_frequencies_in_file(self.unique_file_counts, filename, seq_in_file)

Пример #22

Показать файл

Файл: chimera.py Проект: msGenDev/py_mbl_sequencing_pipeline

class Chimera:
    """ Define here """
    def __init__(self, runobj = None):
        self.utils      = PipelneUtils()
        self.runobj     = runobj
        self.run_keys   = self.runobj.run_keys
        self.rundate    = self.runobj.run
        
        self.chg_suffix         = ".chg"
        self.chimeras_suffix    = ".chimeras"      
        self.ref_suffix         = ".db"      
        self.denovo_suffix      = ".txt"        
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.chimeric_suffix    = ".chimeric.fa"
        self.base_suffix        = "unique" + self.chimeras_suffix

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            site       = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.idx_keys = convert_unicode_dictionary_to_str(json.loads(open(self.runobj.trim_status_file_name,"r").read()))["new_lane_keys"] 
            self.indir  = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
            self.indir  = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
        
        
#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd        
        #self.abskew      = C.chimera_checking_abskew
        self.refdb       = C.chimera_checking_refdb_6
        self.its_refdb   = C.chimera_checking_its_refdb_6
        self.input_file_names  = self.make_chimera_input_illumina_file_names()
#         pprint(self.run_keys)
#         self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names)
        
    def make_chimera_input_illumina_file_names(self):
        input_file_names = {} 
        
        for idx_key in self.run_keys:
            file_name = idx_key + "_" + C.filtered_suffix + ".unique" 
           
            if os.path.exists(os.path.join(self.indir, file_name)):
                input_file_names[idx_key] = file_name
        
        return input_file_names
            
#     def make_chimera_output_illumina_file_names(self, input_file_names):
#         output_file_names = {} 
#         for idx_key, input_file_name in input_file_names.iteritems():
#             output_file_names[idx_key] = input_file_name
#         return output_file_names

    def get_current_dirname(self, in_or_out = ""):
        if in_or_out == "":
            cur_dirname    = self.indir 
        else:
            cur_dirname    = self.outdir
        return cur_dirname

    def is_chimera_check_file(self, filename):
        return filename.endswith((self.chimeras_suffix + self.denovo_suffix, self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix, self.nonchimeric_suffix))

    def get_current_filenames(self, cur_dirname):
        cur_file_names = []
        if cur_dirname == self.indir:
            cur_file_names = self.input_file_names.values()
        elif cur_dirname == self.outdir:
            cur_file_names = self.get_chimera_file_names(self.outdir)
        return cur_file_names

    def get_chimera_file_names(self, cur_dirname):
        cur_file_names = []        
        for dirname, dirnames, filenames in os.walk(cur_dirname):
            cur_file_names = [filename for filename in filenames if (self.is_chimera_check_file(filename))]
        return cur_file_names

#     def illumina_frequency_size(self, in_or_out = "", find = "frequency:", replace = ";size="):
#         cur_dirname    = self.get_current_dirname(in_or_out)
#         cur_file_names = self.get_current_filenames(cur_dirname)
# #         print "cur_file_names: "
# #         pprint(cur_file_names)
#         change_from_suffix = ""
#         change_to_suffix   = self.chg_suffix
# #         print "find = %s, replace = %s" % (find, replace)
#         regex              = re.compile(r"%s" % find)
# 
#         for cur_file_name in cur_file_names:
#             file_name = os.path.join(cur_dirname, cur_file_name)
#             with open(file_name + change_from_suffix, "r") as sources:
#                 lines = sources.readlines()
#             with open(file_name + change_to_suffix, "w") as target:
#                 for line in lines:
#                         target.write(regex.sub(replace, line))

    def read_file(self, source_name):
        with open(source_name, "r") as sources:
            return sources.readlines()

    def illumina_sed(self, lines, target_name, regex, replace, uppercase):
        with open(target_name, "w") as target:
            for line in lines:
                if line.startswith(">"):
                    line1 = regex.sub(replace, line)
                else:
                    if (uppercase):
                        line1 = line.upper()
                    else:
                        line1 = line
                target.write(line1)  


    def call_illumina_sed(self, from_to):
        """
            from_to = from_frequency_to_size or from_size_to_frequency
        """
        sed_from_to = namedtuple('sed_from_to', 'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase')

        from_frequency_to_size = sed_from_to(
        find               = "frequency:",
        replace            = ";size=",
        cur_dirname        = self.indir,
        cur_file_names     = self.get_current_filenames(self.indir),
        change_from_suffix = "",
        change_to_suffix   = self.chg_suffix,
        uppercase          = True
        )

        from_size_to_frequency = sed_from_to(
        find               = ";size=",
        replace            = "frequency:",
        cur_dirname        = self.outdir,
        cur_file_names     = self.get_chimera_file_names(self.outdir),
        change_from_suffix = "",
        change_to_suffix   = "",
        uppercase          = False        
        )
        
        if (from_to == "from_frequency_to_size"):
            tuple_name = from_frequency_to_size
        elif (from_to == "from_size_to_frequency"):
            tuple_name = from_size_to_frequency
        
        regex          = re.compile(r"%s" % tuple_name.find)                                
#         print "find = %s, replace = %s" % (find, replace)
 
        for cur_file_name in tuple_name.cur_file_names:
            file_name = os.path.join(tuple_name.cur_dirname, cur_file_name)           
            source_name = file_name + tuple_name.change_from_suffix
            target_name = file_name + tuple_name.change_to_suffix 
            lines = self.read_file(source_name)
            self.illumina_sed(lines, target_name, regex, tuple_name.replace, tuple_name.uppercase)

    def illumina_freq_to_size_in_chg(self):
#         TODO: not used?
        find1    = "frequency:"
        replace1 = ";size="
        regex1   = re.compile(r"%s" % find1)        
        
#         print "cur_file_names: "
#         pprint(cur_file_names)
        cur_dirname        = self.get_current_dirname()
        cur_file_names     = self.get_current_filenames(cur_dirname)
        change_from_suffix = ""
        change_to_suffix   = self.chg_suffix
#         print "find = %s, replace = %s" % (find, replace)
 
        for cur_file_name in cur_file_names:
            file_name = os.path.join(cur_dirname, cur_file_name)
            with open(file_name + change_from_suffix, "r") as sources:
                lines = sources.readlines()
            with open(file_name + change_to_suffix, "w") as target:
#                 line2 = [regex1.sub(replace1, line) if line.startswith(">") else line.upper() for line in lines]
                for line in lines:
                    if line.startswith(">"):
                        line1 = regex1.sub(replace1, line)
                    else:
                        line1 = line.upper()
#                     print line1
                    target.write(line1)  


    def illumina_size_to_freq_in_chimer(self):
        find1           = ";size="
        replace1        = "frequency:"
        regex1          = re.compile(r"%s" % find1)        
 
        cur_file_names = self.get_chimera_file_names(self.outdir)
                    
        for file_chim in cur_file_names:
            file_chim_path = os.path.join(self.outdir, file_chim)
            with open(file_chim_path, "r") as sources:
                lines = sources.readlines()
            with open(file_chim_path, "w") as target:
                for line in lines:
                    line1 = regex1.sub(replace1, line)
                    target.write(line1)                    
              
    def illumina_rm_size_files(self):
        for idx_key in self.input_file_names:
            file_name = os.path.join(self.indir, self.input_file_names[idx_key] + self.chg_suffix)
            if os.path.exists(file_name):
                os.remove(file_name)
    
#     def illumina_chimera_size_files(self):
#     
#     import os
# [os.rename(f, f.replace('_', '-')) for f in os.listdir('.') if not f.startswith('.')]

        
          
    def check_if_cluster_is_done(self, time_before):
        cluster_done = False
        check_qstat_cmd_line = "qstat | grep \"%s\" | grep usearch | wc -l" % time_before
#         check_qstat_cmd_line = "qstat | grep usearch"

        print "check_qstat_cmd_line = %s" % check_qstat_cmd_line
        
        try:
            p = subprocess.Popen(check_qstat_cmd_line, stdout=subprocess.PIPE, shell=True)
            (output, err) = p.communicate()
            num_proc = int(output)
            print "qstat is running %s 'usearch' processes" % num_proc
    #         pprint(p)
            
            if (num_proc == 0):
                cluster_done = True
    #         print "cluster_done from check_if_cluster_is_done = %s" % cluster_done
        except:
            print "Chimera checking can be done only on a cluster."
            raise

        return cluster_done
        
          
    def create_chimera_cmd(self, input_file_name, output_file_name, ref_or_novo, ref_db = ""):
        """
        http://www.drive5.com/usearch/manual/uchime_denovo.html
        from usearch -help
        Chimera detection (UCHIME ref. db. mode):
          usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta]
            [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns]
         
        Chimera detection (UCHIME de novo mode):
          usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta]
             [-uchimeout results.uch] [-uchimealns results.alns]
          Input is estimated amplicons with integer abundances specified using ";size=N".
        usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime
        """        

        uchime_cmd_append = ""
        db_cmd_append     = ""
        dir_cmd_append    = ""

        if (ref_or_novo == "denovo"):
            uchime_cmd_append = " -uchime_denovo "           
            output_file_name  = output_file_name + self.chimeras_suffix + self.denovo_suffix 
        elif (ref_or_novo == "ref"):
            uchime_cmd_append = " -uchime_ref "
            output_file_name  = output_file_name + self.chimeras_suffix + self.ref_suffix           
            db_cmd_append     = " -db " + ref_db   
            dir_cmd_append    = " -strand plus"
        else:
            print "Incorrect method, should be \"denovo\" or \"ref\"" 
        print "output_file_name = %s" % output_file_name 


        uchime_cmd = C.clusterize_cmd
        uchime_cmd += " "
        uchime_cmd += self.usearch_cmd
        uchime_cmd += uchime_cmd_append + input_file_name
        uchime_cmd += db_cmd_append
        uchime_cmd += " -uchimeout " + output_file_name
        """if we need nonchimeric for denovo and db separate we might create them here
#         uchime_cmd += " -nonchimeras "
#         uchime_cmd += (output_file_name + self.nonchimeric_suffix)
"""
        uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix)         
        uchime_cmd += dir_cmd_append
        uchime_cmd += " -notrunclabels"
        
        
#         print "uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd)
        return uchime_cmd
        
    def get_ref_db(self, dna_region):
        ref_db = ''
        if dna_region.upper() == 'ITS':
            logger.debug("got an ITS dna region so using refdb: " + self.its_refdb)
            ref_db = self.its_refdb
        else:
            logger.debug("using standard refdb: " + self.refdb)
            ref_db = self.refdb
        return ref_db       
    
    def chimera_checking(self, ref_or_novo):
        chimera_region_found = False
        output = {}
        
        for idx_key in self.input_file_names:
#             print "idx_key, self.input_file_names[idx_key] = %s, %s" % (idx_key, self.input_file_names)
            input_file_name  = os.path.join(self.indir,  self.input_file_names[idx_key] + self.chg_suffix)        
            output_file_name = os.path.join(self.outdir, self.input_file_names[idx_key])        
            dna_region       = self.runobj.samples[idx_key].dna_region
#             print "dna_region = %s" % dna_region
            if dna_region in C.regions_to_chimera_check:
                chimera_region_found = True
            else:
                logger.debug('region not checked: ' +  dna_region)
                continue
            
#             print "input_file_name = %s \noutput_file_name = %s" % (input_file_name, output_file_name)
            ref_db     = self.get_ref_db(dna_region)
#             print "dna_region = %s; ref_db = %s; ref_or_novo = %s" % (dna_region, ref_db, ref_or_novo)
            
            uchime_cmd = self.create_chimera_cmd(input_file_name, output_file_name, ref_or_novo, ref_db)
            print "\n==================\n%s command: %s" % (ref_or_novo, uchime_cmd)
            
            try:
                logger.info("chimera checking command: " + str(uchime_cmd))
                output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

            except OSError, e:
                print "Problems with this command: %s" % (uchime_cmd)
                if self.utils.is_local():
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                else:
                    print >>sys.stderr, "Execution of %s failed: %s" % (uchime_cmd, e)
                    raise                  
                               
# ???
        if not chimera_region_found:            
            return ('NOREGION', 'No regions found that need checking', '')
        else:
            return ("The usearch commands were created")

Пример #23

Показать файл

Файл: metadata.py Проект: avoorhis/py_mbl_sequencing_pipeline

class MetadataUtils:
    """
    Class to read metadata files (csv and ini style)
    validate and create a dictionary from them
    Two parts:
    1) From pipeline-ui.py to validate the input args
    2) From runconfig.py to write the final ini file and create the dictionary
    that is used to create the run object
    """
    Name = "MetadataUtils"
    def __init__(self, command_line_args = None, configuration_dictionary = None):
        self.args = command_line_args
        self.general_config_dict = configuration_dictionary
        self.known_header_list  = C.csv_header_list
        self.pipeline_run_items = C.pipeline_run_items
        self.primer_suites      = self.convert_primer_suites(C.primer_suites)
        self.dna_regions        = C.dna_regions
        self.data_object = {}
        self.data_object['general'] = {}
        self.warn_msg = """\n\tThe config File seems to be okay. If the items above look correct
        then press 'c' to continue the pipeline\n"""
        self.res_headers = []
        self.env = {}
        self.utils  = PipelneUtils()


    def convert_and_save_ini(self, analysis_dir):

        new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini')
        #new_ini_file = os.path.join(self.general_config_dict['output_dir'],self.general_config_dict['run'],self.general_config_dict['run'] + '.ini')
        # converts csv to ini and saves to output_dir
        if self.general_config_dict['platform'] == 'vamps':
            self.save_ini_file(new_ini_file)
        else:
            self.convert_csv_to_ini(new_ini_file)
        self.general_config_dict['configPath'] = new_ini_file

        # change path and type to new ini
        # regardless of what they were before



    def validate(self, analysis_dir):

        if self.general_config_dict['platform'] in C.illumina_list:
            self.warn_msg = self.validate_illumina_ini(analysis_dir)
        elif self.general_config_dict['platform'] == '454':
            data = self.validate_454_ini(analysis_dir)
        elif self.general_config_dict['platform'] == 'ion_torrent':
            pass
        elif self.general_config_dict['platform'] == 'vamps':
            data = self.validate_vamps_ini(analysis_dir)
        else:
            sys.exit("Unknown platform and configFile type for validation")


        return self.data_object

    def get_general_data(self):
        """
        """
        return self.data_object['general']

    def validate_vamps_ini(self, analysis_dir):
        # configPath is the new configPath
        'todo: Andy, what should be here, just directory name or directory + number.ini?'
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])
        if 'fasta_file' in self.data_object and not os.path.exists(self.data_object['fasta_file']):
            sys.exit("Fasta file path doesn't exist: "+self.data_object['fasta_file'] )
        elif 'fasta_file' in self.data_object['general'] and not os.path.exists(self.data_object['general']['fasta_file']):
            sys.exit("Fasta file path doesn't exist: "+self.data_object['general']['fasta_file'] )

    def validate_454_ini(self, analysis_dir):
        print("TODO - write validation def for 454/ini")
        #self.data_object = self.create_dictionary_from_ini()
        # 454 ini file requirements:



    def validate_illumina_ini(self, analysis_dir):
        """
        The csv headers are checked earlier
        """

        print("Validating ini type Config File (may have been converted from csv)")
        new_ini_file = os.path.join(analysis_dir, self.general_config_dict['run'] + '.ini')
        print("New ini file location: "+new_ini_file)
        return_code = False
        error_code  = False
        warn_code   = False
        msg = ''
        error=False
        warn=False
        #print('configpath',self.general_config_dict['configPath'])
        # configPath here is the new configPath
        self.data_object = self.configDictionaryFromFile_ini(self.general_config_dict['configPath'])


        (error_code,warn_code) = self.check_for_missing_values(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_for_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_domain_suite_region(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_project_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_dataset_name(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        (error_code,warn_code) = self.check_projects_and_datasets(self.data_object)
        if error_code: error=True
        if warn_code: warn=True
        #print(self.data_object['input_dir'])
        #print(self.data_object['input_files'])


        if 'input_dir' not in self.data_object['general'] and 'input_files' not in self.data_object['general']:
            logger.warning("No input directory and no input files")
            warn=True
        elif not os.path.isdir(self.data_object['general']['input_dir']):
            logger.error("That is not a directory: "+self.data_object['general']['input_dir'])
            error=True
        elif self.data_object['general']['input_file_format'] == 'fastq' and self.data_object['general']['platform'] in C.illumina_list:
                file_exists = False
    #            if 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir']:
                for dirname, dirnames, filenames in os.walk(self.data_object['general']['input_dir']):
    #                if not filenames:
                    for file_name in filenames:
                        if os.path.isfile(os.path.join(dirname, file_name)):
                            file_exists = True
                            break
                if not file_exists:
                    logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
                    error=True
        elif 'input_dir' in self.data_object['general'] and self.data_object['general']['input_dir'] and ('input_files' not in self.data_object['general'] or not self.data_object['general']['input_files']):
            logger.error("There are no files found in the input directory: "+self.data_object['general']['input_dir'])
            error=True

        if error:
            sys.exit( """\n\t\033[91mTHERE WERE SEVERE PROBLEMS WITH THE CSV and/or CONFIG FILE - EXITING
            PLEASE CORRECT THEM AND START OVER.\033[0m\n
            To view the errors add ' --loglevel info' to the command line.\n""")
        elif warn:
            msg = """\n\t\033[93mTHERE WERE NON-FATAL PROBLEMS WITH THE CSV and/or CONFIG FILE THAT MAY OR MAY NOT CAUSE PROBLEMS.\033[0m\n
                To view the warnings add ' --loglevel warning' to the command line.\n"""
            print("\033[92mCSV File Passed Vaidation! (with warnings)\033[0m")
        else:
            print("\033[92mCSV File Passed Vaidation!\033[0m")
        return msg

    def validate_dictionary(self, config_info):
        """
        This is only used for data that comes in as a dictionary rather than a file
        such as with vamps user uploads
        """
        print("TODO - Validating input dictionary")
        # must be a general section
        # should I create a dict here??? -That would render much code in
        #    runconfig useless.
        # are we going to continue developing ini style config files if
        #   no one uses them?
        configDict = config_info

        return configDict




    def populate_data_object_454(self, args):
        data = {}
        data['general'] = {}
        test_datasets = {}
        dataset_counter = {}
        headers = ''
        if self.runobj:
            infile = self.runobj.configPath
        else:
            infile = args.configPath
            data['general']['input_dir'] = args.input_dir
            #data['general']['output_dir'] = os.path.join(args.output_dir,args.run)
            data['general']['output_dir'] = args.output_dir
            data['general']['platform'] = args.platform
            data['general']['run'] = args.run
            #data['general']['run_date'] = args.run
            data['general']["input_file_format"] = args.input_file_format
            data['general']["input_file_suffix"] = args.input_file_suffix

        return data['general']




    def get_input_files(self):

        files_list = []

        if os.path.isdir(self.general_config_dict['input_dir']):

            for infile in glob.glob( os.path.join(self.general_config_dict['input_dir'], '*') ):
                if os.path.isdir(infile) == True:

                    for infile2 in glob.glob( os.path.join( infile,'*') ):
                        if os.path.isdir(infile2) == True:
                            pass
                        else:
                            sub_dir = os.path.basename(infile)

                            files_list.append(os.path.join(sub_dir,os.path.basename(infile2)))
                else:
                    files_list.append(os.path.basename(infile))
#        else:
#            if fasta_file:
#                pass
#            logger.warning("No input directory or directory permissions problem: "+self.general_config_dict['input_dir'])

        return files_list

    def check_for_input_files(self, data_object):

        file_count = 0
        files_list = []
        imports_list = []
        lanes_list = []


        #input_dir = os.path.join(data_object['general']['input_dir'],"fasta")
        input_dir = data_object['general']['input_dir']
        if os.path.isdir(input_dir):
            p = data_object['general']['input_dir'], '*'+data_object['general']['input_file_suffix']


            for infile in glob.glob( os.path.join(input_dir, '*'+data_object['general']['input_file_suffix']) ):
                files_list.append(os.path.basename(infile))
                for x in data_object:
                    if 'file_prefix' in data_object[x]:
                        pass
                        #print(data_object[x]['file_prefix'])

                        #if os.path.basename(infile).split('-')[0] == data_object[x]['file_prefix']:
                            #lanes_list.append(data_object[x]['lane'])

                file_count += 1
        else:

            logger.info("No input directory or directory permissions problem: "+input_dir)
            print("No input directory or directory permissions problem: "+input_dir)
        if not file_count:
            #sys.exit("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")
            logger.info("ERROR: No files were found in '"+input_dir+"' with a suffix of '"+data_object['general']['input_file_suffix']+"'")

        data_object['general']['files_list'] = files_list
        data_object['general']['file_count'] = file_count
        # all the files in an illumina directory should be the same type
        #data_object['general']['file_formats_list'] = [data_object['general']["input_file_format"]] * file_count
        #data_object['general']['lanes_list'] = lanes_list
        #print("Files LIST",data_object['general']['files_list'])


        return data_object


    def check_for_missing_values(self, data):
        missing_key   = ''
        error = False
        warn = False
        for item in data:
            if item == 'general':
                for k,v in data[item].items():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if v == '':
                        logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        warn=True

        for item in data:
            if item != 'general':
                for k,v in data[item].items():
                    if not k:
                        #sys.exit("ERROR: key for: '"+v+"' is missing or corrupt - Exiting")
                        logger.warning("(key: "+item+") key for: '"+v+"' is missing or corrupt - Continuing")
                        warn=True
                    if not v:
                        if (k == 'barcode' or k == 'adaptor'): #these could be empty
                            logger.warning("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                        else:
                            logger.error("(key: "+item+") value of: '"+k+"' is missing or corrupt - Continuing")
                            error=True
        return (error,warn)

    def check_for_datasets(self,data):
        error = False
        warn=False
        for item in data:
            if item != 'general':
                #print('ds',data[item]['dataset'])
                if not data[item]['dataset']:
                #if 'dataset' not in data[item]:
                    logger.error("Current dataset name is missing or corrupt - Exiting (key: "+item+")")
                    error=True
        return (error,warn)

    def check_domain_suite_region(self,data):
        error = False
        warn=False

        for item in data:

            if item != 'general':
                primer_suite = self.convert_primer_suites(data[item]['primer_suite'])
                dna_region   = self.convert_primer_suites(data[item]['dna_region'])

                # CHECK MUST MATCH: "Domain","Primer Suite","DNA Region"
                if primer_suite not in self.primer_suites:
                    logger.error("Primer Suite not found: "+primer_suite+" - Exiting (key: "+item+")")
                    error=True
                if dna_region not in self.dna_regions:
                    logger.error("DNA Region not found: "+dna_region+" - Exiting (key: "+item+")")
                    error=True
                if dna_region not in primer_suite:
                    logger.error("DNA Region ("+dna_region+") not found in Primer Suite ("+primer_suite+") - Exiting (key: "+item+")")
                    error=True
        return (error, warn)

    def convert_primer_suites(self, suite):
        import re
        if type(suite) is list:
            conv_suite = [re.sub(r'[_ -]', '', item.lower()) for item in suite]
        if type(suite) is str:
            conv_suite = re.sub(r'[_ -]', '', suite.lower())
                # suite.lower().translate(None, '_- ')
        return conv_suite

    def check_project_name(self, data):
        """
        # CHECK: project name format: 3 parts; end with Bv6,Ev9,Av6 or something similar
        """
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                try:
                    (a,b,c) = data[item]['project'].split('_')
                except:
                    logger.error("project not in correct format: ")
                    logger.error(data[item]['project'])
                    logger.error(" - Exiting (key: ")
                    logger.error(data[item])
                    error=True
                (a,b,c) = data[item]['project'].split('_')
                #if c[0] not in [i[0].upper() for i in domains]:
                #    sys.exit("ERROR : Project suffix has incorrect/non-existant domain: "+c)
                # logger.error("c[1:] = ")
                # logger.error(c[1:])
                # logger.error("c.lower() =")
                # logger.error(c.lower())
                # logger.error("self.dna_regions")
                # logger.error(self.dna_regions )

                if (c[1:].lower() not in self.dna_regions) and (c.lower() not in self.dna_regions):
                    logger.error("Project suffix has incorrect DNA region: ")
                    logger.error(c)
                    logger.error(" - Exiting (key: ")
                    logger.error(data[item])
                    error = True
        return (error, warn)

    def check_dataset_name(self,data):
        """
        # CHECK: dataset name can be ONLY alphanumeric and underscore
                    and cannot start with a number!
        """
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                dataset_name = data[item]['dataset']
                if not re.match("^[A-Za-z0-9_]*$", dataset_name):
                    logger.error("Dataset name has illeagal character(s): "+dataset_name+" (must be alphanumeric and underscore only)")
                    error = True
                #if  re.match("^[0-9]", dataset_name):
                 #   logger.error("Dataset name cannot begin with a digit: "+dataset_name)
                  #  error = True

        return (error, warn)

    def get_my_conn(self):
        try:
            host = self.general_config_dict['database_host']
        except:
            raise
        try:
            db = self.general_config_dict['database_name']
        except:
            raise
        if self.utils.is_local():
            host = 'localhost'
            db   = "test_env454"

        self.my_conn = MyConnection(host = host, db = db)

    def check_projects_and_datasets(self, data):
        self.get_my_conn()
        project_dataset = {}
        projects = {}
        datasets = {}
        error   =False
        warn    =False
        for item in data:
            if item != 'general':
                #project_dataset[data[item]['project']+'--'+data[item]['dataset']] = 1
                datasets[data[item]['dataset']] = data[item]['project']
                projects[data[item]['project']] = 1
        for p in projects:
            #print(p)
            my_sql = """SELECT project FROM project WHERE project = '%s'""" % (p)
            res    = self.my_conn.execute_fetch_select(my_sql)
            if res:
                logger.warning("project '"+p+"' already exists in the database - is this okay?")
                warn = True
            else:
                logger.debug("project '"+p+"' is new")

            ds_found_count = 0
            for d in datasets:
                if datasets[d] == p:

                    #print("\t%s" % (d))
                    my_sql = """SELECT dataset FROM dataset WHERE dataset = '%s'""" % (d)
                    res    = self.my_conn.execute_fetch_select(my_sql)
                    if res:
                        ds_found_count += 1
                        if ds_found_count >3:
                            logger.warning("\t\tPossibly more .... - Exiting after just three")
                            break
                        logger.warning("\tdataset '"+d+"' already exists in the database - is this okay?")
                        warn=True
                    else:
                        logger.debug("\tdataset '"+d+"' is new")
            logger.debug("\tDataset Count: "+str(len(datasets)))
        return (error,warn)


    def get_confirmation(self, steps, general_data):
        print("\n")
        for item,value in general_data.items():
            #print(len(value))
            if type(value) != bool and len(value) > 80:
                tmp = value.split(',')
                print("%-20s = %s .. %s" % (item,tmp[0],tmp[-1]))
            else:
                print("%-20s = %-20s" % (item,value))
        print("\nStep(s) to be performed: \033[1;36m",steps,'\033[0m')
        print("\n"+self.warn_msg+"\n")
        if 'validate' in steps.split(','):
            # print(we are done)
            sys.exit()
        if self.utils.is_local():
            return 'c'
        else:
            return 'c'

            # return raw_input("\nDoes this look okay? (q to quit, v to view configFile, c to continue) ")

    def convert_csv_to_ini(self, new_ini_file):
        #print(self.args)
        from pipeline.get_ini import readCSV

        print('CSV path', self.general_config_dict['csvPath'])
        my_csv = readCSV(file_path = self.general_config_dict['csvPath'])

        content     = my_csv.read_csv()
        headers     = content[1].keys()
        headers_clean = [x.strip('"').replace(" ", "_").lower() for x in headers]
        projects = {}
        #print
        #print(content[1])
        #print
        # get list of keys
        keys_list = []
        if self.check_headers(headers_clean):
            logger.info("CSV headers okay")
            for k,values in content.items():
                keys_list.append(values['barcode_index']+"_"+values['run_key']+"_"+values['lane'])

        fh = open(new_ini_file,'w')
        # general section
        fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")
        fh.write("[general]\n")
        fh.write("run = "+self.general_config_dict['run']+"\n")
        fh.write("configPath = "+new_ini_file+"\n")

        fh.write("configPath_orig = " + self.general_config_dict['configPath']+"\n")
        fh.write("platform = " + self.general_config_dict['platform']+"\n")
        fh.write("output_dir = " + os.path.dirname(new_ini_file)+"\n")
        #fh.write("output_dir = "+os.path.join(self.general_config_dict['baseoutputdir'],self.general_config_dict['run'])+"\n")
        if self.general_config_dict['platform'] in C.illumina_list:
            #fh.write("input_file_suffix = "  + self.general_config_dict['input_file_suffix']+"\n")
            fh.write("input_file_format = " + self.general_config_dict['input_file_format']+"\n")
            fh.write("anchor_file = "        + self.general_config_dict['anchor_file']+"\n")
            fh.write("primer_file = "        + self.general_config_dict['primer_file']+"\n")
            fh.write("compressed = "          + str(self.general_config_dict['compressed'])+"\n")
            fh.write("do_perfect = "          + str(self.general_config_dict['do_perfect'])+"\n")
            fh.write("lane_name = "          + str(self.general_config_dict['lane_name'])+"\n")
            fh.write("database_host = "          + self.general_config_dict['database_host']+"\n")
            fh.write("database_name = "          + self.general_config_dict['database_name']+"\n")

        fh.write("input_dir = "          + self.general_config_dict['input_dir']+"\n")
        fh.write("require_distal = "     + str(self.general_config_dict['require_distal'])+"\n")
        fh.write("use_cluster = "              + str(self.general_config_dict['use_cluster'])+"\n")
        fh.write("date = "              + str(datetime.date.today())+"\n")
        fh.write("site = "              + self.general_config_dict['site']+"\n")
        fh.write("load_vamps_database = " + str(self.general_config_dict['load_vamps_database'])+"\n")
        fh.write("idx_keys = "           +','.join(keys_list)+"\n")
        if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            fh.write("input_files = "     + ','.join(file_list)+"\n")
        else:
            fh.write("input_files = \n")
        #fh.write(getattr(args,'force_runkey', ""))

        for k, values in content.items():
            fh.write("\n")
            if self.general_config_dict['platform'] in C.illumina_list:
                fh.write("["+values['barcode_index']+"_"+values['run_key']+"_"+values['lane']+"]\n")
            elif self.general_config_dict['platform'] == '454':
                fh.write("["+values['lane']+"_"+values['run_key']+"]\n")

            for v in values:
                if v == "env_sample_source":
                    try:
                        new_val = [str(j[0]) for j in self.env if j[1] == values[v]][0]
                    except:
                        text = """There was an error in env_sample_source. Please check your metadata.
Possible values:
-----------
air
extreme habitat
host associated
human associated
human-amniotic-fluid
human-blood
human-gut
human-oral
human-skin
human-urine
human-vaginal
indoor
microbial mat/biofilm
miscellaneous_natural_or_artificial_environment
plant associated
sediment
soil/sand
unknown
wastewater/sludge
water-freshwater
water-marine
-----------
"""
                        print(text)
                        raise
                    fh.write("env_sample_source_id = "+new_val+"\n")
                else:
                    fh.write(v+" = "+values[v]+"\n")

        fh.close()

        return new_ini_file

    def save_ini_file(self,new_ini_file):
        # give it a new name
        out_fh = open(new_ini_file,'w')
        #for line in open(os.path.abspath(self.general_config_dict['configPath']),"r"):
        #    out_fh.write(line)
        self.general_config_dict['configPath_original'] = self.general_config_dict['configPath']
        self.general_config_dict['configPath'] = new_ini_file

        out_fh.write("#\n#\tCreated by MBL Pipeline for run: "+self.general_config_dict['run']+" on "+self.general_config_dict['date']+"\n#\n\n")
        out_fh.write("[general]\n")
        for item in self.general_config_dict:

            out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n")
        #out_fh.write("\n["+self.general_config_dict['platform']+"]\n")
        #for item in self.general_config_dict:
        #    if item not in C.general_run_items:
        #        out_fh.write(item+" = "+str(self.general_config_dict[item]) + "\n")



        if 'fasta_file' in self.general_config_dict and self.general_config_dict['fasta_file'] != '':
            (path,fasta) = os.path.split(self.general_config_dict['fasta_file'])
            if 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != path:
                sys.exit("Your input_dir and fasta_file directory don't agree - Exiting\n\t"+self.general_config_dict['input_dir']+" != "+self.general_config_dict['fasta_file'])

            out_fh.write("input_dir = "+path+"\n")
            out_fh.write("input_files = "+fasta+"\n")
            #out_fh.write("input_file_suffix = fasta\n")
        elif 'input_dir' in self.general_config_dict and self.general_config_dict['input_dir'] != '':
            file_list = self.get_input_files()
            out_fh.write("input_files = "     + ','.join(file_list)+"\n")
        else:
            out_fh.write("input_files = \n")
        out_fh.close()

    def check_headers(self, headers):
        if self.general_config_dict['platform'] in C.illumina_list:
            pl = self.general_config_dict['platform']
            known_header_list = self.known_header_list[pl]
        elif self.general_config_dict['platform'] == '454':
            known_header_list = self.known_header_list['454']
        else:
            logger.error("in utils: check_headers - unknown platform")
        #print(  sorted(known_header_list))
        #print(sorted(headers))
        self.res_headers = headers
        if "env_sample_source" in headers:
            self.env_source_to_id(headers)

        if sorted(known_header_list) != sorted(self.res_headers):
            print("=" * 40)
            print("csv file header problem")
            print("%-20s %-20s" % ("REQUIRED", "YOUR CSV"))
            for i in sorted(known_header_list):
                if i in headers:
                    print("%-20s%-20s" % (i,i))
                else:
                    print("%-20s%-20s" % (i,"----------- <--- missing"))
            for i in headers:

                if i not in known_header_list:
                    print("%-20s%-20s" % (" ",i+" <--- extra"))
            print("=" * 40)
            sys.exit("ERROR : unknown or missing headers\n")
        else:
            return True

    def env_source_to_id(self, headers):
        logger.error("self.utils.is_local() LLL2 metadata")
        logger.error(self.utils.is_local())
        if self.utils.is_local():
            self.my_conn     = MyConnection(host = 'localhost', db="test_env454")
        else:
            self.my_conn = MyConnection(host='bpcdb1', db="env454")
        # self.my_conn     = MyConnection()
        my_sql       = """SELECT * FROM env_sample_source"""
        self.env     = self.my_conn.execute_fetch_select(my_sql)
        self.res_headers = ["env_sample_source_id" if x=="env_sample_source" else x for x in headers]

    def configDictionaryFromFile_ini(self, config_file_path):
        import configparser

        configDict = {}
        user_config = configparser.ConfigParser()
        user_config.read(config_file_path)

        for section in user_config.sections():
            section_dict = configDict[section] = {}
            for option in user_config.options(section):
                section_dict[option] = user_config.get(section,option)
                if section_dict[option] == 'True' or section_dict[option] == 'true':
                    section_dict[option] = True
                elif section_dict[option] == 'False' or section_dict[option] == 'false':
                    section_dict[option] = False

        return configDict

    def get_values(self, args, general_config_dict = {} ):
        collector={}

        for item in self.pipeline_run_items[args.platform]:

            # set collector[item] to the default first
            collector[item] = self.pipeline_run_items[args.platform][item]

            # now look for args (then ini) values to replace
            if item in args and getattr( args, item ) != None:
                collector[item]  = getattr( args, item )
            elif general_config_dict and item in general_config_dict[args.platform] and general_config_dict[args.platform][item] != '':
                collector[item]  = general_config_dict[args.platform][item]

        # get all the items from general_config_dict['general']
        if 'general' in general_config_dict:
            for item in general_config_dict['general']:
                collector[item]  = general_config_dict['general'][item]


        return collector

    def validate_args(self):
        """
        # THOUGHTS
        # vamps users
        # single project and dataset
        # Supply an ini file OR commandline (for web interface), but no csv file
        #
        # MBL pipeline
        # REQUIRE a csv file and a ini file
        """
        collector={}

        if self.args.configPath:
            general_config_dict = self.configDictionaryFromFile_ini(self.args.configPath)
            if self.args.platform in general_config_dict and 'general' in general_config_dict:
                collector= self.get_values( self.args, general_config_dict)
            else:
                sys.exit("The ini file needs both a [general] and ["+ self.args.platform +"] section - Exiting.")
        else:
            # no configPath
            collector= self.get_values( self.args )

        collector['current_db_host_name'] = self.utils.find_in_nested_dict(C.db_cnf, {'host': collector['database_host'], 'db': collector['database_name']})
        if not collector['current_db_host_name']:
            sys.exit("""Please check -db_host and -db_name parameters, 
            the current combination does not exist: 'db_host' = %s, 'db_name' = %s """ % (collector['database_host'], collector['database_name']))

        if self.args.platform in C.illumina_list:
            print("Starting Illumina Pipeline")
            if not self.args.csvPath:
                sys.exit("illumina requires a csv file - Exiting")

        elif self.args.platform == 'vamps':
            print("Starting VAMPS Pipeline:")

            if 'project' not in collector or collector['project'] == '':
                collector['project'] = collector['project'][:1].capitalize() + collector['project'][1:]
            else:
                logger.debug("No project found in vamps pipeline")
            if self.args.fasta_file:
                collector['project'] = self.args.fasta_file
                collector['from_fasta'] = True
        elif self.args.platform == '454':
            print("Starting 454 Pipeline")

        elif self.args.platform == 'ion_torrent':
            print("Starting Ion Torrent Pipeline")

        else:
            sys.exit("Validate args: Unknown Platform")

        if  self.args.configPath:
            collector['configPath'] = self.args.configPath
        else:
            collector['configPath'] = ""
        # these are all the bool items in the collector
        # they need to be converted from str to bool here
        for i in collector:
            if collector[i] == 'True' or collector[i] == 'true':
                collector[i] = True
            elif collector[i] == 'False' or collector[i] == 'false':
                collector[i] = False

        #collector['runcode'] = self.args.run
        collector['run'] = self.args.run
        #collector['run_date'] = self.args.run
        #collector['steps'] = self.args.steps
        collector['platform'] = self.args.platform
        if self.args.input_dir:
            collector['input_dir'] = self.args.input_dir

        collector['date'] = str(datetime.date.today())
        #print(collector)
        return collector

Пример #24

Показать файл

Файл: chimera.py Проект: avoorhis/py_mbl_sequencing_pipeline

class Chimera:
    """ Define here """
    def __init__(self, runobj=None):
        self.utils = PipelneUtils()
        self.runobj = runobj
        self.run_keys = self.runobj.run_keys
        self.rundate = self.runobj.run
        try:
            self.use_cluster = self.runobj.use_cluster
        except:
            self.use_cluster = True
        self.chg_suffix = ".chg"
        self.chimeras_suffix = ".chimeras"
        self.ref_suffix = ".db"
        self.denovo_suffix = ".txt"
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix  #".nonchimeric.fa"
        self.chimeric_suffix = ".chimeric.fa"
        self.base_suffix = "unique" + self.chimeras_suffix

        self.cluster_slots = {
            "grendel": [12, 8],
            "cricket": [40],
            "cluster5": [32]
        }

        try:
            if self.runobj.lane_name:
                lane_name = self.runobj.lane_name
            else:
                lane_name = ''
        except:
            lane_name = ''

        if self.runobj.vamps_user_upload:
            os.environ['SGE_ROOT'] = '/opt/sge'
            os.environ['SGE_CELL'] = 'grendel'
            path = os.environ['PATH']
            os.environ['PATH'] = '/opt/sge/bin/lx24-amd64:' + path
            site = self.runobj.site
            dir_prefix = self.runobj.user + '_' + self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload,
                             dir_prefix,
                             self.runobj.platform,
                             lane_name=lane_name,
                             site=site)
            self.idx_keys = convert_unicode_dictionary_to_str(
                json.loads(
                    open(self.runobj.trim_status_file_name,
                         "r").read()))["new_lane_keys"]
            self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)
            self.indir = self.dirs.check_dir(self.dirs.trimming_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)
            self.gast_dir = self.dirs.check_dir(self.dirs.gast_dir)

        else:
            site = ''
            dir_prefix = self.runobj.run
            self.dirs = Dirs(self.runobj.vamps_user_upload,
                             dir_prefix,
                             self.runobj.platform,
                             lane_name=lane_name,
                             site=site)
            self.indir = self.dirs.check_dir(self.dirs.reads_overlap_dir)
            self.outdir = self.dirs.check_dir(self.dirs.chimera_dir)

#         self.usearch_cmd = C.usearch_cmd
        self.usearch_cmd = C.usearch6_cmd
        if self.utils.is_local():
            self.usearch_cmd = C.usearch6_cmd_local
        #self.abskew      = C.chimera_checking_abskew
        self.refdb = C.chimera_checking_refdb
        if self.utils.is_local():
            self.refdb_local = C.chimera_checking_refdb_local
        self.its_refdb = C.chimera_checking_its_refdb
        self.input_file_names = self.make_chimera_input_illumina_file_names()
#         self.output_file_names = self.make_chimera_output_illumina_file_names(self.input_file_names)

    def get_ref_db(self, dna_region):
        ref_db = ''
        if dna_region.upper() == 'ITS':
            ref_db = C.chimera_checking_its_refdb
            logger.debug("got an ITS dna region so using refdb: " + ref_db)
        else:
            ref_db = C.chimera_checking_refdb
            if self.utils.is_local():
                ref_db = C.chimera_checking_refdb_local
            logger.debug("using standard refdb: " + ref_db)
        return ref_db

    def make_chimera_input_illumina_file_names(self):
        input_file_names = {}

        for idx_key in self.run_keys:
            file_name = idx_key + "_" + C.filtered_suffix + ".unique"

            if os.path.exists(os.path.join(self.indir, file_name)):
                input_file_names[idx_key] = file_name

        return input_file_names

    def get_current_dirname(self, in_or_out=""):
        if in_or_out == "":
            cur_dirname = self.indir
        else:
            cur_dirname = self.outdir
        return cur_dirname

    def is_chimera_check_file(self, filename):
        return filename.endswith(
            (self.chimeras_suffix + self.denovo_suffix,
             self.chimeras_suffix + self.ref_suffix, self.chimeric_suffix,
             self.nonchimeric_suffix))

    def get_current_filenames(self, cur_dirname):
        cur_file_names = []
        if cur_dirname == self.indir:
            cur_file_names = self.input_file_names.values()
        elif cur_dirname == self.outdir:
            cur_file_names = self.get_chimera_file_names(self.outdir)
        return cur_file_names

    def get_chimera_file_names(self, cur_dirname):
        cur_file_names = []
        for dirname, dirnames, filenames in os.walk(cur_dirname):
            cur_file_names = [
                filename for filename in filenames
                if (self.is_chimera_check_file(filename))
            ]
        return cur_file_names

    def read_file(self, source_name):
        with open(source_name, "r") as sources:
            return sources.readlines()

    def illumina_sed(self, lines, target_name, regex, replace, uppercase):
        with open(target_name, "w") as target:
            for line in lines:
                if line.startswith(">"):
                    line1 = regex.sub(replace, line)
                else:
                    if (uppercase):
                        line1 = line.upper()
                    else:
                        line1 = line
                target.write(line1)

    def call_illumina_sed(self, from_to):
        """
            from_to = from_frequency_to_size or from_size_to_frequency
        """
        sed_from_to = namedtuple(
            'sed_from_to',
            'find, replace, cur_dirname, cur_file_names, change_from_suffix, change_to_suffix, uppercase'
        )

        from_frequency_to_size = sed_from_to(
            find="frequency:",
            replace=";size=",
            cur_dirname=self.indir,
            cur_file_names=self.get_current_filenames(self.indir),
            change_from_suffix="",
            change_to_suffix=self.chg_suffix,
            uppercase=True)

        from_size_to_frequency = sed_from_to(
            find=";size=",
            replace="frequency:",
            cur_dirname=self.outdir,
            cur_file_names=self.get_chimera_file_names(self.outdir),
            change_from_suffix="",
            change_to_suffix="",
            uppercase=False)

        if (from_to == "from_frequency_to_size"):
            tuple_name = from_frequency_to_size
        elif (from_to == "from_size_to_frequency"):
            tuple_name = from_size_to_frequency

        regex = re.compile(r"%s" % tuple_name.find)
        #         logger.debug("find = %s, replace = %s" % (find, replace))
        if (not tuple_name.cur_file_names) and (tuple_name
                                                == from_frequency_to_size):
            self.utils.print_both(
                'ERROR: Did not find uniqued files ("%s") in %s, please check if the previous step has finished. Exiting.\n'
                % (C.filtered_suffix + ".unique", self.indir))
            sys.exit()
        for cur_file_name in tuple_name.cur_file_names:
            file_name = os.path.join(tuple_name.cur_dirname, cur_file_name)
            source_name = file_name + tuple_name.change_from_suffix
            target_name = file_name + tuple_name.change_to_suffix
            lines = self.read_file(source_name)
            self.illumina_sed(lines, target_name, regex, tuple_name.replace,
                              tuple_name.uppercase)

    def illumina_freq_to_size_in_chg(self):
        find1 = "frequency:"
        replace1 = ";size="
        regex1 = re.compile(r"%s" % find1)

        #         logger.debug("cur_file_names: ")
        #         pprint(cur_file_names)
        cur_dirname = self.get_current_dirname()
        cur_file_names = self.get_current_filenames(cur_dirname)
        change_from_suffix = ""
        change_to_suffix = self.chg_suffix
        #         logger.debug("find = %s, replace = %s" % (find, replace))

        for cur_file_name in cur_file_names:
            file_name = os.path.join(cur_dirname, cur_file_name)
            lines = self.utils.read_file(file_name + change_from_suffix)
            with open(file_name + change_to_suffix, "w") as target:
                for line in lines:
                    if line.startswith(">"):
                        line1 = regex1.sub(replace1, line)
                    else:
                        line1 = line.upper()
#                     logger.debug(line1)
                    target.write(line1)

    def illumina_size_to_freq_in_chimer(self):
        find1 = ";size="
        replace1 = "frequency:"
        regex1 = re.compile(r"%s" % find1)

        cur_file_names = self.get_chimera_file_names(self.outdir)

        for file_chim in cur_file_names:
            file_chim_path = os.path.join(self.outdir, file_chim)
            lines = self.utils.read_file(file_chim_path)
            with open(file_chim_path, "w") as target:
                for line in lines:
                    line1 = regex1.sub(replace1, line)
                    target.write(line1)

    def illumina_rm_size_files(self):
        for idx_key in self.input_file_names:
            file_name = os.path.join(
                self.indir, self.input_file_names[idx_key] + self.chg_suffix)
            if os.path.exists(file_name):
                pass
                # os.remove(file_name)

    def check_if_chimera_dir_empty(self):
        if not os.listdir(self.outdir):
            self.utils.print_both(
                'ERROR: Did not find files in %s, something is wrong. First check if you ran the command on a cluster. Exiting.\n'
                % self.outdir)
            sys.exit()

    def check_if_cluster_is_done(self, time_before):
        cluster_done = False
        check_qstat_cmd_line = "qstat | grep \"%s\" | grep chimera_ch | wc -l" % time_before
        #         check_qstat_cmd_line = "qstat | grep vsearch"

        self.utils.print_both("check_qstat_cmd_line = %s" %
                              check_qstat_cmd_line)

        try:
            p = subprocess.Popen(check_qstat_cmd_line,
                                 stdout=subprocess.PIPE,
                                 shell=True)
            (output, err) = p.communicate()
            num_proc = int(output)
            self.utils.print_both("qstat is running %s 'vsearch' processes" %
                                  num_proc)
            #         pprint(p)

            if (num_proc == 0):
                cluster_done = True

    #         logger.debug("cluster_done from check_if_cluster_is_done = %s" % cluster_done)
        except:
            self.utils.print_both(
                "Chimera checking can be done only on a cluster.")
            raise

        return cluster_done

    def create_chimera_cmd(self, ref_db):
        """
        /usr/local/bin/vsearch
        -uchime_denovo
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg
        -uchimeout
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt
        -chimeras
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.txt.chimeric.fa
        -notrunclabels
        ---
        /usr/local/bin/vsearch
        -uchime_ref
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/reads_overlap/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chg
        -uchimeout
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db
        -chimeras
        /Users/ashipunova/BPC/py_mbl_sequencing_pipeline/test/miseq/20150223/lane_1_B/analysis/chimera/TGACCA_NNNNCGACG_1_MERGED-MAX-MISMATCH-3.unique.chimeras.db.chimeric.fa
        -notrunclabels
        -strand
        plus
        -db
        /groups/g454/blastdbs/rRNA16S.gold.fasta

        """
        command_line = []

        ref_or_novo_options = {
            self.denovo_suffix: "-uchime_denovo",
            self.ref_suffix: "-uchime_ref"
        }
        for suff, opt in ref_or_novo_options.items():
            input_file_name = self.indir + "/$filename_base" + self.chg_suffix
            output_file_name = self.outdir + "/$filename_base" + self.chimeras_suffix + suff

            ref_add = ""
            if (opt == "-uchime_ref"):
                ref_add = "-strand plus -db %s" % ref_db

            uchime_cmd = """%s %s %s -uchimeout %s -chimeras %s%s -notrunclabels %s
            """ % (self.usearch_cmd, opt, input_file_name, output_file_name,
                   output_file_name, self.chimeric_suffix, ref_add)
            logger.debug("UUU = uchime_cmd = %s" % uchime_cmd)
            logger.debug("+++")
            command_line.append(uchime_cmd)

        return command_line

    def create_chimera_cmd_old(self,
                               input_file_name,
                               output_file_name,
                               ref_or_novo,
                               ref_db=""):
        """
        http://www.drive5.com/usearch/manual/uchime_denovo.html
        from usearch -help
        Chimera detection (UCHIME ref. db. mode):
          usearch -uchime q.fasta [-db db.fasta] [-chimeras ch.fasta]
            [-nonchimeras good.fasta] [-uchimeout results.uch] [-uchimealns results.alns]

        Chimera detection (UCHIME de novo mode):
          usearch -uchime amplicons.fasta [-chimeras ch.fasta] [-nonchimeras good.fasta]
             [-uchimeout results.uch] [-uchimealns results.alns]
          Input is estimated amplicons with integer abundances specified using ";size=N".
        usearch -uchime_denovo amplicons.fasta -uchimeout results.uchime
        """

        uchime_cmd_append = ""
        db_cmd_append = ""
        dir_cmd_append = ""

        if (ref_or_novo == "denovo"):
            uchime_cmd_append = " -uchime_denovo "
            output_file_name = output_file_name + self.chimeras_suffix + self.denovo_suffix
        elif (ref_or_novo == "ref"):
            uchime_cmd_append = " -uchime_ref "
            output_file_name = output_file_name + self.chimeras_suffix + self.ref_suffix
            db_cmd_append = " -db " + ref_db
            dir_cmd_append = " -strand plus"
        else:
            self.utils.print_both(
                "Error: Incorrect method, should be \"denovo\" or \"ref\"")
        self.utils.print_both("output_file_name = %s" % output_file_name)

        uchime_cmd = C.clusterize_cmd
        if self.utils.is_local():
            uchime_cmd = ""
        uchime_cmd += " "
        uchime_cmd += self.usearch_cmd
        logger.debug("self.usearch_cmd FROM create_chimera_cmd = %s" %
                     (uchime_cmd))

        uchime_cmd += uchime_cmd_append + input_file_name
        logger.debug("uchime_cmd_append FROM create_chimera_cmd = %s" %
                     (uchime_cmd_append))

        uchime_cmd += db_cmd_append

        logger.debug("db_cmd_append FROM create_chimera_cmd = %s" %
                     (db_cmd_append))

        uchime_cmd += " -uchimeout " + output_file_name
        """if we need nonchimeric for denovo and db separate we might create them here"""
        uchime_cmd += " -nonchimeras "
        uchime_cmd += (output_file_name + self.nonchimeric_suffix)

        uchime_cmd += " -chimeras " + (output_file_name + self.chimeric_suffix)
        uchime_cmd += dir_cmd_append

        uchime_cmd += " -notrunclabels"

        logger.debug("uchime_cmd FROM create_chimera_cmd = %s" % (uchime_cmd))
        return uchime_cmd

    def get_sge_cluster_name(self):
        # import subprocess
        result = subprocess.run(['qstat', '-F'], stdout=subprocess.PIPE)
        a1 = result.stdout.decode('utf-8').split()
        for line in a1:
            if (line.find("hostname") !=
                    -1):  #qf:hostname=grendel-01.bpcservers.private
                return line.split("=")[1].split("-")[0]

    def get_sge_slot_number(
            self
    ):  # doesn't work on cricket because: 	qc:slots=12 and qc:slots=8
        result = subprocess.run(['qstat', '-F', 'slots'],
                                stdout=subprocess.PIPE)
        a1 = result.stdout.decode('utf-8').split()
        slots = []
        for line in a1:
            if line.startswith('qc:slots'):
                slots.append(int(line.split("=")[-1]))
        slots_uniq = set(slots)
        return max(slots_uniq)

    # TODO: temp! take from util. change illumina-files to use util, too
    #   create_job_array_script(self, command_line, dir_to_run, files_list, runobj)
    # feb 25 2019 removed, because didn't work on grendel:
    #  Use the allslots pe and all available slots on that cluster
    # #$ -pe allslots %s
    def create_job_array_script(self, script_file_name_base, command_line,
                                dir_to_run, files_list):
        # sge_slot_number = self.get_sge_slot_number()
        sge_cluster_name = self.get_sge_cluster_name()
        sge_slot_number = self.cluster_slots[sge_cluster_name][0]
        logger.debug("sge_slot_number FROM create_job_array_script = %s" %
                     (sge_slot_number))

        files_string = " ".join(files_list)
        files_list_size = len(files_list)
        #         command_file_name = os.path.basename(command_line.split(" ")[0])
        script_file_name = script_file_name_base + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        log_file_name = script_file_name + ".sge_script.sh.log"
        email_mbl = C.email_mbl
        # self.utils.make_users_email()
        text = (
            '''#!/bin/bash
#$ -cwd
#$ -S /bin/bash
#$ -N %s
# Giving the name of the output log file
#$ -o %s
# Combining output/error messages into one file
#$ -j y
# Send mail to these users
#$ -M %s
# Send mail at job end (e); -m as sends abort, suspend.
#$ -m as
# max_running_tasks
#$ -tc 15
-# Use the allslots pe and all available slots on that cluster
#$ -pe allslots %s
#$ -t 1-%s
# Now the script will iterate %s times.

  file_list=(%s)

  i=$(expr $SGE_TASK_ID - 1)
  echo "i = $i"
  . /bioware/root/Modules/etc/profile.modules
  module load bioware
  module load vsearch

  INFILE=${file_list[$i]}

  filename=$(basename $INFILE)
  echo "INFILE = $INFILE"
  filename_base="${filename%%.*}"
  echo "filename_base = $filename_base"
  echo "%s"
  echo "%s"
  %s
  %s
''' % (script_file_name, log_file_name, email_mbl, sge_slot_number,
        files_list_size, files_list_size, files_string, command_line[0],
        command_line[1], command_line[0], command_line[1])
            # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
        )
        self.utils.open_write_close(script_file_name_full, text)
        return script_file_name

    def create_not_SGE_script(self, script_file_name_base, command_line,
                              dir_to_run, files_list):

        files_string = " ".join(files_list)
        script_file_name = script_file_name_base + "_" + self.runobj.run + "_" + self.runobj.lane_name + ".sh"
        script_file_name_full = os.path.join(dir_to_run, script_file_name)
        text = (
            '''#!/bin/bash

    file_list=(%s)

    . /bioware/root/Modules/etc/profile.modules
    module load bioware
    module load vsearch

    n=0
    for INFILE in "${file_list[@]}"
    do      
    n=$[n + 1]
    echo $n
    echo "INFILE = $INFILE"
    filename=$(basename $INFILE)
    filename_base="${filename%.*}"
    echo "filename_base = $filename_base"

    echo "%s"
    echo "%s"
    %s
    %s
    done
    ''' % (files_string, command_line[0], command_line[1], command_line[0],
           command_line[1])
            # ''' % (script_file_name, log_file_name, email_mbl, files_list_size, files_list_size, files_string, command_line)
        )
        self.utils.open_write_close(script_file_name_full, text)
        return script_file_name

    def chimera_checking(self):
        chimera_region_found = False

        file_list = self.dirs.get_all_files_by_ext(self.indir, self.chg_suffix)
        logger.debug("FFF = file_list = %s" % (file_list))

        #         TODO: method
        dna_region = list(
            set([
                self.runobj.samples[idx_key].dna_region
                for idx_key in self.input_file_names
            ]))[0]
        if dna_region in C.regions_to_chimera_check:
            chimera_region_found = True
        else:
            logger.debug('region not checked: ' + dna_region)
        ref_db = self.get_ref_db(dna_region)
        command_line = self.create_chimera_cmd(ref_db)
        sh_script_file_name = self.create_job_array_script(
            "chimera_checking", command_line, self.indir, file_list)
        script_file_name_full = os.path.join(self.indir, sh_script_file_name)
        self.utils.call_sh_script(script_file_name_full, self.indir)
        self.utils.print_both("self.dirs.chmod_all(%s)" % (self.indir))
        self.dirs.chmod_all(self.indir)
        logger.debug('sh_script_file_name: ' + sh_script_file_name)
        if not chimera_region_found:
            return ('NOREGION', 'No regions found that need checking', '')
        else:
            return ("The vsearch commands were created")

    def get_chimeric_ids(self):
        ids = set()
        chimera_file_names = self.get_chimera_file_names(self.outdir)
        file_ratio = self.check_chimeric_stats()

        for file_name in chimera_file_names:
            #             logger.debug("from get_chimeric_ids: file_name = %s" % file_name)
            if file_name.endswith(self.chimeric_suffix):
                both_or_denovo = self.get_chimeras_suffix(
                    file_ratio, file_name)
                #                 TODO: run ones for each file_base = ".".join(file_name.split(".")[0:3]) (for txt and db)
                if file_name.endswith(both_or_denovo):
                    file_name_path = os.path.join(self.outdir, file_name)
                    self.utils.print_both("Get ids from %s" % file_name_path)
                    read_fasta = fa.ReadFasta(file_name_path)
                    ids.update(set(read_fasta.ids))
        return ids

    def get_chimeras_suffix(self, file_ratio, file_name):
        """ use only de-novo (.txt) chimeric if
            check_chimeric_stats shows
            ratio ref to de-novo > 3
            e.g.
            if denovo_only:
                chimeric_suffix = self.chimeras_suffix + self.denovo_suffix + self.chimeric_suffix
            if no:
                chimeras_suffix = self.chimeric_suffix

            if file_name.endswith(chimeric_suffix):
            ...
                #     first_name, last_name = get_name()

        """
        #         for file_basename in file_ratio:
        (percent_ref, ratio) = file_ratio[".".join(file_name.split(".")[0:3])]

        chimeric_fa_suffix = ""
        #         logger.debug("percent_ref = %s, ratio = %s" % (percent_ref, ratio))
        #         if (percent_ref > 15) and (ratio > 2):
        if ratio > 3:
            chimeric_fa_suffix = self.chimeras_suffix + self.denovo_suffix + self.chimeric_suffix
        else:
            chimeric_fa_suffix = self.chimeric_suffix
        return chimeric_fa_suffix

    def move_out_chimeric(self):
        chimeric_ids = self.get_chimeric_ids()
        for idx_key in self.input_file_names:
            fasta_file_path = os.path.join(self.indir,
                                           self.input_file_names[idx_key])
            read_fasta = fa.ReadFasta(fasta_file_path)
            read_fasta.close()

            non_chimeric_file = fasta_file_path + self.nonchimeric_suffix
            non_chimeric_fasta = fa.FastaOutput(non_chimeric_file)

            fasta = fa.SequenceSource(fasta_file_path, lazy_init=False)
            while fasta.next():
                if not fasta.id in chimeric_ids:
                    non_chimeric_fasta.store(fasta, store_frequencies=False)
            non_chimeric_fasta.close()

    def check_chimeric_stats(self):
        all_lines_suffix = self.denovo_suffix  # ".txt" or ".db, doesn't matter"
        chimera_ref_suffix = self.ref_suffix + self.chimeric_suffix  #".db.chimeric.fa"
        chimera_denovo_suffix = self.denovo_suffix + self.chimeric_suffix  # ".txt.chimeric.fa"
        filenames = self.get_basenames(self.get_current_filenames(self.outdir))
        file_ratio = {}
        for file_basename in filenames:
            # logger.debug(file_basename)
            all_lines = 0
            ref_lines = 0
            denovo_lines = 0
            ratio = 0
            percent_ref = 0
            percent_denovo = 0

            all_lines_file_name = os.path.join(
                self.outdir, file_basename + all_lines_suffix)
            ref_lines_file_name = os.path.join(
                self.outdir, file_basename + chimera_ref_suffix)
            denovo_lines_file_name = os.path.join(
                self.outdir, file_basename + chimera_denovo_suffix)

            all_lines = int(self.wccount(all_lines_file_name) or 0)
            ref_lines = int(self.get_fa_lines_count(ref_lines_file_name) or 0)
            denovo_lines = int(
                self.get_fa_lines_count(denovo_lines_file_name) or 0)

            # denovo_lines = int(denovo_lines or 0)
            if (ref_lines == 0) or (all_lines == 0):
                file_ratio[file_basename] = (0, 0)
                continue
            else:
                percent_ref = self.percent_count(all_lines, ref_lines)

            if (denovo_lines == 0):
                file_ratio[file_basename] = (
                    percent_ref, percent_ref
                )  #use ref instead of ratio, because we are actually looking for a huge difference between ref and denovo (ref > 15 and denovo = 0)
                continue

            if (denovo_lines > 0):
                ratio = self.count_ratio(ref_lines, denovo_lines)
                percent_denovo = self.percent_count(all_lines, denovo_lines)
            file_ratio[file_basename] = (percent_ref, ratio)
            # percent_ref = int(percent_ref or 0)
            if (percent_ref > 15):
                self.utils.print_both("=" * 50)

                self.utils.print_both(file_basename)
                # logger.debug("all_lines_file_name = %s, ref_lines_file_name = %s, denovo_lines_file_name = %s" % (all_lines_file_name, ref_lines_file_name, denovo_lines_file_name))
                self.utils.print_both(
                    "all_lines = %s, ref_lines = %s, denovo_lines = %s" %
                    (all_lines, ref_lines, denovo_lines))
                self.utils.print_both("ratio = %s" % ratio)
                self.utils.print_both("percent_ref = %s, percent_denovo = %s" %
                                      (percent_ref, percent_denovo))
        return file_ratio

    def get_basenames(self, filenames):
        file_basenames = set()
        for f in filenames:
            file_basename = ".".join(f.split(".")[0:3])
            if file_basename.endswith(self.base_suffix):
                file_basenames.add(file_basename)

        return file_basenames

    def wccount(self, filename):
        return subprocess.check_output(['wc', '-l', filename]).split()[0]

    def count_ratio(self, ref_num, denovo_num):
        try:
            return float(ref_num or 0) / float(denovo_num or 0)
        except ZeroDivisionError:
            # logger.debug("There is no denovo chimeras to count ratio.")
            pass

    def get_fa_lines_count(self, file_name):
        # todo: use fastalib to get cnt?
        # return fa.SequenceSource(file_name, lazy_init = False).total_seq
        try:
            file_open = open(file_name)
            return len([l for l in file_open.readlines() if l.startswith('>')])
        except IOError:
            e = sys.exc_info()[1]
            self.utils.print_both(e)
            return 0
            # logger.error("%s\nThere is no such file: %s" % (e, file_name))

    def percent_count(self, all_lines, chimeric_count):
        try:
            return float(chimeric_count or 0) * 100 / float(all_lines or 0)
        except ZeroDivisionError:
            # logger.error("There is no denovo chimeras to count ratio.")
            pass

    """
    -----------------------------------------------------------------------------
        For 454.
        not tested
    """

    def chimera_denovo(self):
        chimera_region_found = False
        output = {}
        cluster_id_list = []

        for idx_key in self.idx_keys:
            input_file_name = os.path.join(self.indir, idx_key + '.abund.fa')
            if os.path.isfile(input_file_name):
                output_file_name = os.path.join(self.outdir,
                                                idx_key + '.chimera.denovo')
                #open(output_file_name, 'a').close()  # make sure file exists
                log_file = os.path.join(self.outdir, idx_key + ".denovo.log")

                dna_region = self.runobj.samples[idx_key].dna_region
                logger.debug("dna_region = %s" % dna_region)
                if self.runobj.vamps_user_upload:
                    # VAMPS users can chimera check regardless of region chosen
                    chimera_region_found = True
                else:
                    if dna_region in C.regions_to_chimera_check:
                        chimera_region_found = True
                    else:
                        logger.debug('region not checked: ' + dna_region)
                        continue

                self.utils.print_both(
                    "input_file_name = %s \noutput_file_name = %s" %
                    (input_file_name, output_file_name))

                #             uchime_cmd = C.clusterize_cmd
                #             uchime_cmd += " "
                #             uchime_cmd += self.usearch_cmd
                #             uchime_cmd += " --uchime "
                #             uchime_cmd += input_file_name
                #             uchime_cmd += " --uchimeout "
                #             uchime_cmd += output_file_name
                #             uchime_cmd += " --abskew "
                #             uchime_cmd += self.abskew
                uchime_cmd = ''
                if self.use_cluster:
                    uchime_cmd += C.clusterize_cmd
                    uchime_cmd += " "
                    uchime_cmd += " -log "
                    uchime_cmd += log_file
                    uchime_cmd += " "
                uchime_cmd += self.usearch_cmd
                uchime_cmd += " -uchime_denovo "
                uchime_cmd += input_file_name
                uchime_cmd += " -uchimeout "
                uchime_cmd += output_file_name

                logger.debug("uchime_denovo_cmd = %s" % (uchime_cmd))

                try:
                    logger.info("chimera denovo command: " + str(uchime_cmd))
                    #                 subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

                    self.utils.print_both("chimera denovo command: " +
                                          str(uchime_cmd))
                    #output[idx_key] = subprocess.Popen(uchime_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                    output[idx_key] = subprocess.check_output(uchime_cmd,
                                                              shell=True)
                    self.utils.print_both("chimera denovo result: " +
                                          str(output[idx_key]))
                    #self.utils.print_both("output[idx_key] = %s" % output[idx_key])
                    #if idx_key in output and len(output[idx_key].split()) > 1:
                    #self.utils.print_both(output[idx_key].split()[2])
                    items = output[idx_key].split()
                    if len(items) > 2:
                        cluster_id_list.append(items[2])

                except OSError:
                    e = sys.exc_info()[1]
                    self.utils.print_both(
                        "Error: Problems with this command: %s" % (uchime_cmd))
                    if self.utils.is_local():
                        print >> sys.stderr, "Error: Execution of %s failed: %s" % (
                            uchime_cmd, e)
                    else:
                        print >> sys.stderr, "Error: Execution of %s failed: %s" % (
                            uchime_cmd, e)
                        self.utils.print_both(
                            "Error: Execution of %s failed: %s" %
                            (uchime_cmd, e))
                        raise

# ???
        if not chimera_region_found:
            return ('NOREGION', 'No regions found that need checking', '')

        # ???
#         for idx_key in output:
#             if len(output[idx_key]) > 50 or len(output[idx_key]) < 40:
#                 return ('ERROR','uchime ref may have broken or empty', idx_key)

# finally
        self.utils.print_both('Finished Chimera Denovo')
        if cluster_id_list:
            return ('SUCCESS',
                    'uchime ref seems to have been submitted successfully',
                    cluster_id_list)
        else:
            return ('ERROR', 'uchime ref returned no cluster IDs',
                    cluster_id_list)

    def chimera_reference(self):

        chimera_region_found = False
        output = {}
        cluster_id_list = []
        for idx_key in self.run_keys:

            dna_region = self.runobj.samples[idx_key].dna_region
            if self.runobj.vamps_user_upload:
                # VAMPS users can chimera check regardless of region chosen
                chimera_region_found = True
            else:
                if dna_region in C.regions_to_chimera_check:
                    chimera_region_found = True
                else:
                    logger.debug('region not checked: ' + dna_region)
                    continue

            input_file_name = os.path.join(self.indir, idx_key + '.abund.fa')
            output_file_name = os.path.join(self.outdir,
                                            idx_key + ".chimera.ref")
            #open(output_file_name, 'a').close()  # make sure file exists
            log_file = os.path.join(self.outdir, idx_key + ".ref.log")
            logger.debug("OUT FILE NAME: " + output_file_name)

            #out_file_name = self.prefix[idx_key] + ".chimeras.db"
            input_file_name = os.path.join(self.indir, idx_key + '.abund.fa')
            if os.path.isfile(input_file_name):
                output_file_name = os.path.join(self.outdir,
                                                idx_key + ".chimera.ref")
                #open(output_file_name, 'a').close()  # make sure file exists
                log_file = os.path.join(self.outdir, idx_key + ".ref.log")
                logger.debug("OUT FILE NAME: " + output_file_name)
                # which ref db to use?
                ref_db = ''
                if dna_region.upper() == 'ITS':
                    logger.debug("got an ITS dna region so using refdb: " +
                                 self.its_refdb)
                    ref_db = self.its_refdb
                else:
                    logger.debug("using standard refdb: " + self.refdb)
                    ref_db = self.refdb

                uchime_cmd = ''
                if self.use_cluster:
                    uchime_cmd = C.clusterize_cmd
                    uchime_cmd += " "
                    uchime_cmd += " -log "
                    uchime_cmd += log_file
                    uchime_cmd += " "
                uchime_cmd += self.usearch_cmd
                uchime_cmd += " -uchime_ref "
                uchime_cmd += input_file_name
                uchime_cmd += " -uchimeout "
                uchime_cmd += output_file_name
                uchime_cmd += " -db "
                uchime_cmd += ref_db
                uchime_cmd += " -strand "
                uchime_cmd += "plus"

                logger.debug("uchime_ref_cmd = %s" % (uchime_cmd))

                try:

                    logger.info("vsearch version: " %
                                (self.utils.get_vsearch_version))
                    logger.info("chimera reference command: " +
                                str(uchime_cmd))
                    output[idx_key] = subprocess.check_output(uchime_cmd,
                                                              shell=True)
                    #logger.debug('outsplit',output[idx_key].split()[2])
                    cluster_id_list.append(output[idx_key].split()[2])
                    #logger.debug('Have %d bytes in output' % len(output))
                    #logger.debug('ref',idx_key,output,len(output))
                    if len(output[idx_key]) < 50 and len(output[idx_key]) > 40:
                        logger.debug(
                            idx_key +
                            " uchime ref seems to have been submitted successfully"
                        )
                    else:
                        if self.use_cluster:
                            print >> sys.stderr, "Error: uchime ref may be broke"
                            self.utils.print_both(
                                "Error: uchime ref may be broke")

                except OSError:
                    e = sys.exc_info()[1]
                    print >> sys.stderr, "Error: Execution of chimera_reference failed: %s" % (
                        uchime_cmd, e)
                    self.utils.print_both(
                        "Error: Execution of chimera_reference failed: %s" %
                        (uchime_cmd, e))
                    raise

        if not chimera_region_found:
            return ('NOREGION', 'No regions found that need checking', '')

        for idx_key in output:
            if (len(output[idx_key]) > 50
                    or len(output[idx_key]) < 40) and self.use_cluster:
                return ('ERROR', 'uchime ref may have broken or empty',
                        idx_key)
        self.utils.print_both('Finished Chimera Reference')
        return ('SUCCESS',
                'uchime ref seems to have been submitted successfully',
                cluster_id_list)

    def write_chimeras_to_deleted_file(self):

        for idx_key in self.run_keys:
            # open  deleted file and append chimera to it
            # open and read both chimeras files: chimeras.db and chimeras.txt

            # hash to remove dupes
            chimera_deleted = {}
            denovo_file = os.path.join(self.outdir,
                                       idx_key + '.chimera.denovo')
            ref_file = os.path.join(self.outdir, idx_key + ".chimera.ref")
            # deleted file is in trimming dir for vampsuser
            deleted_file = os.path.join(self.indir, idx_key + ".deleted.txt")
            for file in [denovo_file, ref_file]:
                if os.path.isfile(file):
                    fh = open(file, "r")
                    # make a list of chimera deleted read_ids
                    for line in fh.readlines():
                        lst = line.strip().split()
                        id = lst[1].split(';')[0]
                        chimera_yesno = lst[-1]
                        if (chimera_yesno) == 'Y':
                            chimera_deleted[id] = 'chimera'
            # open to append as trimming deletions are already there
            fh_del = open(deleted_file, "a")
            for id in chimera_deleted:
                fh_del.write(id + "\tChimera\n")
            fh_del.close()