def __init__(self, runobj = None):
        self.runobj      = runobj
        self.rundate     = self.runobj.run
        self.use_cluster = 1       
#        if self.runobj.vamps_user_upload:
#            site       = self.runobj.site
#            dir_prefix = self.runobj.user + '_' + self.runobj.run
#        else:
#            site = ''
#            dir_prefix = self.runobj.run         
#        dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site)

        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
 
        
        self.analysis_dir = dirs.check_dir(dirs.analysis_dir)
        self.fasta_dir    = dirs.check_dir(dirs.reads_overlap_dir)
        self.gast_dir     = dirs.check_dir(dirs.gast_dir)

        host_name     = runobj.database_host
        database_name = runobj.database_name
        
        self.filenames   = []
        self.my_conn     = MyConnection(host = 'newbpcdb2', db="env454")
#         self.my_conn     = MyConnection()    
        self.sequence_table_name = "sequence_ill" 
        self.sequence_field_name = "sequence_comp" 
        self.my_csv              = None

        self.unique_file_counts = dirs.unique_file_counts
        dirs.delete_file(self.unique_file_counts)
        self.seq_id_dict = {}
        self.tax_id_dict = {}
        self.run_id      = None
#        self.nonchimeras_suffix = ".nonchimeric.fa"
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.fa_unique_suffix   = ".fa." + C.unique_suffix #.fa.unique
#         self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique
        self.suffix_used        = ""
예제 #2
0
class dbUpload:
    """db upload methods"""
    Name = "dbUpload"
    """
    TODO: add tests and test case
    TODO: change hardcoded values to args: 
        self.sequence_table_name = "sequence_ill", 
        self.sequence_field_name = "sequence_comp"  
    TODO: generalize all bulk uploads and all inserts? to not copy and paste
    TODO: add refssu_id
    TODO: change csv validaton for new fields
    Order:
        # put_run_info
        # insert_seq()
        # insert_pdr_info()
        # gast
        # insert_taxonomy()
        # insert_sequence_uniq_info_ill()

    """
    def __init__(self, runobj = None):
        self.utils       = PipelneUtils()
        self.runobj      = runobj
        self.rundate     = self.runobj.run
        self.use_cluster = 1       
        self.unique_fasta_files = []
#        if self.runobj.vamps_user_upload:
#            site       = self.runobj.site
#            dir_prefix = self.runobj.user + '_' + self.runobj.run
#        else:
#            site = ''
#            dir_prefix = self.runobj.run         
#        dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, site = site)

        if self.runobj.vamps_user_upload:
            site = self.runobj.site
            dir_prefix=self.runobj.user+'_'+self.runobj.run
        else:
            site = ''
            dir_prefix = self.runobj.run
        if self.runobj.lane_name:
            lane_name = self.runobj.lane_name
        else:
            lane_name = ''
        
        self.dirs = Dirs(self.runobj.vamps_user_upload, dir_prefix, self.runobj.platform, lane_name = lane_name, site = site) 
 
        
        self.analysis_dir = self.dirs.check_dir(self.dirs.analysis_dir)
        self.fasta_dir    = self.dirs.check_dir(self.dirs.reads_overlap_dir)
        self.gast_dir     = self.dirs.check_dir(self.dirs.gast_dir)

        host_name     = runobj.database_host
        database_name = runobj.database_name
        
        self.filenames   = []
        self.my_conn     = MyConnection(host = 'newbpcdb2.jbpc-np.mbl.edu', db="env454")
#         self.my_conn     = MyConnection()

#         self.my_conn     = MyConnection(host = 'localhost', db="test_env454")
        self.sequence_table_name = "sequence_ill" 
        self.sequence_field_name = "sequence_comp" 
        self.my_csv              = None

        self.unique_file_counts = self.dirs.unique_file_counts
        self.dirs.delete_file(self.unique_file_counts)
        self.seq_id_dict = {}
        self.tax_id_dict = {}
        self.run_id      = None
#        self.nonchimeras_suffix = ".nonchimeric.fa"
        self.nonchimeric_suffix = "." + C.nonchimeric_suffix #".nonchimeric.fa"
        self.fa_unique_suffix   = ".fa." + C.unique_suffix #.fa.unique
        self.v6_unique_suffix   = "MERGED_V6_PRIMERS_REMOVED." + C.unique_suffix
        self.suff_list = [self.nonchimeric_suffix, self.fa_unique_suffix, self.v6_unique_suffix]

#         self.merge_unique_suffix = "." + C.filtered_suffix + "." + C.unique_suffix #.MERGED-MAX-MISMATCH-3.unique
        self.suffix_used        = ""
        
#        self.refdb_dir = '/xraid2-2/vampsweb/blastdbs/'
   
   
    def get_fasta_file_names(self):
        files_names = self.dirs.get_all_files(self.fasta_dir)
        self.unique_fasta_files = [f for f in files_names.keys() if f.endswith(tuple(self.suff_list))]
# needs return because how it's called from pipelineprocesor
        return self.unique_fasta_files
        

    def get_run_info_ill_id(self, filename_base):
        
        my_sql = """SELECT run_info_ill_id FROM run_info_ill 
                    JOIN run using(run_id)
                    WHERE file_prefix = '%s'
                    and run = '%s'
        """ % (filename_base, self.rundate)
        res    = self.my_conn.execute_fetch_select(my_sql)
        if res:
            return int(res[0][0])
        
    def make_seq_upper(self, filename):
        read_fasta = fastalib.ReadFasta(filename)
        sequences  = [seq.upper() for seq in read_fasta.sequences] #here we make uppercase for VAMPS compartibility    
        read_fasta.close()
        return sequences 
        
    def insert_seq(self, sequences):
      query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))"
      val_tmpl   = "'%s'"
      my_sql     = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences]))
      seq_id     = self.my_conn.execute_no_fetch(my_sql)
      self.utils.print_both("sequences in file: %s\n" % (len(sequences)))
      return seq_id
    #     try:
    #         query_tmpl = "INSERT IGNORE INTO %s (%s) VALUES (COMPRESS(%s))"
    #         val_tmpl   = "'%s'"
    #         my_sql     = query_tmpl % (self.sequence_table_name, self.sequence_field_name, ')), (COMPRESS('.join([val_tmpl % key for key in sequences]))
    #         seq_id     = self.my_conn.execute_no_fetch(my_sql)
    # #         print "sequences in file: %s" % (len(sequences))
    #         self.utils.print_both("sequences in file: %s\n" % (len(sequences)))
    #         return seq_id
    #     except self.my_conn.conn.cursor._mysql_exceptions.Error as err:
    #         if err.errno == 1582:
    #             self.utils.print_both(("ERROR: _mysql_exceptions.OperationalError: (1582, \"Incorrect parameter count in the call to native function 'COMPRESS'\"), there is an empty fasta in %s") % self.fasta_dir)
    #         else:
    #             raise
    #     except:
    #         if len(sequences) == 0:
    #             self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir)
    #         raise
        
    def get_seq_id_dict(self, sequences):
        id_name    = self.sequence_table_name + "_id" 
        query_tmpl = """SELECT %s, uncompress(%s) FROM %s WHERE %s in (COMPRESS(%s))"""
        val_tmpl   = "'%s'"
        try:
            my_sql     = query_tmpl % (id_name, self.sequence_field_name, self.sequence_table_name, self.sequence_field_name, '), COMPRESS('.join([val_tmpl % key for key in sequences]))
            res        = self.my_conn.execute_fetch_select(my_sql)
            one_seq_id_dict = dict((y, int(x)) for x, y in res)
            self.seq_id_dict.update(one_seq_id_dict)
        except:
            if len(sequences) == 0:
                self.utils.print_both(("ERROR: There are no sequences, please check if there are correct fasta files in the directory %s") % self.fasta_dir)
            raise


    def get_id(self, table_name, value):
        id_name = table_name + '_id'
        my_sql  = """SELECT %s FROM %s WHERE %s = '%s'""" % (id_name, table_name, table_name, value)
        res     = self.my_conn.execute_fetch_select(my_sql)
        if res:
            return int(res[0][0])         
            
    def get_sequence_id(self, seq):
        my_sql = """SELECT sequence_ill_id FROM sequence_ill WHERE COMPRESS('%s') = sequence_comp""" % (seq)
        res    = self.my_conn.execute_fetch_select(my_sql)
        if res:
            return int(res[0][0])     
    
    def insert_pdr_info(self, fasta, run_info_ill_id):
        res_id = ""
        if (not run_info_ill_id):
            self.utils.print_both("ERROR: There is no run info yet, please check if it's uploaded to env454")
            
        # ------- insert sequence info per run/project/dataset --------
        seq_upper = fasta.seq.upper()
        sequence_ill_id = self.seq_id_dict[seq_upper]

        seq_count       = int(fasta.id.split('|')[-1].split(':')[-1])
#        print run_info_ill_id, sequence_ill_id, seq_count
        my_sql          = """INSERT IGNORE INTO sequence_pdr_info_ill (run_info_ill_id, sequence_ill_id, seq_count) 
                             VALUES (%s, %s, %s)""" % (run_info_ill_id, sequence_ill_id, seq_count)

        try:
            res_id = self.my_conn.execute_no_fetch(my_sql)
            return res_id
        except:
            self.utils.print_both("Offensive query: %s" % my_sql)
            raise
        
    def make_gast_files_dict(self):
        return self.dirs.get_all_files(self.gast_dir, "gast")
        
        
    def gast_filename(self, filename):
#         todo: if filename in make_gast_files_dict, use it full path
        gast_file_names = self.make_gast_files_dict()
        gast_file_name_path = ""
        for gast_file_name_path, tpls in gast_file_names.iteritems():
            if any(t.endswith(filename) for t in tpls):
                return gast_file_name_path 
    
    def get_gast_result(self, filename):
        gast_file_name = self.gast_filename(filename)
        self.utils.print_both("current gast_file_name = %s." % gast_file_name)
        
        try:
            with open(gast_file_name) as fd:
                gast_dict = dict([(l.split("\t")[0], l.split("\t")[1:]) for l in fd])    
            return gast_dict
        except IOError, e:
#            print dir(e)
#['__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__getitem__', '__getslice__', '__hash__', '__init__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__unicode__', 'args', 'errno', 'filename', 'message', 'strerror']
#            print "errno = %s" % e.errno
            logger.debug("errno = %s" % e.errno)
            if e.errno == 2:
                # suppress "No such file or directory" error
                pass            
#         except OSError, e:
        except TypeError, e:
            self.utils.print_both("Check if there is a gast file under %s for %s." % (self.gast_dir, filename))
            pass