Пример #1
0
    sp.call(['/home/timmonen/bin/stampy-1.0.27/stampy.py', '-o', fn_out, '-g', sample.get_data_foldername() + 'NL43', '-h', sample.get_data_foldername() +  'NL43', '-M', fn_in[0],fn_in[1]])

             
    n_reads = 0       
    with pysam.Samfile(fn_out, 'r') as samfile:
        for ir, read in enumerate(samfile):
            n_reads += 1
            
        # Write summary file
    summary = {'sample name': sample.name,
               'number of read pairs': (ir + 1)/2,}    
    sample.write_json(summary, fn_outs)

                

  # Script
if __name__ == '__main__':

    # Parse input args
    parser = argparse.ArgumentParser(description='Pre map reads')
    parser.add_argument('--sample', required=True,
                        help='MiSeq sample to analyze')
    parser.add_argument('--verbose', type=int, default=0,
                        help='Verbosity level [0-3]')

    args = parser.parse_args()

    sample = Sample(args.sample)
    sample.pre_map(VERBOSE=args.verbose)

    
Пример #2
0
    


# Functions
def assembly(sample, VERBOSE=0,**kwargs):
    input_filenames = sample.get_fragment_output_names()
    data_folder = sample.get_data_foldername()
    from Bio import SeqIO
    for f in range(len(input_filenames)):
        with pysam.Samfile(input_filenames[f], 'r') as samfile:
            fastqreads = reads_to_seqrecord(samfile)
            SeqIO.write(fastqreads, data_folder + sample.fragment_names[f] + ".fastq",'fastq')
        sp.call(['/home/timmonen/.local/bin/iva', '--fr',data_folder + sample.fragment_names[f] + ".fastq",data_folder + sample.fragment_names[f]])

  # Script
if __name__ == '__main__':

    # Parse input args
    parser = argparse.ArgumentParser(description='Pre map reads')
    parser.add_argument('--sample', required=True,
                        help='MiSeq sample to analyze')
    parser.add_argument('--verbose', type=int, default=0,
                        help='Verbosity level [0-3]')

    args = parser.parse_args()

    sample = Sample(args.sample)
    sample.assembly(VERBOSE=args.verbose)

    def initializeFromDictionary(self, configDict):
        # get the general stuff
        general_config = configDict['general']
        #if general_config['gast_data_source'] != 'database':
        self.run       = general_config['run']
        self.platform       = general_config.get('platform', "unknown")
        self.input_dir      = general_config.get('input_dir', None)
        self.require_distal = general_config.get('require_distal', True)
        self.minimumLength  = general_config.get('minimumLength', C.minimumLength)
        self.maximumLength  = general_config.get('maximumLength', C.maximumLength)
        self.minAvgQual     = general_config.get('minAvgQual',    C.minAvgQual)
        self.force_runkey   = general_config.get('force_runkey', None)
        self.use_cluster    = general_config['use_cluster']
        try:
            self.idx_keys           = general_config['idx_keys']
        except:
            self.idx_keys = ""
            
        if self.platform == 'vamps':
            self.user           = general_config['user']
            
            self.dna_region     = general_config['dna_region'] 
            self.input_files    = general_config['input_files'] 
            self.project        = general_config['project'] 
            self.dataset        = general_config['dataset']
            self.site           = general_config['site']
            self.env_source_id  = general_config['envsource']
            try:
                self.fasta_file    = general_config['fasta_file'] 
            except:
                self.fasta_file    =None
        if self.platform == 'illumina':
            self.compressed     = general_config['compressed'] 
            self.database_name  = general_config['database_name'] 
            self.database_host  = general_config['database_host'] 
        # added gast_input_source for vamps uploads
        # so when users want to gast at a later time they will
        # look in the database and not the files (which may be missing)
        # see /xraid2-2/vampsweb/vampsdev/vamps_trim.py
        self.gast_input_source = 'files' # for regular gast pipeline
        if 'gast_input_source' in general_config: 
            self.gast_input_source = general_config['gast_input_source']
        if 'files_list' in general_config:
            input_file_names = general_config['files_list']
            self.input_files = ','.join(general_config['files_list'])
        else:
            input_file_names  = [input_str.strip() for input_str in general_config['input_files'].split(',')]
            self.input_files = general_config['input_files']
#         
#         # for ini file:  (no plurals)
#         # 1) if input_file_format is a comma sep list then it should match the count of input_file_name
#         #       The same with input_file_lane
#         # 2) if input_file_format is supplied and is a single item it will apply to all the input files
#         #       either in input_dir or the list (or single) of input_file_name
#         # 3) EITHER input_dir OR input_file_name will be supplied (but not both)
#         #
#         if self.platform == '454':
#             
#             if 'input_file_format' in general_config and general_config['input_file_format'] != '':
#                 input_file_types = general_config['input_file_format']
#             elif 'file_formats_list' in general_config:    
#                 input_file_types = general_config['file_formats_list']
#             else:
#                 input_file_types  = [input_str.strip() for input_str in general_config['input_file_formats'].split(',')]
#             
#             print 'input_file_types= ',input_file_types
#             if len(input_file_names) != len(input_file_types):
#                 raise Exception("Mismatch between the number of input_file_names(" + str(len(input_file_names)) + ") and input_file_types(" + str(len(input_file_types)) + ") in configuration information")
#             
#             if 'lanes_list' in general_config: 
#                 input_file_lanes = general_config['lanes_list']
#             else:        
#                 lane_info = general_config['input_file_lanes'].strip()
#                 input_file_lanes  = [] if lane_info == '' else [input_str.strip() for input_str in lane_info.split(',')]
#     
#             # no lane info? better by our custom fasta-mbl format then
#             if len(input_file_lanes) == 0 and len([  type for type in input_file_types if type != 'fasta-mbl' ]) > 0:
#                 raise Exception("Only fasta-mbl formatted sequence files are allowed to not provide a value for input_file_lanes")
#     
#             # if they give any lane information it then needs to either be 1 value (for all files) or match them exactly
#             if len(input_file_lanes) > 1 and (len(input_file_names) != len(input_file_lanes)):
#                 raise Exception("Mismatch between the number of input_file_names(" + str(len(input_file_names)) + ") and lanes(" + str(len(input_file_lanes)) + ") in configuration information")
#         else:
#             input_file_types = []   
#             input_file_lanes = []
#         
#         
#         
#         
 
 
        self.input_file_info = {}
#        print general_config
        for idx,input_file in enumerate(input_file_names):
            
            if "input_file_format" in general_config:
                file_format = general_config['input_file_format']
            else:
                # default
                file_format = 'fasta'
            
            
            if file_format not in C.input_file_formats:
                raise Exception("Invalid sequence input file format: " + config_dict['input_file_format'])
                
            if "input_file_lane" in general_config:
                file_lane = general_config['input_file_lane']
            else:
                # default
                file_lane = ''    
                
            # make up a hash...they are allowed to not put in any input_file_lanes...could be 3 mbl fasta files which would all have lane
            # info encoded on each id/description line of the sequence record
            
            self.input_file_info[input_file] =  {  "name" : input_file, 
                                                   "format" : file_format, 
                                                   "lane" : file_lane
                                                }
        
        
        # now deal with each lane_runkey combo (Sample) that is misnamed though
        # populate sample information for every run_key
        for lane_run_key in [s for s in configDict.keys() if s != 'general']:
            lane_run_dict = configDict[lane_run_key]
            sample = Sample(lane_run_key)
            # has defaults -not required
            try:
                sample.forward_primers = lane_run_dict['forward_primers'].split(',')
            except:
                sample.forward_primers = []
            try:
                sample.reverse_primers = lane_run_dict['reverse_primers'].split(',')
            except:
                sample.reverse_primers = []
            try:
                sample.stop_sequences = lane_run_dict['stop_sequences'].split(',')
            except:
                sample.stop_sequences = []
            try:
                sample.anchor = lane_run_dict['anchor']
            except:
                sample.anchor = ''
            # should we try to trim with mbl primers as well as custom ones
            try:
                sample.use_mbl_primers = lane_run_dict['use_mbl_primers']
            except:
                sample.use_mbl_primers = 1
#################################
            try:
                sample.run_key = lane_run_dict['run_key']
            except:
                sample.run_key = ''
            try:
                sample.lane = lane_run_dict['lane']
            except:
                sample.lane = ''
            try:
                sample.adaptor = lane_run_dict['adaptor']
            except:
                sample.adaptor = ''
            try:
                sample.barcode = lane_run_dict['barcode']
            except:
                sample.barcode = ''
            try:
                sample.seq_operator = lane_run_dict['seq_operator']
            except:
                sample.seq_operator = ''
            try:
                sample.amp_operator = lane_run_dict['amp_operator']
            except:
                sample.amp_operator = ''
            try:
                sample.primer_suite = lane_run_dict['primer_suite']
            except:
                sample.primer_suite = ''
            try:
                sample.tubelabel = lane_run_dict['tubelabel']
            except:
                sample.tubelabel = ''
            try:    
                sample.dna_region = lane_run_dict['dna_region'] 
            except:
                sample.dna_region = ''
                
            sample.data_owner           = lane_run_dict['data_owner']
            sample.first_name           = lane_run_dict['first_name']
            sample.last_name            = lane_run_dict['last_name']
            sample.email                = lane_run_dict['email']
            sample.institution          = lane_run_dict['institution']
            sample.project_title        = lane_run_dict['project_title']
            sample.project_description  = lane_run_dict['project_description']
            sample.funding              = lane_run_dict['funding']
            sample.env_sample_source    = lane_run_dict['env_sample_source']
            sample.dataset_description  = lane_run_dict['dataset_description']
                
            if self.platform == 'illumina':
                # req specifically for illumina
                sample.barcode_index = lane_run_dict['barcode_index'] 
                sample.overlap = lane_run_dict['overlap'] 
                sample.read_length = lane_run_dict['read_length'] 
                sample.file_prefix = lane_run_dict['file_prefix'] 
                sample.insert_size = lane_run_dict['insert_size']
                # concatenate: barcode_index and run_key and lane
                key = lane_run_dict['barcode_index'] +'_'+ lane_run_dict['run_key'] +'_'+ lane_run_dict['lane'] 
                #sample.key = key
                self.run_keys.append(key)  
                
            elif self.platform == '454':
                # required for 454
                sample.direction = lane_run_dict['direction'] 
                sample.taxonomic_domain = lane_run_dict['domain']
                # a list of run_keys
                # convert: change ':' to '_'
                key = lane_run_key[:1]+'_'+lane_run_key[2:]
                #sample.key = key
                self.run_keys.append(key)
                
            sample.project = lane_run_dict['project']
            sample.dataset = lane_run_dict['dataset']
                      

            
            # a dictionary of samples
            self.samples[key] = sample
    def initializeFromDictionary(self, configDict):
        # get the general stuff
        general_config = configDict['general']
        print(   'General Config0:',general_config)
        #if general_config['gast_data_source'] != 'database':
        self.run             = general_config['run']
        self.platform       = general_config.get('platform', "unknown")
        self.input_dir      = general_config.get('input_dir', None)
        self.require_distal = general_config.get('require_distal', True)
        self.minimumLength  = general_config.get('minimumLength', C.minimumLength)
        self.maximumLength  = general_config.get('maximumLength', C.maximumLength)
        self.minAvgQual     = general_config.get('minAvgQual',    C.minAvgQual)
        self.force_runkey   = general_config.get('force_runkey', None)

        try:
            self.idx_keys           = general_config['idx_keys']
        except:
            self.idx_keys = ""


        if self.vamps_user_upload:
            self.site               = general_config['site']
            if self.site == 'new_vamps':
                self.project_dir    = general_config['project_dir']
                self.node_db        = general_config['node_db']
                self.process_dir  = general_config['process_dir']
                self.hostname     = general_config['hostname']
                self.ref_db_dir   = general_config['ref_db_dir']
                self.config_file  = general_config['config_file']
                self.project  = general_config['project']
                self.env_source_id  = general_config['env_source_id']

            self.user           = general_config['user']
            #self.datasets       =   configDict['datasets']

            self.input_files    = general_config['input_files']
            #self.project        = general_config['project']
            #self.dataset        = general_config['dataset']
            self.dna_region     = general_config['dna_region']
            self.domain         = general_config['domain']



            self.load_vamps_database = general_config['load_vamps_database']
            try:
                self.require_distal = general_config['require_distal']
            except:
                self.require_distal = True
            try:
                self.minimumLength = general_config['minimum_length']
            except:
                self.minimumLength = C.minimumLength
            try:
                self.maximumLength = general_config['maximum_length']
            except:
                self.maximumLength = C.maximumLength
            try:
                self.use_cluster    = general_config['use_cluster']
            except:
                self.use_cluster = False
            try:
                self.use64bit = general_config['use64bit']
            except:
                self.use64bit = False

            try:
                self.fasta_file     = general_config['fasta_file']
            except:
                self.fasta_file     = None
            try:
                self.mobedac        = general_config['mobedac']
            except:
                self.mobedac        = False
            try:
                self.use_full_length= general_config['use_full_length']
            except:
                self.use_full_length= False
            try:
                self.classifier     = general_config['classifier']
            except:
                self.classifier= 'unknown'
        else:
            if self.platform in C.illumina_list:
                self.compressed     = general_config['compressed']
                self.database_name  = general_config['database_name']
                self.database_host  = general_config['database_host']
                self.site           = general_config['site']
                self.load_vamps_database = general_config['load_vamps_database']
                if "archaea" in general_config:
                    self.archaea    = general_config['archaea']
                if "do_perfect" in general_config:
                    self.do_perfect = general_config['do_perfect']
                else:
                    self.do_perfect = C.pipeline_run_items[self.platform]['do_perfect']
                if "lane_name" in general_config:
                    self.lane_name = general_config['lane_name']
                else:
                    self.lane_name = C.pipeline_run_items[self.platform]['lane_name']

            elif self.platform == '454':
                self.compressed     = general_config['compressed']
                self.database_name  = general_config['database_name']
                self.database_host  = general_config['database_host']
                self.site           = general_config['site']
                self.load_vamps_database = general_config['load_vamps_database']
            else:
                pass
        # added gast_input_source for vamps uploads
        # so when users want to gast at a later time they will
        # look in the database and not the files (which may be missing)
        # see /xraid2-2/vampsweb/vampsdev/vamps_trim.py

        if 'gast_input_source' in general_config:
            self.gast_input_source = general_config['gast_input_source']

        print(   'General Config:',general_config)
        if 'files_list' in general_config:
            input_file_names = general_config['files_list']
            self.input_files = ','.join(general_config['files_list'])
            self.files_list = general_config['files_list']
        else:
            input_file_names  = [input_str.strip() for input_str in general_config['input_files'].split(',')]
            self.input_files = ','.join(general_config['input_files'])
            self.files_list = general_config['input_files']





        self.input_file_info = {}
        print(general_config)
        for idx,input_file in enumerate(input_file_names):

            if "input_file_format" in general_config:
                file_format = general_config['input_file_format']
            else:
                # default
                file_format = 'fasta'


            if file_format not in C.input_file_formats:
                raise Exception("Invalid sequence input file format: " + general_config['input_file_format'])

            if "input_file_lane" in general_config:
                file_lane = general_config['input_file_lane']
            else:
                # default
                file_lane = ''

            # make up a hash...they are allowed to not put in any input_file_lanes...could be 3 mbl fasta files which would all have lane
            # info encoded on each id/description line of the sequence record

            self.input_file_info[input_file] =  {  "name" : input_file,
                                                   "format" : file_format,
                                                   "lane" : file_lane
                                                }


        # now deal with each lane_runkey combo (Sample) that is misnamed though
        # populate sample information for every run_key

        for lane_run_key in [s for s in configDict.keys() if s != 'general']:
        	# change ':' to '_'
        	# key = lane_run_key[:1]+'_'+lane_run_key[2:]

            lane_run_dict = configDict[lane_run_key]

            sample = Sample(lane_run_key)


            # has defaults -not required
            try:
                sample.forward_primers = lane_run_dict['forward_primers'].split(',')
            except:
                sample.forward_primers = []
            try:
                sample.reverse_primers = lane_run_dict['reverse_primers'].split(',')
            except:
                sample.reverse_primers = []
            try:
                sample.stop_sequences = lane_run_dict['stop_sequences'].split(',')
            except:
                sample.stop_sequences = []
            try:
                sample.anchor = lane_run_dict['anchor']
            except:
                sample.anchor = ''
            # should we try to trim with mbl primers as well as custom ones
            try:
                sample.use_mbl_primers = lane_run_dict['use_mbl_primers']
            except:
                sample.use_mbl_primers = 1
#################################
            try:
                sample.run_key = lane_run_dict['run_key']
            except:
                sample.run_key = ''
            try:
                sample.lane = lane_run_dict['lane']
            except:
                sample.lane = ''
            try:
                sample.adaptor = lane_run_dict['adaptor']
            except:
                sample.adaptor = ''
            try:
                sample.barcode = lane_run_dict['barcode']
            except:
                sample.barcode = ''
            try:
                sample.seq_operator = lane_run_dict['seq_operator']
            except:
                sample.seq_operator = ''
            try:
                sample.amp_operator = lane_run_dict['amp_operator']
            except:
                sample.amp_operator = ''
            try:
                sample.primer_suite = lane_run_dict['primer_suite']
            except:
                sample.primer_suite = ''
            try:
                sample.tubelabel = lane_run_dict['tubelabel']
            except:
                sample.tubelabel = ''
            try:
                sample.dna_region = lane_run_dict['dna_region']
            except:
                sample.dna_region = ''

            if sample.primer_suite:
                sample.taxonomic_domain = sample.primer_suite.split()[0]
            else:
                sample.taxonomic_domain = 'unknown'


            sample.project_title        = lane_run_dict['project_title']
            sample.project_description  = lane_run_dict['project_description']

            sample.env_sample_source_id = lane_run_dict['env_sample_source_id']
            sample.dataset_description  = lane_run_dict['dataset_description']
            sample.project              = lane_run_dict['project']
            sample.dataset              = lane_run_dict['dataset']
#             print('lane_run_key '+lane_run_key)
            if self.vamps_user_upload:
                # required for 454
                sample.direction = lane_run_dict['direction']
                #sample.taxonomic_domain = lane_run_dict['taxonomic_domain']
                # a list of run_keys
                # convert: change ':' to '_'
                #lane_run_key = '_'.join(lane_run_key.split(':'))
                key = lane_run_key[:1]+'_'+lane_run_key[2:]
                #sample.key = key
                self.run_keys.append(key)
                # a dictionary of samples
                self.samples[key] = sample
            else:
                if self.platform in C.illumina_list:
                    # req specifically for illumina
                    sample.data_owner           = lane_run_dict['data_owner']
                    sample.first_name           = lane_run_dict['first_name']
                    sample.last_name            = lane_run_dict['last_name']
                    sample.email                = lane_run_dict['email']
                    sample.institution          = lane_run_dict['institution']
                    sample.funding              = lane_run_dict['funding']
                    sample.barcode_index = lane_run_dict['barcode_index']
                    sample.overlap = lane_run_dict['overlap']
                    sample.read_length = lane_run_dict['read_length']
#                    sample.file_prefix = lane_run_dict['file_prefix']
                    sample.insert_size = lane_run_dict['insert_size']
                    #sample.taxonomic_domain = lane_run_dict['domain']
                    # concatenate: barcode_index and run_key and lane
                    key = lane_run_dict['barcode_index'] +'_'+ lane_run_dict['run_key'] +'_'+ lane_run_dict['lane']
                    #sample.key = key
                    self.run_keys.append(key)
                    # a dictionary of samples
                    self.samples[lane_run_key] = sample

                elif self.platform == '454':
                    # required for 454
                    sample.direction            = lane_run_dict['direction']
                    sample.data_owner           = lane_run_dict['data_owner']
                    sample.first_name           = lane_run_dict['first_name']
                    sample.last_name            = lane_run_dict['last_name']
                    sample.email                = lane_run_dict['email']
                    sample.institution          = lane_run_dict['institution']
                    sample.funding              = lane_run_dict['funding']
                    #sample.taxonomic_domain = lane_run_dict['domain']
                    # a list of run_keys
                    # convert: change ':' to '_'
                    key = lane_run_key[:1]+'_'+lane_run_key[2:]
                    #sample.key = key
                    self.run_keys.append(key)
                    # a dictionary of samples
                    self.samples[lane_run_key] = sample
Пример #5
0
        print 'Discarded:', n_discarded
        
    # Write summary file
    summary = {'sample name': sample.name,
               'sequencing run': sample.run,
               'adapter': sample.adapter,
               'number of read pairs': irp + 1,
               'number of discarded': n_discarded,
               'number of good': n_good,
               'histotgram of number of trimmed bases': np.zeros(301).tolist(),
              }    
    sample.write_json(summary, fn_outs)
    return n_good
    

  # Script
if __name__ == '__main__':

    # Parse input args
    parser = argparse.ArgumentParser(description='Trim low quality end of reads')
    parser.add_argument('--sample', required=True,
                        help='MiSeq sample to analyze')
    parser.add_argument('--verbose', type=int, default=0,
                        help='Verbosity level [0-3]')

    args = parser.parse_args()

    sample = Sample(args.sample)
    n_runs = sample.trim_reads(VERBOSE=args.verbose)
    
    def initializeFromDictionary(self, configDict):
        # get the general stuff
        general_config = configDict['general']
        #if general_config['gast_data_source'] != 'database':
        self.run       = general_config['run']
        self.platform       = general_config.get('platform', "unknown")
        self.input_dir      = general_config.get('input_dir', None)
        self.require_distal = general_config.get('require_distal', True)
        self.minimumLength  = general_config.get('minimumLength', C.minimumLength)
        self.maximumLength  = general_config.get('maximumLength', C.maximumLength)
        self.minAvgQual     = general_config.get('minAvgQual',    C.minAvgQual)
        self.force_runkey   = general_config.get('force_runkey', None)
        self.use_cluster    = general_config['use_cluster']
        try:
            self.idx_keys           = general_config['idx_keys']
        except:
            self.idx_keys = ""
 
 
        if self.vamps_user_upload:
            self.user           = general_config['user']           
            
            self.input_files    = general_config['input_files'] 
            #self.project        = general_config['project'] 
            #self.dataset        = general_config['dataset']
            self.dna_region     = general_config['dna_region']
            self.domain         = general_config['domain']
            
            
            self.site               = general_config['site']
            
            self.load_vamps_database = general_config['load_vamps_database']
            try:
                self.fasta_file     = general_config['fasta_file'] 
            except:
                self.fasta_file     = None
            try:
                self.mobedac        = general_config['mobedac'] 
            except:
                self.mobedac        = False
            try:
                self.use_full_length= general_config['use_full_length']
            except:
                self.use_full_length= False
            try:
                self.classifier     = general_config['classifier']
            except:
                self.classifier= 'unknown'
        else:
            if self.platform == 'illumina':
                self.compressed     = general_config['compressed']                 
                self.database_name  = general_config['database_name'] 
                self.database_host  = general_config['database_host'] 
                self.site           = general_config['site']
                self.load_vamps_database = general_config['load_vamps_database']
                if general_config.has_key("archaea"):
                    self.archaea    = general_config['archaea'] 
                if general_config.has_key("do_perfect"):
                    self.do_perfect = general_config['do_perfect']
                else:
                    self.do_perfect = C.pipeline_run_items['illumina']['do_perfect']        
                if general_config.has_key("lane_name"):
                    self.lane_name = general_config['lane_name']
                else:
                    self.lane_name = C.pipeline_run_items['illumina']['lane_name']                                    
                    
            elif self.platform == '454':
                self.compressed     = general_config['compressed'] 
                self.database_name  = general_config['database_name'] 
                self.database_host  = general_config['database_host'] 
                self.site           = general_config['site']
                self.load_vamps_database = general_config['load_vamps_database']
            else:
                pass
        # added gast_input_source for vamps uploads
        # so when users want to gast at a later time they will
        # look in the database and not the files (which may be missing)
        # see /xraid2-2/vampsweb/vampsdev/vamps_trim.py
        
        if 'gast_input_source' in general_config: 
            self.gast_input_source = general_config['gast_input_source']
        
        print    'General Config:',general_config
        if 'files_list' in general_config:
            input_file_names = general_config['files_list']
            self.input_files = ','.join(general_config['files_list'])
            self.files_list = general_config['files_list']
        else:
            input_file_names  = [input_str.strip() for input_str in general_config['input_files'].split(',')]
            self.input_files = ','.join(general_config['input_files'])
            self.files_list = general_config['input_files']
        

 
        
        
        self.input_file_info = {}
        print general_config
        for idx,input_file in enumerate(input_file_names):
            
            if "input_file_format" in general_config:
                file_format = general_config['input_file_format']
            else:
                # default
                file_format = 'fasta'
            
            
            if file_format not in C.input_file_formats:
                raise Exception("Invalid sequence input file format: " + general_config['input_file_format'])
                
            if "input_file_lane" in general_config:
                file_lane = general_config['input_file_lane']
            else:
                # default
                file_lane = ''    
                
            # make up a hash...they are allowed to not put in any input_file_lanes...could be 3 mbl fasta files which would all have lane
            # info encoded on each id/description line of the sequence record
            
            self.input_file_info[input_file] =  {  "name" : input_file, 
                                                   "format" : file_format, 
                                                   "lane" : file_lane
                                                }
        
        
        # now deal with each lane_runkey combo (Sample) that is misnamed though
        # populate sample information for every run_key
        
        for lane_run_key in [s for s in configDict.keys() if s != 'general']:
        	# change ':' to '_'
        	# key = lane_run_key[:1]+'_'+lane_run_key[2:]
            
            lane_run_dict = configDict[lane_run_key]
            #print 'CD ',configDict
            
            sample = Sample(lane_run_key)
            #print 'sample',sample
            
            # has defaults -not required
            try:
                sample.forward_primers = lane_run_dict['forward_primers'].split(',')
            except:
                sample.forward_primers = []
            try:
                sample.reverse_primers = lane_run_dict['reverse_primers'].split(',')
            except:
                sample.reverse_primers = []
            try:
                sample.stop_sequences = lane_run_dict['stop_sequences'].split(',')
            except:
                sample.stop_sequences = []
            try:
                sample.anchor = lane_run_dict['anchor']
            except:
                sample.anchor = ''
            # should we try to trim with mbl primers as well as custom ones
            try:
                sample.use_mbl_primers = lane_run_dict['use_mbl_primers']
            except:
                sample.use_mbl_primers = 1
#################################
            try:
                sample.run_key = lane_run_dict['run_key']
            except:
                sample.run_key = ''
            try:
                sample.lane = lane_run_dict['lane']
            except:
                sample.lane = ''
            try:
                sample.adaptor = lane_run_dict['adaptor']
            except:
                sample.adaptor = ''
            try:
                sample.barcode = lane_run_dict['barcode']
            except:
                sample.barcode = ''
            try:
                sample.seq_operator = lane_run_dict['seq_operator']
            except:
                sample.seq_operator = ''
            try:
                sample.amp_operator = lane_run_dict['amp_operator']
            except:
                sample.amp_operator = ''
            try:
                sample.primer_suite = lane_run_dict['primer_suite']
            except:
                sample.primer_suite = ''
            try:
                sample.tubelabel = lane_run_dict['tubelabel']
            except:
                sample.tubelabel = ''
            try:    
                sample.dna_region = lane_run_dict['dna_region'] 
            except:
                sample.dna_region = ''
            
            if sample.primer_suite:
                sample.taxonomic_domain = sample.primer_suite.split()[0]
            else:
                sample.taxonomic_domain = 'unknown'
                
            
            sample.project_title        = lane_run_dict['project_title']
            sample.project_description  = lane_run_dict['project_description']
            
            sample.env_sample_source_id = lane_run_dict['env_sample_source_id']
            sample.dataset_description  = lane_run_dict['dataset_description']
            sample.project              = lane_run_dict['project']
            sample.dataset              = lane_run_dict['dataset']
            print 'lane_run_key '+lane_run_key
            if self.vamps_user_upload:
                # required for 454
                sample.direction = lane_run_dict['direction'] 
                #sample.taxonomic_domain = lane_run_dict['taxonomic_domain']
                # a list of run_keys
                # convert: change ':' to '_'
                #lane_run_key = '_'.join(lane_run_key.split(':'))
                key = lane_run_key[:1]+'_'+lane_run_key[2:]
                #sample.key = key
                self.run_keys.append(key)
                # a dictionary of samples
            	self.samples[key] = sample
            else:
                if self.platform == 'illumina':
                    # req specifically for illumina
                    sample.data_owner           = lane_run_dict['data_owner']
                    sample.first_name           = lane_run_dict['first_name']
                    sample.last_name            = lane_run_dict['last_name']
                    sample.email                = lane_run_dict['email']
                    sample.institution          = lane_run_dict['institution']
                    sample.funding              = lane_run_dict['funding']
                    sample.barcode_index = lane_run_dict['barcode_index'] 
                    sample.overlap = lane_run_dict['overlap'] 
                    sample.read_length = lane_run_dict['read_length'] 
#                    sample.file_prefix = lane_run_dict['file_prefix'] 
                    sample.insert_size = lane_run_dict['insert_size']
                    #sample.taxonomic_domain = lane_run_dict['domain']
                    # concatenate: barcode_index and run_key and lane
                    key = lane_run_dict['barcode_index'] +'_'+ lane_run_dict['run_key'] +'_'+ lane_run_dict['lane'] 
                    #sample.key = key
                    self.run_keys.append(key)  
                    # a dictionary of samples
                    self.samples[lane_run_key] = sample
                    
                elif self.platform == '454':
                    # required for 454
                    sample.direction            = lane_run_dict['direction'] 
                    sample.data_owner           = lane_run_dict['data_owner']
                    sample.first_name           = lane_run_dict['first_name']
                    sample.last_name            = lane_run_dict['last_name']
                    sample.email                = lane_run_dict['email']
                    sample.institution          = lane_run_dict['institution']
                    sample.funding              = lane_run_dict['funding']
                    #sample.taxonomic_domain = lane_run_dict['domain']
                    # a list of run_keys
                    # convert: change ':' to '_'
                    key = lane_run_key[:1]+'_'+lane_run_key[2:]
                    #sample.key = key
                    self.run_keys.append(key)
                    # a dictionary of samples
                    self.samples[lane_run_key] = sample
Пример #7
0
    def configFromFile(self, config_file_path):
        import ConfigParser
        
        user_config = ConfigParser.ConfigParser()
        user_config.read(config_file_path)

        # take care of the general section
        G = lambda v: user_config.get('general', v)
        self.run_date   = G('run_date')
        self.platform   = G('platform')
        self.input_dir  = G('input_dir')
        self.output_dir = G('output_dir')

        self.input_files  = [file.strip() for file in G('input_files').split(',')]
        self.input_file_type = G('input_file_type')
 
        # populate sample information for every run_key
        for run_key in [s for s in user_config.sections() if s != 'general']:
            #print run_key    # looks like:  1:ACACT
            S = lambda v: user_config.get(run_key, v)
            sample = Sample(run_key)
            
            # has defaults -not required
            try:
                sample.proximal_primers = S('forward_primers').strip("'").strip('"').split(',')
            except:
                sample.proximal_primers = []
            try:
                sample.distal_primers = S('reverse_primers').strip("'").strip('"').split(',')
            except:
                sample.distal_primers = []
            try:
                sample.stop_sequences = S('stop_sequences').strip("'").strip('"').split(',')
            except:
                sample.stop_sequences = []
            try:
                sample.anchor = S('anchor')
            except:
                sample.anchor = ''
            # required
            sample.direction = S('direction')
            sample.project = S('project_name')
            sample.dataset = S('dataset_name')
            sample.dna_region = S('dna_region')
            sample.taxonomic_domain = S('taxonomic_domain')
            
            # a list of run_keys
            # convert: change ':' to '_'
            key = run_key[:1]+'_'+run_key[2:]
            self.run_keys.append(key)
            # a dictionary of samples
            self.samples[key] = sample