def add_data_source(self, data_type, data_file, mapping_f=None): # for now, only fasta files are handled as data_path, no dirs or zips # with data and correspondig mapping_files. # for now, it is also required that there is a data item for each # object for which we have an id. # read sequences from fasta file try: seqs = [s for s in file_io.read_fasta(data_file)] ids = [s[0] for s in seqs] except Exception as e: # TODO print '\n%s\n%s\n%s\n' % (e, type(e), e.args) return 'Error in fasta file.' # close the temporary file, not sure if this is neccesary data_file.close() if not(len(set(ids)) == len(ids)): return 'Fasta file contains duplicate ids.' fe = self.get_feature_extraction() if not(set(fe.fm_protein.object_ids) == set(ids)): return 'Ids in provided file do not correspond to ids in project.' # reorder the sequence to the project object ids seq_dict = dict(seqs) seqs = [(sid, seq_dict[sid]) for sid in fe.fm_protein.object_ids] # create a uni_orf_mapping??? try: fe.protein_data_set.set_data_source(data_type, seqs) except ValueError as e: print(traceback.format_exc()) return str(e) # save feature extraction, if it all went well fe.save() return ''
def add_data_source(self, data_type, data_file, mapping_f=None): # for now, only fasta files are handled as data_path, no dirs or zips # with data and correspondig mapping_files. # for now, it is also required that there is a data item for each # object for which we have an id. # read sequences from fasta file try: seqs = [s for s in file_io.read_fasta(data_file)] ids = [s[0] for s in seqs] except Exception as e: # TODO print '\n%s\n%s\n%s\n' % (e, type(e), e.args) return 'Error in fasta file.' # close the temporary file, not sure if this is neccesary data_file.close() if not (len(set(ids)) == len(ids)): return 'Fasta file contains duplicate ids.' fe = self.get_feature_extraction() if not (set(fe.fm_protein.object_ids) == set(ids)): return 'Ids in provided file do not correspond to ids in project.' # reorder the sequence to the project object ids seq_dict = dict(seqs) seqs = [(sid, seq_dict[sid]) for sid in fe.fm_protein.object_ids] # create a uni_orf_mapping??? try: fe.protein_data_set.set_data_source(data_type, seqs) except ValueError as e: print(traceback.format_exc()) return str(e) # save feature extraction, if it all went well fe.save() return ''
def start_example_project(self, project_id, fasta_f, seq_type, labeling_f): ''' Start new project without checking input data ''' self.set_project(project_id) # check if user allready has a dir and create one if needed if not os.path.exists(self.user_dir): os.mkdir(self.user_dir) # check if a project with the same name exists, otherwise add number if(os.path.exists(self.project_dir)): index = 0 while(os.path.exists(self.project_dir)): project_id = '%s_%i' % (self.project_id.split('_')[0], index) self.set_project(project_id) index += 1 # read data from file try: seqs = [s for s in file_io.read_fasta(fasta_f)] ids = [s[0] for s in seqs] except Exception as e: print e return 'Error in fasta file' # create sequence feature extraction object fe = featext.FeatureExtraction() # set protein data try: fe.set_protein_ids(ids) fe.protein_data_set.set_data_source(seq_type, seqs) # translate to prot seq if orf provided if(seq_type == 'orf_seq'): ids = [s[0] for s in seqs] prot_seqs = [(sequtil.translate(s[1])) for s in seqs] # chop off translated stop codons at terminus prot_seqs = [s[:-1] if s[-1] == '*' else s for s in prot_seqs] fe.protein_data_set.set_data_source('prot_seq', zip(ids, prot_seqs)) except ValueError as e: print(traceback.format_exc()) return str(e) except: print(traceback.format_exc()) return 'Error during initiation new project' # add to feature matrix try: labeling_name = os.path.splitext(os.path.basename(labeling_f))[0] fe.fm_protein.add_labeling_from_file(labeling_name, labeling_f) except ValueError as e: return str(e) # create data directory for this project (just to be sure, check again) if not(os.path.exists(self.project_dir)): os.mkdir(self.project_dir) # and create directories to store job status os.mkdir(self.job_dir) os.mkdir(self.job_waiting_dir) os.mkdir(self.job_running_dir) os.mkdir(self.job_done_dir) os.mkdir(self.job_error_dir) # create classification dir os.mkdir(self.cl_dir) else: return 'A project with the same project id allready exists' # create project details file with open(self.project_details_f, 'w') as fout: fout.write('project_id\t%s\n' % (self.project_id)) fout.write('project_init\t%s\n' % (self.timestamp_str())) # store feature extraction data fe.set_root_dir(self.fe_dir) fe.save() return ''
def start_new_project(self, project_id, fasta_file, sequence_type, reference_taxon=None): ''' TOCHECK: what is fasta_file for type ??? pre: sequence_type is orf_seq or prot_seq pre: user_id is set ''' self.set_project(project_id) # check if user allready has a dir and create one if needed if not os.path.exists(self.user_dir): os.mkdir(self.user_dir) # check if a project with the same name exists if os.path.exists(self.project_dir): return 'A project with the same project id allready exists' # download reference fasta file ref_seqs = [] if not(reference_taxon is None): # TODO if(sequence_type == 'orf_seq'): return 'Reference set can only be compared to protein' +\ 'amino acid sequences, not to ORF sequences.' # obtain reference proteome sequences ref_f = os.path.join(self.ref_data_dir, '%i.fsa' % (reference_taxon)) ref_red_f = os.path.join(self.ref_data_dir, '%i_reduced.fsa' % (reference_taxon)) # first check local dir if(os.path.exists(ref_red_f)): ref_seqs = [s for s in file_io.read_fasta(ref_red_f)] elif(os.path.exists(ref_f)): ref_seqs = [s for s in file_io.read_fasta(ref_f)] # otherwise fetch reference data set else: pass ''' url = 'http://www.uniprot.org/uniref/' +\ '?query=uniprot:(organism:%i+' % (reference_taxon) +\ 'keyword:181)+identity:0.5&format=fasta' response = urllib2.urlopen(url) try: ref_seqs = [s for s in file_io.read_fasta(response)] except Exception: return 'There appears to be an error in the reference ' +\ 'data fasta file' # check if reference data set is not too large max_num_seqs = 15000 if(len(ref_seqs) > max_num_seqs): # randomly select 15000 sequences indices = random.sample(range(len(ref_seqs)), max_num_seqs) ref_seqs = [ref_seqs[i] for i in indices] ''' # estimate reference data set size size = len(ref_seqs) * 285 # estimate 285 bytes per seq # check file size max_size = 5243000 # bytes (5MB) while True: data = fasta_file.file.read(8192) if(size > max_size): return 'Sequence data exeeds the maximum ' +\ 'allowed size (5MB)' if not(data): break size += len(data) if(size > max_size): return 'Sequence data exeeds the maximum ' +\ 'allowed size (5MB)' # reset to beginning of fasta file fasta_file.file.seek(0) # read sequences from fasta file (to obtain object ids...) try: seqs = [s for s in file_io.read_fasta(fasta_file.file)] seqs.extend(ref_seqs) ids = [s[0] for s in seqs] except Exception as e: return str(e) +\ 'Please consult the documentation (<i>file formats</i> ' +\ 'section) to learn more about the FASTA file format.' # reset pointer to begin of file #fasta_file.file.seek(0) # close the temporary file, not sure if this is neccesary fasta_file.file.close() # create sequence feature extraction object to check input fe = featext.FeatureExtraction() try: fe.set_protein_ids(ids) fe.protein_data_set.set_data_source(sequence_type, seqs) # translate to prot seq if orf provided if(sequence_type == 'orf_seq'): ids = [s[0] for s in seqs] prot_seqs = [(sequtil.translate(s[1])) for s in seqs] # chop off translated stop codons at terminus prot_seqs = [s[:-1] if s[-1] == '*' else s for s in prot_seqs] fe.protein_data_set.set_data_source('prot_seq', zip(ids, prot_seqs)) except ValueError as e: print(traceback.format_exc()) return str(e) except: print(traceback.format_exc()) return 'Error during initiation new project' # add labeling in case of added reference set if(len(ref_seqs) > 0): l = [(s[0], 0) for s in seqs] l.extend([(s[0], 1) for s in ref_seqs]) class_names = ['dataset', 'taxon%i' % (reference_taxon)] fe.fm_protein.add_labeling('reference', dict(l), class_names) # create data directory for this project (just to be sure, check again) if not(os.path.exists(self.project_dir)): os.mkdir(self.project_dir) # and create directories to store job status os.mkdir(self.job_dir) os.mkdir(self.job_waiting_dir) os.mkdir(self.job_running_dir) os.mkdir(self.job_done_dir) os.mkdir(self.job_error_dir) # create classification dir os.mkdir(self.cl_dir) else: return 'A project with the same project id allready exists' # create project details file with open(self.project_details_f, 'w') as fout: fout.write('project_id\t%s\n' % (self.project_id)) fout.write('project_init\t%s\n' % (self.timestamp_str())) # store feature extraction data fe.set_root_dir(self.fe_dir) fe.save() return ''
def start_example_project(self, project_id, fasta_f, seq_type, labeling_f): ''' Start new project without checking input data ''' self.set_project(project_id) # check if user allready has a dir and create one if needed if not os.path.exists(self.user_dir): os.mkdir(self.user_dir) # check if a project with the same name exists, otherwise add number if (os.path.exists(self.project_dir)): index = 0 while (os.path.exists(self.project_dir)): project_id = '%s_%i' % (self.project_id.split('_')[0], index) self.set_project(project_id) index += 1 # read data from file try: seqs = [s for s in file_io.read_fasta(fasta_f)] ids = [s[0] for s in seqs] except Exception as e: print e return 'Error in fasta file' # create sequence feature extraction object fe = featext.FeatureExtraction() # set protein data try: fe.set_protein_ids(ids) fe.protein_data_set.set_data_source(seq_type, seqs) # translate to prot seq if orf provided if (seq_type == 'orf_seq'): ids = [s[0] for s in seqs] prot_seqs = [(sequtil.translate(s[1])) for s in seqs] # chop off translated stop codons at terminus prot_seqs = [s[:-1] if s[-1] == '*' else s for s in prot_seqs] fe.protein_data_set.set_data_source('prot_seq', zip(ids, prot_seqs)) except ValueError as e: print(traceback.format_exc()) return str(e) except: print(traceback.format_exc()) return 'Error during initiation new project' # add to feature matrix try: labeling_name = os.path.splitext(os.path.basename(labeling_f))[0] fe.fm_protein.add_labeling_from_file(labeling_name, labeling_f) except ValueError as e: return str(e) # create data directory for this project (just to be sure, check again) if not (os.path.exists(self.project_dir)): os.mkdir(self.project_dir) # and create directories to store job status os.mkdir(self.job_dir) os.mkdir(self.job_waiting_dir) os.mkdir(self.job_running_dir) os.mkdir(self.job_done_dir) os.mkdir(self.job_error_dir) # create classification dir os.mkdir(self.cl_dir) else: return 'A project with the same project id allready exists' # create project details file with open(self.project_details_f, 'w') as fout: fout.write('project_id\t%s\n' % (self.project_id)) fout.write('project_init\t%s\n' % (self.timestamp_str())) # store feature extraction data fe.set_root_dir(self.fe_dir) fe.save() return ''
def start_new_project(self, project_id, fasta_file, sequence_type, reference_taxon=None): ''' TOCHECK: what is fasta_file for type ??? pre: sequence_type is orf_seq or prot_seq pre: user_id is set ''' self.set_project(project_id) # check if user allready has a dir and create one if needed if not os.path.exists(self.user_dir): os.mkdir(self.user_dir) # check if a project with the same name exists if os.path.exists(self.project_dir): return 'A project with the same project id allready exists' # download reference fasta file ref_seqs = [] if not (reference_taxon is None): # TODO if (sequence_type == 'orf_seq'): return 'Reference set can only be compared to protein' +\ 'amino acid sequences, not to ORF sequences.' # obtain reference proteome sequences ref_f = os.path.join(self.ref_data_dir, '%i.fsa' % (reference_taxon)) ref_red_f = os.path.join(self.ref_data_dir, '%i_reduced.fsa' % (reference_taxon)) # first check local dir if (os.path.exists(ref_red_f)): ref_seqs = [s for s in file_io.read_fasta(ref_red_f)] elif (os.path.exists(ref_f)): ref_seqs = [s for s in file_io.read_fasta(ref_f)] # otherwise fetch reference data set else: pass ''' url = 'http://www.uniprot.org/uniref/' +\ '?query=uniprot:(organism:%i+' % (reference_taxon) +\ 'keyword:181)+identity:0.5&format=fasta' response = urllib2.urlopen(url) try: ref_seqs = [s for s in file_io.read_fasta(response)] except Exception: return 'There appears to be an error in the reference ' +\ 'data fasta file' # check if reference data set is not too large max_num_seqs = 15000 if(len(ref_seqs) > max_num_seqs): # randomly select 15000 sequences indices = random.sample(range(len(ref_seqs)), max_num_seqs) ref_seqs = [ref_seqs[i] for i in indices] ''' # estimate reference data set size size = len(ref_seqs) * 285 # estimate 285 bytes per seq # check file size max_size = 5243000 # bytes (5MB) while True: data = fasta_file.file.read(8192) if (size > max_size): return 'Sequence data exeeds the maximum ' +\ 'allowed size (5MB)' if not (data): break size += len(data) if (size > max_size): return 'Sequence data exeeds the maximum ' +\ 'allowed size (5MB)' # reset to beginning of fasta file fasta_file.file.seek(0) # read sequences from fasta file (to obtain object ids...) try: seqs = [s for s in file_io.read_fasta(fasta_file.file)] seqs.extend(ref_seqs) ids = [s[0] for s in seqs] except Exception as e: return str(e) +\ 'Please consult the documentation (<i>file formats</i> ' +\ 'section) to learn more about the FASTA file format.' # reset pointer to begin of file #fasta_file.file.seek(0) # close the temporary file, not sure if this is neccesary fasta_file.file.close() # create sequence feature extraction object to check input fe = featext.FeatureExtraction() try: fe.set_protein_ids(ids) fe.protein_data_set.set_data_source(sequence_type, seqs) # translate to prot seq if orf provided if (sequence_type == 'orf_seq'): ids = [s[0] for s in seqs] prot_seqs = [(sequtil.translate(s[1])) for s in seqs] # chop off translated stop codons at terminus prot_seqs = [s[:-1] if s[-1] == '*' else s for s in prot_seqs] fe.protein_data_set.set_data_source('prot_seq', zip(ids, prot_seqs)) except ValueError as e: print(traceback.format_exc()) return str(e) except: print(traceback.format_exc()) return 'Error during initiation new project' # add labeling in case of added reference set if (len(ref_seqs) > 0): l = [(s[0], 0) for s in seqs] l.extend([(s[0], 1) for s in ref_seqs]) class_names = ['dataset', 'taxon%i' % (reference_taxon)] fe.fm_protein.add_labeling('reference', dict(l), class_names) # create data directory for this project (just to be sure, check again) if not (os.path.exists(self.project_dir)): os.mkdir(self.project_dir) # and create directories to store job status os.mkdir(self.job_dir) os.mkdir(self.job_waiting_dir) os.mkdir(self.job_running_dir) os.mkdir(self.job_done_dir) os.mkdir(self.job_error_dir) # create classification dir os.mkdir(self.cl_dir) else: return 'A project with the same project id allready exists' # create project details file with open(self.project_details_f, 'w') as fout: fout.write('project_id\t%s\n' % (self.project_id)) fout.write('project_init\t%s\n' % (self.timestamp_str())) # store feature extraction data fe.set_root_dir(self.fe_dir) fe.save() return ''