def read(self, files): """Read fasta files and store the names and seqs in self.names, self.seqs""" if type(files) is str: files = [files] self.seqs, self.names = [], [] for fl in files: nms, sqs = fastaread(fl) sqs = [sq.upper() for sq in sqs] self.names.extend(nms) self.seqs.extend(sqs)
def files_to_OTmap(self, map_key, curr_dic): "This function transforms a file or list of files to an OT map and sets it as an attribute in self." print "Setting attribute: " + map_key files = curr_dic.get('file', None) force_list = curr_dic.get('force_list', False) expt_table_size = curr_dic.get('table_size', 1e7) save_file = curr_dic.get('save_file', None) use_kmer = curr_dic.get('use_kmer', False) if files is not None: files_ = self.str_to_list(files) fasta_ext = ['fa', 'fasta'] if self.check_extension(files, fasta_ext): names, seqs = [], [] for fl in files_: names_, seqs_ = fastaread(fl, force_upper=True) if force_list: names.append(names_) seqs.append(seqs_) else: names.extend(names_) seqs.extend(seqs_) if not force_list: names = [names] seqs = [seqs] OTmaps = [ OTmap(seq_, word_size=17, use_kmer=use_kmer, progress_report=False, save_file=save_file) for seq_ in seqs ] setattr(self, map_key, OTmaps) elif len(files_) == 1 and self.check_extension(files, '.npz'): OTMap_ = OTmap(seq_, word_size=17) OTMap_.load(files_[0]) OTmaps = [OTMap_] setattr(self, map_key, OTmaps) elif len(files_) == 1 and self.check_extension(files, 'pkl'): OTmaps = [pickle.load(open(files_[0], 'rb'))] setattr(self, map_key, OTmaps) else: print "Extension error or more than 1 npz/pkl file provided." else: print "No files" setattr(self, map_key, [constant_zero_dict()])
def load_sequence_file_and_paramaters(self): sequence_file = self.sequence_dic.get('file', None) self.input_names, self.input_seqs = [], [] self.input_noSeqsperFile = [] if sequence_file is not None: sequence_files = self.str_to_list(sequence_file) for sequence_file_ in sequence_files: names, seqs = fastaread(sequence_file_, force_upper=True) self.input_noSeqsperFile.append(len(names)) self.input_names.extend(names) self.input_seqs.extend(seqs) #add additional parameters self.params_dic['sequence_lens'] = map(len, self.input_seqs) self.params_dic['input_noSeqsperFile'] = self.input_noSeqsperFile self.params_dic['input_names'] = self.input_names #internalize paramaters in params_dic for key in self.params_dic.keys(): setattr(self, key, self.params_dic[key])
def coords_to_seq( self, coord, genome_folder=r'/n/dulacfs2/Users/bbintu/Genomes/mouse/mm10', save_file=None): "given coords of form chr*:*-* and a genome folder, returns the sequence and saves to file if save_file not None" import fnmatch if fnmatch.fnmatchcase(coord, 'chr*:*-*'): chr_ = coord.split(':')[0] if not hasattr(self, 'seq_' + chr_): chr_file = genome_folder + os.sep + chr_ + '.fa' nms, seq = fastaread(chr_file) seq = seq[0] setattr(self, 'seq_' + chr_, seq) else: seq = getattr(self, 'seq_' + chr_) start = int(coord.split(':')[-1].split('-')[0]) end = int(coord.split('-')[-1]) sq = seq[start:end] if save_file is not None: fastawrite(save_file, [coord], [sq]) return sq
def load_key(self, key): "Loads a sequences from the map_dic if extension of map_dic[key]['file'] is in ['fa','fasta']. self.<key>_names,self.<key>_seqs" curr_dic = self.map_dic['key'] files = curr_dic.get('file', None) if files is not None: extensions_available = ['fa', 'fasta'] if self.check_extension(files, extensions_available): files_ = self.str_to_list(files) names, seqs = [], [] for fl in files_: names_, seqs_ = fastaread(fl, force_upper=True) if curr_dic.get('force_list', False): names.append(names_) seqs.append(seqs_) else: names.extend(names_) seqs.extend(seqs_) setattr(self, key + '_names', names) setattr(self, key + '_seqs', seqs) else: print "Error, extensions availabe: " + str( extensions_available) else: print "No files"
def create_isoform_fl(self, isoform_fls, transcr_fl=None, isoform_filter_name=None, isoform_filter_seq=None): """ Given a list of filenames and a list of isoform_filter_name or a dictionary isoform_filter_seq it will look through the loaded transcriptome and save isoforms to each file. isoform_filter_name = ['*FLNA*','*(Xist)*',...] (list with length matching seqs_RNAs) isoform_filter_seq = {'word_size':17,'n_choice':100,'perc_keep':0.3} (isoforms are selected if a percent of inclusion of higher than 0.3 of a random set of 100 17mers) """ self.isoform_filter_name = isoform_filter_name self.isoform_filter_seq = isoform_filter_seq self.isoform_fls = isoform_fls if transcr_fl is not None: self.names_transcr, self.seqs_transcr = fastaread(transcr_fl) import fnmatch if self.isoform_filter_name is not None: if hasattr(self, "names_transcr"): #Asume a list of name filters are provided list_unfilt = self.names_transcr list_filters = self.isoform_filter_name #keep only a subset self.isoform_names_list, self.isoform_seqs_list = [], [] for filter_, isoform_fl in zip(list_filters, isoform_fls): isoform_names = fnmatch.filter(list_unfilt, '*' + filter_ + '*') isoform_sequences = [ self.seqs_transcr[self.names_transcr.index(is_name)] for is_name in isoform_names ] #make list self.isoform_names_list.append(isoform_names) self.isoform_seqs_list.append(isoform_sequences) print "Writing fasta file " + str( isoform_fl) + " with number of seqs: " + str( len(isoform_names)) fastawrite(isoform_fl, isoform_names, isoform_sequences) else: print "Warning! No isoforms computed because the transciptome is not loaded." if self.isoform_filter_seq is not None: if hasattr(self, "names_transcr"): word_size_ = self.isoform_filter_seq['word_size'] n_choice_ = self.isoform_filter_seq['n_choice'] perc_keep_ = self.isoform_filter_seq['perc_keep'] self.isoform_names_list, self.isoform_seqs_list = [], [] for seq_ref, isoform_fl in zip(self.seqs_RNAs, isoform_fls): i_choice = np.random.choice( range(len(seq_ref) - word_size_), n_choice_) seq_choice = [ seq_ref[i_:i_ + word_size_] for i_ in i_choice ] specs = [ np.sum([sq in seq_transcr for sq in seq_choice]) / float(n_choice_) for seq_transcr in self.seqs_transcr ] index_keep = np.ravel( np.where(np.array(specs) > perc_keep_)) isoform_names = [ self.names_transcr[i_] for i_ in index_keep ] isoform_seqs = [self.seqs_transcr[i_] for i_ in index_keep] self.isoform_names_list.append(isoform_names) self.isoform_seqs_list.append(isoform_seqs) fastawrite(isoform_fl, isoform_names, isoform_seqs) else: print "Warning! No isoforms computed because the transciptome is not loaded."