def read(self, files):
     """Read fasta files and store the names and seqs in self.names, self.seqs"""
     if type(files) is str:
         files = [files]
     self.seqs, self.names = [], []
     for fl in files:
         nms, sqs = fastaread(fl)
         sqs = [sq.upper() for sq in sqs]
         self.names.extend(nms)
         self.seqs.extend(sqs)
Exemplo n.º 2
0
    def files_to_OTmap(self, map_key, curr_dic):
        "This function transforms a file or list of files to an OT map and sets it as an attribute in self."
        print "Setting attribute: " + map_key

        files = curr_dic.get('file', None)
        force_list = curr_dic.get('force_list', False)
        expt_table_size = curr_dic.get('table_size', 1e7)
        save_file = curr_dic.get('save_file', None)
        use_kmer = curr_dic.get('use_kmer', False)

        if files is not None:
            files_ = self.str_to_list(files)
            fasta_ext = ['fa', 'fasta']
            if self.check_extension(files, fasta_ext):
                names, seqs = [], []
                for fl in files_:
                    names_, seqs_ = fastaread(fl, force_upper=True)
                    if force_list:
                        names.append(names_)
                        seqs.append(seqs_)
                    else:
                        names.extend(names_)
                        seqs.extend(seqs_)
                if not force_list:
                    names = [names]
                    seqs = [seqs]
                OTmaps = [
                    OTmap(seq_,
                          word_size=17,
                          use_kmer=use_kmer,
                          progress_report=False,
                          save_file=save_file) for seq_ in seqs
                ]
                setattr(self, map_key, OTmaps)
            elif len(files_) == 1 and self.check_extension(files, '.npz'):
                OTMap_ = OTmap(seq_, word_size=17)
                OTMap_.load(files_[0])
                OTmaps = [OTMap_]
                setattr(self, map_key, OTmaps)
            elif len(files_) == 1 and self.check_extension(files, 'pkl'):
                OTmaps = [pickle.load(open(files_[0], 'rb'))]
                setattr(self, map_key, OTmaps)
            else:
                print "Extension error or more than 1 npz/pkl file provided."
        else:
            print "No files"
            setattr(self, map_key, [constant_zero_dict()])
Exemplo n.º 3
0
 def load_sequence_file_and_paramaters(self):
     sequence_file = self.sequence_dic.get('file', None)
     self.input_names, self.input_seqs = [], []
     self.input_noSeqsperFile = []
     if sequence_file is not None:
         sequence_files = self.str_to_list(sequence_file)
         for sequence_file_ in sequence_files:
             names, seqs = fastaread(sequence_file_, force_upper=True)
             self.input_noSeqsperFile.append(len(names))
             self.input_names.extend(names)
             self.input_seqs.extend(seqs)
     #add additional parameters
     self.params_dic['sequence_lens'] = map(len, self.input_seqs)
     self.params_dic['input_noSeqsperFile'] = self.input_noSeqsperFile
     self.params_dic['input_names'] = self.input_names
     #internalize paramaters in params_dic
     for key in self.params_dic.keys():
         setattr(self, key, self.params_dic[key])
Exemplo n.º 4
0
    def coords_to_seq(
            self,
            coord,
            genome_folder=r'/n/dulacfs2/Users/bbintu/Genomes/mouse/mm10',
            save_file=None):
        "given coords of form chr*:*-* and a genome folder, returns the sequence and saves to file if save_file not None"
        import fnmatch
        if fnmatch.fnmatchcase(coord, 'chr*:*-*'):
            chr_ = coord.split(':')[0]
            if not hasattr(self, 'seq_' + chr_):
                chr_file = genome_folder + os.sep + chr_ + '.fa'
                nms, seq = fastaread(chr_file)
                seq = seq[0]
                setattr(self, 'seq_' + chr_, seq)
            else:
                seq = getattr(self, 'seq_' + chr_)

            start = int(coord.split(':')[-1].split('-')[0])
            end = int(coord.split('-')[-1])

            sq = seq[start:end]
            if save_file is not None:
                fastawrite(save_file, [coord], [sq])
            return sq
Exemplo n.º 5
0
 def load_key(self, key):
     "Loads a sequences from the map_dic if extension of map_dic[key]['file'] is in ['fa','fasta']. self.<key>_names,self.<key>_seqs"
     curr_dic = self.map_dic['key']
     files = curr_dic.get('file', None)
     if files is not None:
         extensions_available = ['fa', 'fasta']
         if self.check_extension(files, extensions_available):
             files_ = self.str_to_list(files)
             names, seqs = [], []
             for fl in files_:
                 names_, seqs_ = fastaread(fl, force_upper=True)
                 if curr_dic.get('force_list', False):
                     names.append(names_)
                     seqs.append(seqs_)
                 else:
                     names.extend(names_)
                     seqs.extend(seqs_)
             setattr(self, key + '_names', names)
             setattr(self, key + '_seqs', seqs)
         else:
             print "Error, extensions availabe: " + str(
                 extensions_available)
     else:
         print "No files"
Exemplo n.º 6
0
    def create_isoform_fl(self,
                          isoform_fls,
                          transcr_fl=None,
                          isoform_filter_name=None,
                          isoform_filter_seq=None):
        """
        Given a list of filenames and a list of isoform_filter_name or a dictionary isoform_filter_seq
        it will look through the loaded transcriptome and save isoforms to each file.
        
        isoform_filter_name = ['*FLNA*','*(Xist)*',...] 
        (list with length matching seqs_RNAs)
        isoform_filter_seq = {'word_size':17,'n_choice':100,'perc_keep':0.3} 
        (isoforms are selected if a percent of inclusion of higher than 0.3 of a random set of 100 17mers)
        """
        self.isoform_filter_name = isoform_filter_name
        self.isoform_filter_seq = isoform_filter_seq
        self.isoform_fls = isoform_fls
        if transcr_fl is not None:
            self.names_transcr, self.seqs_transcr = fastaread(transcr_fl)
        import fnmatch
        if self.isoform_filter_name is not None:
            if hasattr(self, "names_transcr"):
                #Asume a list of name filters are provided
                list_unfilt = self.names_transcr
                list_filters = self.isoform_filter_name
                #keep only a subset
                self.isoform_names_list, self.isoform_seqs_list = [], []
                for filter_, isoform_fl in zip(list_filters, isoform_fls):
                    isoform_names = fnmatch.filter(list_unfilt,
                                                   '*' + filter_ + '*')
                    isoform_sequences = [
                        self.seqs_transcr[self.names_transcr.index(is_name)]
                        for is_name in isoform_names
                    ]
                    #make list
                    self.isoform_names_list.append(isoform_names)
                    self.isoform_seqs_list.append(isoform_sequences)
                    print "Writing fasta file " + str(
                        isoform_fl) + " with number of seqs: " + str(
                            len(isoform_names))
                    fastawrite(isoform_fl, isoform_names, isoform_sequences)
            else:
                print "Warning! No isoforms computed because the transciptome is not loaded."
        if self.isoform_filter_seq is not None:
            if hasattr(self, "names_transcr"):
                word_size_ = self.isoform_filter_seq['word_size']
                n_choice_ = self.isoform_filter_seq['n_choice']
                perc_keep_ = self.isoform_filter_seq['perc_keep']

                self.isoform_names_list, self.isoform_seqs_list = [], []
                for seq_ref, isoform_fl in zip(self.seqs_RNAs, isoform_fls):
                    i_choice = np.random.choice(
                        range(len(seq_ref) - word_size_), n_choice_)
                    seq_choice = [
                        seq_ref[i_:i_ + word_size_] for i_ in i_choice
                    ]
                    specs = [
                        np.sum([sq in seq_transcr
                                for sq in seq_choice]) / float(n_choice_)
                        for seq_transcr in self.seqs_transcr
                    ]
                    index_keep = np.ravel(
                        np.where(np.array(specs) > perc_keep_))

                    isoform_names = [
                        self.names_transcr[i_] for i_ in index_keep
                    ]
                    isoform_seqs = [self.seqs_transcr[i_] for i_ in index_keep]

                    self.isoform_names_list.append(isoform_names)
                    self.isoform_seqs_list.append(isoform_seqs)
                    fastawrite(isoform_fl, isoform_names, isoform_seqs)
            else:
                print "Warning! No isoforms computed because the transciptome is not loaded."