Пример #1
0
    def make_hdf5_from_array(cls, array: Union[np.array, pd.Series], output_file: str, num_batches: int =100 , bptt_length = 75):
        '''Tokenize sequences from a line-by-line txt file, concatenate and cut into num_batch sequences.
        Save as mdf5, and return Dataset with this mdf5 as source.
        Properties of mdf5 file:
            dataset tokenized_sequences: concatenation of all tokenized sequences (stop tokens inserted). 1D array of size total_n_tokens
            dataset starting_indices: starting index in tokenized_sequences of each sequence. 1D array of size n_sequences
        '''

        tokenizer = TAPETokenizer(vocab = 'iupac') 
        #load and tokenize
        startidxlist = []
        tokenlist = []
        current_start_idx = 0
        for seq in array:
            
            startidxlist.append(current_start_idx)
            words = tokenizer.tokenize(seq) + [tokenizer.stop_token]
            for word in words:
                tokenlist.append(tokenizer.convert_token_to_id(word))
            current_start_idx = len(tokenlist)

        data =  np.array(tokenlist)
        startidx = np.array(startidxlist)
        with h5py.File(output_file, "w") as f:
            f.create_dataset('tokenized_sequences', data=data)
            f.create_dataset('starting_indices', data = startidx)

        return cls(output_file, bptt_length)
Пример #2
0
    def make_hdf5_from_array(cls, array: Union[np.array, pd.Series], num_batches: int, output_file: str, bptt_length = 75):
        '''Tokenize sequences from a line-by-line txt file, concatenate and cut into num_batch sequences.
           Save as mdf5, and return Dataset with this mdf5 as source.
        '''

        tokenizer = TAPETokenizer(vocab = 'iupac') 
        #load and tokenize
        tokenlist = []
        for seq in array:
            words = tokenizer.tokenize(seq) + [tokenizer.stop_token]
            #tokens += len(words)
            for word in words:
                tokenlist.append(tokenizer.convert_token_to_id(word))

        #split into batches
        tokensperbatch = len(tokenlist) // num_batches
        end = tokensperbatch*num_batches #trim
        tokenlist = tokenlist[0:end]
        data =  np.array(tokenlist)
        data = data.reshape(-1, num_batches)

        with h5py.File(output_file, "w") as f:
            f.create_dataset('tokenized_sequences', data=data)

        return cls(output_file, bptt_length)
Пример #3
0
    def make_hdf5_from_txt(cls, file: str, num_batches: int = 100, output_file: str = None, bptt_length = 75, buffer_size = 1000):
        '''Tokenize sequences from a line-by-line txt file, concatenate and cut into num_batch sequences.
           Save as mdf5, and return Dataset with this mdf5 as source.
        '''
        if not os.path.exists(file):
            raise FileNotFoundError(file)
        tokenizer = TAPETokenizer(vocab = 'iupac') 
        #load and tokenize        
        startidxlist = []
        tokenlist = []
        current_start_idx = 0

        with open(file, 'r') as f:
            for line in f:

                startidxlist.append(current_start_idx)
                words = tokenizer.tokenize(line.rstrip()) + [tokenizer.stop_token]
                for word in words:
                    tokenlist.append(tokenizer.convert_token_to_id(word))
                current_start_idx = len(tokenlist)


        data =  np.array(tokenlist)
        startidx = np.array(startidxlist)
        if not output_file:
            output_file = file + '.hdf5'

        with h5py.File(output_file, "w") as f:
            f.create_dataset('tokenized_sequences', data=data)
            f.create_dataset('starting_indices', data = startidx)

        return cls(output_file, num_batches, bptt_length, buffer_size)
Пример #4
0
    def make_hdf5_from_txt(cls, file: str, num_batches: int, output_file: str = None, bptt_length = 75):
        '''Tokenize sequences from a line-by-line txt file, concatenate and cut into num_batch sequences.
           Save as mdf5, and return Dataset with this mdf5 as source.
        '''
        if not os.path.exists(file):
            raise FileNotFoundError(file)
        tokenizer = TAPETokenizer(vocab = 'iupac') 
        #load and tokenize
        tokenlist = []
        with open(file, 'r') as f:
            #ids = torch.LongTensor(tokens)
            #token = 0
            for line in f:
                words = tokenizer.tokenize(line.rstrip()) + [tokenizer.stop_token]
                #tokens += len(words)
                for word in words:
                    tokenlist.append(tokenizer.convert_token_to_id(word))


        #split into batches
            tokensperbatch = len(tokenlist) // num_batches
            end = tokensperbatch*num_batches #trim
            tokenlist = tokenlist[0:end]
            data =  np.array(tokenlist)
            data = data.reshape(-1, num_batches)
        
        if not output_file:
            output_file = file + '.hdf5'

        with h5py.File(output_file, "w") as f:
            f.create_dataset('tokenized_sequences', data=data)

        return cls(output_file, bptt_length)