def __init__(self, filename_py="train.txt", vocab_file_py='vocab_pinyin.txt', filename_hz="train_hanzi.txt", vocab_file_hz='vocab_hanzi.txt', sort=False, descent=False): self.basename, self.py_text = process_meta( os.path.join(hp.preprocessed_path, filename_py)) self.sort = sort self.py_vocab = open(os.path.join(hp.preprocessed_path, vocab_file_py)).read().split('\n') assert ('pad' in self.py_vocab and 'sp1' in self.py_vocab and 'sil' in self.py_vocab) _, self.py_text = process_meta( os.path.join(hp.preprocessed_path, filename_py)) self.py2idx = dict([(c, i) for i, c in enumerate(self.py_vocab)]) if hp.with_hanzi: self.hz_vocab = open( os.path.join(hp.preprocessed_path, vocab_file_hz)).read().split('\n') assert ('pad' in self.hz_vocab and 'sp1' in self.hz_vocab and 'sil' in self.hz_vocab) _, self.hz_text = process_meta( os.path.join(hp.preprocessed_path, filename_hz)) self.hz2idx = dict([(c, i) for i, c in enumerate(self.hz_vocab)]) if sort: names = [ l.split('|')[0] for l in open(os.path.join(hp.preprocessed_path, filename)).read().split('\n')[:-1] ] mel_len = [ np.load(hp.preprocessed_path + '/mel/baker-mel-{}.npy'.format(n)).shape[0] for n in names ] self.map_idx = np.argsort(mel_len) #i=names[map_idx[-1]] else: self.map_idx = [i for i in range(len(self.basename))] self.map_idx_rev = self.map_idx[::-1] self.descent = descent
def get_data_to_buffer(file='train.txt'): buffer = list() basename, text = process_meta( os.path.join(hparams.data_path, 'outdir', file), []) start = time.perf_counter() for i in tqdm(range(len(text))): mel_gt_name = os.path.join( hparams.data_path, 'outdir', 'mel', "{}-mel-{}.npy".format(hparams.dataset, basename[i])) mel_gt_target = np.load(mel_gt_name) duration = np.load(os.path.join( hparams.data_path, 'outdir', "alignment", "{}-ali-{}.npy".format(hparams.dataset, basename[i]))) character = text[i].strip() character = np.array( text_to_sequence(character, hparams.text_cleaners)) character = torch.from_numpy(character) duration = torch.from_numpy(duration) mel_gt_target = torch.from_numpy(mel_gt_target) buffer.append({"name": basename[i], "text": character, "duration": duration, "mel_target": mel_gt_target}) end = time.perf_counter() print("cost {:.2f}s to load all data into buffer.".format(end-start)) return buffer
def __init__(self, filename="train.txt", sort=True, speaker_lookup_table=None): self.basename, self.text = process_meta( os.path.join(hparams.preprocessed_path, filename)) self.sort = sort self.speaker_lookup_table = speaker_lookup_table
def check_text_to_sequence(fn): basename_list, text_list = utils.process_meta( os.path.join("..", hp.preprocessed_path, fn)) for i, basename in enumerate(basename_list): text = text_list[i] global cur_processing cur_processing = basename text_to_sequence(text) print("check text done. fn=%s, cnt=%d" % (fn, len(basename_list)))
def __init__(self, filename="train.txt", sort=True): self.basename, self.text = process_meta( os.path.join(hparams.preprocessed_path, filename)) self.sort = sort
# -*- coding: utf-8 -*- """ Created on Wed Jul 18 23:25:32 2018 @author: DBLITALMK Upload reviews File to Database """ import os os.chdir('C:/Users/DBLITALMK/Documents/Legacy/Case Study/env/Analysis/') from utils import getDFSample, process_meta, getDFStream,getDFTop,getDFStreamFilter from sqlalchemy import create_engine print('connecting to database') engine = create_engine('postgresql://*****:*****@localhost:5432/amazon_reviews') # test process reviews date ############## index = 0 for df in getDFStream('metadata.json.gz', splits=200000): index += 1 print('iteration %d' % index) t = process_meta(df) print(' data retrieved') print(' description %d' % max( t['description'].apply(lambda x: len(x) if isinstance(x,str) else 0) ) ) t.to_sql('meta',engine, if_exists='append', index=False) print(' data uploaded')
def __init__(self, filename="train.txt", list_unuse=[], sort=True): self.basename, self.text = process_meta( os.path.join(hparams.data_path, 'outdir', filename), list_unuse) self.sort = sort