def __init__(self, params): super().__init__() self.samples = defaultdict(lambda: { 'mix': None, 'vocl': None, 'inst': [] }) path = params['path'] self.number_if_samples = 0 for a in iterate_files(os.path.join(path, 'Mixtures'), '.wav'): key = os.path.basename(os.path.dirname(a)) self.samples[key]['mix'] = a self.number_if_samples += 1 for a in iterate_files(os.path.join(path, 'Sources'), '.wav'): key = os.path.basename(os.path.dirname(a)) if a.endswith('vocals.wav'): self.samples[key]['vocl'] = a self.add_frames_num('vocl', a) else: self.samples[key]['inst'].append(a) self.add_frames_num('inst', a) self.number_if_samples += 1 self.samples = dict(self.samples)
def pretrain(): model = self_supervized_model(400) model.compile(loss='sparse_categorical_crossentropy', optimizer=keras.optimizers.Adam(learning_rate=8e-5), metrics='accuracy') config = { 'batch_size': 128, 'maxlen': 80, 'units': 400, 'model': model, } run = wandb.init(project="dotter", group="pretrain", tags=[], config=config) wandb_callback = wandb.keras.WandbCallback(log_batch_frequency=50, save_model=False, log_weights=False) with run: for fname in utils.iterate_files(["../wikipedia/AA"]): name = fname.split('/')[-1] raw_y = load_plaintext(fname, 80) utils.shuffle_in_unison(raw_y) x, y = get_masked(raw_y, 0.3) model.fit(x, y, batch_size=128, validation_split=0.1, callbacks=[wandb_callback]) model.save(f'{pretrain_path}/{name}.h5', save_format='tf') model.save(model_name, save_format='tf') return model
def diacritize_all(sysname): for filename in utils.iterate_files([basepath]): # if filename.endswith(r'\nrg\6.txt') or filename.endswith(r'president\6.txt'): # continue print(filename, end=' ' * 30 + '\r', flush=True) actual = diacritize(sysname, filename) outfile = filename.replace('expected', sysname) Path(outfile).parent.mkdir(parents=True, exist_ok=True) with open(outfile, 'w', encoding='utf8') as f: f.write(actual)
def __init__(self, params): super().__init__() self.samples = defaultdict(lambda: { 'mix': None, 'vocl': None, 'inst': None }) path = params['path'] for a in iterate_files(path, '.wav'): key = os.path.basename(os.path.dirname(a)) if a.endswith('mix.wav'): self.samples[key]['mix'] = a elif a.endswith('source-01.wav'): self.samples[key]['inst'] = a self.add_frames_num('inst', a) else: self.samples[key]['vocl'] = a self.add_frames_num('vocl', a) self.samples = dict(self.samples)
def __init__(self, params): super().__init__() self.samples = defaultdict(lambda: {'mix': [], 'vocl': [], 'inst': []}) path = params['path'] self.net_vocals = 0 self.net_insts = 0 for a in iterate_files(os.path.join(path, 'separation'), '.wav'): key = os.path.basename(os.path.dirname(os.path.dirname(a))).lower() filename = os.path.basename(a).lower() if self._is_vocal_name(filename): self.samples[key]['vocl'].append(a) # self.net_vocls += get_net_duration(a) self.add_frames_num('vocl', a) continue if 'mix' in filename: self.samples[key]['mix'].append(a) continue self.samples[key]['inst'].append(a) self.add_frames_num('inst', a) # self.net_insts += get_net_duration(a) self.samples = dict(self.samples)
def __init__(self, params): super().__init__() path = params['path'] self.vocl = [] self.inst = [] reg = re.compile('\[(.*?)\]') for a in iterate_files(path, '.wav'): try: txt_filename = a[:-4] + '.txt' if 'voi' in map(str.strip, open(txt_filename)): self.vocl.append(a) self.add_frames_num('vocl', a) else: self.inst.append(a) self.add_frames_num('inst', a) except FileNotFoundError: if 'voi' in reg.findall(a): self.vocl.append(a) self.add_frames_num('vocl', a) else: self.inst.append(a) self.add_frames_num('inst', a)
def collect_tokens(paths: Iterable[str]): return tokenize(itertools.chain.from_iterable(iterate_file(path) for path in utils.iterate_files(paths)))
def read_corpora(base_paths): return [(filename, list(hebrew.iterate_file(filename))) for filename in utils.iterate_files(base_paths)]