def tofile(self, path): """ put model parameters to file """ show_status(".. put parameters to file: %s" % path) with open(path, 'w') as f: tem = [] # input config K, J tem.append(' '.join([str(i) for i in (Config.K, Config.J)])) # input config THETA SIGMA tem.append(' '.join([str(i) for i in (Config.THETA, Config.SIGMA)])) # input parameters # input a line of B, C tem.append(' '.join([str(i) for i in (self.B, self.C)])) # input a line of Ws tem.append(' '.join([str(i) for i in self.W])) # input K lines of W_s for j in range(Config.J): tem.append(' '.join([str(i) for i in self.W_[j]])) f.write('\n'.join(tem))
def input_user_features(self): show_status(".. input user features") with open(TRAINSET_USER_FEATURE_PATH) as f: for line in f.readlines(): ws = line.split() userid = int(ws[0]) features = ws[1:] self.user_features[userid] = ' '.join(features)
def input_item_features(self): show_status(".. input item features") with open(TRAINSET_ITEM_FEATURE_PATH) as f: for line in f.readlines(): ws = line.split() itemid = int(ws[0]) features = ws[1:] self.item_features[itemid] = ' '.join(features)
def input_positive_trainset(self): """ only input positive trainset for validate """ show_status(".. input positive trainset") with open(TRAINSET_PAIR_PATH) as f: for line in f.readlines(): uid, p_papers, n_papers = line.split(',') self.trainset.append((uid, p_papers, []))
def split_trainset(self): """ spit dataset to several splits and create a validation dataset """ show_status(".. split dataset to %d pieces" % TRAIN_SET_NUM) num = len(self.trainset) piece_len = int(num / TRAIN_SET_NUM) index = 0 self.trainsets = [ self.trainset[index : index + piece_len] for i in xrange(TRAIN_SET_NUM)]
def tofile(self): """ put trainset to files """ for i in range(TRAIN_SET_NUM): show_status("output %th dataset" % i) dataset = self.dataset.get_dataset(i) with open(os.path.join(TRAINSET_VIRTUAL_MEM_DIR, str(i)+'.chun'), 'w') as f: strs = "\n".join(dataset) f.write(strs)
def get_dataset(self, i): with open(os.path.join(TRAINSET_VIRTUAL_MEM_DIR, str(i)+'.chun')) as f: records = [] for i, line in enumerate(f.readlines()): record = line[:-1].split(",") records.append(record) if i == 0: continue if i % BATCH_SIZE == 0: show_status(".. given %dth record" % i) yield records
def tofile(self): """ put trainset to files """ for i in range(TRAIN_SET_NUM): show_status("output %th dataset" % i) dataset = self.dataset.get_dataset(i) with open(os.path.join(TRAINSET_VIRTUAL_MEM_DIR, str(i) + '.chun'), 'w') as f: strs = "\n".join(dataset) f.write(strs)
def get_dataset(self, i): with open(os.path.join(TRAINSET_VIRTUAL_MEM_DIR, str(i) + '.chun')) as f: records = [] for i, line in enumerate(f.readlines()): record = line[:-1].split(",") records.append(record) if i == 0: continue if i % BATCH_SIZE == 0: show_status(".. given %dth record" % i) yield records
def split_trainset(self): """ spit dataset to several splits and create a validation dataset """ show_status(".. split dataset to %d pieces" % TRAIN_SET_NUM) num = len(self.trainset) piece_len = int(num / TRAIN_SET_NUM) index = 0 self.trainsets = [ self.trainset[index:index + piece_len] for i in xrange(TRAIN_SET_NUM) ]
def run(self): """ record is a string line """ val_maps = [] for val_idx in xrange(TRAIN_SET_NUM): # user ith dataset as a validate dataset self.val_idx = val_idx set_indexs = set(range(TRAIN_SET_NUM)) set_indexs.discard(val_idx) self.train(set_indexs) val_res = self.validate() show_status(".. get map: " + str(val_res)) val_maps.append(val_res) map_res = sum(val_maps) / TRAIN_SET_NUM show_status(".. get avage map: " + str(map_res)) self.model.dataspace.tofile(data_path('models', str(map_res)))
def trans_pairs(self): """ traindata: a line of self.trainset (uid, p_papers, n_papers) """ show_status(".. trains_pairs") for idx in range(TRAIN_SET_NUM): dataset = self.trainsets[idx] for d in dataset: # train pairs (uid, p_papers, n_papers) = (int(d[0]), [int(i) for i in d[1].split()], [int(i) for i in d[2].split()]) pairs = [(uid, p, n) for p in p_papers for n in n_papers] random.shuffle(pairs) if len(pairs) > MAX_PAIRS_SINGLE_LINE: pairs = pairs[:MAX_PAIRS_SINGLE_LINE] # add pairs to trains_set if self.train_pairs.get(idx, None) is None: self.train_pairs[idx] = [] self.train_pairs[idx] += pairs
def train(self, set_indexs): # train using the rest dataset for i in list(set_indexs): show_status(">>>" * 20) show_status(".. training %dth dataset" % i) for dataset in self.dataset.get_dataset(i): dataset_len = len(self.dataset.train_pairs[i]) show_status("dataset len: %d" % dataset_len) for i, (X1, X2) in enumerate(dataset): print "train %dth line" % i show_status(">> training data", i, dataset_len) X1 = np.array([float(i) for i in X1.split()]) X2 = np.array([float(i) for i in X2.split()]) self.model.study_line(X1, X2)
def fromfile(self, path): show_status(".. load parameters from file : %s" % path) def split_line_trans_type(line, _type): return [_type(i) for i in line.split()] with open(path) as f: for no, line in enumerate(f.readlines()): if no == 0: Config.K, Config.J = split_line_trans_type(line, int) elif no == 1: Config.THETA, Config.SIGMA = split_line_trans_type(line, float) elif no == 2: self.B, self.C = split_line_trans_type(line, float) elif no == 3: Ws = split_line_trans_type(line, float) for i in range(Config.J): self.W[i] = Ws[i] else: j = no - 3 w_j = split_line_trans_type(line, float) for k in range(Config.K): self.W_[j][k] = w_j[k]
def input_trainset(self): show_status(".. input trainset") with open(TRAINSET_PAIR_PATH) as f: for line in f.readlines(): uid, p_papers, n_papers = line.split(',') self.trainset.append((uid, p_papers, n_papers))
def tofile(self): show_status(".. to file: %s" % self.transed_features_path) with open(self.transed_features_path, 'w') as f: f.write(''.join(self.features))
def scan_files(tracks): sox_args = ['sox'] + [t.path for t in tracks] + ['-t', 'raw', '-'] entries_per_track = max([len(t.ar_entries) for t in tracks]) ckcdda_args = [BIN['ckcdda'], entries_per_track] for track in tracks: ckcdda_args.append(str(track.num_sectors)) crcs = [e.crc for e in track.ar_entries] crc450s = [e.crc450 for e in track.ar_entries] crcs += [0] * (entries_per_track - len(crcs)) crc450s += [0] * (entries_per_track - len(crc450s)) ckcdda_args += crcs ckcdda_args += crc450s ckcdda_args = map(str, ckcdda_args) tmp = TemporaryFile() PROCS.append(Popen(sox_args, stdout=PIPE)) PROCS.append(Popen(ckcdda_args, stdin=PROCS[-1].stdout, stdout=tmp)) p = PROCS[-1] while p.poll() is None: utils.show_status('Calculating checksums for %i files', len(tracks)) utils.finish_status() out, err = p.communicate() tmp.seek(0) out = tmp.read().decode() for pr in PROCS: if pr.returncode: raise SubprocessError('sox had an error (returned %i)' % pr.returncode) lines = out.split('\n') num_lines = len(lines) results1 = [] results2 = [] results450 = [] for i, line in enumerate(lines): if not re.match('^\d', line): continue index, data = line.split(': ') track_index, offset = [int(x) for x in index.split(',')] hashes = [int(x, 16) for x in data.split()] crc1, crc450 = hashes[:2] if len(hashes) > 2: crc2 = hashes[2] else: crc2 = None track = tracks[track_index] if offset == 0: track.crc1 = crc1 track.crc2 = crc2 track.crc450 = crc450 for entry in track.ar_entries: if entry.crc in (crc1, crc2): if offset not in track.exact_matches: track.exact_matches[offset] = [] track.exact_matches[offset].append(entry.confidence) elif entry.crc450 == crc450 and offset != 0: if offset not in track.possible_matches: track.possible_matches[offset] = [] track.possible_matches[offset].append(entry.confidence)
def scan_files(tracks): sox_args = ['sox']+[t.path for t in tracks]+['-t', 'raw', '-'] entries_per_track = max([len(t.ar_entries) for t in tracks]) ckcdda_args = [BIN['ckcdda'], entries_per_track] for track in tracks: ckcdda_args.append(str(track.num_sectors)) crcs = [e.crc for e in track.ar_entries] crc450s = [e.crc450 for e in track.ar_entries] crcs += [0]*(entries_per_track-len(crcs)) crc450s += [0]*(entries_per_track-len(crc450s)) ckcdda_args += crcs ckcdda_args += crc450s ckcdda_args = map(str, ckcdda_args) tmp = TemporaryFile() PROCS.append(Popen(sox_args, stdout=PIPE)) PROCS.append(Popen(ckcdda_args, stdin=PROCS[-1].stdout, stdout=tmp)) p = PROCS[-1] while p.poll() is None: utils.show_status('Calculating checksums for %i files', len(tracks)) utils.finish_status() out, err = p.communicate() tmp.seek(0) out = tmp.read().decode() for pr in PROCS: if pr.returncode: raise SubprocessError('sox had an error (returned %i)' % pr.returncode) lines = out.split('\n') num_lines = len(lines) results1 = [] results2 = [] results450 = [] for i, line in enumerate(lines): if not re.match('^\d', line): continue index, data = line.split(': ') track_index, offset = [int(x) for x in index.split(',')] hashes = [int(x, 16) for x in data.split()] crc1, crc450 = hashes[:2] if len(hashes) > 2: crc2 = hashes[2] else: crc2 = None track = tracks[track_index] if offset == 0: track.crc1 = crc1 track.crc2 = crc2 track.crc450 = crc450 for entry in track.ar_entries: if entry.crc in (crc1, crc2): if offset not in track.exact_matches: track.exact_matches[offset] = [] track.exact_matches[offset].append(entry.confidence) elif entry.crc450 == crc450 and offset != 0: if offset not in track.possible_matches: track.possible_matches[offset] = [] track.possible_matches[offset].append(entry.confidence)
def fix_offset(sources, offset, fmt='wav', verbose=False): output_dir = None i = 0 while not output_dir: a = '_%i' % i if i > 0 else '' output_dir = join(dirname(sources[0]['path']), 'fixedoffset_%i%s' % (offset, a)) if exists(output_dir): output_dir = None i += 1 TEMPDIRS.append(output_dir) os.mkdir(output_dir) common_args = [ '-t', 'raw', '-b16', '-c2', '-r44100', '-e', 'signed-integer', '-', ] sox_args = [BIN['sox']] + [s['path'] for s in sources] + common_args total_samples = sum([s['num_samples'] for s in sources]) if offset > 0: sox_args += [ 'pad', '0', '%is' % offset, 'trim', '%is' % offset, '%is' % total_samples ] else: sox_args += [ 'pad', '%is' % -offset, '0', 'trim', '0', '%is' % total_samples ] splitaudio_args = [BIN['splitaudio'], '1' if fmt == 'flac' else '0'] for s in sources: splitaudio_args += [str(s['num_samples'])] if verbose: print('format: %s' % fmt) print('%s | %s' % (' '.join(sox_args), ' '.join(splitaudio_args))) devnull = open(os.devnull, 'w') PROCS.append(Popen(sox_args, stdout=PIPE, stderr=devnull)) PROCS.append(Popen(splitaudio_args, stdin=PROCS[-1].stdout, cwd=output_dir)) p = PROCS[-1] while p.poll() is None: utils.show_status('Fixing offset (%i samples)', offset) out, err = p.communicate() devnull.close() print('', file=sys.stderr, end='\n') for pr in PROCS: if pr.returncode: raise utils.SubprocessError('sox had an error (returned %i)' % pr.returncode) TEMPDIRS.remove(output_dir) for i, s in enumerate(sources): src = join(output_dir, 'fixed%03i.%s' % (i, fmt)) outpath = join(output_dir, '%s.%s' % (splitext(basename(s['path']))[0], fmt)) os.rename(src, outpath) return output_dir
def fix_offset(sources, offset, fmt='wav', verbose=False): output_dir = None i = 0 while not output_dir: a = '_%i' % i if i > 0 else '' output_dir = join(dirname(sources[0]['path']), 'fixedoffset_%i%s' % (offset, a)) if exists(output_dir): output_dir = None i += 1 TEMPDIRS.append(output_dir) os.mkdir(output_dir) common_args = ['-t', 'raw', '-b16', '-c2', '-r44100', '-e', 'signed-integer', '-', ] sox_args = [BIN['sox']]+[s['path'] for s in sources]+common_args total_samples = sum([s['num_samples'] for s in sources]) if offset > 0: sox_args += ['pad', '0', '%is' % offset, 'trim', '%is' % offset, '%is' % total_samples] else: sox_args += ['pad', '%is' % -offset, '0', 'trim', '0', '%is' % total_samples] splitaudio_args = [BIN['splitaudio'], '1' if fmt == 'flac' else '0'] for s in sources: splitaudio_args += [str(s['num_samples'])] if verbose: print('format: %s' % fmt) print('%s | %s' % (' '.join(sox_args), ' '.join(splitaudio_args))) devnull = open(os.devnull, 'w') PROCS.append(Popen(sox_args, stdout=PIPE, stderr=devnull)) PROCS.append(Popen(splitaudio_args, stdin=PROCS[-1].stdout, cwd=output_dir)) p = PROCS[-1] while p.poll() is None: utils.show_status('Fixing offset (%i samples)', offset) out, err = p.communicate() devnull.close() print('', file=sys.stderr, end='\n') for pr in PROCS: if pr.returncode: raise utils.SubprocessError('sox had an error (returned %i)' % pr.returncode) TEMPDIRS.remove(output_dir) for i, s in enumerate(sources): src = join(output_dir, 'fixed%03i.%s' % (i,fmt)) outpath = join(output_dir, '%s.%s' % (splitext(basename(s['path']))[0], fmt)) os.rename(src, outpath) return output_dir