示例#1
0
    def tofile(self, path):
        """
        put model parameters to file
        """
        show_status(".. put parameters to file: %s" % path)

        with open(path, 'w') as f:
            tem = []
            # input config K, J
            tem.append(' '.join([str(i) 
                for i in (Config.K, Config.J)]))
            # input config THETA SIGMA
            tem.append(' '.join([str(i) 
                for i in (Config.THETA, Config.SIGMA)]))

            # input parameters
            # input a line of B, C
            tem.append(' '.join([str(i) 
                for i in (self.B, self.C)]))
            # input a line of Ws
            tem.append(' '.join([str(i) for i in self.W]))
            # input K lines of W_s
            for j in range(Config.J):
                tem.append(' '.join([str(i) for i in self.W_[j]]))
            f.write('\n'.join(tem))
示例#2
0
 def input_user_features(self):
     show_status(".. input user features")
     with open(TRAINSET_USER_FEATURE_PATH) as f:
         for line in f.readlines():
             ws = line.split()
             userid = int(ws[0])
             features = ws[1:]
             self.user_features[userid] = ' '.join(features)
示例#3
0
 def input_item_features(self):
     show_status(".. input item features")
     with open(TRAINSET_ITEM_FEATURE_PATH) as f:
         for line in f.readlines():
             ws = line.split()
             itemid = int(ws[0])
             features = ws[1:]
             self.item_features[itemid] = ' '.join(features)
示例#4
0
 def input_item_features(self):
     show_status(".. input item features")
     with open(TRAINSET_ITEM_FEATURE_PATH) as f:
         for line in f.readlines():
             ws = line.split()
             itemid = int(ws[0])
             features = ws[1:]
             self.item_features[itemid] = ' '.join(features)
示例#5
0
 def input_user_features(self):
     show_status(".. input user features")
     with open(TRAINSET_USER_FEATURE_PATH) as f:
         for line in f.readlines():
             ws = line.split()
             userid = int(ws[0])
             features = ws[1:]
             self.user_features[userid] = ' '.join(features)
示例#6
0
 def input_positive_trainset(self):
     """
     only input positive trainset for validate
     """
     show_status(".. input positive trainset")
     with open(TRAINSET_PAIR_PATH) as f:
         for line in f.readlines():
             uid, p_papers, n_papers = line.split(',')
             self.trainset.append((uid, p_papers, []))
示例#7
0
 def input_positive_trainset(self):
     """
     only input positive trainset for validate
     """
     show_status(".. input positive trainset")
     with open(TRAINSET_PAIR_PATH) as f:
         for line in f.readlines():
             uid, p_papers, n_papers = line.split(',')
             self.trainset.append((uid, p_papers, []))
示例#8
0
 def split_trainset(self):
     """
     spit dataset to several splits
     and create a validation dataset
     """
     show_status(".. split dataset to %d pieces" % TRAIN_SET_NUM)
     num = len(self.trainset)
     piece_len = int(num / TRAIN_SET_NUM)
     index = 0
     self.trainsets = [ self.trainset[index : index + piece_len] for i in xrange(TRAIN_SET_NUM)]
示例#9
0
 def tofile(self):
     """
     put trainset to files
     """
     for i in range(TRAIN_SET_NUM):
         show_status("output %th dataset" % i)
         dataset = self.dataset.get_dataset(i)
         with open(os.path.join(TRAINSET_VIRTUAL_MEM_DIR, str(i)+'.chun'), 'w') as f:
             strs = "\n".join(dataset)
             f.write(strs)
示例#10
0
 def get_dataset(self, i):
     with open(os.path.join(TRAINSET_VIRTUAL_MEM_DIR, str(i)+'.chun')) as f:
         records = []
         for i, line in enumerate(f.readlines()):
             record = line[:-1].split(",")
             records.append(record)
             if i == 0:
                 continue
             if i % BATCH_SIZE == 0:
                 show_status(".. given %dth record" % i)
                 yield records
示例#11
0
 def tofile(self):
     """
     put trainset to files
     """
     for i in range(TRAIN_SET_NUM):
         show_status("output %th dataset" % i)
         dataset = self.dataset.get_dataset(i)
         with open(os.path.join(TRAINSET_VIRTUAL_MEM_DIR,
                                str(i) + '.chun'), 'w') as f:
             strs = "\n".join(dataset)
             f.write(strs)
示例#12
0
 def get_dataset(self, i):
     with open(os.path.join(TRAINSET_VIRTUAL_MEM_DIR,
                            str(i) + '.chun')) as f:
         records = []
         for i, line in enumerate(f.readlines()):
             record = line[:-1].split(",")
             records.append(record)
             if i == 0:
                 continue
             if i % BATCH_SIZE == 0:
                 show_status(".. given %dth record" % i)
                 yield records
示例#13
0
 def split_trainset(self):
     """
     spit dataset to several splits
     and create a validation dataset
     """
     show_status(".. split dataset to %d pieces" % TRAIN_SET_NUM)
     num = len(self.trainset)
     piece_len = int(num / TRAIN_SET_NUM)
     index = 0
     self.trainsets = [
         self.trainset[index:index + piece_len]
         for i in xrange(TRAIN_SET_NUM)
     ]
示例#14
0
    def run(self):
        """
        record is a string line
        """
        val_maps = []
        for val_idx in xrange(TRAIN_SET_NUM):
            # user ith dataset as a validate dataset
            self.val_idx = val_idx
            set_indexs = set(range(TRAIN_SET_NUM))
            set_indexs.discard(val_idx)

            self.train(set_indexs)
            val_res = self.validate()
            show_status(".. get map: " + str(val_res))
            val_maps.append(val_res)
        map_res = sum(val_maps) / TRAIN_SET_NUM
        show_status(".. get avage map: " + str(map_res))
        self.model.dataspace.tofile(data_path('models', str(map_res)))
示例#15
0
    def run(self):
        """
        record is a string line
        """
        val_maps = []
        for val_idx in xrange(TRAIN_SET_NUM):
            # user ith dataset as a validate dataset
            self.val_idx = val_idx
            set_indexs = set(range(TRAIN_SET_NUM))
            set_indexs.discard(val_idx)

            self.train(set_indexs)
            val_res = self.validate()
            show_status(".. get map: " + str(val_res)) 
            val_maps.append(val_res)
        map_res = sum(val_maps) / TRAIN_SET_NUM
        show_status(".. get avage map: " + str(map_res)) 
        self.model.dataspace.tofile(data_path('models', str(map_res)))
示例#16
0
 def trans_pairs(self):
     """
     traindata: a line of self.trainset
         (uid, p_papers, n_papers)
     """
     show_status(".. trains_pairs")
     for idx in range(TRAIN_SET_NUM):
         dataset = self.trainsets[idx]
         for d in dataset:
             # train pairs
             (uid, p_papers, n_papers) = (int(d[0]), 
                         [int(i) for i in d[1].split()],
                         [int(i) for i in d[2].split()])
             pairs = [(uid, p, n) for p in p_papers for n in n_papers]
             random.shuffle(pairs)
             if len(pairs) > MAX_PAIRS_SINGLE_LINE:
                 pairs = pairs[:MAX_PAIRS_SINGLE_LINE]
             # add pairs to trains_set
             if self.train_pairs.get(idx, None) is None:
                 self.train_pairs[idx] = []
             self.train_pairs[idx] += pairs
示例#17
0
 def trans_pairs(self):
     """
     traindata: a line of self.trainset
         (uid, p_papers, n_papers)
     """
     show_status(".. trains_pairs")
     for idx in range(TRAIN_SET_NUM):
         dataset = self.trainsets[idx]
         for d in dataset:
             # train pairs
             (uid, p_papers,
              n_papers) = (int(d[0]), [int(i) for i in d[1].split()],
                           [int(i) for i in d[2].split()])
             pairs = [(uid, p, n) for p in p_papers for n in n_papers]
             random.shuffle(pairs)
             if len(pairs) > MAX_PAIRS_SINGLE_LINE:
                 pairs = pairs[:MAX_PAIRS_SINGLE_LINE]
             # add pairs to trains_set
             if self.train_pairs.get(idx, None) is None:
                 self.train_pairs[idx] = []
             self.train_pairs[idx] += pairs
示例#18
0
 def train(self, set_indexs):
     # train using the rest dataset
     for i in list(set_indexs):
         show_status(">>>" * 20)
         show_status(".. training %dth dataset" % i)
         for dataset in self.dataset.get_dataset(i):
             dataset_len = len(self.dataset.train_pairs[i])
             show_status("dataset len: %d" % dataset_len)
             for i, (X1, X2) in enumerate(dataset): 
                 print "train %dth line" % i
                 show_status(">> training data", i, dataset_len)
                 X1 = np.array([float(i) for i in X1.split()])
                 X2 = np.array([float(i) for i in X2.split()])
                 self.model.study_line(X1, X2)
示例#19
0
 def train(self, set_indexs):
     # train using the rest dataset
     for i in list(set_indexs):
         show_status(">>>" * 20)
         show_status(".. training %dth dataset" % i)
         for dataset in self.dataset.get_dataset(i):
             dataset_len = len(self.dataset.train_pairs[i])
             show_status("dataset len: %d" % dataset_len)
             for i, (X1, X2) in enumerate(dataset):
                 print "train %dth line" % i
                 show_status(">> training data", i, dataset_len)
                 X1 = np.array([float(i) for i in X1.split()])
                 X2 = np.array([float(i) for i in X2.split()])
                 self.model.study_line(X1, X2)
示例#20
0
    def fromfile(self, path):
        show_status(".. load parameters from file : %s" % path)
        def split_line_trans_type(line, _type):
            return [_type(i) for i in line.split()]

        with open(path) as f:
            for no, line in enumerate(f.readlines()):
                if no == 0:
                    Config.K, Config.J = split_line_trans_type(line, int)
                elif no == 1:
                    Config.THETA, Config.SIGMA = split_line_trans_type(line, float)
                elif no == 2:
                    self.B, self.C = split_line_trans_type(line, float)
                elif no == 3:
                    Ws = split_line_trans_type(line, float)
                    for i in range(Config.J):
                        self.W[i] = Ws[i]
                else:
                    j = no - 3
                    w_j = split_line_trans_type(line, float)
                    for k in range(Config.K):
                        self.W_[j][k] = w_j[k]
示例#21
0
 def input_trainset(self):
     show_status(".. input trainset")
     with open(TRAINSET_PAIR_PATH) as f:
         for line in f.readlines():
             uid, p_papers, n_papers = line.split(',')
             self.trainset.append((uid, p_papers, n_papers))
示例#22
0
 def tofile(self):
     show_status(".. to file: %s" % self.transed_features_path)
     with open(self.transed_features_path, 'w') as f:
         f.write(''.join(self.features))
示例#23
0
 def input_trainset(self):
     show_status(".. input trainset")
     with open(TRAINSET_PAIR_PATH) as f:
         for line in f.readlines():
             uid, p_papers, n_papers = line.split(',')
             self.trainset.append((uid, p_papers, n_papers))
示例#24
0
def scan_files(tracks):
    sox_args = ['sox'] + [t.path for t in tracks] + ['-t', 'raw', '-']
    entries_per_track = max([len(t.ar_entries) for t in tracks])
    ckcdda_args = [BIN['ckcdda'], entries_per_track]

    for track in tracks:
        ckcdda_args.append(str(track.num_sectors))
        crcs = [e.crc for e in track.ar_entries]
        crc450s = [e.crc450 for e in track.ar_entries]
        crcs += [0] * (entries_per_track - len(crcs))
        crc450s += [0] * (entries_per_track - len(crc450s))
        ckcdda_args += crcs
        ckcdda_args += crc450s

    ckcdda_args = map(str, ckcdda_args)

    tmp = TemporaryFile()
    PROCS.append(Popen(sox_args, stdout=PIPE))
    PROCS.append(Popen(ckcdda_args, stdin=PROCS[-1].stdout, stdout=tmp))

    p = PROCS[-1]
    while p.poll() is None:
        utils.show_status('Calculating checksums for %i files', len(tracks))
    utils.finish_status()

    out, err = p.communicate()
    tmp.seek(0)
    out = tmp.read().decode()
    for pr in PROCS:
        if pr.returncode:
            raise SubprocessError('sox had an error (returned %i)' %
                                  pr.returncode)

    lines = out.split('\n')
    num_lines = len(lines)

    results1 = []
    results2 = []
    results450 = []
    for i, line in enumerate(lines):
        if not re.match('^\d', line):
            continue

        index, data = line.split(': ')
        track_index, offset = [int(x) for x in index.split(',')]
        hashes = [int(x, 16) for x in data.split()]

        crc1, crc450 = hashes[:2]
        if len(hashes) > 2:
            crc2 = hashes[2]
        else:
            crc2 = None

        track = tracks[track_index]

        if offset == 0:
            track.crc1 = crc1
            track.crc2 = crc2
            track.crc450 = crc450

        for entry in track.ar_entries:
            if entry.crc in (crc1, crc2):
                if offset not in track.exact_matches:
                    track.exact_matches[offset] = []
                track.exact_matches[offset].append(entry.confidence)
            elif entry.crc450 == crc450 and offset != 0:
                if offset not in track.possible_matches:
                    track.possible_matches[offset] = []
                track.possible_matches[offset].append(entry.confidence)
示例#25
0
def scan_files(tracks):
    sox_args = ['sox']+[t.path for t in tracks]+['-t', 'raw', '-']
    entries_per_track = max([len(t.ar_entries) for t in tracks])
    ckcdda_args = [BIN['ckcdda'], entries_per_track]

    for track in tracks:
        ckcdda_args.append(str(track.num_sectors))
        crcs = [e.crc for e in track.ar_entries]
        crc450s = [e.crc450 for e in track.ar_entries]
        crcs += [0]*(entries_per_track-len(crcs))
        crc450s += [0]*(entries_per_track-len(crc450s))
        ckcdda_args += crcs
        ckcdda_args += crc450s

    ckcdda_args = map(str, ckcdda_args)

    tmp = TemporaryFile()
    PROCS.append(Popen(sox_args, stdout=PIPE))
    PROCS.append(Popen(ckcdda_args, stdin=PROCS[-1].stdout, stdout=tmp))

    p = PROCS[-1]
    while p.poll() is None:
        utils.show_status('Calculating checksums for %i files', len(tracks))
    utils.finish_status()

    out, err = p.communicate()
    tmp.seek(0)
    out = tmp.read().decode()
    for pr in PROCS:
        if pr.returncode:
            raise SubprocessError('sox had an error (returned %i)' %
                                  pr.returncode)

    lines = out.split('\n')
    num_lines = len(lines)

    results1 = []
    results2 = []
    results450 = []
    for i, line in enumerate(lines):
        if not re.match('^\d', line):
            continue

        index, data = line.split(': ')
        track_index, offset = [int(x) for x in index.split(',')]
        hashes = [int(x, 16) for x in data.split()]

        crc1, crc450 = hashes[:2]
        if len(hashes) > 2:
            crc2 = hashes[2]
        else:
            crc2 = None

        track = tracks[track_index]

        if offset == 0:
            track.crc1 = crc1
            track.crc2 = crc2
            track.crc450 = crc450

        for entry in track.ar_entries:
            if entry.crc in (crc1, crc2):
                if offset not in track.exact_matches:
                    track.exact_matches[offset] = []
                track.exact_matches[offset].append(entry.confidence)
            elif entry.crc450 == crc450 and offset != 0:
                if offset not in track.possible_matches:
                    track.possible_matches[offset] = []
                track.possible_matches[offset].append(entry.confidence)
示例#26
0
def fix_offset(sources, offset, fmt='wav', verbose=False):
    output_dir = None
    i = 0
    while not output_dir:
        a = '_%i' % i if i > 0 else ''
        output_dir = join(dirname(sources[0]['path']),
                          'fixedoffset_%i%s' % (offset, a))
        if exists(output_dir):
            output_dir = None
        i += 1
    TEMPDIRS.append(output_dir)
    os.mkdir(output_dir)
    common_args = [
        '-t',
        'raw',
        '-b16',
        '-c2',
        '-r44100',
        '-e',
        'signed-integer',
        '-',
    ]
    sox_args = [BIN['sox']] + [s['path'] for s in sources] + common_args

    total_samples = sum([s['num_samples'] for s in sources])
    if offset > 0:
        sox_args += [
            'pad', '0',
            '%is' % offset, 'trim',
            '%is' % offset,
            '%is' % total_samples
        ]
    else:
        sox_args += [
            'pad',
            '%is' % -offset, '0', 'trim', '0',
            '%is' % total_samples
        ]

    splitaudio_args = [BIN['splitaudio'], '1' if fmt == 'flac' else '0']

    for s in sources:
        splitaudio_args += [str(s['num_samples'])]

    if verbose:
        print('format: %s' % fmt)
        print('%s | %s' % (' '.join(sox_args), ' '.join(splitaudio_args)))
    devnull = open(os.devnull, 'w')
    PROCS.append(Popen(sox_args, stdout=PIPE, stderr=devnull))
    PROCS.append(Popen(splitaudio_args, stdin=PROCS[-1].stdout,
                       cwd=output_dir))

    p = PROCS[-1]
    while p.poll() is None:
        utils.show_status('Fixing offset (%i samples)', offset)

    out, err = p.communicate()
    devnull.close()
    print('', file=sys.stderr, end='\n')
    for pr in PROCS:
        if pr.returncode:
            raise utils.SubprocessError('sox had an error (returned %i)' %
                                        pr.returncode)

    TEMPDIRS.remove(output_dir)
    for i, s in enumerate(sources):
        src = join(output_dir, 'fixed%03i.%s' % (i, fmt))
        outpath = join(output_dir,
                       '%s.%s' % (splitext(basename(s['path']))[0], fmt))
        os.rename(src, outpath)

    return output_dir
示例#27
0
def fix_offset(sources, offset, fmt='wav', verbose=False):
    output_dir = None
    i = 0
    while not output_dir:
        a = '_%i' % i if i > 0 else ''
        output_dir = join(dirname(sources[0]['path']),
                          'fixedoffset_%i%s' % (offset, a))
        if exists(output_dir):
            output_dir = None
        i += 1
    TEMPDIRS.append(output_dir)
    os.mkdir(output_dir)
    common_args = ['-t', 'raw',
                   '-b16',
                   '-c2',
                   '-r44100',
                   '-e', 'signed-integer',
                   '-',
                   ]
    sox_args = [BIN['sox']]+[s['path'] for s in sources]+common_args

    total_samples = sum([s['num_samples'] for s in sources])
    if offset > 0:
        sox_args += ['pad', '0', '%is' % offset,
                     'trim', '%is' % offset, '%is' % total_samples]
    else:
        sox_args += ['pad', '%is' % -offset, '0',
                     'trim', '0', '%is' % total_samples]

    splitaudio_args = [BIN['splitaudio'], '1' if fmt == 'flac' else '0']

    for s in sources:
        splitaudio_args += [str(s['num_samples'])]

    if verbose:
        print('format: %s' % fmt)
        print('%s | %s' % (' '.join(sox_args), ' '.join(splitaudio_args)))
    devnull = open(os.devnull, 'w')
    PROCS.append(Popen(sox_args, stdout=PIPE, stderr=devnull))
    PROCS.append(Popen(splitaudio_args, stdin=PROCS[-1].stdout, cwd=output_dir))

    p = PROCS[-1]
    while p.poll() is None:
        utils.show_status('Fixing offset (%i samples)', offset)

    out, err = p.communicate()
    devnull.close()
    print('', file=sys.stderr, end='\n')
    for pr in PROCS:
        if pr.returncode:
            raise utils.SubprocessError('sox had an error (returned %i)' %
                                        pr.returncode)

    TEMPDIRS.remove(output_dir)
    for i, s in enumerate(sources):
        src = join(output_dir, 'fixed%03i.%s' % (i,fmt))
        outpath = join(output_dir,
                       '%s.%s' % (splitext(basename(s['path']))[0], fmt))
        os.rename(src, outpath)

    return output_dir
示例#28
0
 def tofile(self):
     show_status(".. to file: %s" % self.transed_features_path)
     with open(self.transed_features_path, 'w') as f:
         f.write(''.join(self.features))