def psort(path, parallel=-1, field_seperator=' ', key=1, tmp_dir='/tmp/', buffer_mb=1024, output=None): # TODO: We need better way for OS/platform compatibility. # we need compatibility checking routine for this method. commands = ['sort', '-n', '-s'] if parallel == -1: parallel = psutil.cpu_count() if parallel > 0: commands.extend(['--parallel', parallel]) if not output: output = path commands.extend(['-t', '{}'.format(field_seperator)]) commands.extend(['-k', key]) commands.extend(['-T', tmp_dir]) commands.extend(['-S', '%sM' % buffer_mb]) commands.extend(['-o', output]) commands.append(path) try: subprocess.check_output(map(str, commands), stderr=subprocess.STDOUT, env={'LC_ALL': 'C'}) except Exception as e: log.get_logger().error('Unexpected error: %s for %s' % (str(e), ' '.join(list(map(str, commands))))) raise
def __init__(self, opt_path=None, *args, **kwargs): Algo.__init__(self, *args, **kwargs) BPRMFOption.__init__(self, *args, **kwargs) Evaluable.__init__(self, *args, **kwargs) Serializable.__init__(self, *args, **kwargs) Optimizable.__init__(self, *args, **kwargs) if opt_path is None: opt_path = BPRMFOption().get_default_option() self.logger = log.get_logger('BPRMF') self.opt, self.opt_path = self.get_option(opt_path) self.obj = CyBPRMF() assert self.obj.init(bytes(self.opt_path, 'utf-8')),\ 'cannot parse option file: %s' % opt_path self.data = None data = kwargs.get('data') data_opt = self.opt.get('data_opt') data_opt = kwargs.get('data_opt', data_opt) if data_opt: self.data = buffalo.data.load(data_opt) self.data.create() elif isinstance(data, Data): self.data = data self.logger.info('BPRMF(%s)' % json.dumps(self.opt, indent=2)) if self.data: self.logger.info(self.data.show_info()) assert self.data.data_type in ['matrix']
def __init__(self, opt_path=None, *args, **kwargs): Algo.__init__(self, *args, **kwargs) ALSOption.__init__(self, *args, **kwargs) Evaluable.__init__(self, *args, **kwargs) Serializable.__init__(self, *args, **kwargs) Optimizable.__init__(self, *args, **kwargs) if opt_path is None: opt_path = ALSOption().get_default_option() self.logger = log.get_logger('ALS') self.opt, self.opt_path = self.get_option(opt_path) if self.opt.accelerator and not inited_CUALS: self.logger.error("ImportError CuALS, no cuda library exists.") raise RuntimeError() self.obj = CuALS() if self.opt.accelerator else CyALS() assert self.obj.init(bytes( self.opt_path, 'utf-8')), 'cannot parse option file: %s' % opt_path self.data = None data = kwargs.get('data') data_opt = self.opt.get('data_opt') data_opt = kwargs.get('data_opt', data_opt) if data_opt: self.data = buffalo.data.load(data_opt) self.data.create() elif isinstance(data, Data): self.data = data self.logger.info('ALS(%s)' % json.dumps(self.opt, indent=2)) if self.data: self.logger.info(self.data.show_info()) assert self.data.data_type in ['matrix']
def __init__(self, opt_path=None, *args, **kwargs): Algo.__init__(self, *args, **kwargs) WARPOption.__init__(self, *args, **kwargs) Evaluable.__init__(self, *args, **kwargs) Serializable.__init__(self, *args, **kwargs) Optimizable.__init__(self, *args, **kwargs) if opt_path is None: opt_path = WARPOption().get_default_option() self.logger = log.get_logger('WARP') self.opt, self.opt_path = self.get_option(opt_path) # TODO:GPU Implementation if self.opt.accelerator is True: raise NotImplementedError( "GPU version WARP is not implemented yet") self.obj = CyWARP() assert self.obj.init(bytes(self.opt_path, 'utf-8')),\ 'cannot parse option file: %s' % opt_path self.data = None data = kwargs.get('data') data_opt = self.opt.get('data_opt') data_opt = kwargs.get('data_opt', data_opt) if data_opt: self.data = buffalo.data.load(data_opt) self.data.create() elif isinstance(data, Data): self.data = data self.logger.info('WARP(%s)' % json.dumps(self.opt, indent=2)) if self.data: self.logger.info(self.data.show_info()) assert self.data.data_type in ['matrix']
def __init__(self, opt_path=None, *args, **kwargs): Algo.__init__(self, *args, **kwargs) W2VOption.__init__(self, *args, **kwargs) Evaluable.__init__(self, *args, **kwargs) Serializable.__init__(self, *args, **kwargs) Optimizable.__init__(self, *args, **kwargs) if opt_path is None: opt_path = W2VOption().get_default_option() self.logger = log.get_logger('W2V') self.opt, self.opt_path = self.get_option(opt_path) self.obj = CyW2V() assert self.obj.init(bytes(self.opt_path, 'utf-8')), 'cannot parse option file: %s' % opt_path self.data = None data = kwargs.get('data') data_opt = self.opt.get('data_opt') data_opt = kwargs.get('data_opt', data_opt) if data_opt: self.data = buffalo.data.load(data_opt) assert self.data.data_type == 'stream' self.data.create() elif isinstance(data, Data): self.data = data self.logger.info('W2V(%s)' % json.dumps(self.opt, indent=2)) if self.data: self.logger.info(self.data.show_info()) assert self.data.data_type in ['stream'] self._vocab = aux.Option({'size': 0, 'index': None, 'inv_index': None, 'scale': None, 'dist': None, 'total_word_count': 0})
def __init__(self, opt, *args, **kwargs): super().__init__(opt, *args, **kwargs) self.name = 'MatrixMarket' self.logger = log.get_logger('MatrixMarket') if isinstance(self.value_prepro, (prepro.SPPMI)): raise RuntimeError(f'{self.opt.data.value_prepro.name} does not support MatrixMarket') self.data_type = 'matrix'
def get_main_path(self): main = self.opt.input.main if isinstance(main, (str,)): return main if hasattr(self, 'temp_main'): return self.temp_main log.get_logger('MatrixMarketDataReader').debug('creating temporary matrix-market data from numpy-kind array') tmp_path = aux.get_temporary_file(self.opt.data.tmp_dir) with open(tmp_path, 'wb') as fout: if isinstance(main, (np.ndarray,)) and main.ndim == 2: main = scipy.sparse.csr_matrix(main) if scipy.sparse.issparse(main): scipy.io.mmwrite(fout, main) self.temp_main = tmp_path return tmp_path raise RuntimeError(f'Unexpected data type for MatrixMarketOption.input.main field: {type(main)}')
def __init__(self, *args, **kwargs): Algo.__init__(self, *args, **kwargs) Optimizable.__init__(self, *args, **kwargs) TensorboardExtention.__init__(self, *args, **kwargs) self.logger = log.get_logger('MockAlgo') option = ALSOption().get_default_option() optimize_option = ALSOption().get_default_optimize_option() optimize_option.start_with_default_parameters = False option.optimize = optimize_option option.model_path = 'hello.world.bin' self.opt = option self._optimize_loss = {'loss': 987654321.0}
def __init__(self, opt_path=None, *args, **kwargs): Algo.__init__(self, *args, **kwargs) CFROption.__init__(self, *args, **kwargs) Evaluable.__init__(self, *args, **kwargs) Serializable.__init__(self, *args, **kwargs) Optimizable.__init__(self, *args, **kwargs) if opt_path is None: opt_path = CFROption().get_default_option() self.logger = log.get_logger('CFR') # put options into cython class with type assertion # see comments on options.py for the description of each parameter self.opt, self.opt_path = self.get_option(opt_path) self.obj = CyCFR() # check the validity of option self.is_valid_option(self.opt) assert self.obj.init(self.opt_path.encode( "utf8")), "putting parameter to cython object failed" # ensure embedding matrix is initialzed for preventing segmentation fault self.is_initialized = False self.data = None data = kwargs.get('data') data_opt = self.opt.get('data_opt') data_opt = kwargs.get('data_opt', data_opt) if data_opt: assert data_opt.data.internal_data_type == "matrix", \ f"internal data type is {data_opt.data.internal_data_type}, not matrix" self.data = buffalo.data.load(data_opt) assert self.data.data_type == 'stream' self.data.create() elif isinstance(data, Data): self.data = data self.logger.info('CFR ({})'.format(json.dumps(self.opt, indent=2))) if self.data: self.logger.info(self.data.show_info()) assert self.data.data_type in ['stream']
def __init__(self): self.logger = log.get_logger('ALS')
def __init__(self): self.logger = log.get_logger('BufferedData')
import buffalo.data from buffalo.misc import aux, log from buffalo.data.base import Data from buffalo.algo._als import CyALS from buffalo.evaluate import Evaluable from buffalo.algo.options import ALSOption from buffalo.algo.optimize import Optimizable from buffalo.algo.tensorflow._als import TFALS from buffalo.data.buffered_data import BufferedDataMatrix from buffalo.algo.base import Algo, Serializable, TensorboardExtention try: from buffalo.algo.cuda._als import CyALS as CuALS except Exception as e: log.get_logger("system").error( f"ImportError CuALS, no cuda library exists. error message: {e}") CuALS = lambda x: () class ALS(Algo, ALSOption, Evaluable, Serializable, Optimizable, TensorboardExtention): """Python implementation for C-ALS. Implementation of Collaborative Filtering for Implicit Feedback datasets. Reference: http://yifanhu.net/PUB/cf.pdf""" def __init__(self, opt_path=None, *args, **kwargs): Algo.__init__(self, *args, **kwargs) ALSOption.__init__(self, *args, **kwargs) Evaluable.__init__(self, *args, **kwargs) Serializable.__init__(self, *args, **kwargs)
def __init__(self, opt, *args, **kwargs): super(Stream, self).__init__(opt, *args, **kwargs) self.name = 'Stream' self.logger = log.get_logger('Stream') self.data_type = 'stream'
def prepare_dataset(): logger = log.get_logger() if not os.path.isdir('ext/ml-100k/'): logger.warn('Cannot find the ./ext/ml-100k directory') else: if not os.path.isfile('./ext/ml-100k/main'): logger.info('preprocessing for matrix market format of ml-100k...') in_path = "./ext/ml-100k/u.data" stream_out_path = "./ext/ml-100k/stream" aux.psort(in_path, field_seperator="\t", key=4) aux.psort(in_path, field_seperator="\t", key=1) with open('./ext/ml-100k/main', 'w') as fout: fout.write( '%%MatrixMarket matrix coordinate integer general\n%\n%\n943 1682 80000\n' ) with open(in_path) as fin: for line in fin: u, i, v, ts = line.strip().split('\t') fout.write('%s %s %s\n' % (u, i, v)) iids = [] with open('./ext/ml-100k/iid', 'w') as fout: with open('./ext/ml-100k/u.item', encoding='ISO-8859-1') as fin: iids = [ line.strip().split('|')[1].replace(' ', '_') for line in fin ] iids = [f"{idx}.{key}" for idx, key in enumerate(iids)] fout.write("\n".join(iids)) with open('./ext/ml-100k/uid', 'w') as fout: for line in open('./ext/ml-100k/u.user'): userid = line.strip().split('|')[0] fout.write('%s\n' % userid) logger.info('preprocessing for stream format of ml-100k...') probe, bag = None, [] with open(in_path, "r") as fin, open(stream_out_path, "w") as fout: for line in fin: u, i, v, ts = line.strip().split("\t") if not probe: probe = u elif probe != u: fout.write(" ".join(bag) + "\n") probe, bag = u, [] bag.append(iids[int(i) - 1]) if bag: fout.write(" ".join(bag)) if not os.path.isdir('ext/ml-20m'): logger.warn('Cannot find the ./ml-20m directory') else: if not os.path.isfile('./ext/ml-20m/main'): logger.info('preprocessing for matrix market format of ml-20m...') uids, iids = {}, {} in_path = "./ext/ml-20m/ratings.csv" aux.psort(in_path, field_seperator=",", key=4) aux.psort(in_path, field_seperator=",", key=1) with open(in_path) as fin: fin.readline() for line in fin: uid = line.split(',')[0] if uid not in uids: uids[uid] = len(uids) + 1 with open('./ext/ml-20m/uid', 'w') as fout: for uid, _ in sorted(uids.items(), key=lambda x: x[1]): fout.write('%s\n' % uid) with open('./ext/ml-20m/movies.csv') as fin: fin.readline() for line in fin: iid = line.split(',')[0] iids[iid] = len(iids) + 1 with open('./ext/ml-20m/iid', 'w') as fout: for iid, _ in sorted(iids.items(), key=lambda x: x[1]): fout.write('%s\n' % iid) with open('./ext/ml-20m/main', 'w') as fout: fout.write( '%%MatrixMarket matrix coordinate real general\n%\n%\n138493 27278 20000263\n' ) with open('./ext/ml-20m/ratings.csv') as fin: fin.readline() for line in fin: uid, iid, r, *_ = line.split(',') uid, iid = uids[uid], iids[iid] fout.write(f'{uid} {iid} {r}\n') logger.info('preprocessing for stream format of ml-20m...') probe, bag = None, [] stream_out_path = "./ext/ml-20m/stream" with open(in_path, "r") as fin, open(stream_out_path, "w") as fout: fin.readline() for line in fin: u, i, v, ts = line.strip().split(",") if not probe: probe = u elif probe != u: fout.write(" ".join(bag) + "\n") probe, bag = u, [] bag.append(i) if bag: fout.write(" ".join(bag)) if not os.path.isdir('ext/text8'): logger.warn('Cannot find the text8 directory') else: if not os.path.isfile('./ext/text8/main'): with open('./ext/text8/text8') as fin: words = fin.readline().strip().split() with open('./ext/text8/main', 'w') as fout: for i in range(0, len(words), 1000): fout.write('%s\n' % ' '.join(words[i:i + 1000])) if not os.path.isdir('brunch'): logger.warn('Cannot find the brunch directory') else: if not os.path.isfile('./ext/brunch/main'): os.makedirs('./ext/brunch/tmp', exist_ok=True) to_dir = './ext/brunch/tmp' logger.info('dividing...') num_chunks = 30 fouts = { i: open(os.path.join(to_dir, str(i)), 'w') for i in range(num_chunks) } for path, fname in iterate_brunch_data_files('./ext/brunch'): for line in open(path): uid = line.strip().split()[0] fid = hash(uid) % num_chunks fouts[fid].write(line) for val in fouts.values(): val.close() logger.info('merging...') with open('./ext/brunch/main', 'w') as fout, \ open('./ext/brunch/uid', 'w') as fout_uid: for fid in fouts.keys(): seens = {} chunk_path = os.path.join(to_dir, str(fid)) for line in open(chunk_path): line = line.strip().split() uid, seen = line[0], line[1:] seens.setdefault(uid, []).extend(seen) for uid, seen in seens.items(): fout.write(' '.join(seen) + '\n') fout_uid.write(uid + '\n') for fid in fouts.keys(): chunk_path = os.path.join(to_dir, str(fid)) os.remove(chunk_path) make_mm_from_stream('./ext/brunch/', './ext/brunch/mm')
def prepare_dataset(): logger = log.get_logger() if not os.path.isdir('ext/ml-100k/'): logger.warn('Cannot find the ./ext/ml-100k directory') else: if not os.path.isfile('./ext/ml-100k/main'): logger.info('preprocessing for matrix market format of ml-100k...') in_path = "./ext/ml-100k/u.data" stream_out_path = "./ext/ml-100k/stream" aux.psort(in_path, field_seperator="\t", key=4) aux.psort(in_path, field_seperator="\t", key=1) with open('./ext/ml-100k/main', 'w') as fout: fout.write( '%%MatrixMarket matrix coordinate integer general\n%\n%\n943 1682 80000\n' ) with open(in_path) as fin: for line in fin: u, i, v, ts = line.strip().split('\t') fout.write('%s %s %s\n' % (u, i, v)) iids = [] with open('./ext/ml-100k/iid', 'w') as fout: with open('./ext/ml-100k/u.item', encoding='ISO-8859-1') as fin: iids = [ line.strip().split('|')[1].replace(' ', '_') for line in fin ] iids = [f"{idx}.{key}" for idx, key in enumerate(iids)] fout.write("\n".join(iids)) with open('./ext/ml-100k/uid', 'w') as fout: for line in open('./ext/ml-100k/u.user'): userid = line.strip().split('|')[0] fout.write('%s\n' % userid) logger.info('preprocessing for stream format of ml-100k...') probe, bag = None, [] with open(in_path, "r") as fin, open(stream_out_path, "w") as fout: for line in fin: u, i, v, ts = line.strip().split("\t") if not probe: probe = u elif probe != u: fout.write(" ".join(bag) + "\n") probe, bag = u, [] bag.append(iids[int(i) - 1]) if bag: fout.write(" ".join(bag)) if not os.path.isdir('ext/ml-20m'): logger.warn('Cannot find the ./ml-20m directory') else: if not os.path.isfile('./ext/ml-20m/main'): logger.info('preprocessing for matrix market format of ml-20m...') uids, iids = {}, {} in_path = "./ext/ml-20m/ratings.csv" aux.psort(in_path, field_seperator=",", key=4) aux.psort(in_path, field_seperator=",", key=1) with open(in_path) as fin: fin.readline() for line in fin: uid = line.split(',')[0] if uid not in uids: uids[uid] = len(uids) + 1 with open('./ext/ml-20m/uid', 'w') as fout: for uid, _ in sorted(uids.items(), key=lambda x: x[1]): fout.write('%s\n' % uid) with open('./ext/ml-20m/movies.csv') as fin: fin.readline() for line in fin: iid = line.split(',')[0] iids[iid] = len(iids) + 1 with open('./ext/ml-20m/iid', 'w') as fout: for iid, _ in sorted(iids.items(), key=lambda x: x[1]): fout.write('%s\n' % iid) with open('./ext/ml-20m/main', 'w') as fout: fout.write( '%%MatrixMarket matrix coordinate real general\n%\n%\n138493 27278 20000263\n' ) with open('./ext/ml-20m/ratings.csv') as fin: fin.readline() for line in fin: uid, iid, r, *_ = line.split(',') uid, iid = uids[uid], iids[iid] fout.write(f'{uid} {iid} {r}\n') logger.info('preprocessing for stream format of ml-20m...') probe, bag = None, [] stream_out_path = "./ext/ml-20m/stream" with open(in_path, "r") as fin, open(stream_out_path, "w") as fout: fin.readline() for line in fin: u, i, v, ts = line.strip().split(",") if not probe: probe = u elif probe != u: fout.write(" ".join(bag) + "\n") probe, bag = u, [] bag.append(i) if bag: fout.write(" ".join(bag)) if not os.path.isdir('ext/text8'): logger.warn('Cannot find the text8 directory') else: if not os.path.isfile('./ext/text8/main'): with open('./ext/text8/text8') as fin: words = fin.readline().strip().split() with open('./ext/text8/main', 'w') as fout: for i in range(0, len(words), 1000): fout.write('%s\n' % ' '.join(words[i:i + 1000]))
def __init__(self, opt, name="tf_als"): self.logger = log.get_logger("tf-als") self.opt = opt self.name = name self.sess = tf.Session() self.graph = tf.get_default_graph()