def test_two_sentence_language(self): test_sequences = ['a b c d.', 'e f g.'] N = 2 sampler = utilities.SentenceSamplerUtility(test_sequences, N) sample = sampler.get_sample() print(sample) assert sample in test_sequences
def build_samples(class_value, y_value, n, training_file, generate_html): ''' :param class_attr: Column name in train csv that specifies class :param file_name_attr: Column name in train csv that specifies file name :param class_value: Name of class (In this case unsponsored and sponsored) :param y_value: The y value of the class (In this case 0 - unsponsored and 1 - sponsored) :param n: Number of sample files :param training_file: Name of the train csv :param generate_html: Boolean value to specify whether we want to generate the HTML file for the sampled html text files :return: A list of file names that get sampled. This is important because not all files sampled are available in the data. ''' class_files = sampler.get_rows_of_class(training_file, y_value) sampled_files = sampler.get_sample(class_files, n) return sampler.collect_sample_data(sampled_files, class_value, generate_html=generate_html)
def train(train_file, parameters): model = Model(parameters) batch = parameters["batch"] niter = parameters["niter"] sample = sampler.get_sample(parameters) logger = logging.getLogger(Logger.project_name) ##initilization the sampler train_reader = SvmReader(train_file, batch) has_next = True while has_next: x,y,has_next = train_reader.read() sample.update(y) ##weight updates for iter1 in xrange(niter): train_reader = SvmReader(train_file, batch) has_next = True while has_next: x, y, has_next = train_reader.read() idx = sample.sample(y) model.update(x, y, idx) logger.info("The %d-th iteration completes"%(iter1+1)); train_reader.close() ##tuning threshold train_reader = SvmReader(train_file, batch) x, y, has_next = train_reader.read() while has_next: p = model.ff(x) model.thrsel.update(p, y) x, y, has_next = train_reader.read() return model
def test_utilities_sampler_construction(self): test_sequences = ['a b c d e f g.'] N = 2 sampler = utilities.SentenceSamplerUtility(test_sequences, N) assert sampler.get_sample() == 'a b c d e f g.'
def train_mem(train_file, parameters): model = Model(parameters) batch = parameters["batch"] niter = parameters["niter"] sample = sampler.get_sample(parameters) logger = logging.getLogger(Logger.project_name) logger.info("Model initialization done") train_reader = SvmReader(train_file) x, y = train_reader.full_read() num, _ = y.shape #if None == sample: idx_y = sp.csr_matrix(np.ones(y.shape)) #else: idx_y = sp.csr_matrix(sample.sample(y)) logger.info("Training data loading done") sample.update(y) logger.info("Sampling initialization done") start_time = time.time() for iter1 in xrange(niter): start = 0 end = batch while start < num: #logger.info("start = %d, end = %d\n"%(start, end)) if end > num: end = num # import cProfile, pstats, StringIO # pr = cProfile.Profile() # pr.enable() batch_x = x[start:end, :] batch_y = y[start:end, :] batch_i = sample.sample(batch_y) model.update(batch_x, batch_y, batch_i) start += batch; end += batch; # pr.disable() # s = StringIO.StringIO() # sortby = 'cumulative' # ps = pstats.Stats(pr, stream = s).sort_stats(sortby) # ps.print_stats() # print "update",s.getvalue() logger.info("The %d-th iteration completes"%(iter1+1)); #####tuning the threshold total = 0 start = 0 end = batch while start < num and total < 1000: if end > num: end = num batch_x = x[start:end,:] batch_y = y[start:end,:] batch_p = model.ff(batch_x) model.thrsel.update(batch_p, batch_y) start += batch end += batch total += 1 logger.info("The threshold tuning completes") end_time = time.time() logger.info("The training time is %f"%(end_time-start_time)) return model