def test_two_sentence_language(self):
     test_sequences = ['a b c d.', 'e f g.']
     N = 2
     sampler = utilities.SentenceSamplerUtility(test_sequences, N)
     sample = sampler.get_sample()
     print(sample)
     assert sample in test_sequences
예제 #2
0
def build_samples(class_value, y_value, n, training_file, generate_html):
    '''
    :param class_attr: Column name in train csv that specifies class
    :param file_name_attr: Column name in train csv that specifies file name
    :param class_value: Name of class (In this case unsponsored and sponsored)
    :param y_value: The y value of the class (In this case 0 - unsponsored and 1 - sponsored)
    :param n: Number of sample files
    :param training_file: Name of the train csv
    :param generate_html: Boolean value to specify whether we want to generate the HTML file for the sampled html text files
    :return: A list of file names that get sampled. This is important because not all files sampled are available in the data.
    '''
    class_files = sampler.get_rows_of_class(training_file, y_value)
    sampled_files = sampler.get_sample(class_files, n)
    return sampler.collect_sample_data(sampled_files, class_value, generate_html=generate_html)
def build_samples(class_value, y_value, n, training_file, generate_html):
    '''
    :param class_attr: Column name in train csv that specifies class
    :param file_name_attr: Column name in train csv that specifies file name
    :param class_value: Name of class (In this case unsponsored and sponsored)
    :param y_value: The y value of the class (In this case 0 - unsponsored and 1 - sponsored)
    :param n: Number of sample files
    :param training_file: Name of the train csv
    :param generate_html: Boolean value to specify whether we want to generate the HTML file for the sampled html text files
    :return: A list of file names that get sampled. This is important because not all files sampled are available in the data.
    '''
    class_files = sampler.get_rows_of_class(training_file, y_value)
    sampled_files = sampler.get_sample(class_files, n)
    return sampler.collect_sample_data(sampled_files,
                                       class_value,
                                       generate_html=generate_html)
def train(train_file, parameters):
    model  = Model(parameters)
    batch  = parameters["batch"]
    niter  = parameters["niter"]
    sample = sampler.get_sample(parameters)
    logger = logging.getLogger(Logger.project_name)


    ##initilization the sampler
    train_reader   = SvmReader(train_file, batch)
    has_next       = True
    while has_next:
        x,y,has_next = train_reader.read()
        sample.update(y)
    
    ##weight updates
    for iter1 in xrange(niter): 
        train_reader = SvmReader(train_file, batch)
    
        has_next = True
        while has_next:
            x, y, has_next = train_reader.read()
            idx            = sample.sample(y)
            model.update(x, y, idx)

        logger.info("The %d-th iteration completes"%(iter1+1)); 
        train_reader.close()

    ##tuning threshold
    train_reader = SvmReader(train_file, batch)
    x, y, has_next = train_reader.read()
    while has_next:
        p = model.ff(x)
        model.thrsel.update(p, y)
        x, y, has_next = train_reader.read()
        

    return model
 def test_utilities_sampler_construction(self):
     test_sequences = ['a b c d e f g.']
     N = 2
     sampler = utilities.SentenceSamplerUtility(test_sequences, N)
     assert sampler.get_sample() == 'a b c d e f g.'
def train_mem(train_file, parameters):
    model  = Model(parameters)
    batch  = parameters["batch"]
    niter  = parameters["niter"]
    sample = sampler.get_sample(parameters)   
    logger = logging.getLogger(Logger.project_name)
    logger.info("Model initialization done")

    train_reader = SvmReader(train_file)
    x, y = train_reader.full_read()
    num, _ = y.shape
    #if None == sample: idx_y = sp.csr_matrix(np.ones(y.shape))
    #else: idx_y = sp.csr_matrix(sample.sample(y))
    logger.info("Training data loading done")

    sample.update(y)
    logger.info("Sampling initialization done")

    start_time = time.time()
    for iter1 in xrange(niter):
        start = 0
        end = batch
        while start < num:
            #logger.info("start = %d, end = %d\n"%(start, end))
            if end > num:   end = num
            
#            import cProfile, pstats, StringIO
#            pr =  cProfile.Profile()
#            pr.enable()

            batch_x = x[start:end, :]
            batch_y = y[start:end, :] 
            batch_i = sample.sample(batch_y)
            model.update(batch_x, batch_y, batch_i)      

            start += batch;
            end += batch;
#            pr.disable()
#            s = StringIO.StringIO()
#            sortby = 'cumulative'
#            ps = pstats.Stats(pr, stream = s).sort_stats(sortby)
#            ps.print_stats()
#            print "update",s.getvalue()
            
        logger.info("The %d-th iteration completes"%(iter1+1)); 
    
    #####tuning the threshold
    total = 0
    start = 0
    end = batch
    while start < num and total < 1000:
        if end > num: end = num
        batch_x = x[start:end,:]
        batch_y = y[start:end,:]
        batch_p = model.ff(batch_x)
        model.thrsel.update(batch_p, batch_y)
        start += batch
        end   += batch
        total += 1

    logger.info("The threshold tuning completes") 
    end_time = time.time()
    logger.info("The training time is %f"%(end_time-start_time))

    return model