Пример #1
0
def execute_generation(dist_holder,
                       options,
                       logger,
                       cl_flags,
                       schema_file=None,
                       queries_file=None,
                       results_db=None):
    """
    The heavy lifting: generate the queries, generate the rows of data, 
    process of the results, and write the queries & ground-truth to the 
    database. Note: factored out from set_up_and_execute_run, below, so that
    it can be profiled separately from learning.
    """
    query_seed = cl_flags.query_seed

    # generate queries
    (querysets, aggregators) = generate_queries(schema_file, logger,
                                                dist_holder, options,
                                                query_seed)
    for a in aggregators:
        a.set_process_limit(cl_flags.num_processes)

    # generate data
    agg_results = generate_rows(options, aggregators, logger, dist_holder)

    # process results - needs to be reseeded because it relies on the
    # overall random state to refine queries
    seed = int(query_seed)
    spar_random.seed(seed)
    process_results(querysets, agg_results, results_db, queries_file,
                    cl_flags.num_processes)

    return
    def setUp(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        self.dist = bespoke_distributions.SSNDistribution()
Пример #3
0
 def setUp(self):
     self.seed = int(time.time())
     self.seed_msg = "Random seed used for this test: %s" % self.seed
     self.longMessage = True
     spar_random.seed(self.seed)
     #set up intitialization values    
     sub_cat = 'word'
     f = s.StringIO('''Buck had accepted the rope with quiet dignity. To be sure, it 
             unwonted performance: but he had learned to trust in men he knew, and to 
             give them credit for a wisdom that outreached his own. But when the ends 
             of the ropes were placed in the strangers hands, he growled menacingly. 
             He had merely intimated his displeasure, in his pride believing that to 
             intimate was to command. But to his surprise the rope tightened around 
             his neck, shutting off his breath. In quick rage he sprang at the man, 
             who met him halfway, grappled him close by the throat, and with a deft 
             twist threw him over on his back. Then the rope tightened mercilessly, 
             while Buck struggled in a fury, his tongue lolling out of his mouth and 
             his great chest. Never in all his life had he been so 
             vilely treated, and never in all his life had he been so angry. But his 
             strength ebbed, his eyes glazed, and he knew nothing when the train was 
             flagged and the two men threw him into the baggage car.''')
     self._kw_dist = text_generator.TextGenerator((f,))
     fields = [sv.VARS.NOTES3]
     dists = [self._kw_dist]
     other_fields = ['no_queries', 'rss','keyword_len','type']
     other_cols = [[3, 60, 4, 'word'], [3, 60, 5, 'word'],
                   [3, 75, 4, 'stem'], [3, 60, 5, 'stem']]
     self.generator = kqg.KeywordQueryGenerator('P3',sub_cat, ["LL"],dists, fields, 1000,
                                                 100, other_fields, other_cols)
    def setUp(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)
        #set up intitialization values 
        self.dummy_logger = logging.getLogger('dummy')
        self.dummy_logger.addHandler(logging.NullHandler())
        class Options(object):
            pass
        learner_options = Options()
        learner_options.verbose = False

        pums_files = \
            [("mock pums", 
              stringio.StringIO(mock_data_files.mock_pums_data))]
        pums_dict = \
            learn_distributions.learn_pums_dists(learner_options,
                                                 self.dummy_logger,
                                                 pums_files)
            
        names_files = \
            [('male_first_names.txt', 
              stringio.StringIO(mock_data_files.mock_male_first_names)),
             ('female_first_names.txt', 
              stringio.StringIO(mock_data_files.mock_female_first_names)),
             ('last_names.txt', 
              stringio.StringIO(mock_data_files.mock_last_names))]
        names_dict = \
            learn_distributions.learn_name_dists(learner_options,
                                                 self.dummy_logger,
                                                 names_files)

        vars = [sv.VARS.SEX,
                sv.VARS.CITIZENSHIP,
                sv.VARS.AGE,
                sv.VARS.RACE,
                sv.VARS.STATE,
                sv.VARS.FIRST_NAME,
                sv.VARS.LAST_NAME]

        var_order = vars
        var_names = [sv.VARS.to_string(x) for x in vars]
        dist_dict = { }
        dist_dict.update(pums_dict)
        dist_dict.update(names_dict)
        
        dist_holder = dh.DistributionHolder(var_order, var_names, dist_dict)
        
        fields = [sv.sql_name_to_enum('xml')]

        self._dist1 = xml_generator.XmlGenerator(dist_holder)
        dists = [self._dist1]
        other_fields = ['no_queries', 'r_lower', 'r_upper', 'path_type'] 
        other_cols_full = [[5, 1, 10, 'full']]
        other_cols_short = [[5,1, 10,'short']]
        self.full_generator = xqg.XmlQueryGenerator('P11','', ["LL"],dists, fields, 1000,
                                                    100,other_fields, other_cols_full)
        self.short_generator = xqg.XmlQueryGenerator('P11','', ["LL"],dists, fields, 1000,
                                                    100,other_fields, other_cols_short)
    def setUp(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        self.zip_codes = []
        self.dist = \
            bespoke_distributions.AddressDistribution()

        streets_raw = mock_data_files.mock_street_names
        mock_file = stringio.StringIO(streets_raw)
        csv_dict_reader = csv.DictReader(mock_file)

        self.zip_codes = []
        for d in csv_dict_reader:

            zip_str = d['zip']
            street_str = d['fullname']
            self.dist.add(street_str, 1, zip_str)

            ind_var = {sv.VARS.ZIP_CODE: d['zip']}
            self.zip_codes.append(ind_var)

        self.expected_re = re.compile('\d+ [^,]+(, APT \d+)?')
    def setUp(self):

        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        pums_data_raw = mock_data_files.mock_pums_data
        mock_file = stringio.StringIO(pums_data_raw)
        pums_files = [("mock file", mock_file)]

        self.log = stringio.StringIO()
        dummy_logger = logging.getLogger('dummy')
        dummy_logger.addHandler(logging.StreamHandler(self.log))
        dummy_logger.setLevel(logging.DEBUG)

        class Options(object):
            pass

        learner_options = Options()
        learner_options.verbose = True
        pums_dict = \
            learn_distributions.learn_pums_dists(learner_options,
                                                 dummy_logger,
                                                 pums_files)

        age_dist = pums_dict[sv.VARS.AGE]
        dob_dist = \
             bespoke_distributions.DOBDistribution(age_dist)

        self.dist = bespoke_distributions.LastUpdatedDistribution(dob_dist)
    def setUp(self):

        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        names_files = \
            [('male_first_names.txt', 
              stringio.StringIO(mock_data_files.mock_male_first_names)),
             ('female_first_names.txt', 
              stringio.StringIO(mock_data_files.mock_female_first_names)),
             ('last_names.txt', 
              stringio.StringIO(mock_data_files.mock_last_names))]
        
        
        self.log = stringio.StringIO()
        dummy_logger = logging.getLogger('dummy')
        dummy_logger.addHandler(logging.StreamHandler(self.log))
        dummy_logger.setLevel(logging.DEBUG)
        
        learner_options = Options()
        learner_options.verbose = True
        

        self.names_dict = \
            learn_distributions.learn_name_dists(learner_options,
                                                 dummy_logger,
                                                 names_files)
 
        (_, manipulator) = sv.PUMS_VARS_DICT[sv.VARS.SEX ]
        self.male_ind_var = {sv.VARS.SEX :
                             manipulator.to_string(sv.SEX.Male) }
        self.female_ind_var =  {sv.VARS.SEX :
                                manipulator.to_string(sv.SEX.Female)}
Пример #8
0
    def setUp(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)
        self.dummy_logger = logging.getLogger('dummy')

        #set up intitialization values
        self._dist1 = base_distribution.CompactIndependentDistribution()
        self._dist1.add('hello', 1)
        self._dist1.add('there', 999)
        self._dist2 = base_distribution.SimpleConditionalDistribution(
            VARS.dist1)
        self._dist2.add('hello', 1, 'hello')
        self._dist2.add('hi', 300, 'there')
        self._dist2.add('there', 400, 'there')
        self._dist2.add('person', 300, 'there')
        fields = [sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME]
        dists = [self._dist1, self._dist2]
        other_fields = ['no_queries', 'r_lower', 'r_upper']
        other_cols = [[5, 1, 7]]
        query_object = [
            eqg.EqualityQueryGenerator('EQ', 'eq', ["LL"], [self._dist1],
                                       [sv.VARS.FIRST_NAME], 100, 10000,
                                       other_fields, other_cols),
            eqg.EqualityQueryGenerator('EQ', 'eq', ["LL"], [self._dist2],
                                       [sv.VARS.LAST_NAME], 10000, 100,
                                       other_fields, other_cols),
            eqg.EqualityQueryGenerator('EQ', 'eq', ["LL"], dists, fields,
                                       10000, 100, other_fields, other_cols)
        ]
        self.handler = handler.QueryHandler(query_object)
Пример #9
0
 def setUp(self):
     """
     Records the randomness used.
     """
     # record the randomness used in case the test fails:
     self.rand_seed = int(time.time())
     sr.seed(self.rand_seed)
     print("seed for this test: " + str(self.rand_seed))
    def setUp(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        self.upper_bound = 10**5
        self.dist = \
             bespoke_distributions.RandIntWithoutReplacement(self.upper_bound)
    def setUp(self):

        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        self.simple = base_distributions.SimpleIndependentDistribution
        self.compact = base_distributions.CompactIndependentDistribution
        self.compact_dist = base_distributions.CompactIndependentDistribution()
        self.simple_dist = base_distributions.SimpleIndependentDistribution()
Пример #12
0
    def _make_id_seed_generator(self, start_seed, num_rows):
        '''
        Make a generator for all the (row_id, seed) pairs for this run.
        '''
        seed = int(start_seed)
        # Seed the RNG. Will need this when generating row IDs. Note: if
        # the user does not set this, it defaults to 0.
        spar_random.seed(seed)

        id_seed_generator = \
            itertools.izip(self._row_id_generator(num_rows),
                           self._seed_generator(seed))
        return itertools.islice(id_seed_generator, num_rows)
 def setUp(self):
     self.seed = int(time.time())
     self.seed_msg = "Random seed used for this test: %s" % self.seed
     self.longMessage = True
     spar_random.seed(self.seed)
     #set up intitialization values 
     sub_cat = 'foo-range'
     self._foo_dist = bespoke_distribution.FooDistribution()
     fields = [sv.sql_name_to_enum('foo')]
     dists = [self._foo_dist]
     other_fields = ['no_queries', 'r_lower', 'r_upper','r_exp_lower','r_exp_upper','type']
     other_cols = [[2, 1, 100, 21, 21, 'range'], [2,1, 100,32, 32,'range'],
                   [2, 1, 200, 21, 21,'greater'],[2,1, 200,25, 25,'greater']]
     self.generator = frqg.FooRangeQueryGenerator('P2',sub_cat, ["LL"],dists, fields, 50000,
                                                  100,other_fields, other_cols)
    def setUp(self):

        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        self.compact_dist = \
                base_distributions.CompactConditionalDistribution(VARS.FOO)
        self.simple_dist = \
                base_distributions.SimpleConditionalDistribution(VARS.FOO)
        self.compact_dist_2d = \
                base_distributions.CompactConditionalDistribution(VARS.FOO, VARS.BAR)
        self.simple_dist_2d = \
                base_distributions.SimpleConditionalDistribution(VARS.FOO, VARS.BAR)
Пример #15
0
 def generate_row_dict(self, row_id_seed_pair):
     """
     Given a (row_id, seed) pairt, generates a row with the given ID and 
     from the given seed, and returns it in a dictionary.
     """
     (row_id, seed) = row_id_seed_pair
     spar_random.seed(seed)
     row_dict = generated_row.GeneratedRow()
     dist_dict = self.dist_holder.dist_dict
     for var in self.fields_to_gen:
         dist = dist_dict[var]
         v = dist.generate(row_dict)
         row_dict[var] = v
     row_dict[sv.VARS.ID] = row_id
     return row_dict
Пример #16
0
    def setUp(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        self.dummy_logger = logging.getLogger('dummy')
        self.dummy_logger.addHandler(logging.NullHandler())

        learner_options = Options()
        learner_options.verbose = False

        pums_files = \
            [("mock pums",
              stringio.StringIO(mock_data_files.mock_pums_data))]
        pums_dict = \
            learn_distributions.learn_pums_dists(learner_options,
                                                 self.dummy_logger,
                                                 pums_files)

        names_files = \
            [('male_first_names.txt',
              stringio.StringIO(mock_data_files.mock_male_first_names)),
             ('female_first_names.txt',
              stringio.StringIO(mock_data_files.mock_female_first_names)),
             ('last_names.txt',
              stringio.StringIO(mock_data_files.mock_last_names))]
        names_dict = \
            learn_distributions.learn_name_dists(learner_options,
                                                 self.dummy_logger,
                                                 names_files)

        vars = [
            sv.VARS.SEX, sv.VARS.CITIZENSHIP, sv.VARS.AGE, sv.VARS.RACE,
            sv.VARS.STATE, sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME
        ]

        var_order = vars
        var_names = [sv.VARS.to_string(x) for x in vars]
        dist_dict = {}
        dist_dict.update(pums_dict)
        dist_dict.update(names_dict)

        self.dist_holder = dh.DistributionHolder(var_order, var_names,
                                                 dist_dict)
Пример #17
0
def query_generation(schema_file, logger, dist_holder, db_size, row_width,
                     start_seed):

    start_time = time.time()
    #Create query_types around the distributions
    logger.info("Creating query generators.")
    seed = int(start_seed)
    spar_random.seed(seed)
    my_learner = lqt.Learner(dist_holder, schema_file, db_size, row_width)
    #feed them into a handler
    logger.info("Creating queries.")
    handler = qh.QueryHandler(my_learner.generate_query_objects())
    #runs all of the queries and outputs them
    query_sets = handler.run(logger)
    print "LEN OF QUERY_SETS is ", len(query_sets)
    elapsed_time = time.time() - start_time
    logger.info('Done generating queries. %d seconds elapsed' % elapsed_time)
    return query_sets
Пример #18
0
 def test_f3_circuit_maker(self):
     """
     Tests that the family 3 circuit makers function as desired.
     """
     fho = tfho.TestFileHandleObject()
     W = 5
     D = 6
     gate_maker = g.TYPE_TO_FAM3_GATE_GEN[g.TEST_TYPES.RANDOM]
     # family 3 files:
     circuit_file_name = "circuit_file"
     circuit_file = fho.get_file_object(circuit_file_name, 'w')
     input_file_name = "input_file"
     input_file = fho.get_file_object(input_file_name, 'w')
     output_file_name = "output_file"
     output_file = fho.get_file_object(output_file_name, 'w')
     F = 3
     # make a family 3 circuit:
     sr.seed(self.rand_seed)
     gen = g.f3_circuit_maker(W, D, circuit_file, input_file, output_file,
                                gate_maker)
     gen.generate()
     # obtain strings representing the contents of all the resulting files:
     circuit_string = fho.get_file(circuit_file_name).getvalue()
     input_string = fho.get_file(input_file_name).getvalue()
     output_string = fho.get_file(output_file_name).getvalue()
     # make sure that the input begins and ends with a bracket:
     self.assertEqual("[", input_string[0])
     self.assertEqual("]", input_string[-1])
     # make sure that each input element is a bit:
     for bit in input_string[1:-1]:
         self.assertTrue((bit == '0') or (bit == '1'))
     # make sure that the output is a bit:
     self.assertTrue((output_string == '0') or (output_string == '1'))
     # make sure that the circuit header contains the correct values:
     circuit_header = circuit_string.split("\n")[0]
     (W_string, D_string, F_string) = circuit_header.split(",")
     W_value = int(W_string.split("=")[-1])
     D_value = int(D_string.split("=")[-1])
     F_value = int(F_string.split("=")[-1])
     self.assertEqual(W, W_value)
     self.assertEqual(D, D_value)
     self.assertEqual(F, F_value)
Пример #19
0
    def setUp(self):

        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)
        self.dummy_logger = logging.getLogger('dummy')

        #distribution holder
        dist1 = base_distribution.CompactIndependentDistribution()
        dist1.add('hello', 1)
        dist1.add('there', 99)
        vars = [VARS.FIRST_NAME]
        dists = [dist1]
        dist_dict = dict(zip(vars, dists))
        self._holder = dh.DistributionHolder(vars, ['fname'], dist_dict)

        self._file_name = StringIO.StringIO('cat,sub_cat,perf,fields,\"[\'no_queries' \
                          '\',\'r_lower\',\'r_upper\']\"\nEQ,eq,"[\'LL\']","[\'fname\']","[10, 1, 100]\"')

        self._db_size = 1000
Пример #20
0
def learn_distributions(options, logger):
    """
    Learn the distributions of demographic data, suitable for creating
    randrom rows of data. Returns the resulting distribution holder.
    """
    #this seed is hard-coded so that the random element of learning are
    #repeatable
    seed = int(81991)
    spar_random.seed(seed)

    logger.info("Learning PUMS distributions")
    pums_files = _files_generator('PUMS', options, logger)
    pums_dict = learn_pums_dists(options, logger, pums_files)

    logger.info("Learning name distributions data")
    names_files = _files_generator('names', options, logger)
    names_dict = learn_name_dists(options, logger, names_files)

    logger.info("Learning zip-code distributions")
    zipcode_files = _files_generator('zipcodes', options, logger)
    zipcode_dict = learn_zipcode_dists(options, logger, zipcode_files)

    logger.info("Learning street-address distributions")
    streets_files = _files_generator('streets', options, logger)
    address_dict = learn_street_address_dists(options, logger, streets_files)

    logger.info("Training on text corpus")
    # Get the names files
    texts_key = 'texts'
    text_files = _files_generator(texts_key, options, logger)
    text_engine = train_text_engine(options, logger, text_files)

    dist_holder = make_distribution_holder(options, logger, pums_dict,
                                           names_dict, zipcode_dict,
                                           address_dict, text_engine)

    sanitization.sanitize_distribution(dist_holder)
    return dist_holder
    def setUp(self):
 
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)
        
        pums_data_raw = mock_data_files.mock_pums_data
        mock_file = stringio.StringIO(pums_data_raw)
        pums_files = [("mock file", mock_file)]
        
        self.log = stringio.StringIO()
        dummy_logger = logging.getLogger('dummy')
        dummy_logger.addHandler(logging.StreamHandler(self.log))
        dummy_logger.setLevel(logging.DEBUG)
        
        learner_options = Options()
        learner_options.verbose = True
 
        self.pums_dict = \
            learn_distributions.learn_pums_dists(learner_options,
                                                 dummy_logger,
                                                 pums_files)
    def setUp(self):
                
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        zipcode_files = \
            [('mock_zipcodes', 
              stringio.StringIO(mock_data_files.mock_zipcodes))]
        
        
        self.log = stringio.StringIO()
        dummy_logger = logging.getLogger('dummy')
        dummy_logger.addHandler(logging.StreamHandler(self.log))
        dummy_logger.setLevel(logging.DEBUG)
        
        learner_options = Options()
        learner_options.verbose = True
        
        self.zipcode_dict = \
            learn_distributions.learn_zipcode_dists(learner_options,
                                                    dummy_logger,
                                                    zipcode_files)
    def make_engine_options(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        spar_random.seed(self.seed)

        self.num_rows = 100

        engine_options = gw.DataGeneratorOptions()
        engine_options.named_pipes = False
        engine_options.random_seed = self.seed
        engine_options.num_processes = 1
        engine_options.num_rows = self.num_rows
        engine_options.row_width = 100

        self.line_raw_file = tempfile.NamedTemporaryFile(delete=False)
        lra_options_dict = {
            'file_obj'    : self.line_raw_file,
            'schema_file' : \
                os.path.join(base_dir,
                            'spar_python/data_generation/test_schema.csv')}
        lra_aggregator = lra.LineRawHandleAggregator(**lra_options_dict)
        engine_options.aggregators = [lra_aggregator]

        self.engine_options = engine_options
    def setUp(self):
        
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)
 
        learner_options = Options()
        learner_options.verbose = False
        self.learner_options = learner_options


        engine_options = gw.DataGeneratorOptions()
        
        counts_agg = ca.CountsAggregator()
        
        engine_options.aggregators = [counts_agg]

        self.engine_options = engine_options

        dummy_logger = logging.getLogger('dummy')
        dummy_logger.addHandler(logging.NullHandler())

        pums_files = \
            [("mock pums", 
              stringio.StringIO(mock_data_files.mock_pums_data))]
        pums_dict = \
            learn_distributions.learn_pums_dists(learner_options,
                                                 dummy_logger,
                                                 pums_files)
            
        names_files = \
            [('male_first_names.txt', 
              stringio.StringIO(mock_data_files.mock_male_first_names)),
             ('female_first_names.txt', 
              stringio.StringIO(mock_data_files.mock_female_first_names)),
             ('last_names.txt', 
              stringio.StringIO(mock_data_files.mock_last_names))]
        names_dict = \
            learn_distributions.learn_name_dists(learner_options,
                                                 dummy_logger,
                                                 names_files)

        zipcode_files = \
            [('mock_zipcodes', 
              stringio.StringIO(mock_data_files.mock_zipcodes))]
        zipcode_dict = \
            learn_distributions.learn_zipcode_dists(learner_options,
                                                    dummy_logger,
                                                    zipcode_files)
        
        text_files = \
            [('mock_text', 
              stringio.StringIO(mock_data_files.mock_text_files))]
        text_engine = \
            learn_distributions.train_text_engine(learner_options, 
                                                  dummy_logger, 
                                                  text_files)

        streets_files = \
            [('mock street file', 
              stringio.StringIO(mock_data_files.mock_street_names))]
        address_dict = \
                learn_distributions.learn_street_address_dists(learner_options, 
                                                               dummy_logger, 
                                                               streets_files)
        
        dist_holder = \
            learn_distributions.make_distribution_holder(learner_options,
                                                         dummy_logger,
                                                         pums_dict,
                                                         names_dict,
                                                         zipcode_dict,
                                                         address_dict,
                                                         text_engine)
        self.dist_holder = dist_holder
        self.data_generator_engine = \
            data_generator_engine.DataGeneratorEngine(engine_options,    
                                                      dist_holder)    
    def test_generated_greater_than(self):
        '''
        Greater than: Tests at a much smaller scale if given 10^4 values of foo
        generated if generate_less_than can generate ranges for all
        sane combinations of record_set_sizes. 
        
        It does this by generating the range and counting the number
        of generated foos fall in the range. It then compares it to 
        the actual desired record set size to see if the two are 
        'close enough' which is a factor of 10. 
        '''
        db_size = 10**4
        RECORD_SET_SIZES = [10, 500, 1000]

        def close_enough(actual_density, desired_density):
            'checks to see if it is either equal or a factor of ten less'
            density_ratio = actual_density / desired_density
            upper_bound = 10
            lower_bound = 0.1
            return ((density_ratio >= lower_bound)
                    and (density_ratio <= upper_bound))

        #generate a bunch of foos to test
        foo_dist = bespoke_distributions.FooDistribution()
        foos = []
        for _ in xrange(db_size):
            x = foo_dist.generate()
            foos.append(x)

        #keep track of what combos work and what don't
        failed_combos = []
        passed_combos = []

        for record_set_size in RECORD_SET_SIZES:
            spar_random.seed(int(time.time() + 1))
            try:
                min = foo_dist.generate_greater_than(record_set_size, db_size)
            except bespoke_distributions.FooInputs:
                #foo distribution says that it is impossible to generate
                #a range for that record size, let's add it to failed combos
                failed_combos.append((db_size, record_set_size, 0, 0, 0))
                continue
            #check to see if what is generated actually works
            #with the exception that even if count_between for 10 is zero
            #we treat it as a success for the sake of passing unit tests
            range_size = min.bit_length()
            count_between = len([foo for foo in foos if foo >= min])
            if close_enough(count_between, record_set_size) or \
               (record_set_size==10 and count_between==0):
                passed_combos.append((db_size,record_set_size,range_size, \
                                      min, count_between))
            else:
                failed_combos.append((db_size,record_set_size,range_size,\
                                      min, count_between))
        #create and see if fail message must be used
        fail_msg = ''
        if len(failed_combos) != 0:
            (db, rss, rs, _, cb) = failed_combos[0]
            fail_msg = "Runs into the law of small numbers, if this fails more"\
                   " than 1 time in 20 then things have gotten fishy."\
                   "The generated ranges did not support all of desired " \
                   "combinations. For example database size: %d, record set "\
                   "size : %d, and range size: %d had %d records returned."\
                   " There are %d other unmatched sets."\
                    % (db,rss,rs,cb, len(failed_combos)-1)
        # Allow 0 failures because we are generating a few of these in actuality
        # and they need to match, most of the time.
        for x in failed_combos:
            print "failed: ", x
        for x in passed_combos:
            print "passed: ", x
        self.assertTrue(len(failed_combos) == 0, fail_msg)
    def setUp(self):

        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)
Пример #27
0
    def setUp(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        #set up intitialization values
        class Object(object):
            pass

        self.learner_options = Object()
        self.dummy_logger = logging.getLogger('dummy')
        self.dummy_logger.addHandler(logging.NullHandler())
        self.dummy_object = Object()
        pums_files = \
            [("mock pums",
              stringio.StringIO(mock_data_files.mock_pums_data))]
        pums_dict = \
            learn_distributions.learn_pums_dists(self.learner_options,
                                                 self.dummy_logger,
                                                 pums_files)
        names_files = \
            [('male_first_names.txt',
              stringio.StringIO(mock_data_files.mock_male_first_names)),
             ('female_first_names.txt',
              stringio.StringIO(mock_data_files.mock_female_first_names)),
             ('last_names.txt',
              stringio.StringIO(mock_data_files.mock_last_names))]
        names_dict = \
            learn_distributions.learn_name_dists(self.learner_options,
                                                 self.dummy_logger,
                                                 names_files)
        zipcode_files = \
            [('mock_zipcodes',
              stringio.StringIO(mock_data_files.mock_zipcodes))]
        zipcode_dict = \
            learn_distributions.learn_zipcode_dists(self.learner_options,
                                                    self.dummy_logger,
                                                    zipcode_files)

        text_files = \
            [('mock_text',
              stringio.StringIO(mock_data_files.mock_text_files))]
        text_engine = \
            learn_distributions.train_text_engine(self.learner_options,
                                                  self.dummy_logger,
                                                  text_files)
        streets_files = \
            [('mock street file',
              stringio.StringIO(mock_data_files.mock_street_names))]
        address_dict = \
                learn_distributions.learn_street_address_dists(self.learner_options,
                                                               self.dummy_logger,
                                                               streets_files)
        self.dist_holder = \
            learn_distributions.make_distribution_holder(self.learner_options,
                                                         self.dummy_logger,
                                                         pums_dict,
                                                         names_dict,
                                                         zipcode_dict,
                                                         address_dict,
                                                         text_engine)
        self.fields_to_gen = [
            sv.VARS.SEX, sv.VARS.FIRST_NAME, sv.VARS.LAST_NAME
        ]
        sub_cat = 'eq'
        fields = [sv.sql_name_to_enum('fname'), sv.sql_name_to_enum('lname')]
        dists1 = [
            self.dist_holder.dist_dict[sv.VARS.FIRST_NAME],
            self.dist_holder.dist_dict[sv.VARS.LAST_NAME]
        ]
        other_fields = ['no_queries', 'r_lower', 'r_upper']
        other_cols = [[5, 1, 10], [5, 11, 100]]
        self.generator = eqg.EqualityQueryGenerator('EQ', sub_cat, ["LL"],
                                                    dists1, fields, 1000, 100,
                                                    other_fields, other_cols)
Пример #28
0
    def setUp(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        #set up intitialization values
        class Object(object):
            pass

        self.learner_options = Object()
        self.dummy_logger = logging.getLogger('dummy')
        self.dummy_logger.addHandler(logging.NullHandler())
        self.dummy_object = Object()
        pums_files = \
            [("mock pums",
              stringio.StringIO(mock_data_files.mock_pums_data))]
        pums_dict = \
            learn_distributions.learn_pums_dists(self.learner_options,
                                                 self.dummy_logger,
                                                 pums_files)
        names_files = \
            [('male_first_names.txt',
              stringio.StringIO(mock_data_files.mock_male_first_names)),
             ('female_first_names.txt',
              stringio.StringIO(mock_data_files.mock_female_first_names)),
             ('last_names.txt',
              stringio.StringIO(mock_data_files.mock_last_names))]
        names_dict = \
            learn_distributions.learn_name_dists(self.learner_options,
                                                 self.dummy_logger,
                                                 names_files)
        zipcode_files = \
            [('mock_zipcodes',
              stringio.StringIO(mock_data_files.mock_zipcodes))]
        zipcode_dict = \
            learn_distributions.learn_zipcode_dists(self.learner_options,
                                                    self.dummy_logger,
                                                    zipcode_files)

        text_files = \
            [('mock_text',
              stringio.StringIO(mock_data_files.mock_text_files))]
        text_engine = \
            learn_distributions.train_text_engine(self.learner_options,
                                                  self.dummy_logger,
                                                  text_files)
        streets_files = \
            [('mock street file',
              stringio.StringIO(mock_data_files.mock_street_names))]
        address_dict = \
                learn_distributions.learn_street_address_dists(self.learner_options,
                                                               self.dummy_logger,
                                                               streets_files)
        self.dist_holder = \
            learn_distributions.make_distribution_holder(self.learner_options,
                                                         self.dummy_logger,
                                                         pums_dict,
                                                         names_dict,
                                                         zipcode_dict,
                                                         address_dict,
                                                         text_engine)
        self.fields_to_gen = [
            sv.VARS.SEX, sv.VARS.FOO, sv.VARS.LAST_NAME, sv.VARS.CITIZENSHIP,
            sv.VARS.AGE, sv.VARS.INCOME, sv.VARS.RACE, sv.VARS.STATE,
            sv.VARS.WEEKS_WORKED, sv.VARS.HOURS_WORKED,
            sv.VARS.MILITARY_SERVICE, sv.VARS.MARITAL_STATUS,
            sv.VARS.GRADE_ENROLLED, sv.VARS.LANGUAGE, sv.VARS.FIRST_NAME,
            sv.VARS.ZIP_CODE, sv.VARS.CITY, sv.VARS.STREET_ADDRESS, sv.VARS.DOB
        ]
        other_fields = [
            'no_queries', 'r_lower', 'r_upper', 'num_clauses', 'tm_lower',
            'tm_upper'
        ]
        other_cols = [[5, 10, 100, 2, 100, 1000], [5, 1, 10, 2, 10, 100]]
        self.generator = aqg.AndQueryGenerator(
            "P1", 'and-eq', ["LL"], self.dist_holder.dist_dict.values(),
            self.dist_holder.dist_dict.keys(), 1000, 100, other_fields,
            other_cols)
Пример #29
0
    def setUp(self):
        self.seed = int(time.time())
        self.seed_msg = "Random seed used for this test: %s" % self.seed
        self.longMessage = True
        spar_random.seed(self.seed)

        #set up intitialization values
        class Object(object):
            pass

        self.learner_options = Object()
        self.dummy_logger = logging.getLogger('dummy')
        self.dummy_logger.addHandler(logging.NullHandler())
        self.dummy_object = Object()
        pums_files = \
            [("mock pums",
              stringio.StringIO(mock_data_files.mock_pums_data))]
        pums_dict = \
            learn_distributions.learn_pums_dists(self.learner_options,
                                                 self.dummy_logger,
                                                 pums_files)
        names_files = \
            [('male_first_names.txt',
              stringio.StringIO(mock_data_files.mock_male_first_names)),
             ('female_first_names.txt',
              stringio.StringIO(mock_data_files.mock_female_first_names)),
             ('last_names.txt',
              stringio.StringIO(mock_data_files.mock_last_names))]
        names_dict = \
            learn_distributions.learn_name_dists(self.learner_options,
                                                 self.dummy_logger,
                                                 names_files)
        zipcode_files = \
            [('mock_zipcodes',
              stringio.StringIO(mock_data_files.mock_zipcodes))]
        zipcode_dict = \
            learn_distributions.learn_zipcode_dists(self.learner_options,
                                                    self.dummy_logger,
                                                    zipcode_files)

        text_files = \
            [('mock_text',
              stringio.StringIO(mock_data_files.mock_text_files))]
        text_engine = \
            learn_distributions.train_text_engine(self.learner_options,
                                                  self.dummy_logger,
                                                  text_files)
        streets_files = \
            [('mock street file',
              stringio.StringIO(mock_data_files.mock_street_names))]
        address_dict = \
                learn_distributions.learn_street_address_dists(self.learner_options,
                                                               self.dummy_logger,
                                                               streets_files)
        self.dist_holder = \
            learn_distributions.make_distribution_holder(self.learner_options,
                                                         self.dummy_logger,
                                                         pums_dict,
                                                         names_dict,
                                                         zipcode_dict,
                                                         address_dict,
                                                         text_engine)
        self.fields_to_gen = [
            sv.VARS.SEX, sv.VARS.FIRST_NAME, sv.VARS.CITIZENSHIP, sv.VARS.RACE,
            sv.VARS.STATE, sv.VARS.ZIP_CODE, sv.VARS.AGE, sv.VARS.DOB,
            sv.VARS.SSN, sv.VARS.LAST_UPDATED
        ]

        fields = [
            sv.VARS.FIRST_NAME, sv.VARS.ZIP_CODE, sv.VARS.SSN,
            sv.VARS.LAST_UPDATED
        ]
        dist_dict = self.dist_holder.dist_dict
        dists = [
            dist_dict[sv.VARS.FIRST_NAME], dist_dict[sv.VARS.ZIP_CODE],
            dist_dict[sv.VARS.SSN], dist_dict[sv.VARS.LAST_UPDATED]
        ]
        other_fields = ['no_queries', 'r_lower', 'r_upper', 'type']
        other_cols = [[3, 10, 100, 'less'], [3, 10, 100, 'range'],
                      [3, 10, 100, 'greater']]
        self.generator = rqg.RangeQueryGenerator("P2", 'range', ["LL"], dists,
                                                 fields, 1000, 100,
                                                 other_fields, other_cols)
Пример #30
0
 def setUp(self):
     SEED = 7  #sr.randint(0, 1000000000)
     sr.seed(SEED)