예제 #1
0
 def __iter__(self):
     for line in self.generator:
         document = json.loads(line)
         if self.filter:
             document = self.filter(document)
             if document:
                 self.lookup[self.k] = safe_get(document, *self.key)
                 yield self._transform(document)
                 self.k += 1
         else:
             self.lookup[self.k] = safe_get(document, *self.key)
             yield self._transform(document)
             self.k += 1
예제 #2
0
def test_nested_dict():
    test_dict = {
        'layer1': {
            'layer2': {
                'layer3': {
                    'layer4': 'this is layer 4'
                }
            }
        }
    }
    assert safe_get(test_dict, 'layer1', 'layer2', 'layer3',
                    'layer4') == 'this is layer 4'
예제 #3
0
    def test_state(self):
        transformer = lambda job: safe_get(job, 'jobLocation', 'address', 'addressRegion')
        js = JobSampler(
                job_posting_generator=self.fake_corpus_train,
                k=self.sample_size,
        )

        result = []
        for i in range(self.num_loops):
            result.extend(list(map(lambda x: transformer(x), js)))

        counts = dict(Counter(result))
        assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.states)
예제 #4
0
 def _transform_generator(self, job_posting_generator):
     if isinstance(self.keys, list):
         for job in job_posting_generator:
             yield (job, safe_get(job, *self.keys))
     elif isinstance(self.keys, str):
         for job in job_posting_generator:
             yield (job, job[self.keys])
     elif self.major_group:
         for job in job_posting_generator:
             try:
                 yield (job, job['onet_soc_code'][:2])
             except TypeError:
                 yield (job, None)
     else:
         for job in job_posting_generator:
             yield (job, )
예제 #5
0
 def __iter__(self):
     for document in self.job_posting_generator:
         self.lookup[self.k] = safe_get(document, *self.key)
         yield self._transform(document)
         self.k += 1