def __iter__(self): for line in self.generator: document = json.loads(line) if self.filter: document = self.filter(document) if document: self.lookup[self.k] = safe_get(document, *self.key) yield self._transform(document) self.k += 1 else: self.lookup[self.k] = safe_get(document, *self.key) yield self._transform(document) self.k += 1
def test_nested_dict(): test_dict = { 'layer1': { 'layer2': { 'layer3': { 'layer4': 'this is layer 4' } } } } assert safe_get(test_dict, 'layer1', 'layer2', 'layer3', 'layer4') == 'this is layer 4'
def test_state(self): transformer = lambda job: safe_get(job, 'jobLocation', 'address', 'addressRegion') js = JobSampler( job_posting_generator=self.fake_corpus_train, k=self.sample_size, ) result = [] for i in range(self.num_loops): result.extend(list(map(lambda x: transformer(x), js))) counts = dict(Counter(result)) assert np.mean(np.array(list(counts.values()))) == self.num_loops * self.sample_size / len(self.states)
def _transform_generator(self, job_posting_generator): if isinstance(self.keys, list): for job in job_posting_generator: yield (job, safe_get(job, *self.keys)) elif isinstance(self.keys, str): for job in job_posting_generator: yield (job, job[self.keys]) elif self.major_group: for job in job_posting_generator: try: yield (job, job['onet_soc_code'][:2]) except TypeError: yield (job, None) else: for job in job_posting_generator: yield (job, )
def __iter__(self): for document in self.job_posting_generator: self.lookup[self.k] = safe_get(document, *self.key) yield self._transform(document) self.k += 1