def _get_current_records(records: List[PartitionedDatasetRecord], from_timestamp: Optional[datetime], to_timestamp: datetime): records = Query.en(records).where( lambda z: z.timestamp <= to_timestamp).order_by_descending( lambda z: z.timestamp).to_list() first_major = Query.en(records).with_indices().where( lambda z: z.value.is_major).select( lambda z: z.key).first_or_default() if first_major is None: raise ValueError( f"There are no major revisions before {to_timestamp}") records = records[:first_major + 1] if from_timestamp is not None: records = [r for r in records if r.timestamp >= from_timestamp] return records
def test_files(self): folder = Path(__file__).parent / "temp" os.makedirs(folder, exist_ok=True) if os.path.isdir(folder): # pragma: no cover shutil.rmtree(folder) plan = self.get_default_plan() self.kraken_run(kraken_simple_method, plan, cache_to_folder=folder, special_iterations=[3]) file = Query.folder(folder).single() self.assertEqual('3.kraken.pkl', file.name) results = self.kraken_run( kraken_simple_method, plan, cache_to_folder=folder) # type: List[kraken.IterationResult] for index, result in enumerate(results): self.assertEqual( kraken.IterationStatus.Skipped if index == 3 else kraken.IterationStatus.Success, result.status) self.assertIsNone(result.condition) self.assertIsNone(result.result) loaded_results = Query.en(kraken.Kraken.load( folder, None)).order_by(lambda z: z.result).to_list() self.assertResult([11, 12, 21, 22], loaded_results) shutil.rmtree(folder)
def get_splits(self, query_template): if self.custom_shards is not None: shards = self.custom_shards else: shards = list(range(self.shard_count)) return (Query.en(shards).select( lambda z: dict(shard=z, shard_count=self.shard_count)).select( lambda z: query_template.format(**z)).to_list())
def get_data(self): splits = list(self.get_splits(self.query_template)) query = Query.en(splits) if self.with_progress_bar: query = query.feed(fluq.with_progress_bar()) query = query.select_many( lambda z: self.downloader_factory(z).get_data()) return query
def test_extracting_warnings(self): df = Query.en(range(10)).select(lambda z: (z,z)).to_dataframe(columns=['x','y']) pipe = make_pipeline( DataFrameTransformer([ContinousTransformer(['x'])]), LinearRegression() ) pipe.fit(df[['x']],df.y) pipe.predict(pd.DataFrame(dict(x=[None]))) warnings = TGWarningStorage.get_report() self.assertEqual(1,len(warnings)) TGWarningStorage.clear()
def _get_module_name_and_version(path: Path): try: file = tarfile.open(path, 'r:gz') properties = (Query.en( file.getmembers()).where(lambda z: z.name.endswith( 'properties.json')).order_by(lambda z: len(z.name)).first()) stream = file.extractfile(properties).read() props = json.loads(stream) return props['full_module_name'], props['version'] except: module_name, version = re.match('([^-/]+)-(.+)\.tar\.gz$', path.name).groups() return module_name, version
def _read_bundle(path: Path): index_frame = pd.read_parquet(path.joinpath('index.parquet')) files = (Query .folder(path) .where(lambda z: z.name!='index.parquet') .where(lambda z: z.name.endswith('.parquet')) .to_list() ) data_frames = Query.en(files).to_dictionary(lambda z: z.name.split('.')[0], lambda z: pd.read_parquet(z)) return DataBundle( index_frame, data_frames )
def download_folder(bucket: str, s3_path: str, folder: Path, report=None): if os.path.exists(folder.__str__()): shutil.rmtree(folder.__str__()) os.makedirs(folder.__str__()) s3_resource = boto3.resource('s3') bucket_obj = s3_resource.Bucket(bucket) keys = [z.key for z in bucket_obj.objects.filter(Prefix=s3_path)] keys = Query.en(keys) if report == 'tqdm': keys = keys.feed(fluq.with_progress_bar()) for key in keys: proper_key = key[len(s3_path):] if proper_key.startswith('/'): proper_key = proper_key[1:] filename = folder.joinpath(proper_key) S3Handler.download_file(bucket, key, filename)
def _get_data_iter(self, start_date: datetime.datetime, end_date: datetime.datetime): start_date_str = str(start_date) end_date_str = str(end_date) logger.info( f"Retrieving updated ids from {start_date_str} to {end_date_str}") sql = self.id_retrieve_sql_template.format(start_date=start_date_str, end_date=end_date_str) id_src = self.source_factory(sql) ids = id_src.get_data().select(lambda z: z['id']).select(str).to_list() partitions = Query.en(ids).feed( fluq.partition_by_count(self.partition_size)).to_list() logger.info( f'Retrieving {len(ids)} records, {len(partitions)} partitions') for index, partition in enumerate(partitions): id_list = ','.join(partition) sql = self.download_sql_template.format(id_list=id_list) src = self.source_factory(sql) for item in src.get_data(): yield item logger.info(f"Partition {index} is processed")
def test_zip_file(self): src = Query.en(range(10)) path = Path(__file__).parent.joinpath('test_cache') cache = ZippedFileDataSource(path, buffer_size=4) self.assertEqual(False, cache.is_available()) cache.cache_from(src, 7) self.assertEqual(True, cache.is_available()) self.assertEqual( "7", FileIO.read_text(path.__str__() + '.pkllines.zip.length')) stored = Query.file.zipped_folder(path.__str__() + '.pkllines.zip').to_dictionary() self.assertEqual(2, len(stored)) self.assertListEqual([0, 1, 2, 3], stored['0']) self.assertListEqual([4, 5, 6], stored['1']) result = cache.get_data().to_list() self.assertListEqual(list(range(7)), result) os.unlink(path.__str__() + '.pkllines.zip.length') os.unlink(path.__str__() + '.pkllines.zip')
def _postprocess(self, df: pd.DataFrame) -> pd.DataFrame: return df.set_index('a') class MyFeaturizerFailing(DataframeFeaturizer): def __init__(self): super(MyFeaturizerFailing, self).__init__() def _featurize(self, item: Any) -> List[Any]: return [item] def _validate(self): raise ValueError() data = Query.en(range(5)).select(lambda z: dict(a=z)).to_dataframe() class BatchJobTestCase(TestCase): def test_simple(self): mem = InMemoryJobDestination() job = FeaturizationJob( 'test', 'test', MockDfDataSource(data), { 'def': MyFeaturizerSimple() }, mem, None, None
def get_data(self): return Query.en(self._get_data_iter(self.start_date, self.end_date))
def parse(self) -> Queryable[CorpusFragment]: return Query.en(self._parse_iter())
def get_data(self): if MockUpdateSource.state == 1: return Query.en([2, 1, 0, 3, 4, 5]).select(lambda z: dict(id=z)) else: return Query.en([])
def get_data(self, **kwargs): return Query.en([1])