def test_parquet_saver(self): with tempfile.TemporaryDirectory() as tmpdir: dirname = tmpdir try: # load and save data rows = [("Alice", 1)] df = self.spark.createDataFrame(rows, ["name", "age"]) ParquetSaver(dirname + "/", explain=True)(df.rdd) ParquetSaver(dirname + "2/")(df.rdd) # read saved data and check it data = ParquetLoader(session=self.spark, paths=dirname).execute() self.assertEqual(data.count(), 1) finally: shutil.rmtree(dirname)
def preprocess_source(args): log = logging.getLogger("preprocess_source") if os.path.exists(args.batches): log.critical("%s must not exist", args.batches) return 1 if not args.config: args.config = [] engine = create_engine("source2bags-%s" % uuid4(), args.repositories, args) pipeline = Engine(engine, explain=args.explain).link( DzhigurdaFiles(args.dzhigurda)) uasts = pipeline.link(UastExtractor(languages=[args.language])) fields = uasts.link(FieldsSelector(fields=args.fields)) saver = fields.link(ParquetSaver(save_loc=args.batches)) saver.explode()
def preprocess_source(args): log = logging.getLogger("preprocess_source") if os.path.exists(args.output): log.critical("%s must not exist", args.output) return 1 if not args.config: args.config = [] engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__) ignition = Ignition(engine, explain=args.explain) ignition \ .link(DzhigurdaFiles(args.dzhigurda)) \ .link(UastExtractor(languages=args.languages)) \ .link(FieldsSelector(fields=args.fields)) \ .link(ParquetSaver(save_loc=args.output)) \ .execute() pipeline_graph(args, log, ignition)
def preprocess_repos(args): log = logging.getLogger("preprocess_repos") session_name = "preprocess_repos-%s" % uuid4() if os.path.exists(args.output): log.critical("%s must not exist", args.output) return 1 if not args.config: args.config = [] root, start_point = create_uast_source(args, session_name) start_point \ .link(Moder(args.mode)) \ .link(FieldsSelector(fields=args.fields)) \ .link(ParquetSaver(save_loc=args.output)) \ .execute() pipeline_graph(args, log, root)