def write_evidence_strings(evidence_df: DataFrame, output_file: str) -> None: """ Exports the table to a compressed JSON file containing the evidence strings. Pandas is used to export it to a single file, not a directory. """ evidence_df.coalesce(1).write.format('json').mode('overwrite').option( 'compression', 'gzip').save(output_file) return 0
def repartitionDF(self, df: DataFrame, partitions: int = 0): ''' Repartition the inuput dataframe parms: df -> dataframe partitions -> new partitions count. Defaulted to 0 i.e Don't partition logic, if partitions = 0 , Don't repartitions if partitions = -1, Repartions to the default number (NumOfExecutors * ExecutorCores * 2) if partitions > 0 , Repartition/coalesce to the input number ''' curParts = df.rdd.getNumPartitions finalParts = min(curParts, partitions) if curParts == partitions or partitions == 0: finalParts = -1 elif partitions == -1: finalParts = self.__dfltRDDParts elif partitions > 0: finalParts = partitions else: pass #finalParts is pre-populated. self.log("Current Partitions: %d , Requested: %d, Final: %d " % (curParts, partitions, finalParts)) if finalParts != -1: return df elif curParts > finalParts: return df.coalesce(finalParts) else: return df.repartition(finalParts)
def write(self, feature_set: FeatureSet, dataframe: DataFrame, spark_client: SparkClient) -> Any: """Write output to single file CSV dataset.""" path = f"data/datasets/{feature_set.name}" spark_client.write_dataframe( dataframe=dataframe.coalesce(1), format_="csv", mode="overwrite", path=path, header=True, )
def writeFile(df:DataFrame, filepath:str, filetype:str) -> None: if isinstance(df, DataFrame): getattr(df.coalesce(1).write,filetype)(filepath, mode="overwrite", header="true") # csv, 1 file, with header
def _save_parquet_local(spark_df: Spark_df, fpath: str) -> Sequence[str]: spark_df.coalesce(1).write.parquet(fpath) return glob.glob(f"{fpath}/*.parquet")