def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: return filter( self.column.get_column_spec(source_df=source_df, current_column=current_column), self.func, )
def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column], parent_columns: Optional[List[Column]], ) -> Column: if parent_columns is None: parent_columns = [] if current_column is not None: parent_columns.append(current_column) return filter( self.array_field.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ), lambda y: exists( self.inner_array_field.get_column_spec( source_df=source_df, current_column=y, parent_columns=parent_columns, ), lambda x: x[self.match_property] == self.match_value. get_column_spec( source_df=source_df, current_column=y, parent_columns=parent_columns, ), ), )
def test_automapper_filter(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper(view="members", source_view="patients").columns( age=A.filter(column=A.column("identifier"), func=lambda x: x["use"] == lit("usual"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], filter("b.identifier", lambda x: x["use"] == lit("usual")).alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False)
def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: return flatten( filter( self.column.get_column_spec(source_df=source_df, current_column=current_column), lambda x: x.isNotNull(), ))
def test_auto_mapper_array_multiple_items_with_null( spark_session: SparkSession, ) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df: DataFrame = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns(dst2=AutoMapperList(["address1", "address2", None])) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["dst2"], when( array(lit("address1"), lit("address2"), lit(None)).isNotNull(), filter( coalesce(array(lit("address1"), lit("address2"), lit(None)), array()), lambda x: x.isNotNull(), ), ).alias("dst2"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][1] == "address2") assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][1] == "address2")
def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column], parent_columns: Optional[List[Column]], ) -> Column: return filter( self.column.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ), self.func, )
def test_automapper_nested_array_filter_simple_with_array( spark_session: SparkSession, ) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") environ["LOGLEVEL"] = "DEBUG" data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(age=A.nested_array_filter( array_field=A.column("array1"), inner_array_field=A.field("array2"), match_property="reference", match_value=A.text("bar"), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], filter( col("b.array1"), lambda y: exists( y["array2"], lambda x: x["reference"] == lit("bar").cast( "string")), ).alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.printSchema() result_df.show(truncate=False) assert result_df.count() == 2 assert result_df.select("age").collect()[0][0] == [] assert result_df.select( "age").collect()[1][0][0]["array2"][0]["reference"] == "bar"
def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column], parent_columns: Optional[List[Column]], ) -> Column: if isinstance(self.value, list): # if the src column is a list then iterate inner_array = array( *[ self.get_value( item, source_df=source_df, current_column=current_column, parent_columns=parent_columns, ) for item in self.value ] ) return when( inner_array.isNotNull(), filter(inner_array, lambda x: x.isNotNull() & ~x.eqNullSafe("")), ) # if value is an AutoMapper then ask it for its column spec if isinstance(self.value, AutoMapperDataTypeBase): child: AutoMapperDataTypeBase = self.value inner_child_spec = child.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ) return when( inner_child_spec.isNotNull(), filter(inner_child_spec, lambda x: x.isNotNull() & ~x.eqNullSafe("")), ) raise ValueError(f"value: {self.value} is neither list nor AutoMapper")
def test_automapper_select_one(spark_session: SparkSession) -> None: clean_spark_session(spark_session) data_dir: Path = Path(__file__).parent.joinpath("./") data_json_file: Path = data_dir.joinpath("data.json") source_df: DataFrame = spark_session.read.json(str(data_json_file), multiLine=True) source_df.createOrReplaceTempView("patients") source_df.show(truncate=False) # Act mapper = AutoMapper( view="members", source_view="patients").columns(age=A.column("identifier").filter( lambda x: x["system"] == "http://hl7.org/fhir/sid/us-npi"). select_one(A.field("_.value"))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["age"], transform( filter( "b.identifier", lambda x: x["system"] == lit("http://hl7.org/fhir/sid/us-npi"), ), lambda x: x["value"], )[0].alias("age"), ) result_df: DataFrame = mapper.transform(df=source_df) result_df.show(truncate=False) assert result_df.select("age").collect()[0][0] == "1730325416" assert result_df.select("age").collect()[1][0] == "1467734301"
def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column] ) -> Column: if isinstance( self.value, str ): # if the src column is just string then consider it a sql expression return array(lit(self.value)) if isinstance(self.value, list): # if the src column is a list then iterate return ( filter( array( *[ self.get_value( item, source_df=source_df, current_column=current_column ) for item in self.value ] ), lambda x: x.isNotNull(), ) if self.remove_nulls else array( *[ self.get_value( item, source_df=source_df, current_column=current_column ) for item in self.value ] ) ) # if value is an AutoMapper then ask it for its column spec if isinstance(self.value, AutoMapperDataTypeBase): child: AutoMapperDataTypeBase = self.value return child.get_column_spec( source_df=source_df, current_column=current_column ) raise ValueError(f"value: {self.value} is neither str nor AutoMapper")
def test_auto_mapper_columns(spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, "Vidal", "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( dst1="src1", dst2=AutoMapperList(["address1"]), dst3=AutoMapperList(["address1", "address2"]), dst4=AutoMapperList( [A.complex(use="usual", family=A.column("last_name"))]), ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") # Assert assert len(sql_expressions) == 4 assert_compare_expressions(sql_expressions["dst1"], lit("src1").alias("dst1")) assert_compare_expressions( sql_expressions["dst2"], when( array(lit("address1")).isNotNull(), filter(coalesce(array(lit("address1")), array()), lambda x: x.isNotNull()), ).alias("dst2"), ) assert_compare_expressions( sql_expressions["dst3"], when( array(lit("address1"), lit("address2")).isNotNull(), filter( coalesce(array(lit("address1"), lit("address2")), array()), lambda x: x.isNotNull(), ), ).alias("dst3"), ) assert_compare_expressions( sql_expressions["dst4"], when( array( struct( lit("usual").alias("use"), col("b.last_name").alias("family"))).isNotNull(), filter( coalesce( array( struct( lit("usual").alias("use"), col("b.last_name").alias("family"), )), array(), ), lambda x: x.isNotNull(), ), ).alias("dst4"), ) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert len(result_df.columns) == 5 assert result_df.where("member_id == 1").select( "dst1").collect()[0][0] == "src1" assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][0] == "address1") assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][1] == "address2") assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0] [0] == "usual") assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0] [1] == "Qureshi")
def test_auto_mapper_list_addition_multiple_items_structs_different_elements_with_schema( spark_session: SparkSession, ) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, None, "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df: DataFrame = source_df.select("member_id") df.createOrReplaceTempView("members") schema: StructType = StructType( [ StructField("id", StringType(), True), StructField("c", StringType(), True), StructField("b", StringType(), True), ] ) # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"], drop_key_columns=False, ).columns( dst2=AutoMapperList( [ AutoMapperDataTypeComplexBase( id_=A.column("first_name"), b=A.column("last_name") ), ], include_null_properties=True, children_schema=schema, ) + AutoMapperList( [ AutoMapperDataTypeComplexBase( id_=A.column("first_name"), c=A.column("last_name") ), ], include_null_properties=True, children_schema=schema, ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") struct1 = struct( col("b.first_name").alias("id"), lit(None).alias("c"), col("b.last_name").alias("b"), ) struct2 = struct( col("b.first_name").alias("id"), col("b.last_name").alias("c"), lit(None).alias("b"), ) array1 = when( array(struct1, struct2).isNotNull(), filter(coalesce(array(struct1, struct2), array()), lambda x: x.isNotNull()), ) assert_compare_expressions(sql_expressions["dst2"], array1.alias("dst2")) result_df: DataFrame = mapper.transform(df=df) # Assert result_df.printSchema() result_df.show() assert ( result_df.where("member_id == 1").select("dst2").collect()[0][0][0][0] == "Imran" ) assert ( result_df.where("member_id == 1").select("dst2").collect()[0][0][0][2] == "Qureshi" ) assert ( result_df.where("member_id == 2").select("dst2").collect()[0][0][0][0] == "Michael" ) assert ( result_df.where("member_id == 2").select("dst2").collect()[0][0][0][1] is None )
def main(argv): mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf( "SC_PHYS_PAGES") # e.g. 4015976448 mem_gib = int((mem_bytes / (1024.0**3)) * 0.9) tar_jar = os.path.join(find_runfiles(), "__main__/galvasr2/spark/tar_spark_datasource.jar") spark = (pyspark.sql.SparkSession.builder.master( f"local[{os.cpu_count() - 1}]").config( "spark.eventLog.enabled", "true").config("spark.eventLog.dir", "/spark-events").config( "spark.sql.execution.arrow.pyspark.enabled", "true").config( "spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true", ).config( "spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true", ).config("spark.driver.memory", f"{mem_gib}g").config( "spark.history.fs.logDirectory", "/spark-events").config( "spark.sql.execution.arrow.maxRecordsPerBatch", "1").config("spark.jars", tar_jar).config( "spark.local.dir", "/mnt/disks/spark-scratch/").getOrCreate()) spark.sparkContext.setLogLevel("INFO") # "ALL" for very verbose logging logging.getLogger("py4j").setLevel(logging.ERROR) catalogue_df = load_audio_id_text_id_mapping(spark, FLAGS.input_catalogue) _, licenseurl_df = load_audio_and_text_dfs(spark, FLAGS.input_catalogue) licenseurl_df = licenseurl_df.select( [F.col("identifier"), F.col("text_document_id"), F.col("licenseurl")]) # Kaldi's wav.scp format does not support space characters in the key field of a wav.scp file # We write the transcript to a file called "{kaldi_normalized_uttid}.ctm", so we also need to change all instances of "/" to "_" catalogue_df = catalogue_df.withColumn( "kaldi_normalized_uttid", F.concat_ws( "-", F.translate(catalogue_df.identifier, " /", "__"), F.translate(catalogue_df.audio_document_id, " /", "__"), ), ) # key_int_mapping = os.path.join(FLAGS.work_dir, "key_int_mapping_csv") if not FLAGS.work_dir.startswith("gs://"): os.makedirs(FLAGS.work_dir, exist_ok=True) wav_scp = os.path.join(FLAGS.work_dir, "wav.scp") ctm_out_dir = os.path.join(FLAGS.work_dir, "decoder_ctm_dir") if FLAGS.stage <= 0: catalogue_df = catalogue_df.cache() # catalogue_df.write.mode("overwrite").format("csv").options(header="true").save(key_int_mapping) training_sample_rows = catalogue_df.collect() catalogue_df.unpersist() with TemporaryMountDirectory( mount_cmd=[ "gcsfuse", "--implicit-dirs", FLAGS.input_gcs_bucket.lstrip("gs://"), ], unmount_cmd=["fusermount", "-u"], ) as temp_dir_name: posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, wav_scp) create_wav_scp(posix_wav_scp, training_sample_rows, FLAGS.input_dir, ctm_out_dir) # /development/lingvo-source/output_ctm_dir/ # nvprof --analysis-metrics -o decoder-analysis.nvprof \ # We want only the best path, so we set lattice-beam to 0.1 # --main-q-capacity=35000 \ # Can get 266x RTF with this configuration. Keep it? # bath size of 100 and num channels of 100 works just fine if FLAGS.stage <= 1: if not FLAGS.work_dir.startswith("gs://"): os.makedirs(ctm_out_dir, exist_ok=True) with TemporaryMountDirectory( mount_cmd=[ "gcsfuse", "--implicit-dirs", FLAGS.input_gcs_bucket.lstrip("gs://"), ], unmount_cmd=["fusermount", "-u"], ) as temp_dir_name: posix_ctm_out_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, ctm_out_dir) posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, wav_scp) posix_work_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, FLAGS.work_dir) num_gpus = 4 posix_wav_scp_shards = split_wav_scp(posix_wav_scp, posix_work_dir, num_gpus) executor = ThreadPoolExecutor(max_workers=num_gpus) def run_gpu(posix_wav_scp_shard, gpu_number): cmd = f"""\ /opt/kaldi/src/cudadecoderbin/batched-wav-nnet3-cuda3 \ --frame-subsampling-factor=3 \ --config=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/conf/online.conf \ --max-active=7000 \ --beam=15.0 \ --lattice-beam=0.1 \ --acoustic-scale=1.0 \ --cuda-decoder-copy-threads=2 \ --cuda-worker-threads={os.cpu_count() // num_gpus} \ --segmentation=true \ --cuda-use-tensor-cores=true \ --max-batch-size=150 \ --num-channels=250 \ --lattice-postprocessor-rxfilename=/development/lingvo-source/lattice_postprocess.conf \ --word-symbol-table=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/words.txt \ /opt/kaldi/egs/aspire/s5/exp/chain/tdnn_7b/final.mdl \ /opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst \ scp,p:{posix_wav_scp_shard} \ {posix_ctm_out_dir} """ env = deepcopy(os.environ) env["CUDA_VISIBLE_DEVICES"] = f"{gpu_number}" subprocess.check_call(shlex.split(cmd), env=env) for i, shard in enumerate(posix_wav_scp_shards): executor.submit(run_gpu, shard, i) executor.shutdown(wait=True) alignments_dir = os.path.join(FLAGS.alignments_work_dir, "alignments_json_jul_28") if FLAGS.stage <= 2: # TODO: Add options to DSAlign here dsalign_args = dsalign_main.parse_args( ["--output-wer", "--output-cer"]) # , "--output-sws", "--output-levenshtein"]) alphabet_normalized_path = ( "/development/lingvo-source/galvasr2/align/spark/alphabet2.txt") align_udf = prepare_align_udf(dsalign_args, alphabet_normalized_path, 15_000, 3_000) ctm_df = (spark.read.format("binaryFile").option( "pathGlobFilter", "*.ctm").load(ctm_out_dir)) ctm_df = ctm_df.withColumn( "kaldi_normalized_uttid", F.regexp_replace( F.reverse(F.split(ctm_df.path, "/"))[0], r"[.]ctm$", ""), ) ctm_df = ctm_df.withColumn("ctm_content", fix_text_udf(F.col("content"))).drop( "path", "length", "modificationTime", "content") ctm_df = ctm_df.join(catalogue_df, "kaldi_normalized_uttid") downsampled_catalogue_df = ctm_df.drop("ctm_content") training_sample_rows = downsampled_catalogue_df.collect() transcripts_df = load_transcripts(spark, FLAGS.input_gcs_path, training_sample_rows) transcripts_df = transcripts_df.withColumn( "transcript", normalize_english_text_udf(transcripts_df.transcript)) ctm_df = ctm_df.join(transcripts_df, ["identifier", "text_document_id"]) ctm_df = ctm_df.repartition(960) # alignments_df = ctm_df.select(align_udf(F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id), # F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id), # ctm_df.transcript, ctm_df.ctm_content)) alignments_df = ctm_df.withColumn( "alignments", align_udf( F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id), F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id), ctm_df.transcript, ctm_df.ctm_content, ), ).drop("ctm_content") print("GALVEZ:schema") alignments_df.printSchema() sys.stdout.flush() alignments_df.write.mode("overwrite").format("json").save( alignments_dir) manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest") tars_dir = os.path.join(FLAGS.work_dir, "dataset_tars") if FLAGS.stage <= 3: duplicate_data_path = "gs://the-peoples-speech-west-europe/forced-aligner/data_deduplication/data_deduplication_v2_lines.json" duplicates_df = spark.read.format("json").load(duplicate_data_path) alignments_df = spark.read.json(alignments_dir) alignments_df = alignments_df.join( duplicates_df, on=(alignments_df.identifier == duplicates_df.identifier) & (alignments_df.text_document_id == duplicates_df.text_document_id), how="anti", ) if FLAGS.license_filter == "": pass else: if FLAGS.license_filter == "Not CC-BY-SA": filtered_licenseurl_df = licenseurl_df.filter( ~is_cc_by_sa(F.col("licenseurl"))) elif FLAGS.license_filter == "CC-BY-SA": filtered_licenseurl_df = licenseurl_df.filter( is_cc_by_sa(F.col("licenseurl"))) else: raise Exception("Unknown license_filter provided.") filtered_licenseurl_df = filtered_licenseurl_df.drop("licenseurl") alignments_df = alignments_df.join( filtered_licenseurl_df, on=(alignments_df.identifier == filtered_licenseurl_df.identifier) & (alignments_df.text_document_id == filtered_licenseurl_df.text_document_id), how="inner", ) alignments_df = alignments_df.drop( filtered_licenseurl_df.identifier).drop( filtered_licenseurl_df.text_document_id) # We would like the number of partitions to be some large multiple # of the number of executors. Not every audio file is the same # length, so this helps with load balancing. alignments_df = alignments_df.withColumn( "duration_ms", F.expr( "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)" ), ) alignments_df = alignments_df.withColumn( "alignments", F.arrays_zip( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.duration_ms, ).cast( T.ArrayType( T.StructType([ T.StructField("cer", T.FloatType()), T.StructField("end_ms", T.LongType()), T.StructField("label", T.StringType()), T.StructField("start_ms", T.LongType()), T.StructField("wer", T.FloatType()), T.StructField("duration_ms", T.LongType()), ]))), ) alignments_df = alignments_df.drop("duration_ms") alignments_df = alignments_df.withColumn( "alignments", F.filter( alignments_df.alignments, # Need to select this filter such that total number of # hours is 31,400 lambda alignment: (alignment.duration_ms < FLAGS.max_duration_ms) & (alignment.duration_ms >= FLAGS.min_duration_ms) & (alignment.cer < FLAGS.max_cer) & (alignment.cer >= FLAGS.min_cer), ), ) alignments_df = alignments_df.withColumn( "alignments", F.struct( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.alignments.duration_ms, ).cast( T.StructType([ T.StructField("cer", T.ArrayType(T.FloatType())), T.StructField("end_ms", T.ArrayType(T.LongType())), T.StructField("label", T.ArrayType(T.StringType())), T.StructField("start_ms", T.ArrayType(T.LongType())), T.StructField("wer", T.ArrayType(T.FloatType())), T.StructField("duration_ms", T.ArrayType(T.LongType())), ])), ) alignments_df = alignments_df.repartition(960) abc = alignments_df.select( F.sum( F.expr( "aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)" )) / 1000.0 / 60.0 / 60.0).collect() print("GALVEZ:total number of hours=", abc) sys.stdout.flush() alignments_df = alignments_df.select( alignments_df.identifier, alignments_df.audio_document_id, alignments_df.text_document_id, alignments_df.alignments, ) alignments_df = F.broadcast(alignments_df) audio_paths = F.concat( F.lit(FLAGS.input_gcs_path), F.lit("/"), F.col("identifier"), F.lit("/"), F.col("audio_document_id"), ) rows = alignments_df.select(audio_paths).collect() paths = [row[0] for row in rows] # [:1] # GALVEZ: WARNING test! # print(f"number of paths = {len(paths)}") audio_df = (spark.read.format("binaryFile").load(paths).drop( "modificationTime", "length")) alignments_audio_df = alignments_df.join(audio_df, audio_paths == audio_df.path) # from IPython import embed; embed() # Remove "/" so that, if someat untars the tar files, everything will be dumped into one directory # Remove "." becasue it has special meaning in webdataset format. # Remove " " because kaldi keys may not contain " " (this is not striclty necessary, but convenient) name = F.concat(F.col("identifier"), F.lit("/"), F.col("audio_document_id")) # name = F.regexp_replace(name, r"/", "_SLASH_") name = F.regexp_replace(name, r"\.", "_DOT_") name = F.regexp_replace(name, r" ", "_SPACE_") # glob.glob("**/*.flac") pdf = df.select(name).collect() for name in pdf.name: assert len(name) < 4096 for chunk in "/".split(name): assert len(chunk) < 256 # name = F.regexp_replace(F.concat(F.col("identifier"), # F.lit("-"), # F.col("audio_document_id")), # r"(\.|/)", # "_" # ) # The name of each thing in the tar file. May not exceed 100 characters in length # substr indexes from 1! # name = name.substr( # F.length(name) - F.least(F.length(name), F.lit(88)) + 1, # F.least(F.length(name), F.lit(88)) # ) alignments_audio_df = alignments_audio_df.withColumn( "aligned_chunks", create_audio_segments_udf( alignments_audio_df.content, F.lit("mp3"), name, alignments_audio_df.alignments.start_ms, alignments_audio_df.alignments.end_ms, F.lit("flac"), ), ) a = alignments_audio_df.select( F.explode( F.arrays_zip("aligned_chunks.audio_name", "aligned_chunks.audio"))).select( "col.0", "col.1") a.write.mode("overwrite").format("tar").save(tars_dir) output_df = alignments_audio_df.select( alignments_audio_df.identifier, alignments_audio_df.audio_document_id, alignments_audio_df.text_document_id, F.struct( alignments_audio_df.alignments.label.alias("label"), create_audio_segment_names_udf( # Is F.size right here? name, F.size(alignments_audio_df.alignments.start_ms), F.lit("flac"), ).alias("name"), alignments_audio_df.alignments.duration_ms.alias( "duration_ms"), ).alias("training_data"), ) output_df = output_df.coalesce(960) # coalesce(1) seems to make the create_audio_segments_udf function run serially output_df.write.mode("overwrite").json(manifest_dir) repartitioned_tars_dir = os.path.join(FLAGS.work_dir, "repartitioned_dataset_tars") tmp_tars_dir = os.path.join(FLAGS.work_dir, "repartitioned_dataset_tmp_dir") if FLAGS.stage <= 4: tars_df = spark.read.format("tar").load(tars_dir) # .limit(100) number_of_rows = tars_df.count() spark2 = spark.newSession() spark2.conf.set( "spark.sql.execution.rangeExchange.sampleSizePerPartition", number_of_rows) spark2.conf.set("spark.sql.files.minPartitionNum", FLAGS.number_of_shards) # tars_df = spark2.read.format("tar").load(tars_dir)#.limit(100) # print("GALVEZ:", tars_df.select(F.col("key")).collect()) # import sys; sys.exit() tars_df = spark2.read.format("tar").load(tars_dir) # .limit(100) tars_df = tars_df.repartitionByRange(FLAGS.number_of_shards, F.col("key")) # # May need to write this out to GCS, and then delete it, to prevent different behavior between runs. # # tars_df = tars_df.persist() tars_df.write.mode("overwrite").format("tar").save(tmp_tars_dir) tars_df = spark2.read.format("tar").load( tmp_tars_dir) # .repartitionByRange() # coalesce(1024) # counts_df = ( # tars_df.withColumn("partitionId", F.spark_partition_id()) # .groupBy("partitionId") # .count() # ) # num_rows_to_keep = counts_df.select(F.min(F.col("count"))).collect()[0][0] # # Consider doing this in java # def drop_final_rows(rows): # for _ in range(num_rows_to_keep): # yield next(rows) # for _ in rows: # pass # return # print("GALVEZ:before=", tars_df.rdd.getNumPartitions()) # # , preservesPartitioning=True # tars_df = spark2.createDataFrame( # tars_df.rdd.mapPartitions(drop_final_rows), schema=tars_df.schema # ) # print("GALVEZ:after=", tars_df.rdd.getNumPartitions()) # import sys # sys.stdout.flush() # # Don't actually write this out right now. It doesn't benefit us unless we are doing nemo training in a specific mode. # tars_df.write.mode("overwrite").format("tar").save(repartitioned_tars_dir) # manifest_df = spark2.read.json(manifest_dir) # number_of_utterances = manifest_df.select(F.explode(F.col("training_data.name"))).count() # print(f"GALVEZ:number_of_utterances={number_of_utterances}") # utterances_per_shard = number_of_utterances // FLAGS.number_of_shards # repartition_tar_files(os.path.join(tars_dir, "*.tar"), repartitioned_tars_dir, utterances_per_shard) nemo_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo") nemo_single_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo_single") if FLAGS.stage <= 5: json_df = spark.read.format("json").load(manifest_dir) nemo_df = json_df.select( F.explode( F.arrays_zip( F.col("training_data.name").alias("audio_filepath"), F.col("training_data.label").alias("text"), F.col("training_data.duration_ms").alias("duration_ms"), ))) nemo_df = nemo_df.select( F.col("col.name").alias("audio_filepath"), F.col("col.label").alias("text"), (F.col("col.duration_ms").cast(T.DoubleType()) / 1000.0).alias("duration"), F.lit(-1).alias("shard_id"), ) if False: tars_df = spark.read.format("tar").load(repartitioned_tars_dir) tars_df = tars_df.select(tars_df.key) nemo_df = F.broadcast(nemo_df) nemo_df = nemo_df.join( tars_df, F.col("audio_filepath") == F.col("key")).drop(F.col("key")) # TODO: Join against tar files that have been made to contain the # same number of files to filter out removed files nemo_df.write.mode("overwrite").format("json").save(nemo_manifest_dir) nemo_single_df = spark.read.format("json").load(nemo_manifest_dir) nemo_single_df.coalesce(1).write.mode("overwrite").format("json").save( nemo_single_manifest_dir) single_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_single") single_tar_dir = os.path.join(FLAGS.work_dir, "dataset_tars_single") # Create single tar file and single json file if FLAGS.stage <= 6: json_df = spark.read.format("json").load(manifest_dir) json_df.coalesce(1).write.format("json").mode("overwrite").save( single_manifest_dir) tars_df = spark.read.format("tar").load(tmp_tars_dir) tars_df.coalesce(1).write.format("tar").mode("overwrite").save( single_tar_dir)
def test_auto_mapper_concat_multiple_items_structs_different_elements( spark_session: SparkSession, ) -> None: # Arrange clean_spark_session(spark_session) spark_session.createDataFrame( [ (1, "Qureshi", "Imran"), (2, None, "Michael"), ], ["member_id", "last_name", "first_name"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") # Act mapper = AutoMapper( view="members", source_view="patients", enable_schema_pruning=True).columns(dst2=AutoMapperList([ AutoMapperDataTypeComplexBase(a=A.column("first_name"), b=A.column("last_name")) ], ).concat( AutoMapperList([ AutoMapperDataTypeComplexBase(a=A.column("first_name"), c=A.column("last_name")), ], ))) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") array1 = when( array( struct( col("b.first_name").alias("a"), col("b.last_name").alias("b"), lit(None).alias("c"), ), ).isNotNull(), filter( coalesce( array( struct( col("b.first_name").alias("a"), col("b.last_name").alias("b"), lit(None).alias("c"), ), ), array(), ), lambda x: x.isNotNull(), ), ) array2 = when( array( struct( col("b.first_name").alias("a"), lit(None).alias("b"), col("b.last_name").alias("c"), ), ).isNotNull(), filter( coalesce( array( struct( col("b.first_name").alias("a"), lit(None).alias("b"), col("b.last_name").alias("c"), ), ), array(), ), lambda x: x.isNotNull(), ), ) assert_compare_expressions(sql_expressions["dst2"], concat(array1, array2).alias("dst2")) result_df: DataFrame = mapper.transform(df=source_df) # Assert result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] [0] == "Imran") assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0] [1] == "Qureshi") assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0] [0] == "Michael") assert ( result_df.where("member_id == 2").select("dst2").collect()[0][0][0][1] is None)
def main(self, sc: SparkContext, *args: Any): observations_parquet_path = args[0] genotype_phenotype_parquet_path = args[1] impc_images_parquet_path = args[2] product_parquet_path = args[3] gene_core_parquet_path = args[4] output_path = args[5] spark = SparkSession(sc) product_df = spark.read.parquet(product_parquet_path) gene_df = spark.read.parquet(gene_core_parquet_path) order_df = ( product_df.groupBy( *[ col_name for col_name in product_df.columns if col_name not in ["type", "tissue_enquiry_links"] ] ) .agg( collect_set("type").alias("available_products"), collect_set("tissue_enquiry_links").alias("tissue_enquiry_links"), ) .withColumn( "available_products", when( size("tissue_enquiry_links") > 0, concat("available_products", array(lit("tissue"))), ).otherwise(col("available_products")), ) ) gene_id_symbol = gene_df.select("mgi_accession_id", "marker_symbol").distinct() order_df = order_df.join(gene_id_symbol, "marker_symbol") gene_order_df = order_df.select( "marker_symbol", "allele_name", "allele_description", "available_products" ).distinct() gene_order_df = gene_order_df.withColumn( "_class", lit("org.mousephenotype.web.models.gene.Order") ) gene_order_df.write.format("mongo").mode("append").option( "spark.mongodb.output.uri", f"{self.mongodb_connection_uri}/admin?replicaSet={self.mongodb_replica_set}", ).option("database", str(self.mongodb_database)).option( "collection", "gene-order" ).save() links_fields = [ "genbank_file", "allele_image", "allele_simple_image", "vector_genbank_file", "vector_allele_image", ] for link_field in links_fields: allele_summary_df = order_df.withColumn( f"{link_field}_url", filter("other_links", lambda x: x.startswith(f"{link_field}:")), ) allele_summary_df = allele_summary_df.withColumn( f"{link_field}_url", when( size(f"{link_field}_url") > 0, regexp_extract( col(f"{link_field}_url").getItem(0), f"{link_field}:(.*)", 1 ), ).otherwise(lit(None)), ) genetic_info_fields = [ "strain", "cassette", "cassette_type", "parent_es_cell_line", ] for genetic_info_field in genetic_info_fields: allele_summary_df = order_df.withColumn( genetic_info_field, filter( "genetic_info", lambda x: x.startswith(f"{genetic_info_field}:") ), ) allele_summary_df = allele_summary_df.withColumn( genetic_info_field, when( size(genetic_info_field) > 0, regexp_extract( col(genetic_info_field).getItem(0), f"{link_field}:(.*)", 1 ), ).otherwise(lit(None)), ) ## process by type and then join with the metadata dataframe mice_df = ( allele_summary_df.where(col("type") == "mouse") .select( col("mgi_accession_id"), col("allele_name"), col("product_id"), col("name").alias("colony_name"), col("background_colony_strain"), col("production_centre"), col("qc_data"), col("associated_product_es_cell_name").alias( "es_cell_parent_mouse_colony" ), ) .distinct() )
def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column], parent_columns: Optional[List[Column]], ) -> Column: """ returns a Spark Column definition """ self.ensure_children_have_same_properties( skip_null_properties=self.skip_null_properties ) if isinstance( self.value, str ): # if the src column is just string then consider it a sql expression return array(lit(self.value)) if isinstance(self.value, list): # if the src column is a list then iterate inner_array = array( *[ self.get_value( item, source_df=source_df, current_column=current_column, parent_columns=parent_columns, ) for item in self.value ] ) return ( when( inner_array.isNotNull(), # coalesce is needed otherwise Spark complains: # pyspark.sql.utils.AnalysisException: cannot resolve # 'filter(NULL, lambdafunction((x IS NOT NULL), x))' due to argument data type mismatch: # argument 1 requires array type, however, 'NULL' is of null type.; filter(coalesce(inner_array, array()), lambda x: x.isNotNull()), ) if self.remove_nulls else inner_array ) # if value is an AutoMapper then ask it for its column spec if isinstance(self.value, AutoMapperDataTypeBase): child: AutoMapperDataTypeBase = self.value inner_child_spec = child.get_column_spec( source_df=source_df, current_column=current_column, parent_columns=parent_columns, ) return ( when( inner_child_spec.isNotNull(), filter( # coalesce is needed otherwise Spark complains: # pyspark.sql.utils.AnalysisException: cannot resolve # 'filter(NULL, lambdafunction((x IS NOT NULL), x))' due to argument data type mismatch: # argument 1 requires array type, however, 'NULL' is of null type.; coalesce(inner_child_spec, array()), lambda x: x.isNotNull(), ), ) if self.remove_nulls else inner_child_spec ) raise ValueError(f"value: {self.value} is neither str nor AutoMapper")
def test_auto_mapper_fhir_patient_resource_include_null_properties( spark_session: SparkSession, ) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01", "female"), (2, "Vidal", "Michael", "1970-02-02", None), ], ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).complex( Patient( id_=FhirId(A.column("member_id")), birthDate=A.date(A.column("date_of_birth")), name=FhirList( [HumanName(use=NameUseCode("usual"), family=A.column("last_name"))], include_null_properties=True, ), gender=A.if_not_null( A.column("my_gender"), AdministrativeGenderCode(A.column("my_gender")) ), ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert len(sql_expressions) == 21 assert str(sql_expressions["id"]) == str( substring( regexp_replace(col("b.member_id"), r"[^A-Za-z0-9\-\.]", "_"), 0, 63 ).alias("id") ) assert str(sql_expressions["resourceType"]) == str( lit("Patient").alias("resourceType") ) assert str(sql_expressions["birthDate"]) == str( coalesce( to_date(col("b.date_of_birth"), "y-M-d"), to_date(col("b.date_of_birth"), "yyyyMMdd"), to_date(col("b.date_of_birth"), "M/d/y"), ).alias("birthDate") ) assert str(sql_expressions["name"]) == str( filter( array( struct( lit("usual").alias("use"), lit(None).alias("text"), col("b.last_name").alias("family"), lit(None).alias("given"), lit(None).alias("prefix"), lit(None).alias("suffix"), lit(None).alias("period"), ) ), lambda x: x.isNotNull(), ).alias("name") ) assert str(sql_expressions["gender"]) == str( when(col("b.my_gender").isNull(), None) .otherwise(col("b.my_gender")) .alias("gender") ) result_df.printSchema() result_df.show() assert ( result_df.where("member_id == 1").selectExpr("name[0].use").collect()[0][0] == "usual" ) assert ( result_df.where("member_id == 1").selectExpr("name[0].family").collect()[0][0] == "Qureshi" ) assert ( result_df.where("member_id == 2").selectExpr("name[0].use").collect()[0][0] == "usual" ) assert ( result_df.where("member_id == 2").selectExpr("name[0].family").collect()[0][0] == "Vidal" )
def test_alignments_filter(self): work_dir = "gs://the-peoples-speech-west-europe/forced-aligner/cuda-forced-aligner/output_work_dir_5b/output_work_dir_5b" alignments_dir = os.path.join(work_dir, "alignments_json_jul_28") spark = self.spark alignments_df = spark.read.json(alignments_dir) alignments_df = alignments_df.withColumn( "duration_ms", F.expr( "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)" ), ) alignments_df = alignments_df.withColumn( "alignments", F.arrays_zip( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.duration_ms, ).cast( T.ArrayType( T.StructType( [ T.StructField("cer", T.FloatType()), T.StructField("end_ms", T.LongType()), T.StructField("label", T.StringType()), T.StructField("start_ms", T.LongType()), T.StructField("wer", T.FloatType()), T.StructField("duration_ms", T.LongType()), ] ) ) ), ) alignments_df = alignments_df.drop("duration_ms") max_duration_ms = 20_000 max_cer = 36.0 min_duration_ms = 1_000 alignments_df = alignments_df.withColumn( "alignments", F.filter( alignments_df.alignments, # Need to select this filter such that total number of # hours is 31,400 lambda alignment: (alignment.duration_ms < max_duration_ms) & (alignment.cer < max_cer) & (alignment.duration_ms > min_duration_ms), ), ) alignments_df = alignments_df.withColumn( "alignments", F.struct( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, # Is this the fix? alignments_df.alignments.duration_ms, ).cast( T.StructType( [ T.StructField("cer", T.ArrayType(T.FloatType())), T.StructField("end_ms", T.ArrayType(T.LongType())), T.StructField("label", T.ArrayType(T.StringType())), T.StructField("start_ms", T.ArrayType(T.LongType())), T.StructField("wer", T.ArrayType(T.FloatType())), T.StructField("duration_ms", T.ArrayType(T.LongType())), ] ) ), ) abc = alignments_df.select( F.sum(F.expr("aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)")) / 1000.0 / 60.0 / 60.0 ).collect() print("GALVEZ:max_duration_ms=", max_duration_ms) print("GALVEZ:max_cer=", max_cer) print("GALVEZ:min_duration_ms=", min_duration_ms) print("GALVEZ:total number of hours=", abc)
def test_automapper_nested_array_filter_with_parent_column( spark_session: SparkSession, ) -> None: schema = StructType( [ StructField("row_id", dataType=IntegerType(), nullable=False), StructField( "location", dataType=ArrayType( StructType( [ StructField("name", StringType(), True), ] ) ), ), StructField( "schedule", dataType=ArrayType( StructType( [ StructField("name", StringType(), True), StructField( "actor", ArrayType( StructType( [StructField("reference", StringType(), True)] ), True, ), ), ] ) ), ), StructField( "single_level", dataType=ArrayType( StructType( [ StructField("reference", StringType(), True), ] ) ), ), ] ) spark_session.createDataFrame( [ ( 1, [{"name": "location-100"}, {"name": "location-200"}], [ { "name": "schedule-1", "actor": [ {"reference": "location-100"}, {"reference": "practitioner-role-100"}, ], }, { "name": "schedule-2", "actor": [ {"reference": "location-200"}, {"reference": "practitioner-role-200"}, ], }, ], [{"reference": "location-100"}, {"reference": "location-200"}], ) ], schema, ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") mapper = AutoMapper( view="schedule", source_view="patients", keys=["row_id"] ).columns( location=A.column("location").select( AutoMapperElasticSearchLocation( name=A.field("name"), scheduling=A.nested_array_filter( array_field=A.column("schedule"), inner_array_field=A.field("actor"), match_property="reference", match_value=A.field("{parent}.name"), ).select_one(AutoMapperElasticSearchSchedule(name=A.field("name"))), ) ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) print("------COLUMN SPECS------") for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") assert_compare_expressions( sql_expressions["location"], transform( col("b.location"), lambda l: ( struct( l["name"].alias("name"), transform( filter( col("b.schedule"), lambda s: exists( s["actor"], lambda a: a["reference"] == l["name"], # type: ignore ), ), lambda s: struct(s["name"].alias("name")), )[0].alias("scheduling"), ) ), ).alias("___location"), ) result_df: DataFrame = mapper.transform(df=source_df) # Assert # result_df.printSchema() # result_df.show(truncate=False) location_row = result_df.collect()[0].location for index, location in enumerate(location_row): location_name = location.name location_scheduling = location.scheduling assert location_name == f"location-{index + 1}00" assert len(location_scheduling) == 1 assert location_scheduling.name == f"schedule-{index + 1}"