def spk_med(data): spk_df = spark.read.json(data) # Selects data from JSON to normalize filtered = spk_df.select(spk_df.animals.id.alias("id"), \ spk_df.animals.attributes.house_trained.alias("house_trained"), \ spk_df.animals.attributes.declawed.alias("declawed"), \ spk_df.animals.attributes.spayed_neutered.alias("spayed_neutered"), \ spk_df.animals.attributes.special_needs.alias("special_needs"), \ spk_df.animals.attributes.shots_current.alias("shots_current")) # Result is cleaned dataframe cleaned = filtered.withColumn("filtered", F.explode(F.arrays_zip("id", "house_trained", "declawed", "spayed_neutered", "special_needs", "shots_current"))) \ .select("filtered.id", "filtered.house_trained", "filtered.declawed", "filtered.spayed_neutered", "filtered.special_needs", "filtered.shots_current") # Casting values to correct datatype cleaned_cast = cleaned.select(cleaned.id.cast(IntegerType()), cleaned.house_trained.cast(BooleanType()), cleaned.declawed.cast(BooleanType()), cleaned.spayed_neutered.cast(BooleanType()), cleaned.special_needs.cast(BooleanType()), cleaned.shots_current.cast(BooleanType())) return cleaned_cast
def spk_temp(data): spk_df = spark.read.json(data) # Selects data from JSON to normalize filtered = spk_df.select(spk_df.animals.id.alias("id"), \ spk_df.animals.environment.cats.alias("cats"), \ spk_df.animals.environment.dogs.alias("dogs"), \ spk_df.animals.environment.children.alias("children")) # Result is cleaned dataframe cleaned = filtered.withColumn("filtered", F.explode(F.arrays_zip("id", \ "cats", \ "dogs", \ "children"))) \ .select("filtered.id", \ "filtered.cats", \ "filtered.dogs", \ "filtered.children") # Casting values to correct datatyoe cleaned_cast = cleaned.select(cleaned.id.cast(IntegerType()), cleaned.children.cast(BooleanType()), cleaned.dogs.cast(BooleanType()), cleaned.cats.cast(BooleanType())) return cleaned_cast
def transfLineitems(ds, config): invoiceDS: DataFrame = ds invoiceDS = invoiceDS.withColumn("TypeOfService", col("_LineItems._Description")) \ .withColumn("ServiceAmount", col("_LineItems._TotalPriceNetto")) if invoiceDS.schema["TypeOfService"].dataType == StringType(): invoiceDS = invoiceDS.withColumn("TypeOfService", array(col("_LineItems._Description"))) \ .withColumn("ServiceAmount", array(col("_LineItems._TotalPriceNetto"))) invoiceDS = invoiceDS.withColumn("ServiceNRate", arrays_zip(col("TypeOfService"), col("ServiceAmount")))\ .withColumn("ServiceNRate", explode_outer(col("ServiceNRate"))) # udf_service = udf(get_genesis_TOS, StringType()) # same code of line 56, remove costTypeList billPath = config["Master"]["billPath"] CostTypeList = createDatasetFromCSVFile("", billPath).filter(col("VENDOR_NAME").rlike("GENESIS MARINE, LLC"))\ .rdd.map(lambda x: x.COST_LINE_ITEM_TYPE).collect() def udf_service(costType): return udf(lambda l: get_genesis_TOS_search(l, costType), StringType()) invoiceDS = invoiceDS.withColumn("TypeOfService", udf_service(CostTypeList)(regexp_replace( col("ServiceNRate.TypeOfService"), "\n", " ")))\ .withColumn("ServiceAmount", col("ServiceNRate.ServiceAmount")) \ return invoiceDS
def benchmark_extract_top_keywords(posts, n_keywords=10): """Given TF-IDF output (as "features" column) extracts out the vocabulary index of the 10 keywords with highest TF-IDF (for each post).""" def extract_keys_from_vector(vector): return vector.indices.tolist() def extract_values_from_vector(vector): return vector.values.tolist() extract_keys_from_vector_udf = udf( lambda vector: extract_keys_from_vector(vector), ArrayType(IntegerType())) extract_values_from_vector_udf = udf( lambda vector: extract_values_from_vector(vector), ArrayType(DoubleType())) posts = posts.withColumn("extracted_keys", extract_keys_from_vector_udf("features")) posts = posts.withColumn("extracted_values", extract_values_from_vector_udf("features")) posts = posts.withColumn( "zipped_truncated", slice( sort_array(arrays_zip("extracted_values", "extracted_keys"), asc=False), 1, n_keywords)) take_second = udf(lambda rows: [row[1] for row in rows], ArrayType(IntegerType())) posts = posts.withColumn("top_indices", take_second("zipped_truncated")) return posts
def predict(model,data): print('making predictions on data...') predictions = model.transform(data) print('Converting predictions to pandas df...') df = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \ .select(F.expr("cols['0']").alias("token"), F.expr("cols['1']").alias("ground_truth"), F.expr("cols['2']").alias("prediction")).toPandas() print('Evaluation report') print(classification_report(df.ground_truth, df.prediction)) print(accuracy_score(df.ground_truth, df.prediction))
def logVendorDetails(config, overAllXMLCnt, vendorList, vendorInitialCnt, vendorProcessedCnt, StartTime, EndTime): RunID = singleSession().getSparkSession(config).sparkContext.applicationId masterURL = config["SparkSubmit"]["master"] xmlPath = config['InvoicePath']['abbyXmlPath'] selectedColumns = config['Logging']['selectedColumns'] summaryColumns = config['Logging']['summaryColumns'] detailsColumns = config['Logging']['detailsColumns'] loggingPath = config['Logging']['loggingPath'] overAllXMLProcessedCnt = 0 for cnt in vendorProcessedCnt: overAllXMLProcessedCnt += cnt row = [ RunID, xmlPath, masterURL, overAllXMLCnt, overAllXMLProcessedCnt, StartTime, EndTime, vendorList, vendorInitialCnt, vendorProcessedCnt ] dataSet = singleSession().getSparkSession(config).createDataFrame( [row], selectedColumns) summary = dataSet.select(*summaryColumns) # details part details = dataSet.withColumn( "Message", arrays_zip(col("vendorList"), col("vendorInitialCnt"), col("vendorProcessedCnt"))) details = details.select(*detailsColumns).withColumn("Message", explode(col("Message")))\ .withColumn("Message", concat(col("Message.vendorList"), lit(" vendor had "), col("Message.vendorInitialCnt"), lit(" XMLs and generated ") , col("Message.vendorProcessedCnt"), lit(" records"))) # summary.show(truncate=False) # details.show(truncate=False) # csv writeIntoServingLayer(summary, loggingPath + "summary", mode="Append") writeIntoServingLayer(details, loggingPath + "details", mode="Append") logSummaryOfRun = config['LoggingTale']['logSummaryOfRun'] logDetailsOfRun = config['LoggingTale']['logDetailsOfRun'] # Hive # writeIntoHiveServingLayer(summary, logSummaryOfRun) # writeIntoHiveServingLayer(details, logDetailsOfRun) return None
def df_array_zip(df, b, c, d): from pyspark.sql.functions import arrays_zip, col ###https://stackoverflow.com/questions/41027315/pyspark-split-multiple-array-columns-into-rows df = df.withColumn("tmp", arrays_zip(b, c, d)) df = df.withColumn("tmp", explode("tmp")) df = df.select(col("tmp")[b], col("tmp")[c], col("tmp")[d]) df = df.withColumnRenamed('tmp.{}'.format(d), d) df = df.withColumnRenamed('tmp.{}'.format(b), b) df = df.withColumnRenamed('tmp.{}'.format(c), c) return df
def get_metrics(data, model): print("Getting embeddings...") data = bert_annotator.transform(data) predictions = model.transform(data) df = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \ .select(F.expr("cols['1']").alias("target"), F.expr("cols['2']").alias("prediction")) correct = df.filter(df.target == df.prediction).count() total = df.select("target").count() accuracy = 100 * correct / total print("Accuracy = {}".format(accuracy))
def explode_list(df): """Explode a list of products into separated rows :param df: Input DataFrame :return: Output DataFrame """ return (df.withColumn('tmp', arrays_zip( 'productCode', 'productName')).withColumn('tmp', explode_outer('tmp')).select( 'maid', 'siteseq', 'userid', 'transaction_date', 'transaction_time', 'logtype', 'tmp.productCode', 'tmp.productName').withColumnRenamed( 'tmp.productCode', 'productCode').withColumnRenamed('tmp.productName', 'productName'))
def _transform_rec_info(self, rec_info, item_ids_dataset): import pyspark.sql.functions as F if self.output_user_embeddings: user_embeddings = rec_info.select( self.increasing_id_column_name, self.recommendation_info_column_name + '.user_embedding') rec_info = rec_info.withColumn( self.recommendation_info_column_name, F.arrays_zip(self.recommendation_info_column_name + '.indices', self.recommendation_info_column_name + '.distances')) rec_info = rec_info.select( self.increasing_id_column_name, (F.posexplode(self.recommendation_info_column_name).alias( 'pos', self.recommendation_info_column_name))) rec_info = rec_info.select( self.increasing_id_column_name, 'pos', F.col(self.recommendation_info_column_name + '.0').alias('index'), F.col(self.recommendation_info_column_name + '.1').alias('distance')) rec_info = rec_info.join(item_ids_dataset, F.col('index') == F.col('id')) w = pyspark.sql.Window.partitionBy( self.increasing_id_column_name).orderBy('pos') if self.output_item_embeddings: rec_info = rec_info.select( self.increasing_id_column_name, 'pos', (F.struct('name', 'distance', 'item_embedding').alias( self.recommendation_info_column_name))) else: rec_info = rec_info.select( self.increasing_id_column_name, 'pos', (F.struct('name', 'distance').alias( self.recommendation_info_column_name))) rec_info = rec_info.select( self.increasing_id_column_name, 'pos', (F.collect_list( self.recommendation_info_column_name).over(w).alias( self.recommendation_info_column_name))) rec_info = rec_info.groupBy(self.increasing_id_column_name).agg( F.max(self.recommendation_info_column_name).alias( self.recommendation_info_column_name)) if self.output_user_embeddings: rec_info = rec_info.join(user_embeddings, self.increasing_id_column_name) return rec_info
def filter_by_pos(self, pos_col, lemma_col): self.data = self.data.select( F.expr("*"), F.explode( F.arrays_zip( pos_col + '.result', pos_col + '.begin', lemma_col + '.begin', ) ).alias("cols") ) \ .select( F.expr('*'), F.expr("cols['0']").alias("pos_result"), F.expr("cols['1']").alias("pos_begin"), F.expr("cols['2']").alias("token_begin") ) \ .filter((F.col('pos_result') == 'NN') & (F.col('pos_begin') == F.col('token_begin'))) \ .drop("cols", "pos_result", "pos_begin", "token_begin")
def filter_by_ner(self, ner_col, lemma_col): self.data = self.data.select( F.expr("*"), F.explode( F.arrays_zip( ner_col + '.result', ner_col + '.begin', lemma_col + '.begin', ).alias("cols") ) ) \ .select( F.expr('*'), F.expr("cols['0']").alias("ner_result"), F.expr("cols['1']").alias("ner_begin"), F.expr("cols['2']").alias("token_begin") ) \ .filter((F.col('ner_result') == 'O') & (F.col('ner_begin') == F.col('token_begin'))) \ .drop("cols", "ner_result", "ner_begin", "token_begin")
def spk_descr(data): spk_df = spark.read.json(data) # Selects data from JSON to normalize filtered = spk_df.select(spk_df.animals.id.alias("id"), \ spk_df.animals.description.alias("description")) # Result is cleaned dataframe cleaned = filtered.withColumn("filtered", F.explode(F.arrays_zip("id", \ "description"))) \ .select("filtered.id", \ "filtered.description") # Casting values to correct datatype cleaned_cast = cleaned.select(cleaned.id.cast(IntegerType()), cleaned.description.cast(StringType())) return cleaned_cast
def zip_explode_cols(df: pyspark.sql.dataframe.DataFrame, cols: list, result_name: str, rename_fields: Dict[str, str] = None): """ Explode multiple equally-sized arrays into one struct by zipping all arrays into one `ArrayType[StructType]` Args: df: The input Spark DataFrame cols: The array columns that should be zipped result_name: The name of the column that will contain the newly created struct rename_fields: dictionary mapping column names to new struct field names. Used to rename columns in the newly created struct. Returns: `df.withColumn(result_name, zip(explode(cols)))` """ df = df.withColumn(result_name, f.explode(f.arrays_zip(*cols))) if rename_fields: # create schema of new struct by simply renaming the top-level struct fields old_schema: t.StructType = df.schema[result_name].dataType # rename field if field ist in `old_schema.fieldNames()` new_field_names = [ rename_fields[field] if field in rename_fields else field for field in old_schema.fieldNames() ] new_schema = t.StructType([ t.StructField(name, field.dataType) for name, field in zip(new_field_names, old_schema.fields) ]) df = df.withColumn(result_name, f.col(result_name).cast(new_schema)) # # old method using withColumn and a new struct; breaks with PySpark 3.0 # df = df.withColumn(target_struct, f.struct(*[ # f.col(target_struct + "." + actualName).alias(targetName) # for targetName, actualName in zip(target_colnames, df.schema[target_struct].dataType.fieldNames()) # ])) return df
def read_and_clean_json(spark, input_json_path): ''' Reads json file with products data Explodes nested columns Selects main columns to dataframe Replaces wrong values of character '&'(ampersand) Replaces values of character '$' to cast price column to double Replaces wrong null values to None Casts price column to double ''' df = spark.read.json(input_json_path) df = df.withColumn("tmp", arrays_zip("category", "description", "image")) \ .withColumn("tmp", explode("tmp")) \ .select("asin", col("tmp.category"), col("tmp.description"), col("tmp.image"), "title", "brand", "main_cat", "price") \ .withColumn('brand', translate('brand', '&', '&')) \ .withColumn('category', translate('category', '&', '&')) \ .withColumn('main_cat', translate('main_cat', '&', '&')) \ .withColumn('price', regexp_replace('price', '\$', '')) \ .replace(['null', '', 'None'], None) \ .withColumn('price', col('price').cast("double")) return df
def extract_top_keywords(posts, vocabulary, n_keywords=10): """Given word count (Count Vectorizer) output (as "features" column) - extracts out the vocabulary index of the 10 keywords with highest TF-IDF (for each post).""" def extract_keys_from_vector(vector): return vector.indices.tolist() def extract_values_from_vector(vector): return vector.values.tolist() extract_keys_from_vector_udf = udf( lambda vector: extract_keys_from_vector(vector), ArrayType(IntegerType())) extract_values_from_vector_udf = udf( lambda vector: extract_values_from_vector(vector), ArrayType(DoubleType())) idf_udf = array_transform(idf_wiki) vocab_dict = {k: v for k, v in enumerate(vocabulary)} def ix_to_word(ix): return vocab_dict[ix] vocab_udf = array_transform(ix_to_word) posts = posts.withColumn("word_ix", extract_keys_from_vector_udf("features")) posts = posts.withColumn("word_count", extract_values_from_vector_udf("features")) posts = posts.withColumn('words', vocab_udf(col('word_ix'))) posts = posts.withColumn("idf", idf_udf(col("words"))) posts = posts.withColumn( "zipped_truncated", slice(sort_array(arrays_zip("idf", "words"), asc=False), 1, n_keywords)) take_second = udf(lambda rows: [row[1] for row in rows], ArrayType(StringType())) posts = posts.withColumn("top_keywords", take_second("zipped_truncated")) return posts['CreationDate', 'top_keywords', 'Tags', 'ParentId']
def get_metrics(data): print("Getting labels...") data = bert_annotator.transform(data) print("Loading model...") model = NerDLModel.load(model_path)\ .setInputCols(["sentence", "token", 'embeddings'])\ .setOutputCol("ner") data.show() predictions = model.transform(data) df = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \ .select(F.expr("cols['1']").alias("target"), F.expr("cols['2']").alias("prediction")) correct = df.filter(df.target == df.prediction).count() total = df.select("target").count() accuracy = 100 * correct / total print("Accuracy = {}".format(accuracy))
def spk_status(data): spk_df = spark.read.json(data) # Selects data from JSON to normalize filtered = spk_df.select(spk_df.animals.id.alias("id"), \ spk_df.animals.status.alias("status"), \ spk_df.animals.status_changed_at.alias("status_changed_at")) # Result is cleaned dataframe cleaned = filtered.withColumn("filtered", F.explode(F.arrays_zip("id", \ "status", \ "status_changed_at"))) \ .select("filtered.id", \ "filtered.status", \ "filtered.status_changed_at") # Casting values to correct datatype cleaned_cast = cleaned.select( cleaned.id.cast(IntegerType()), cleaned.status.cast(StringType()), cleaned.status_changed_at.cast(TimestampType())) return cleaned_cast
def transfLineitems(ds): invoiceDS: DataFrame = ds invoiceDS = invoiceDS.withColumn("TypeOfService", col("_LineItems._Description")) \ .withColumn("ServiceAmount", col("_LineItems._TotalPriceNetto")) if invoiceDS.schema["TypeOfService"].dataType == StringType(): invoiceDS = invoiceDS.withColumn("TypeOfService", split(col("TypeOfService"), ",")) \ .withColumn("ServiceAmount", split(col("ServiceAmount"), ",")) # invoiceDS.printSchema() # import sys # sys.exit(3) invoiceDS = invoiceDS.withColumn("ServiceNPrice", arrays_zip(col("TypeOfService"), col("ServiceAmount"))) \ .withColumn("ServiceNPrice", explode_outer(col("ServiceNPrice"))) invoiceDS = invoiceDS.withColumn("TypeOfService", when(col("ServiceNPrice.TypeOfService").isNotNull(), regexp_replace(col("ServiceNPrice.TypeOfService"), "[:;]", "")).otherwise(""))\ .withColumn("ServiceAmount", col("ServiceNPrice.ServiceAmount")) \ .filter(~(col('TypeOfService').like("Total Amount%"))) return invoiceDS
"ETL Demo").enableHiveSupport().getOrCreate() spark.sql("create database if not exists spotify_demo;") spark.sql("use spotify_demo;") """ Notice that the following ETL flow just serves as an example and is not optimal. Also, better execution time can be achieved by changing the order of different logical steps. """ # Load track data and merge tracks = merge_tracks_sources(tracks_160k_etl(args), tracks_1200k_etl(args)) # preview # print("tracks", tracks.count()) # tracks.show(10, truncate=10) # Build artist table track_artists = tracks.select("id", from_json("artists", ArrayType(StringType(), containsNull=False), options={"allowBackslashEscapingAnyCharacter": True}).alias("artists"), from_json("artist_ids", ArrayType(StringType(), containsNull=False)).alias("artist_ids"))\ .select("id", explode(arrays_zip("artists", "artist_ids")).alias("tuple"))\ .select("id", col("tuple.artists").alias("a_name"), col("tuple.artist_ids").alias("a_spotify_id")) artists = build_dictionary_table(track_artists, "a_spotify_id", "a_name", index_column="a_id") artist_genres = genre_etl(args) artists = artists.join(artist_genres, artists.a_name == artist_genres.artists, 'left').select("a_id", "a_spotify_id", "a_name", col("genre_list").alias("genres")) # preview print("artists", artists.count()) artists.show() artists.write.saveAsTable("artist", format="orc", mode="error") # write_mode="ignore"
def main(argv): mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf( "SC_PHYS_PAGES") # e.g. 4015976448 mem_gib = int((mem_bytes / (1024.0**3)) * 0.9) tar_jar = os.path.join(find_runfiles(), "__main__/galvasr2/spark/tar_spark_datasource.jar") spark = (pyspark.sql.SparkSession.builder.master( f"local[{os.cpu_count() - 1}]").config( "spark.eventLog.enabled", "true").config("spark.eventLog.dir", "/spark-events").config( "spark.sql.execution.arrow.pyspark.enabled", "true").config( "spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true", ).config( "spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true", ).config("spark.driver.memory", f"{mem_gib}g").config( "spark.history.fs.logDirectory", "/spark-events").config( "spark.sql.execution.arrow.maxRecordsPerBatch", "1").config("spark.jars", tar_jar).config( "spark.local.dir", "/mnt/disks/spark-scratch/").getOrCreate()) spark.sparkContext.setLogLevel("INFO") # "ALL" for very verbose logging logging.getLogger("py4j").setLevel(logging.ERROR) catalogue_df = load_audio_id_text_id_mapping(spark, FLAGS.input_catalogue) _, licenseurl_df = load_audio_and_text_dfs(spark, FLAGS.input_catalogue) licenseurl_df = licenseurl_df.select( [F.col("identifier"), F.col("text_document_id"), F.col("licenseurl")]) # Kaldi's wav.scp format does not support space characters in the key field of a wav.scp file # We write the transcript to a file called "{kaldi_normalized_uttid}.ctm", so we also need to change all instances of "/" to "_" catalogue_df = catalogue_df.withColumn( "kaldi_normalized_uttid", F.concat_ws( "-", F.translate(catalogue_df.identifier, " /", "__"), F.translate(catalogue_df.audio_document_id, " /", "__"), ), ) # key_int_mapping = os.path.join(FLAGS.work_dir, "key_int_mapping_csv") if not FLAGS.work_dir.startswith("gs://"): os.makedirs(FLAGS.work_dir, exist_ok=True) wav_scp = os.path.join(FLAGS.work_dir, "wav.scp") ctm_out_dir = os.path.join(FLAGS.work_dir, "decoder_ctm_dir") if FLAGS.stage <= 0: catalogue_df = catalogue_df.cache() # catalogue_df.write.mode("overwrite").format("csv").options(header="true").save(key_int_mapping) training_sample_rows = catalogue_df.collect() catalogue_df.unpersist() with TemporaryMountDirectory( mount_cmd=[ "gcsfuse", "--implicit-dirs", FLAGS.input_gcs_bucket.lstrip("gs://"), ], unmount_cmd=["fusermount", "-u"], ) as temp_dir_name: posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, wav_scp) create_wav_scp(posix_wav_scp, training_sample_rows, FLAGS.input_dir, ctm_out_dir) # /development/lingvo-source/output_ctm_dir/ # nvprof --analysis-metrics -o decoder-analysis.nvprof \ # We want only the best path, so we set lattice-beam to 0.1 # --main-q-capacity=35000 \ # Can get 266x RTF with this configuration. Keep it? # bath size of 100 and num channels of 100 works just fine if FLAGS.stage <= 1: if not FLAGS.work_dir.startswith("gs://"): os.makedirs(ctm_out_dir, exist_ok=True) with TemporaryMountDirectory( mount_cmd=[ "gcsfuse", "--implicit-dirs", FLAGS.input_gcs_bucket.lstrip("gs://"), ], unmount_cmd=["fusermount", "-u"], ) as temp_dir_name: posix_ctm_out_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, ctm_out_dir) posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, wav_scp) posix_work_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, FLAGS.work_dir) num_gpus = 4 posix_wav_scp_shards = split_wav_scp(posix_wav_scp, posix_work_dir, num_gpus) executor = ThreadPoolExecutor(max_workers=num_gpus) def run_gpu(posix_wav_scp_shard, gpu_number): cmd = f"""\ /opt/kaldi/src/cudadecoderbin/batched-wav-nnet3-cuda3 \ --frame-subsampling-factor=3 \ --config=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/conf/online.conf \ --max-active=7000 \ --beam=15.0 \ --lattice-beam=0.1 \ --acoustic-scale=1.0 \ --cuda-decoder-copy-threads=2 \ --cuda-worker-threads={os.cpu_count() // num_gpus} \ --segmentation=true \ --cuda-use-tensor-cores=true \ --max-batch-size=150 \ --num-channels=250 \ --lattice-postprocessor-rxfilename=/development/lingvo-source/lattice_postprocess.conf \ --word-symbol-table=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/words.txt \ /opt/kaldi/egs/aspire/s5/exp/chain/tdnn_7b/final.mdl \ /opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst \ scp,p:{posix_wav_scp_shard} \ {posix_ctm_out_dir} """ env = deepcopy(os.environ) env["CUDA_VISIBLE_DEVICES"] = f"{gpu_number}" subprocess.check_call(shlex.split(cmd), env=env) for i, shard in enumerate(posix_wav_scp_shards): executor.submit(run_gpu, shard, i) executor.shutdown(wait=True) alignments_dir = os.path.join(FLAGS.alignments_work_dir, "alignments_json_jul_28") if FLAGS.stage <= 2: # TODO: Add options to DSAlign here dsalign_args = dsalign_main.parse_args( ["--output-wer", "--output-cer"]) # , "--output-sws", "--output-levenshtein"]) alphabet_normalized_path = ( "/development/lingvo-source/galvasr2/align/spark/alphabet2.txt") align_udf = prepare_align_udf(dsalign_args, alphabet_normalized_path, 15_000, 3_000) ctm_df = (spark.read.format("binaryFile").option( "pathGlobFilter", "*.ctm").load(ctm_out_dir)) ctm_df = ctm_df.withColumn( "kaldi_normalized_uttid", F.regexp_replace( F.reverse(F.split(ctm_df.path, "/"))[0], r"[.]ctm$", ""), ) ctm_df = ctm_df.withColumn("ctm_content", fix_text_udf(F.col("content"))).drop( "path", "length", "modificationTime", "content") ctm_df = ctm_df.join(catalogue_df, "kaldi_normalized_uttid") downsampled_catalogue_df = ctm_df.drop("ctm_content") training_sample_rows = downsampled_catalogue_df.collect() transcripts_df = load_transcripts(spark, FLAGS.input_gcs_path, training_sample_rows) transcripts_df = transcripts_df.withColumn( "transcript", normalize_english_text_udf(transcripts_df.transcript)) ctm_df = ctm_df.join(transcripts_df, ["identifier", "text_document_id"]) ctm_df = ctm_df.repartition(960) # alignments_df = ctm_df.select(align_udf(F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id), # F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id), # ctm_df.transcript, ctm_df.ctm_content)) alignments_df = ctm_df.withColumn( "alignments", align_udf( F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id), F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id), ctm_df.transcript, ctm_df.ctm_content, ), ).drop("ctm_content") print("GALVEZ:schema") alignments_df.printSchema() sys.stdout.flush() alignments_df.write.mode("overwrite").format("json").save( alignments_dir) manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest") tars_dir = os.path.join(FLAGS.work_dir, "dataset_tars") if FLAGS.stage <= 3: duplicate_data_path = "gs://the-peoples-speech-west-europe/forced-aligner/data_deduplication/data_deduplication_v2_lines.json" duplicates_df = spark.read.format("json").load(duplicate_data_path) alignments_df = spark.read.json(alignments_dir) alignments_df = alignments_df.join( duplicates_df, on=(alignments_df.identifier == duplicates_df.identifier) & (alignments_df.text_document_id == duplicates_df.text_document_id), how="anti", ) if FLAGS.license_filter == "": pass else: if FLAGS.license_filter == "Not CC-BY-SA": filtered_licenseurl_df = licenseurl_df.filter( ~is_cc_by_sa(F.col("licenseurl"))) elif FLAGS.license_filter == "CC-BY-SA": filtered_licenseurl_df = licenseurl_df.filter( is_cc_by_sa(F.col("licenseurl"))) else: raise Exception("Unknown license_filter provided.") filtered_licenseurl_df = filtered_licenseurl_df.drop("licenseurl") alignments_df = alignments_df.join( filtered_licenseurl_df, on=(alignments_df.identifier == filtered_licenseurl_df.identifier) & (alignments_df.text_document_id == filtered_licenseurl_df.text_document_id), how="inner", ) alignments_df = alignments_df.drop( filtered_licenseurl_df.identifier).drop( filtered_licenseurl_df.text_document_id) # We would like the number of partitions to be some large multiple # of the number of executors. Not every audio file is the same # length, so this helps with load balancing. alignments_df = alignments_df.withColumn( "duration_ms", F.expr( "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)" ), ) alignments_df = alignments_df.withColumn( "alignments", F.arrays_zip( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.duration_ms, ).cast( T.ArrayType( T.StructType([ T.StructField("cer", T.FloatType()), T.StructField("end_ms", T.LongType()), T.StructField("label", T.StringType()), T.StructField("start_ms", T.LongType()), T.StructField("wer", T.FloatType()), T.StructField("duration_ms", T.LongType()), ]))), ) alignments_df = alignments_df.drop("duration_ms") alignments_df = alignments_df.withColumn( "alignments", F.filter( alignments_df.alignments, # Need to select this filter such that total number of # hours is 31,400 lambda alignment: (alignment.duration_ms < FLAGS.max_duration_ms) & (alignment.duration_ms >= FLAGS.min_duration_ms) & (alignment.cer < FLAGS.max_cer) & (alignment.cer >= FLAGS.min_cer), ), ) alignments_df = alignments_df.withColumn( "alignments", F.struct( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.alignments.duration_ms, ).cast( T.StructType([ T.StructField("cer", T.ArrayType(T.FloatType())), T.StructField("end_ms", T.ArrayType(T.LongType())), T.StructField("label", T.ArrayType(T.StringType())), T.StructField("start_ms", T.ArrayType(T.LongType())), T.StructField("wer", T.ArrayType(T.FloatType())), T.StructField("duration_ms", T.ArrayType(T.LongType())), ])), ) alignments_df = alignments_df.repartition(960) abc = alignments_df.select( F.sum( F.expr( "aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)" )) / 1000.0 / 60.0 / 60.0).collect() print("GALVEZ:total number of hours=", abc) sys.stdout.flush() alignments_df = alignments_df.select( alignments_df.identifier, alignments_df.audio_document_id, alignments_df.text_document_id, alignments_df.alignments, ) alignments_df = F.broadcast(alignments_df) audio_paths = F.concat( F.lit(FLAGS.input_gcs_path), F.lit("/"), F.col("identifier"), F.lit("/"), F.col("audio_document_id"), ) rows = alignments_df.select(audio_paths).collect() paths = [row[0] for row in rows] # [:1] # GALVEZ: WARNING test! # print(f"number of paths = {len(paths)}") audio_df = (spark.read.format("binaryFile").load(paths).drop( "modificationTime", "length")) alignments_audio_df = alignments_df.join(audio_df, audio_paths == audio_df.path) # from IPython import embed; embed() # Remove "/" so that, if someat untars the tar files, everything will be dumped into one directory # Remove "." becasue it has special meaning in webdataset format. # Remove " " because kaldi keys may not contain " " (this is not striclty necessary, but convenient) name = F.concat(F.col("identifier"), F.lit("/"), F.col("audio_document_id")) # name = F.regexp_replace(name, r"/", "_SLASH_") name = F.regexp_replace(name, r"\.", "_DOT_") name = F.regexp_replace(name, r" ", "_SPACE_") # glob.glob("**/*.flac") pdf = df.select(name).collect() for name in pdf.name: assert len(name) < 4096 for chunk in "/".split(name): assert len(chunk) < 256 # name = F.regexp_replace(F.concat(F.col("identifier"), # F.lit("-"), # F.col("audio_document_id")), # r"(\.|/)", # "_" # ) # The name of each thing in the tar file. May not exceed 100 characters in length # substr indexes from 1! # name = name.substr( # F.length(name) - F.least(F.length(name), F.lit(88)) + 1, # F.least(F.length(name), F.lit(88)) # ) alignments_audio_df = alignments_audio_df.withColumn( "aligned_chunks", create_audio_segments_udf( alignments_audio_df.content, F.lit("mp3"), name, alignments_audio_df.alignments.start_ms, alignments_audio_df.alignments.end_ms, F.lit("flac"), ), ) a = alignments_audio_df.select( F.explode( F.arrays_zip("aligned_chunks.audio_name", "aligned_chunks.audio"))).select( "col.0", "col.1") a.write.mode("overwrite").format("tar").save(tars_dir) output_df = alignments_audio_df.select( alignments_audio_df.identifier, alignments_audio_df.audio_document_id, alignments_audio_df.text_document_id, F.struct( alignments_audio_df.alignments.label.alias("label"), create_audio_segment_names_udf( # Is F.size right here? name, F.size(alignments_audio_df.alignments.start_ms), F.lit("flac"), ).alias("name"), alignments_audio_df.alignments.duration_ms.alias( "duration_ms"), ).alias("training_data"), ) output_df = output_df.coalesce(960) # coalesce(1) seems to make the create_audio_segments_udf function run serially output_df.write.mode("overwrite").json(manifest_dir) repartitioned_tars_dir = os.path.join(FLAGS.work_dir, "repartitioned_dataset_tars") tmp_tars_dir = os.path.join(FLAGS.work_dir, "repartitioned_dataset_tmp_dir") if FLAGS.stage <= 4: tars_df = spark.read.format("tar").load(tars_dir) # .limit(100) number_of_rows = tars_df.count() spark2 = spark.newSession() spark2.conf.set( "spark.sql.execution.rangeExchange.sampleSizePerPartition", number_of_rows) spark2.conf.set("spark.sql.files.minPartitionNum", FLAGS.number_of_shards) # tars_df = spark2.read.format("tar").load(tars_dir)#.limit(100) # print("GALVEZ:", tars_df.select(F.col("key")).collect()) # import sys; sys.exit() tars_df = spark2.read.format("tar").load(tars_dir) # .limit(100) tars_df = tars_df.repartitionByRange(FLAGS.number_of_shards, F.col("key")) # # May need to write this out to GCS, and then delete it, to prevent different behavior between runs. # # tars_df = tars_df.persist() tars_df.write.mode("overwrite").format("tar").save(tmp_tars_dir) tars_df = spark2.read.format("tar").load( tmp_tars_dir) # .repartitionByRange() # coalesce(1024) # counts_df = ( # tars_df.withColumn("partitionId", F.spark_partition_id()) # .groupBy("partitionId") # .count() # ) # num_rows_to_keep = counts_df.select(F.min(F.col("count"))).collect()[0][0] # # Consider doing this in java # def drop_final_rows(rows): # for _ in range(num_rows_to_keep): # yield next(rows) # for _ in rows: # pass # return # print("GALVEZ:before=", tars_df.rdd.getNumPartitions()) # # , preservesPartitioning=True # tars_df = spark2.createDataFrame( # tars_df.rdd.mapPartitions(drop_final_rows), schema=tars_df.schema # ) # print("GALVEZ:after=", tars_df.rdd.getNumPartitions()) # import sys # sys.stdout.flush() # # Don't actually write this out right now. It doesn't benefit us unless we are doing nemo training in a specific mode. # tars_df.write.mode("overwrite").format("tar").save(repartitioned_tars_dir) # manifest_df = spark2.read.json(manifest_dir) # number_of_utterances = manifest_df.select(F.explode(F.col("training_data.name"))).count() # print(f"GALVEZ:number_of_utterances={number_of_utterances}") # utterances_per_shard = number_of_utterances // FLAGS.number_of_shards # repartition_tar_files(os.path.join(tars_dir, "*.tar"), repartitioned_tars_dir, utterances_per_shard) nemo_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo") nemo_single_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo_single") if FLAGS.stage <= 5: json_df = spark.read.format("json").load(manifest_dir) nemo_df = json_df.select( F.explode( F.arrays_zip( F.col("training_data.name").alias("audio_filepath"), F.col("training_data.label").alias("text"), F.col("training_data.duration_ms").alias("duration_ms"), ))) nemo_df = nemo_df.select( F.col("col.name").alias("audio_filepath"), F.col("col.label").alias("text"), (F.col("col.duration_ms").cast(T.DoubleType()) / 1000.0).alias("duration"), F.lit(-1).alias("shard_id"), ) if False: tars_df = spark.read.format("tar").load(repartitioned_tars_dir) tars_df = tars_df.select(tars_df.key) nemo_df = F.broadcast(nemo_df) nemo_df = nemo_df.join( tars_df, F.col("audio_filepath") == F.col("key")).drop(F.col("key")) # TODO: Join against tar files that have been made to contain the # same number of files to filter out removed files nemo_df.write.mode("overwrite").format("json").save(nemo_manifest_dir) nemo_single_df = spark.read.format("json").load(nemo_manifest_dir) nemo_single_df.coalesce(1).write.mode("overwrite").format("json").save( nemo_single_manifest_dir) single_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_single") single_tar_dir = os.path.join(FLAGS.work_dir, "dataset_tars_single") # Create single tar file and single json file if FLAGS.stage <= 6: json_df = spark.read.format("json").load(manifest_dir) json_df.coalesce(1).write.format("json").mode("overwrite").save( single_manifest_dir) tars_df = spark.read.format("tar").load(tmp_tars_dir) tars_df.coalesce(1).write.format("tar").mode("overwrite").save( single_tar_dir)
df.show(truncate=False) df_explode = df.select(df.name, explode(df.subjects).alias("Exploded_Subjects")) df_explode.printSchema() df_explode.show(truncate=False) df_flatten = df.select(df.name, flatten(df.subjects).alias("Flattened_Subjects")) df_flatten.printSchema() df_flatten.show(truncate=False) df_flatten_zip=df_flatten \ .withColumn("tmp", arrays_zip("Flattened_Subjects")) \ .withColumn("tmp", explode("tmp")) \ .select("name", col("tmp.Flattened_Subjects")) \ df_flatten_zip.printSchema() df_flatten_zip.show(truncate=False) '''Above is not performant hence below solution if array size is known ''' # Length of array n = 5 # For legacy Python you'll need a separate function # in place of method accessor reduce(DataFrame.unionAll, (df_flatten.select("name", col("Flattened_Subjects").getItem(i)) for i in range(n))).toDF("name", "Subjects").show()
def getWeightedAgg(all_data, key=['hotelid'], prefix='hotel'): """calculate weighter ctr, br and dtob for a dataframe """ global multiplication_array, sum_weights, n_dates col_list = ['searchdate'] + key df_hotel = all_data.groupby(col_list).agg( F.sum('bookingFlag').alias('booking_count'), F.sum('detailsFlag').alias('details_count'), F.count('fprice').alias('impression_count')) # calculate the metrics to be weighted col_list_1 = col_list + [ (F.col('details_count') / F.col('impression_count')).alias('raw_ctr'), (F.col('booking_count') / F.col('impression_count')).alias('raw_br'), (F.col('booking_count') / F.col('details_count')).alias('raw_dtob') ] df_hotel = df_hotel.select(*col_list_1) # fill missing dates for all keys dates_df = df_hotel.select('searchdate').drop_duplicates() hotel_dates_df = df_hotel.select( *key).drop_duplicates().crossJoin(dates_df) n_dates = dates_df.count() df_hotel = hotel_dates_df.join(df_hotel, on=col_list, how='left').fillna(0) # collect metrics as a list to pass through udf df_hotel = df_hotel.groupby(key).agg( F.collect_list('raw_ctr').alias('raw_ctr'), F.collect_list('raw_br').alias('raw_br'), F.collect_list('raw_dtob').alias('raw_dtob'), F.collect_list('searchdate').alias('searchdate')) # compute the global weighting matrix and sum across rows multiplication_array = np.arange(1, n_dates + 1) multiplication_array = np.array( [multiplication_array - i for i in range(0, n_dates)]) multiplication_array = 8 - multiplication_array multiplication_array = np.where(multiplication_array <= 7, multiplication_array, 0) multiplication_array = np.where(multiplication_array >= 1, multiplication_array, 0) sum_weights = multiplication_array.sum(axis=0) # calculate the rolling weighted average of metrics select_col = [F.col(i) for i in key] + [ weightedAverageUdf(F.col('searchdate'), F.col('raw_ctr')).alias('ctr_array'), weightedAverageUdf(F.col('searchdate'), F.col('raw_br')).alias('br_array'), weightedAverageUdf(F.col('searchdate'), F.col('raw_dtob')).alias('dtob_array') ] weighted_average_df = df_hotel.select(*select_col) select_col = [F.col(i) for i in key] + [ F.arrays_zip( F.col('ctr_array').getItem(0).alias('searchdate'), F.col('ctr_array').getItem(1).alias('ctr'), F.col('br_array').getItem(1).alias('br'), F.col('dtob_array').getItem(1).alias('dtob')).alias('zip_averages') ] weighted_average_df = weighted_average_df.select(*select_col) select_col = [F.col(i) for i in key ] + [F.explode('zip_averages').alias('averages')] weighted_average_df = weighted_average_df.select(*select_col) select_col = [F.col(i) for i in key] + [ F.col('averages').getItem('0').alias('searchdate'), F.col('averages').getItem('1').alias(prefix + '_ctr'), F.col('averages').getItem('2').alias(prefix + '_br'), F.col('averages').getItem('3').alias(prefix + '_dtob') ] weighted_average_df = weighted_average_df.select(*select_col) return weighted_average_df
def main(): parser = argparse.ArgumentParser(description=__doc__) args = getargs(parser) # Initialise Spark session spark = init_sparksession( name="index_archival_{}_{}".format(args.index_table, args.night), shuffle_partitions=2 ) # The level here should be controlled by an argument. logger = get_fink_logger(spark.sparkContext.appName, args.log_level) # debug statements inspect_application(logger) # Connect to the aggregated science database path = '{}/science/year={}/month={}/day={}'.format( args.agg_data_prefix, args.night[:4], args.night[4:6], args.night[6:8] ) df = load_parquet_files(path) # construct the index view index_row_key_name = args.index_table columns = index_row_key_name.split('_') names = [col(i) for i in columns] index_name = '.' + columns[0] # Drop partitioning columns df = df.drop('year').drop('month').drop('day') # Load column names to use in the science portal cols_i, cols_d, cols_b = load_science_portal_column_names() # Assign each column to a specific column family cf = assign_column_family_names(df, cols_i, cols_d, cols_b) # Restrict the input DataFrame to the subset of wanted columns. if 'upper' in args.index_table: df = df.select( 'objectId', 'prv_candidates.jd', 'prv_candidates.fid', 'prv_candidates.magpsf', 'prv_candidates.sigmapsf', 'prv_candidates.diffmaglim' ) else: df = df.select(cols_i + cols_d + cols_b) # Create and attach the rowkey df, _ = attach_rowkey(df) common_cols = [ 'objectId', 'candid', 'publisher', 'rcid', 'chipsf', 'distnr', 'ra', 'dec', 'jd', 'fid', 'nid', 'field', 'xpos', 'ypos', 'rb', 'ssdistnr', 'ssmagnr', 'ssnamenr', 'jdstarthist', 'jdendhist', 'tooflag', 'sgscore1', 'distpsnr1', 'neargaia', 'maggaia', 'nmtchps', 'diffmaglim', 'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos', 'cdsxmatch', 'roid', 'mulens', 'snn_snia_vs_nonia', 'snn_sn_vs_all', 'rf_snia_vs_nonia', 'classtar', 'drb', 'ndethist', 'rf_kn_vs_nonkn', 'tracklet' ] if columns[0].startswith('pixel'): nside = int(columns[0].split('pixel')[1]) df_index = df.withColumn( columns[0], ang2pix( df['ra'], df['dec'], lit(nside) ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + ['objectId'] ) elif columns[0] == 'class': df_index = df.withColumn( 'class', extract_fink_classification( df['cdsxmatch'], df['roid'], df['mulens'], df['snn_snia_vs_nonia'], df['snn_sn_vs_all'], df['rf_snia_vs_nonia'], df['ndethist'], df['drb'], df['classtar'], df['jd'], df['jdstarthist'], df['rf_kn_vs_nonkn'], df['tracklet'] ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'ssnamenr': # Flag only objects with likely counterpart in MPC df_index = df\ .filter(df['roid'] == 3)\ .select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'tracklet': # For data < 2021-08-10, no tracklet means '' # For data >= 2021-08-10, no tracklet means 'null' df_index = df\ .filter(df['tracklet'] != 'null')\ .filter(df['tracklet'] != '')\ .select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) elif columns[0] == 'upper': # This case is the same as the main table # but we keep only upper limit measurements. index_row_key_name = 'objectId_jd' # explode df_ex = df.withColumn( "tmp", arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid") ).withColumn("tmp", explode("tmp")).select( concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name), "objectId", col("tmp.jd"), col("tmp.fid"), col("tmp.magpsf"), col("tmp.sigmapsf"), col("tmp.diffmaglim") ) # take only upper limits df_index = df_ex.filter(~df_ex['magpsf'].isNotNull()) # drop NaN columns df_index = df_index.drop(*['magpsf', 'sigmapsf']) elif columns[0] == 'uppervalid': # This case is the same as the main table # but we keep only upper limit measurements. index_row_key_name = 'objectId_jd' # explode df_ex = df.withColumn( "tmp", arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid") ).withColumn("tmp", explode("tmp")).select( concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name), "objectId", col("tmp.jd"), col("tmp.fid"), col("tmp.magpsf"), col("tmp.sigmapsf"), col("tmp.diffmaglim") ) # take only valid measurements from the history df_index = df_ex.filter(df_ex['magpsf'].isNotNull()) elif columns[0] == 'tns': with open('{}/tns_marker.txt'.format(args.tns_folder)) as f: tns_marker = f.read().replace('\n', '') pdf_tns = download_catalog(os.environ['TNS_API_KEY'], tns_marker) # Filter TNS confirmed data f1 = ~pdf_tns['type'].isna() pdf_tns_filt = pdf_tns[f1] pdf_tns_filt_b = spark.sparkContext.broadcast(pdf_tns_filt) @pandas_udf(StringType(), PandasUDFType.SCALAR) def crossmatch_with_tns(objectid, ra, dec): # TNS pdf = pdf_tns_filt_b.value ra2, dec2, type2 = pdf['ra'], pdf['declination'], pdf['type'] # create catalogs catalog_ztf = SkyCoord( ra=np.array(ra, dtype=np.float) * u.degree, dec=np.array(dec, dtype=np.float) * u.degree ) catalog_tns = SkyCoord( ra=np.array(ra2, dtype=np.float) * u.degree, dec=np.array(dec2, dtype=np.float) * u.degree ) # cross-match idx, d2d, d3d = catalog_tns.match_to_catalog_sky(catalog_ztf) sub_pdf = pd.DataFrame({ 'objectId': objectid.values[idx], 'ra': ra.values[idx], 'dec': dec.values[idx], }) # cross-match idx2, d2d2, d3d2 = catalog_ztf.match_to_catalog_sky(catalog_tns) # set separation length sep_constraint2 = d2d2.degree < 1.5 / 3600 sub_pdf['TNS'] = [''] * len(sub_pdf) sub_pdf['TNS'][idx2[sep_constraint2]] = type2.values[idx2[sep_constraint2]] to_return = objectid.apply( lambda x: '' if x not in sub_pdf['objectId'].values else sub_pdf['TNS'][sub_pdf['objectId'] == x].values[0] ) return to_return df = df.withColumn( 'tns', crossmatch_with_tns( df['objectId'], df['ra'], df['dec'] ) ).select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols + ['tns'] ).cache() df_index = df.filter(df['tns'] != '').drop('tns') # trigger the cache - not the cache might be a killer for LSST... n = df_index.count() print('TNS objects: {}'.format(n)) else: df_index = df.select( [ concat_ws('_', *names).alias(index_row_key_name) ] + common_cols ) # construct the time catalog hbcatalog_index = construct_hbase_catalog_from_flatten_schema( df_index.schema, args.science_db_name + index_name, rowkeyname=index_row_key_name, cf=cf ) # Push index table df_index.write\ .options(catalog=hbcatalog_index, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save() # Construct the schema row - inplace replacement schema_row_key_name = 'schema_version' df_index = df_index.withColumnRenamed( index_row_key_name, schema_row_key_name ) df_index_schema = construct_schema_row( df_index, rowkeyname=schema_row_key_name, version='schema_{}_{}'.format(fbvsn, fsvsn)) # construct the hbase catalog for the schema hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema( df_index_schema.schema, args.science_db_name + index_name, rowkeyname=schema_row_key_name, cf=cf) # Push the data using the shc connector df_index_schema.write\ .options(catalog=hbcatalog_index_schema, newtable=50)\ .format("org.apache.spark.sql.execution.datasources.hbase")\ .save()
def main(sc): """ Main processing function Read in data from PostgreSQL transaction table Perform reverse lookup for vin transactions and return input Bitcoin values and addresses Perform disjoint set (i.e., union find) algorithm using GraphFrames Write out address clustering results to PostgreSQL """ # ---READ IN TRANSACTION DATA AND PERFORM REVERSE TX LOOKUP USING JOINS--- # create initial SQL query # tx_query = "SELECT txid, height, time, ntx, vin_coinbase, vin_txid, vin_vout, vout_value, vout_n, vout_addresses FROM {} WHERE height <= 400000 LIMIT 5000000"\ tx_query = "SELECT txid, height, time, ntx, vin_coinbase, vin_txid, vin_vout, vout_value, vout_n, vout_addresses FROM {}"\ .format(config.SPARK_CONFIG['PG_TX_TABLE']) # read in data from PostgreSQL tx_df = spark.read \ .format(config.SPARK_CONFIG['PG_FORMAT']) \ .option("url", config.SPARK_CONFIG['PG_URL'] + config.SPARK_CONFIG['PG_PORT'] + "/" + config.SPARK_CONFIG['PG_DB']) \ .option("user", config.SPARK_CONFIG['PG_USER']) \ .option("password", config.SPARK_CONFIG['PG_PASSWORD'])\ .option("query", tx_query) \ .option("numPartitions", '10000') \ .load() # display_df(tx_df) # select priority columns, convert array columns, and zip vin and vout fields clean_df = tx_df.withColumn("vin_txid_arr", split(col("vin_txid"), ",\s*")) \ .withColumn("vin_vout_arr", split(col("vin_vout"), ",\s*")) \ .withColumn("vin_txid_vout_zip", arrays_zip("vin_txid_arr", "vin_vout_arr")) \ .withColumn("vout_value_arr", split(col("vout_value"), ",\s*")) \ .withColumn("vout_n_arr", split(col("vout_n"), ",\s*")) \ .withColumn("vout_addresses_arr", split(col("vout_addresses"), ",\s*")) \ .withColumn("vout_value_n_addr_zip", arrays_zip("vout_value_arr", "vout_n_arr", "vout_addresses_arr")) # display_df(clean_df) # # create left side DataFrame vin_cols = [ 'txid', 'height', 'time', 'ntx', 'vin_coinbase', 'vin_txid_vout_zip' ] vin_df = clean_df.select(vin_cols) \ .withColumn("vin_txid_vout_tup", explode("vin_txid_vout_zip")) \ .withColumn("vin_txid", col("vin_txid_vout_tup").vin_txid_arr) \ .withColumn("vin_vout", col("vin_txid_vout_tup").vin_vout_arr) \ .drop("vin_txid_vout_zip") \ .drop("vin_txid_vout_tup") \ .withColumn("left_key", concat(col("vin_txid"), lit("-"), col("vin_vout"))) # display_df(vin_df) # create right side DataFrame vout_cols = ['txid', 'vout_value_n_addr_zip'] vout_df = clean_df.select(vout_cols) \ .withColumn("vout_value_n_addr_tup", explode("vout_value_n_addr_zip")) \ .withColumn("vout_value", col("vout_value_n_addr_tup").vout_value_arr) \ .withColumn("vout_n", col("vout_value_n_addr_tup").vout_n_arr) \ .withColumn("vout_addr_pre", col("vout_value_n_addr_tup").vout_addresses_arr) \ .withColumn("vout_addr", translate(col("vout_addr_pre"), '[]', '')) \ .drop("vout_value_n_addr_zip") \ .drop("vout_value_n_addr_tup") \ .drop("vout_addr_pre") \ .withColumnRenamed("txid", "txid2") \ .withColumn("right_key", concat(col("txid2"), lit("-"), col("vout_n"))) \ .drop("txid2") # display_df(vout_df) # join DataFrames join_df = vin_df.join(vout_df, vin_df.left_key == vout_df.right_key, 'left') \ .drop("left_key") \ .drop("right_key") # display_df(join_df) # create temporary table for GraphFrames join_df.registerTempTable("join_result") # ---CREATING GRAPHFRAME FOR CONNECTED COMPONENTS ALGORITHM--- # create vertices DataFrame vertices = spark.sql( "SELECT DISTINCT(vout_addr) FROM join_result").withColumnRenamed( "vout_addr", "id") # generate DataFrame with single address connection for all addresses in a given txid group w = Window.partitionBy("txid").orderBy("vout_addr") first_by_txid_df = join_df.withColumn("rn", row_number().over(w)).where(col("rn") == 1) \ .withColumnRenamed("txid", "txid2") \ .withColumnRenamed("vout_addr", "vout_addr_first") \ .drop("rn") \ .drop("height") # first_by_txid_df.show(100) # join DataFrames interim_df = join_df.join(first_by_txid_df, join_df.txid == first_by_txid_df.txid2, 'left') # create edges DataFrame edges = interim_df.select("vout_addr", "vout_addr_first") \ .withColumnRenamed("vout_addr", "src") \ .withColumnRenamed("vout_addr_first", "dst") \ .na.drop() # create GraphFrame g = GraphFrame(vertices, edges) # set checkpoint directory in S3 sc.setCheckpointDir(config.SPARK_CONFIG['S3_CHECKPOINT']) # run connected components clst_result = g.connectedComponents() clst_result.show(100, truncate=False) # # ---FOR TESTING ONLY--- show result DataFrame for a specific block to verify correct results # clst_result.registerTempTable("clst_table") # view_df = spark.sql("SELECT * FROM clst_table ORDER BY clst_table.component") # view_df.show(1000, truncate=False) # write out to PostgreSQL write_clst_to_pg(clst_result)
# +---+---+----+------+ # | 1| 1| 7 | foo | # | 1| 2| 8 | foo | # | 1| 3| 9 | foo | # +---+---+----+------+ # if we go with this , # So we can not do this df_exploded = df.withColumn('b', explode('b')) \ .withColumn("c", explode(col("c"))) # will create curr_row_count X no_of_items_in_array ,here it will be 9 as 2 # array with 3 items each df_exploded.show(truncate=False) # way 1 , from pyspark.sql.functions import arrays_zip, array df = df.withColumn("merged_col", arrays_zip("b", "c")) df.show(truncate=False) df.withColumn("added_array", concat(col("b"), col("c"))).show() flattened_df = df.withColumn("exploded_merged_col", explode(col("merged_col"))) flattened_df.show(truncate=False) flattened_df.printSchema() flattened_df.select(col("a"), col("exploded_merged_col.b"), col("exploded_merged_col.c"), col("d")).show() # way 2 , this is useful to manipulate any col in DF , useful for any arbritary number of col and items in it . # else use array_zip(c1,c2) as seen before df = spark.createDataFrame([Row(a=1, b=[1, 2, 3], c=[7, 8, 9], d='foo')]) def zip_and_explode(*colnames, n):
col("seatRows.features")).withColumn("seat", explode("seatRows.seats")) # COMMAND ---------- seats.count() # COMMAND ---------- seats.select("seat.seatId").take(1) # COMMAND ---------- seat_info = seats.withColumn( "info", explode( F.arrays_zip(col("seat.seatId"), col("seat.features"), col("seat.pricingReferences")))) # COMMAND ---------- seat_info.select("info").take(1) # COMMAND ---------- seat_info.count() # COMMAND ---------- # MAGIC %md # MAGIC ##### # COMMAND ----------
def test_alignments_filter(self): work_dir = "gs://the-peoples-speech-west-europe/forced-aligner/cuda-forced-aligner/output_work_dir_5b/output_work_dir_5b" alignments_dir = os.path.join(work_dir, "alignments_json_jul_28") spark = self.spark alignments_df = spark.read.json(alignments_dir) alignments_df = alignments_df.withColumn( "duration_ms", F.expr( "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)" ), ) alignments_df = alignments_df.withColumn( "alignments", F.arrays_zip( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.duration_ms, ).cast( T.ArrayType( T.StructType( [ T.StructField("cer", T.FloatType()), T.StructField("end_ms", T.LongType()), T.StructField("label", T.StringType()), T.StructField("start_ms", T.LongType()), T.StructField("wer", T.FloatType()), T.StructField("duration_ms", T.LongType()), ] ) ) ), ) alignments_df = alignments_df.drop("duration_ms") max_duration_ms = 20_000 max_cer = 36.0 min_duration_ms = 1_000 alignments_df = alignments_df.withColumn( "alignments", F.filter( alignments_df.alignments, # Need to select this filter such that total number of # hours is 31,400 lambda alignment: (alignment.duration_ms < max_duration_ms) & (alignment.cer < max_cer) & (alignment.duration_ms > min_duration_ms), ), ) alignments_df = alignments_df.withColumn( "alignments", F.struct( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, # Is this the fix? alignments_df.alignments.duration_ms, ).cast( T.StructType( [ T.StructField("cer", T.ArrayType(T.FloatType())), T.StructField("end_ms", T.ArrayType(T.LongType())), T.StructField("label", T.ArrayType(T.StringType())), T.StructField("start_ms", T.ArrayType(T.LongType())), T.StructField("wer", T.ArrayType(T.FloatType())), T.StructField("duration_ms", T.ArrayType(T.LongType())), ] ) ), ) abc = alignments_df.select( F.sum(F.expr("aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)")) / 1000.0 / 60.0 / 60.0 ).collect() print("GALVEZ:max_duration_ms=", max_duration_ms) print("GALVEZ:max_cer=", max_cer) print("GALVEZ:min_duration_ms=", min_duration_ms) print("GALVEZ:total number of hours=", abc)
# fitting data to schema df = dsraw.select(F.from_json("value", schema).alias("data")).select("data.*") # creates offset and adds columns latitude2, longitude2 and time2 to each row which is required to calculate speed and distance d = df.withColumn("arr_size", F.size("latitude"))\ .withColumn("lat2", F.expr("slice(latitude,1,arr_size-1)"))\ .withColumn("long2", F.expr("slice(longitude,1,arr_size-1)"))\ .withColumn("time2", F.expr("slice(timestamp,1,arr_size-1)"))\ .withColumn("null_col", F.array([F.lit(None)]))\ .withColumn("lat2", F.concat(F.col("null_col"),F.col("lat2")))\ .withColumn("long2", F.concat(F.col("null_col"),F.col("long2")))\ .withColumn("time2", F.concat(F.col("null_col"),F.col("time2"))) # explodes all array columns d1 = d.withColumn("new", F.arrays_zip("heart_rate","timestamp","latitude","longitude","lat2","long2","time2"))\ .withColumn("new", F.explode("new"))\ .select("userId","id", F.col("new.heart_rate").alias("heart_rate"), F.col("new.timestamp").alias("timestamp"), F.col("new.latitude").alias("lat"), F.col("new.longitude").alias("long"), F.col("new.lat2").alias("lat2"), F.col("new.long2").alias("long2"), F.col("new.time2").alias("time2")) # haversine formula, calculates distance and speed between two points d2 = d1.withColumn("distance", 3956 *(2 * F.asin(F.sqrt(F.sin((F.radians("lat") - F.radians("lat2"))/2)**2 + F.cos(F.radians("lat")) * F.cos(F.radians("lat2")) * F.sin((F.radians("long") - F.radians("long2"))/2)**2))))\ .withColumn("speed", F.col("distance")/((F.col("timestamp") - F.col("time2"))/3600))
# 列合并 df = df.withColumn('col', concat('col1', 'col2')) # concat_ws can use separator, concat can't df = claim_group_info.withColumn('col', concat_ws('-','col1', 'col2')) ########################## # explode multiple columns ########################## # only works in spark > 2.4 from pyspark.sql.functions import arrays_zip, col (df .withColumn("tmp", arrays_zip("b", "c")) .withColumn("tmp", explode("tmp")) .select("a", col("tmp.b"), col("tmp.c"), "d")) # works for spark <= 2.3 from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType from pyspark.sql.functions import col, udf, explode zip_ = udf( lambda x, y: list(zip(x, y)), ArrayType(StructType([ StructField("first", IntegerType()), StructField("second", IntegerType()) ])) )