def run_featurize_patents(spark: SparkSession): logger.info("Starting execution") full = read(spark=spark, storage_name=FILTERED_STORAGE_NAME, containter_name=FILTERED_CONTAINER_NAME, output_folder=FILTERED_OUTPUT_FOLDER, logger=logger) full = process_full(full) text = read(spark=spark, storage_name=PROCESSED_TEXT_STORAGE_NAME, containter_name=PROCESSED_TEXT_CONTAINER_NAME, output_folder=PROCESSED_TEXT_OUTPUT_FOLDER, logger=logger) text = process_text(text) result = full.join(text, ["_file"], "inner") save(spark=spark, df=result, num_files=NUM_OUTPUT_FILES, containter_name=FEATURES_CONTAINER_NAME, storage_name=FEATURES_STORAGE_NAME, output_folder=FEATURES_OUTPUT_FOLDER, logger=logger) logger.info("Process finished!") return result
def run_parquetizer(spark: SparkSession): logger.info("Starting execution") df = read(spark) result = process(df) save(spark=spark, df=result, num_files=NUM_OUTPUT_FILES, containter_name=PARQUET_CONTAINER_NAME, storage_name=PARQUET_STORAGE_NAME, output_folder=PARQUET_OUTPUT_FOLDER, logger=logger) logger.info("Process finished!") return result
def process(input_folder: str, output_file: str): with open(output_file, "w", encoding="utf-8") as f: for (root, directories, file_names) in os.walk(input_folder): for file in file_names: if file.endswith(".xml"): # To exclude xds and other files with open(os.path.join(root, file), "r", encoding="utf-8") as input_file: data = input_file.read() output = sanitize_xml(data) f.write(output) logger.info("Process completed!")
def log_language_distribution(df: DataFrame, field_name: str): """Generates a log with the distribution of languages""" logger.info(f"Getting language distribution for: {field_name}") if isinstance(df.select(field_name).schema.fields[0].dataType, ArrayType): languages = df.select( sf.explode_outer(field_name).alias("target_field")) else: languages = df.select(sf.col(field_name).alias("target_field")) languages_p = languages.groupby("target_field._lang").count().toPandas() logger.info( f"Distribution of languages in {field_name}:\n{languages_p.to_string()}" )
def run_frequent_words(spark: SparkSession): logger.info("Starting execution") df = read(spark=spark, storage_name=PROCESSED_TEXT_STORAGE_NAME, containter_name=PROCESSED_TEXT_CONTAINER_NAME, output_folder=PROCESSED_TEXT_OUTPUT_FOLDER, logger=logger) result_p = process(df) save(result_p) logger.info("Process finished!") return result_p
def save_ml_model(spark, model, storage_name: str, container_name: str, output_folder, output_suffix): """Saves a spark model to a blob storage""" key = spark.conf.get( f"spark.hadoop.fs.azure.account.key.{storage_name}.blob.core.windows.net" ) create_if_not_exists_container(storage_name, container_name=container_name, key=key, logger=logger) output_path = f"wasbs://{container_name}@{storage_name}.blob.core.windows.net/{output_folder}_{output_suffix}/" model.write().overwrite().save(output_path) logger.info(f"Model saved in: {output_path}")
def run_energy_classifier(spark: SparkSession): logger.info("Starting execution") df = read(spark=spark, storage_name=FEATURES_STORAGE_NAME, containter_name=FEATURES_CONTAINER_NAME, output_folder=FEATURES_OUTPUT_FOLDER, logger=logger) result = process(df) save(spark=spark, df=result, num_files=NUM_OUTPUT_FILES, containter_name=ENERGY_PATENTS_CONTAINER_NAME, storage_name=ENERGY_PATENTS_STORAGE_NAME, output_folder=ENERGY_PATENTS_OUTPUT_FOLDER, logger=logger) logger.info("Process finished!") return result
def unzip_data(target_file: str, output_folder: str) -> Tuple[str, int]: num_files = 0 if target_file.endswith("tgz"): tar = tarfile.open(target_file, "r:gz") for member in tar.getmembers(): if member.isreg(): # skip if the TarInfo is not files member.name = os.path.basename( member.name) # remove the path by reset it logger.debug(f"Extracting {member.name} into {output_folder}") tar.extract(member, output_folder) num_files += 1 else: logger.info(f"Omitting file {member.name}") else: logger.error(f"Unsupported file extension: {target_file}") logger.info(f"Unzip completed! {num_files} files into {target_file}") return target_file, num_files
def process(df: DataFrame) -> DataFrame: """ Process text columns and generates two columns per input column with the result after StopWords and Lemmatization """ cols = [ OUTPUT_COL_ENGLISH_TEXT, OUTPUT_COL_ENGLISH_ABSTRACT_TEXT, OUTPUT_COL_ENGLISH_TITLE_TEXT, OUTPUT_COL_ENGLISH_CLAIMS_TEXT ] df = df.select("_file", *cols) # Initializad only once because it downloads data each time lemma = LemmatizerModel.pretrained(name="lemma_antbnc", lang="en").setInputCols( ["stopwords"]).setOutputCol("lemma") for col in cols: logger.info(f"Processing column: {col}") df = process_col(df=df, input_col=col, lemma=lemma) return df
def run_text_processor(spark: SparkSession): logger.info("Starting execution") df = read(spark=spark, storage_name=FILTERED_STORAGE_NAME, containter_name=FILTERED_CONTAINER_NAME, output_folder=FILTERED_OUTPUT_FOLDER, logger=logger) result = process(df) save(spark=spark, df=result, num_files=NUM_OUTPUT_FILES, containter_name=PROCESSED_TEXT_CONTAINER_NAME, storage_name=PROCESSED_TEXT_STORAGE_NAME, output_folder=PROCESSED_TEXT_OUTPUT_FOLDER, logger=logger) logger.info("Process finished!") return result
def process(df: DataFrame) -> DataFrame: container_path = f"wasbs://{ENERGY_CLASSIFIER_CONTAINER_NAME}@{ENERGY_CLASSIFIER_STORAGE_NAME}.blob.core.windows.net" blob_folder = f"{container_path}/{ENERGY_CLASSIFIER_OUTPUT_FOLDER}/" model = PipelineModel.load(blob_folder) result = model.transform(df) result = result.cache() num_pos = result.filter(sf.col("prediction") == 1) num_neg = result.filter(sf.col("prediction") == 0) if num_pos == 0: # TODO parametrize. Maybe min percentage? logger.warning(f"There are {num_pos} positives") else: logger.info(f"There are {num_pos} positives") if num_neg == 0: # TODO parametrize. Maybe min percentage? logger.warning(f"There are {num_neg} negatives") else: logger.info(f"There are {num_neg} negatives") return result
def save_results_lda(df_p: pd.DataFrame, key: str, list_num_topics: List[int]): """Saves a csv file to blob storage with the information of the results of all topics""" output_file = TOPIC_CLUSTERING_OUTPUT_LDA_RESULT_PREFIX + "_".join( [str(n) for n in list_num_topics]) + ".csv" logger.info(f"Saving local data into {output_file}") df_p.to_csv(output_file, header=True, index=False, sep=";", encoding="utf-8") logger.info(f"Uploading data...") output_url = get_account_url(TOPIC_CLUSTERING_STORAGE_NAME) output_service = BlobServiceClient(account_url=output_url, credential=key) output_container = output_service.get_container_client( TOPIC_CLUSTERING_CONTAINER_NAME) upload_blob_client = output_container.get_blob_client(output_file) with open(output_file, "rb") as data: upload_blob_client.upload_blob(data, blob_type="BlockBlob", overwrite=True) logger.info("Upload completed!")
def save(df_p: pd.DataFrame): logger.info(f"Saving local data into {FREQUENT_WORDS_OUTPUT_FILE_NAME}") assert FREQUENT_WORDS_OUTPUT_FILE_NAME.endswith(".csv") df_p.to_csv(FREQUENT_WORDS_OUTPUT_FILE_NAME, header=True, index=False, sep=",", encoding="utf-8") logger.info(f"Uploading data...") output_url = get_account_url(FREQUENT_WORDS_STORAGE_NAME) output_service = BlobServiceClient(account_url=output_url, credential=FREQUENT_WORDS_STORAGE_KEY) output_container = output_service.get_container_client( FREQUENT_WORDS_CONTAINER_NAME) upload_blob_client = output_container.get_blob_client( FREQUENT_WORDS_OUTPUT_FILE_NAME) with open(FREQUENT_WORDS_OUTPUT_FILE_NAME, "rb") as data: upload_blob_client.upload_blob(data, blob_type="BlockBlob", overwrite=True) logger.info("Upload completed!")
def run_sanitize_data(): logger.info("Starting execution") input_url = get_account_url(INPUT_STORAGE_NAME) service = BlobServiceClient(account_url=input_url, credential=INPUT_STORAGE_KEY) container = service.get_container_client(INPUT_CONTAINER_NAME) output_url = get_account_url(SANITIZED_STORAGE_NAME) output_service = BlobServiceClient(account_url=output_url, credential=SANITIZED_STORAGE_KEY) output_container = output_service.get_container_client( SANITIZED_CONTAINER_NAME) try: output_container.create_container() logger.info(f"Creating container: {SANITIZED_CONTAINER_NAME}") except ResourceExistsError: logger.warning("Output container already exists") blobs = list(container.list_blobs()) logger.info(f"There are {len(blobs)} blobs to process") info_unzip_num_files = [] for n, blob in enumerate(blobs): try: blob_name = blob["name"] logger.info( f"Processing blob {n + 1} of {len(blobs)}: {blob_name}") except KeyError: logger.error(f"Omitting blob, it doesn't have a name: {blob}") continue blob = container.get_blob_client(blob=blob_name) init_local_directories() # Download target_blob_file = os.path.join(output_tmp_folder_blob, blob_name) logger.info(f"Downloading {blob_name} into {target_blob_file}") download_data(blob=blob, target_file=target_blob_file) # Process unzip_info = unzip_data(target_file=target_blob_file, output_folder=output_tmp_folder_xmls) info_unzip_num_files.append(unzip_info) output_xml_file = os.path.splitext( os.path.basename(blob_name))[0] + ".xml" process(input_folder=output_tmp_folder_xmls, output_file=output_xml_file) # Upload upload_blob_client = output_container.get_blob_client(output_xml_file) with open(output_xml_file, "rb") as data: upload_blob_client.upload_blob(data, blob_type="BlockBlob", overwrite=True) logger.info("Upload completed!") os.remove(output_xml_file) logger.info("Local file deleted!") logger.info("Uploaded info:") total_num_registers = 0 for name, num_files in info_unzip_num_files: logger.info(f"File {name} -> {num_files} registers") total_num_registers += num_files logger.info(f"Total registers uploaded: {total_num_registers}") logger.info("Process finished!")
def download_data(blob, target_file: str): with open(target_file, "wb") as my_blob: blob_data = blob.download_blob() blob_data.readinto(my_blob) logger.info(f"Download completed!")
def read(spark: SparkSession) -> DataFrame: input_path = f"wasbs://{SANITIZED_CONTAINER_NAME}@{SANITIZED_STORAGE_NAME}.blob.core.windows.net/" logger.info(f"Reading from: {input_path}") df = spark.read.format("com.databricks.spark.xml").option("rowTag", "questel-patent-document").option( "mode", "DROPMALFORMED").load(input_path) return df
def run_energy_clustering(spark: SparkSession, list_num_topics: List[int]) -> pd.DataFrame: logger.info("Starting execution") df = read(spark=spark, storage_name=ENERGY_PATENTS_STORAGE_NAME, containter_name=ENERGY_PATENTS_CONTAINER_NAME, output_folder=ENERGY_PATENTS_OUTPUT_FOLDER, logger=logger) min_df = 0.05 df = df.select("_file", "prediction", "english_text_features") df = df.filter(sf.col("prediction") == 1) cv = CountVectorizer(inputCol="english_text_features", outputCol="features", minDF=min_df) cv_model = cv.fit(df) df_vectorized = cv_model.transform(df) df_vectorized.persist(StorageLevel.DISK_ONLY) logger.info(f"Vocabulary size: {len(cv_model.vocabulary)}") save_ml_model(spark=spark, model=cv_model, storage_name=TOPIC_CLUSTERING_STORAGE_NAME, container_name=TOPIC_CLUSTERING_CONTAINER_NAME, output_folder=TOPIC_CLUSTERING_OUTPUT_CV, output_suffix=min_df) results_log_likelihood = [] results_log_perplexity = [] for n in list_num_topics: lda = LDA(k=n, maxIter=100, seed=18) model = lda.fit(df_vectorized) ll = model.logLikelihood(df_vectorized) lp = model.logPerplexity(df_vectorized) logger.info(f"Num topics: {n}") logger.info( f"The lower bound on the log likelihood of the entire corpus: {ll}" ) logger.info(f"The upper bound on perplexity: {lp}") results_log_likelihood.append(ll) results_log_perplexity.append(lp) save_ml_model(spark=spark, model=cv_model, storage_name=TOPIC_CLUSTERING_STORAGE_NAME, container_name=TOPIC_CLUSTERING_CONTAINER_NAME, output_folder=TOPIC_CLUSTERING_OUTPUT_LDA, output_suffix=n) data = { "num_topics": list_num_topics, "log_likelihood": results_log_likelihood, "log_perplexity": results_log_perplexity, } result_p = pd.DataFrame(data) key = spark.conf.get( f"spark.hadoop.fs.azure.account.key.{TOPIC_CLUSTERING_STORAGE_NAME}.blob.core.windows.net" ) save_results_lda(result_p, key=key, list_num_topics=list_num_topics) logger.info("Process finished!") return result_p
sep=";", encoding="utf-8") logger.info(f"Uploading data...") output_url = get_account_url(TOPIC_CLUSTERING_STORAGE_NAME) output_service = BlobServiceClient(account_url=output_url, credential=key) output_container = output_service.get_container_client( TOPIC_CLUSTERING_CONTAINER_NAME) upload_blob_client = output_container.get_blob_client(output_file) with open(output_file, "rb") as data: upload_blob_client.upload_blob(data, blob_type="BlockBlob", overwrite=True) logger.info("Upload completed!") if __name__ == '__main__': if len(sys.argv) > 1: list_num_topics = [] for n in range(1, len(sys.argv)): list_num_topics.append(int(sys.argv[n])) else: list_num_topics = [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 16, 18, 20, 30, 50, 100 ] logger.info( f"Executing with the following list of number of topics: {list_num_topics}" ) spark_session = create_spark_session("energy_clustering") run_energy_clustering(spark_session, list_num_topics)