def main(args): ''' Module entry point function ''' seq_col = args.sequence_column id_col = args.identifier_column logger.debug(f'input-dir {args.input_dir}') logger.debug(f'model input dir {args.model_input_dir}') logger.debug(f'sequence-column {seq_col}') logger.debug(f'identifier-column {id_col}') logger.debug(f'output-dir {args.output_dir}') sgt = load_model_from_directory(args.model_input_dir, model_loader=joblib_loader).data input_df = load_data_frame_from_directory(args.input_dir).data if input_df[seq_col].isnull().sum().sum() > 0: print(f'column{seq_col} contains missing values ') sys.exit(1) embedding_df = score(input_df, sgt, seq_col, id_col) print('f embedding shape{embedding_df.shape}') print(embedding_df.head()) save_data_frame_to_directory( save_to=args.output_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df))
def predict(args): # Load data that needs to be scored df = load_data_frame_from_directory(args.input_data).data # Connect to workspace ws = automl_helper.get_workspace() # Get AutoML run details automl_run = automl_helper.get_automl_run(ws, args.experiment, args.run_id) properties = automl_run.properties # Load AutoML model model = automl_helper.load_automl_model(automl_run) # Score data print("Using model to score input data...") isForecasting = isinstance( model, azureml.automl.runtime.shared.model_wrappers. ForecastingPipelineWrapper) if (isForecasting): y_query = None if 'y_query' in df.columns: y_query = df.pop('y_query').values results = model.forecast(df, y_query) results = results[0] else: results = model.predict(df) results_df = pd.DataFrame(results, columns=['Predictions']) print(f"This is how your prediction data looks like:\n{results_df.head()}") # Write results back automl_helper.write_prediction_dataframe(args.predictions_data, results_df)
def main(args): ''' Module entry function args: args:list, user parameters ''' logger.debug(f'input-dir {args.input_dir}') logger.debug(f'model input dir {args.model_input_dir}') logger.debug(f'output-dir {args.output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ') pca_module = load_model_from_directory(args.model_input_dir, model_loader=pcamodule_loader).data logger.debug(pca_module.pca_instance) output_df = score(pca_module, input_df) logger.debug(f'output shape {output_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=output_df, schema=DataFrameSchema.data_frame_to_dict(output_df))
def process_data(args, file_name): """ :return: word2id: map word to id id2word: map id to word label2id: map label to id id2label: map id to label max_len: max length of text """ label2id, id2label, word2id, id2word, max_len = {}, {}, {}, {}, 0 label_set, word_set = set([]), set([]) df = load_data_frame_from_directory(file_name).data for index, row in df.iterrows(): label_set.add(row[args.label_column]) sentence = row[args.text_column] words = word_tokenize(sentence) if len(words) > max_len: max_len = len(words) word_set |= set(words) id2word[0] = '<UNK>' # unknown word2id['<UNK>'] = 0 id2word[1] = '<EOS>' # ending word2id['<EOS>'] = 1 for i, word in enumerate(word_set): word2id[word] = i + 2 id2word[i + 2] = word for i, label in enumerate(label_set): label2id[label] = i id2label[i] = label return word2id, id2word, label2id, id2label, max_len
def main(args): ''' Module entry function args: args:list transformer parameters requested by user/ ''' logger.debug(f'input-dir {args.input_dir}') logger.debug(f'output-dir {args.output_dir}') logger.debug(f'model output dir {args.model_output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ') pca_module = PCAModule(args) logger.debug(pca_module.pca_instance) output_df = pca_module.fit_transform(input_df) pca_module.log_metrics(input_df.columns) logger.debug(f'output shape {output_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=output_df, schema=DataFrameSchema.data_frame_to_dict(output_df)) save_model_to_directory(save_to=args.model_output_dir, model_dumper=pca_module_dumper(data=pca_module))
def load(cls, load_from: str): if isinstance(load_from, str): dfd = load_data_frame_from_directory(load_from_dir=load_from) return cls(df=dfd.data, column_attributes=dfd.schema_instance.column_attributes) elif isinstance(load_from, DataFrameDirectory): return cls( df=load_from.data, column_attributes=load_from.schema_instance.column_attributes) else: raise NotImplementedError( f"Cannot load data from {load_from} of type {type(load_from)}")
def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size, threshold, sensitivity, appendMode, compute_stats_in_visualization, output_path): data_frame_directory = load_data_frame_from_directory(input_path) logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}") if data_frame_directory.data.shape[0] < MIN_POINTS: raise UserError(NotEnoughPoints.format(MIN_POINTS)) if 0 < batch_size < MIN_POINTS: raise UserError(InvalidBatchSize.format(MIN_POINTS)) query_string = unquote(timestamp_column) timestamp_column_selector = ColumnSelection(query_string) timestamp = timestamp_column_selector.select_dataframe_directory(data_frame_directory).data timestamps = pd.to_datetime(timestamp.iloc[:, 0].values) if np.any(np.isnat(timestamps)): raise UserError(InvalidTimestamps) res = is_timestamp_ascending(timestamps) if res == -1: raise UserError(InvalidSeriesOrder) elif res == -2: raise UserError(DuplicateSeriesTimestamp) query_string = unquote(value_column) data_column_selector = ColumnSelection(query_string) data_columns = data_column_selector.select_dataframe_directory(data_frame_directory).data for col in data_columns.columns: try: float_data = data_columns[col].apply(float) except Exception as e: raise UserError(InvalidValueFormat.format(col)) if not np.all(np.isfinite(float_data)): raise UserError(InvalidSeriesValue.format(col)) if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any(np.greater(float_data, VALUE_UPPER_BOUND)): raise UserError(ValueOverflow.format(col)) data_columns[col] = float_data result = sr_detector.detect(timestamps, data_columns, detect_mode=detect_mode, batch_size=batch_size, threshold=threshold, sensitivity=sensitivity) if appendMode is True: result = pd.merge(data_frame_directory.data, result, left_index=True, right_index=True) save_data_frame_to_directory(output_path, result, compute_stats_in_visualization=compute_stats_in_visualization)
def gdal_sample( ##define interface(input, output, paratmers) of the module here output_dir1: OutputDirectory(), output_dir2: OutputDirectory(), input_dir1: InputDirectory(), input_dir2: InputDirectory()): print('I am in module definition') print(f'input_dir1: {Path(input_dir1).resolve()}') print(f'input_dir2: {Path(input_dir2).resolve()}') ## add custom logic here dfd1 = load_data_frame_from_directory(input_dir1) data_frame1 = dfd1.data print(data_frame1.head(10))
def testAnomalyAndMargin(self): df = pd.DataFrame() df['timestamp'] = pd.date_range(start='2020-01-01', periods=200, freq='1D') df['value'] = np.sin(np.linspace(1, 20, 200)) save_data_frame_to_directory(self.__input_path, df) invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column, self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode, self.compute_stats_in_visualization, self.__output_path) result = load_data_frame_from_directory(self.__output_path).data self.assertEqual(result.shape[0], 200) self.assertTrue('value' in result.columns) self.assertTrue('isAnomaly' in result.columns) self.assertTrue('score' in result.columns) self.assertTrue('expectedValue' in result.columns) self.assertTrue('upperBoundary' in result.columns) self.assertTrue('lowerBoundary' in result.columns)
def __init__(self, file, word2id, label2id, args, transform=sentence2idlist, max_len=-1): self.data = [] self.transform = transform self.max_len = max_len df = load_data_frame_from_directory(file).data for index, row in df.iterrows(): self.data.append( (np.array(self.transform(row[args.text_column], word2id)), label2id[row[args.label_column]])) self.len = len(self.data)
def main(args=None): ''' Module entry function ''' input_dir = args.input_dir corr_type = args.correlation_method logger.debug(f'input-dir {input_dir}') logger.debug(f'correlation-method {corr_type}') logger.debug(f'output-dir {args.output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data corr_df = ComputeCorrelationModule(corr_type).compute(input_df) logger.debug(f'correlation matrix shape {corr_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=corr_df, schema=DataFrameSchema.data_frame_to_dict(corr_df))
def main(args): ''' Module entry function ''' transformer = SUPPORTED_TRANSFORMERS[args.transformer] logger.debug(f'input-dir {args.input_dir}') logger.debug(f'column {args.column_name}') logger.debug(f'distance {args.distance}') logger.debug(f'transformer {transformer}') logger.debug(f'sim-dir {args.sim_dir}') input_df = load_data_frame_from_directory(args.input_dir).data if input_df[args.column_name].isnull().sum().sum() > 0: logger.debug(f'column{args.column_name} contains missing values ') sys.exit(1) sts = TextualSimilarity(transformer=transformer, distance_func=args.distance) embedding_df, sim_df = sts.fit_transform(input_df[args.column_name].values) sim_df.insert(0, args.column_name, input_df[args.column_name]) logger.debug(f'similarity matrix shape {sim_df.shape}') logger.debug(f'embedding shape {embedding_df.shape}') save_data_frame_to_directory( save_to=args.sim_dir, data=sim_df, schema=DataFrameSchema.data_frame_to_dict(sim_df)) save_data_frame_to_directory( save_to=args.embedding_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df))
def main(args=None): ''' Module entry point function ''' seq_col = args.sequence_column id_col = args.identifier_column length_sensitive = args.length_sensitive kappa = args.kappa logger.debug(f'input-dir {args.input_dir}') logger.debug(f'sequence-column {seq_col}') logger.debug(f'identifier-column {id_col}') logger.debug(f'length-sensitive {length_sensitive}') logger.debug(f'kappa {args.kappa}') logger.debug(f'output-dir {args.output_dir}') logger.debug(f'model output dir {args.model_output_dir}') input_df = load_data_frame_from_directory(args.input_dir).data if input_df[seq_col].isnull().sum().sum() > 0: logger.debug(f'column {seq_col} contains missing values ') sys.exit(1) embedding_df, sgt = compute_embeddings(input_df, seq_col, kappa, length_sensitive, id_col) logger.debug(f'embedding shape {embedding_df.shape}') save_data_frame_to_directory( save_to=args.output_dir, data=embedding_df, schema=DataFrameSchema.data_frame_to_dict(embedding_df)) save_model_to_directory(save_to=args.model_output_dir, model_dumper=sgt_dumper(data=sgt))
import os import pandas as pd from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory print( "Replace undefined values to relavant values and rename columns to meaningful names" ) parser = argparse.ArgumentParser("normalize") parser.add_argument("--filtered_data", type=str, help="filtered taxi data") parser.add_argument("--output_normalize", type=str, help="replaced undefined values and renamed columns") args = parser.parse_args() combined_converted_df = load_data_frame_from_directory(args.filtered_data).data print("Argument (output normalized taxi data path): %s" % args.output_normalize) # These functions replace undefined values and rename to use meaningful names. replaced_stfor_vals_df = (combined_converted_df.replace({ "store_forward": "0" }, { "store_forward": "N" }).fillna({"store_forward": "N"})) replaced_distance_vals_df = (replaced_stfor_vals_df.replace({ "distance": ".00" }, { "distance": 0 }).fillna({"distance": 0}))
from textclscnn.args_util import preprocess_args nltk.download('punkt') class DataPreprocessor(object): def __init__(self, vocab_path, text_column): self.vocab_path = vocab_path self.text_column = text_column self.rule = re.compile(r"[^\u4e00-\u9fa5]") self.cut = word_tokenize with open(self.vocab_path + '/' + 'word2id.pkl', 'rb') as f: self.word2id = pickle.load(f) def process(self, data_frame: pd.DataFrame): out_df = data_frame.copy() out_df['text_id'] = data_frame[self.text_column].apply(lambda text: [ self.word2id[word] if word != '\x00' and word in self.word2id else 0 for word in word_tokenize(text) ]) print(f'first 5 lines of processed df: {out_df.head()}') return out_df if __name__ == '__main__': args = preprocess_args() processor = DataPreprocessor(args.input_vocab, args.text_column) data_frame = load_data_frame_from_directory(args.input_data).data save_data_frame_to_directory(args.output_data, data=processor.process(data_frame))
plt.ylim([0, 1.1]) plt.ylabel('score') plt.title('Scores') return f2_plt def evaluation(self, df_true, df_predict, df_prob, output_eval_dir): run = Run.get_context() f1_plt = self.prcurve(df_true, df_predict, df_prob) run.log_image("precision/recall curve", plot=f1_plt) f1_plt.savefig(os.path.join(output_eval_dir, 'precision_recall.png')) f2_plt = self.scores(df_true, df_predict) run.log_image("scores", plot=f2_plt) f2_plt.savefig(os.path.join(output_eval_dir, 'scores.png')) if __name__ == '__main__': args = predict_args() predictor = Predictor(args.trained_model) df = load_data_frame_from_directory(args.predict_path).data out_df = predictor.predict(df) save_data_frame_to_directory(args.predict_result_path, data=out_df) label_column = predictor.label_column print(f'label column {label_column}') if label_column in df.columns: print(f"Got actual label column {label_column}, evaluating:") predictor.evaluation(df[label_column], out_df['Scored Label'], out_df['Scored Prob'], args.predict_result_path)
'--remove-seen-items', type=str, help='Remove items seen in training from recommendation') parser.add_argument('--score-result', help='Ratings or items to output') args, _ = parser.parse_known_args() logger.info(f"Arguments: {args}") sort_top_k = strtobool(args.sort_top_k) if args.sort_top_k else None remove_seen_items = strtobool( args.remove_seen_items) if args.remove_seen_items else None normalize = strtobool(args.normalize) if args.normalize else None sar_model = load_model_from_directory(args.trained_model, model_loader=joblib_loader).data dataset_to_score = load_data_frame_from_directory( args.dataset_to_score).data logger.debug(f"Shape of loaded DataFrame: {dataset_to_score.shape}") score_sar_module = ScoreSARModule(model=sar_model, input_data=dataset_to_score) score_type = ScoreType(args.score_type) if score_type == ScoreType.ITEM_RECOMMENDATION: score_result = score_sar_module.recommend_items( ranking_metric=RankingMetric(args.ranking_metric), top_k=args.top_k, sort_top_k=sort_top_k, remove_seen=args.remove_seen_items, normalize=normalize) elif score_type == ScoreType.RATING_PREDICTION: score_result = score_sar_module.predict_ratings(
def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size, threshold, sensitivity, appendMode, compute_stats_in_visualization, output_path): df = load_data_frame_from_directory(input_path).data logging.info(f"Shape of loaded DataFrame: {df.shape}") if df.shape[0] < MIN_POINTS: raise Exception(NotEnoughPoints.format(MIN_POINTS)) if 0 < batch_size < MIN_POINTS: raise Exception(InvalidBatchSize.format(MIN_POINTS)) if timestamp_column not in list(df.columns): raise Exception(ColumnNotFoundError.format(timestamp_column)) if value_column not in list(df.columns): raise Exception(ColumnNotFoundError.format(value_column)) timestamp = pd.DataFrame(df, columns=[timestamp_column]) timestamps = pd.to_datetime(timestamp.iloc[:, 0].values) if np.any(np.isnat(timestamps)): raise Exception(InvalidTimestamps) res = is_timestamp_ascending(timestamps) if res == -1: raise Exception(InvalidSeriesOrder) elif res == -2: raise Exception(DuplicateSeriesTimestamp) data_columns = pd.DataFrame(df, columns=[value_column]) for col in data_columns: try: float_data = data_columns[col].apply(float) except Exception as e: raise Exception(InvalidValueFormat.format(col)) if not np.all(np.isfinite(float_data)): raise Exception(InvalidSeriesValue.format(col)) if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any( np.greater(float_data, VALUE_UPPER_BOUND)): raise Exception(ValueOverflow.format(col)) data_columns[col] = float_data result = sr_detector.detect(timestamps, data_columns, detect_mode=detect_mode, batch_size=batch_size, threshold=threshold, sensitivity=sensitivity) if appendMode is True: result = pd.merge(df, result, left_index=True, right_index=True) save_data_frame_to_directory( output_path, result, compute_stats_in_visualization=compute_stats_in_visualization)
parser = argparse.ArgumentParser("XGBRegressorEvaluation") parser.add_argument("--Evaluation_Data", type=str, help="Evaluation dataset.") parser.add_argument("--Lable_Col", type=str, help="Lable column in the evaluation dataset.") parser.add_argument("--Model_Path", type=str, help="Path where contains model file.") parser.add_argument("--Model_FileName", type=str, help="Name of the model file.") parser.add_argument("--Evaluation_Output", type=str, help="Evaluation result") args = parser.parse_args() ## Load data from DataFrameDirectory to Pandas DataFrame evaluation_df = load_data_frame_from_directory(args.Evaluation_Data).data ## Prepare evaluation data evaluation_df_features = evaluation_df[[ c for c in evaluation_df.columns if c != args.Lable_Col ]] evaluation_df_lable = evaluation_df[args.Lable_Col] ## Load model xg_reg = xgb.XGBRegressor() xg_reg.load_model(args.Model_Path + "/" + args.Model_FileName) ## Evaluation preds = xg_reg.predict(evaluation_df_features) rmse = np.sqrt(mean_squared_error(evaluation_df_lable, preds)) print("RMSE: %f" % (rmse))
def inference(self, data_path, save_path): os.makedirs(save_path, exist_ok=True) input = load_data_frame_from_directory(data_path).data df = self.run(input) save_data_frame_to_directory(save_path, data=df)
int_param = args.int_parameter bool_param = args.boolean_parameter enum_param = args.enum_parameter logger.debug(f"Received parameters:") logger.debug(f" {str_param}") logger.debug(f" {int_param}") logger.debug(f" {bool_param}") logger.debug(f" {enum_param}") if rank > 0: logger.debug(f"I'm rank {rank}/{size}, wait for data.") data = comm.recv(source=0, tag=rank) logger.debug(f"Received shape of loaded DataFrame: {data} ") else: logger.debug(f"I'm rank 0/{size}, load and dump.") logger.debug(f"Input path: {args.input_path}") data_frame_directory = load_data_frame_from_directory(args.input_path) logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}") logger.debug(f"Output path: {args.output_path}") save_data_frame_to_directory(args.output_path, data_frame_directory.data) for i in range(1, size): data = data_frame_directory.data.shape logger.debug(f"Send shape to rank {i}") comm.send(data, dest=i, tag=i)
parser.add_argument("--Learning_rate", type=float, help="Boosting learning rate.") parser.add_argument("--Max_depth", type=int, help="Maximum tree depth for base learners.") parser.add_argument("--Model_FileName", type=str, help="Name of the model file.") parser.add_argument("--Model_Path", type=str, help="Path to store XGBoost model file in Json format.") args = parser.parse_args() ## Load data from DataFrameDirectory to Pandas DataFrame training_df = load_data_frame_from_directory(args.Training_Data).data ## Prepare training data training_df_features = training_df[[ c for c in training_df.columns if c != args.Lable_Col ]] training_df_lable = training_df[args.Lable_Col] ## Training xg_reg = xgb.XGBRegressor(objective='reg:linear', colsample_bytree=0.3, alpha=10, n_estimators=10, learning_rate=args.Learning_rate, max_depth=args.Max_depth)
parser.add_argument( "--relevancy-method", type=str, help="method for determining relevancy ['top_k', 'by_threshold'].", ) parser.add_argument("--k", type=int, help="number of top k items per user.") parser.add_argument("--threshold", type=float, help="threshold of top items per user.") parser.add_argument("--score-result", help="Result of the computation.") args, _ = parser.parse_known_args() rating_true = load_data_frame_from_directory(args.rating_true).data rating_pred = load_data_frame_from_directory(args.rating_pred).data col_user = args.col_user col_item = args.col_item col_rating = args.col_rating col_prediction = args.col_prediction relevancy_method = args.relevancy_method k = args.k threshold = args.threshold logger.debug(f"Received parameters:") logger.debug(f"User: {col_user}") logger.debug(f"Item: {col_item}") logger.debug(f"Rating: {col_rating}") logger.debug(f"Prediction: {col_prediction}")
import argparse import os from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory print("Merge Green and Yellow taxi data") parser = argparse.ArgumentParser("merge") parser.add_argument("--cleansed_green_data", type=str, help="cleansed green data") parser.add_argument("--cleansed_yellow_data", type=str, help="cleansed yellow data") parser.add_argument("--output_merge", type=str, help="green and yellow taxi data merged") args = parser.parse_args() green_df = load_data_frame_from_directory(args.cleansed_green_data).data yellow_df = load_data_frame_from_directory(args.cleansed_yellow_data).data print("Argument (output merge taxi data path): %s" % args.output_merge) # Appending yellow data to green data combined_df = green_df.append(yellow_df, ignore_index=True) combined_df.reset_index(inplace=True, drop=True) if not (args.output_merge is None): os.makedirs(args.output_merge, exist_ok=True) print("%s created" % args.output_merge) save_data_frame_to_directory(args.output_merge, combined_df)