def main(args): ################ CONFIGURATIONS ################# source_file = os.path.join(args.data_path, args.from_file_name) source_root_file_name = args.from_file_name.rpartition(os.path.sep)[-1].rpartition('.')[:-2][0] ################ ALGOS ################# """ ****************************************************************************************************************** START: PARSING FILE ****************************************************************************************************************** """ tokenized_questions, tokenized_paragraphs, questions_nontokenized, paragraphs_nontokenized = UTIL.prepare_squad_objects(source_file, source_root_file_name) """ ****************************************************************************************************************** END: PARSING FILE ****************************************************************************************************************** """ """ ****************************************************************************************************************** START: SLIDING WINDOW ****************************************************************************************************************** """ get_slideed_tokenizations_and_dump(tokenized_questions, UTIL.create_dir(os.path.join(args.data_path, 'questions_windowed')), args.truncate_length, args.window_length) get_slideed_tokenizations_and_dump(tokenized_paragraphs, UTIL.create_dir(os.path.join(args.data_path, 'paragraphs_windowed')), args.truncate_length, args.window_length) """
def main(args): path = UTIL.create_dir(os.path.join(args.embedding_path, 'splitted_train_test')) test_question_embeddings, test_paragraph_embeddings, test_labels = load_data(args.embedding_path, args.label_path, 'test') dump_splitted_train_test(test_question_embeddings, test_paragraph_embeddings, test_labels, 'test', path, args.partition_size) print('Test data is ready') train_question_embeddings, train_paragraph_embeddings, train_labels = load_data(args.embedding_path, args.label_path, 'train') dump_splitted_train_test(train_question_embeddings, train_paragraph_embeddings, train_labels, 'train', path, args.partition_size) print('Train data is ready')
"batch_question": 250, "batch_paragraph": 20, } resource = titanX is_dump_during_execution = False is_inject_idf = True is_filtered_by_answers_from_rnet = False ################ CONFIGURATIONS ################# _basepath = os.path.abspath(__file__).rpartition(os.sep)[0] datadir = os.path.join(_basepath, dataset_type) paragraphs_dir = UTIL.create_dir(os.path.join(datadir, 'ELMO', 'paragraphs')) questions_dir = UTIL.create_dir(os.path.join(datadir, 'ELMO', 'questions')) _paragraphs_file_name = '{}_paragraphs.txt' paragraphs_file = os.path.join(paragraphs_dir, _paragraphs_file_name) _questions_file_name = '{}_questions.txt' questions_file = os.path.join(questions_dir, _questions_file_name) _mapping_file_name = '{}_q_to_p_mappings.csv' mapping_file = os.path.join(questions_dir, _mapping_file_name) _paragraph_embeddings_file_name = '{}[email protected]'.format( dataset_type) paragraph_embedding_file = os.path.join(paragraphs_dir, _paragraph_embeddings_file_name)
def cluster_helper(role, sagemaker_session, bucket, local_data_folder, prefix, ticker): A_df = pd.read_pickle(local_data_folder + ticker + '.pkl') A_df.dropna(inplace=True) A_df.drop(columns=["Date"], inplace=True) # Normalize scaler = MinMaxScaler() Y_df = pd.DataFrame(A_df["Label"]).astype('float64') X_df = A_df.drop(columns=["Label"]).astype('float64') X = scaler.fit_transform(X_df) Y = scaler.fit_transform(Y_df) # split data print("Splitting data") x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=.33, random_state=1, shuffle=True) # clustering s3_output_folder = "s3://{}/{}/output".format(bucket, prefix) print("Clustering") kmeans = KMeans(role=role, train_instance_count=1, train_instance_type="ml.m4.xlarge", output_path=s3_output_folder, k=3) kmeans.fit(kmeans.record_set(pd.DataFrame(x_train).astype('float32').values)) # deploy print("Deploying model", kmeans.model_data) kmeans_predictor = kmeans.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge") create_dir('{}s3/{}'.format(local_data_folder, ticker)) # upload train and test data to S3 dataset_with_cluster = pd.concat([pd.DataFrame(y_train, columns=["label"]).astype("float32"), \ pd.DataFrame(x_train).astype("float32"),\ clustering(x_train, kmeans_predictor) ], axis=1) dataset_with_cluster.to_csv('{}s3/{}/all-train.csv'.format(local_data_folder, ticker), header=False, index=False) # prepare cluster data sets create_dir('{}s3/{}/train'.format(local_data_folder, ticker)) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/train/cluster-0".format(ticker), True, local_data_folder) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/train/cluster-1".format(ticker), True, local_data_folder) save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/train/cluster-2".format(ticker), True, local_data_folder) # We have to predict the clusters for each of the test data sets so that we could use it for testing out next model dataset_with_cluster = pd.concat([pd.DataFrame(y_test, columns=["label"]).astype("float32"), \ pd.DataFrame(x_test).astype("float32"),\ clustering(x_test, kmeans_predictor) ], axis=1) dataset_with_cluster.to_csv(local_data_folder + 's3/{}/all-test.csv'.format(ticker), header=False, index=False) # # prepare cluster data sets # create_dir('{}s3/{}/test'.format(local_data_folder, ticker)) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 0], "{}/test/cluster-0".format(ticker), False, local_data_folder) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 1], "{}/test/cluster-1".format(ticker), False, local_data_folder) # save_data(dataset_with_cluster[dataset_with_cluster["cat"] == 2], "{}/test/cluster-2".format(ticker), False, local_data_folder) # delete endpoint kmeans_predictor.delete_endpoint(kmeans_predictor.endpoint) print('Completed clustering for', ticker)
sys.path.append(os.path.join(os.path.dirname(__file__), '..')) import helper.utils as UTIL TRAIN = 'train' DEV = 'dev' ################ CONFIGURATIONS ################# dataset_type = DEV is_dump_during_execution = False is_inject_idf = False is_tf_style = False ################ CONFIGURATIONS ################# _basepath = os.path.abspath(__file__).rpartition(os.sep)[0] datadir = os.path.join(_basepath, dataset_type) pre_trained_dir = UTIL.create_dir(os.path.join(_basepath, 'GLOVE', 'data')) paragraphs_dir = UTIL.create_dir(os.path.join(datadir, 'GLOVE', 'paragraphs')) questions_dir = UTIL.create_dir(os.path.join(datadir, 'GLOVE','questions')) _paragraphs_file_name = '{}_paragraphs.txt' paragraphs_file = os.path.join(paragraphs_dir, _paragraphs_file_name) _questions_file_name = '{}_questions.txt' questions_file = os.path.join(questions_dir, _questions_file_name) _mapping_file_name = '{}_q_to_p_mappings.csv' mapping_file = os.path.join(questions_dir, _mapping_file_name) _paragraph_embeddings_file_name = '{}[email protected]'.format(dataset_type) paragraph_embedding_file = os.path.join(paragraphs_dir, _paragraph_embeddings_file_name)