import pandas as pd import sanalytics.estimators.d2vestimator as sed import logging logging.basicConfig(level=logging.DEBUG) from time import time from progressbar import progressbar import sanalytics.evaluation.evaluation_metric as see import sanalytics.algorithms.utils as sau ## Import Data X_train = pd.read_parquet("datasets/rq3_data/sec1.0_train.parquet") X_test = pd.read_parquet("datasets/rq3_data/sec1.0_test.parquet") X_all = pd.concat([X_train, X_test], sort=False) X_train_90 = X_train[X_train.label=="security"] X_train_100 = X_all[X_all.label=="security"] ## Train D2V d2v_90 = sed.D2VEstimator().fit(X_train_90) d2v_100 = sed.D2VEstimator().fit(X_train_90) d2v_90.model.save("datasets/rq3_d2v/sec1.0_posonly.model") d2v_100.model.save("datasets/rq3_d2v/sec1.0_all_posonly.model")
import pandas as pd import sanalytics.estimators.d2vestimator as sed import logging logging.basicConfig(level=logging.DEBUG) from time import time ## Read test data and training folds filename = sys.argv[1] X = pd.read_parquet("datasets/rq3_data/{}_train.parquet".format(filename)) print(filename) print(len(X)) ## Train Doc2Vec for each split start = time() d2v = sed.D2VEstimator().fit(X) end = time() model = d2v.model model.save("datasets/rq3_d2v/{}.model".format(filename)) pd.DataFrame([["{}".format(filename), end - start]], columns=["set", "training_time"]).to_csv( "outputcsvs/d2v_training_times/{}.csv".format(filename), index=False) ## Read test data and training folds X = pd.concat([ pd.read_parquet("datasets/rq3_data/{}_train.parquet".format(filename)), pd.read_parquet("datasets/rq3_data/{}_test.parquet".format(filename)) ]) print(filename) print(len(X))
import dask.dataframe as dd import sanalytics.estimators.d2vestimator as sed import logging logging.basicConfig(level=logging.DEBUG) from time import time import pandas as pd if int(sys.argv[1]) == 1: filename = "X100_train.parquet+X100_test.parquet" files = [ "datasets/model_selection_CV/{}".format(i) for i in filename.split("+") ] df = dd.read_parquet(files).fillna( '').compute() # generated using analysis/job_array_processing start = time() d2vest = sed.D2VEstimator().fit(df) end = time() d2vest.model.save("datasets/rq3_d2v/sec1.0R100_all.model") filename = "sec1.0R100_all" pd.DataFrame([["{}".format(filename), end - start]], columns=["set", "training_time"]).to_csv( "outputcsvs/d2v_training_times/{}.csv".format(filename), index=False) if int(sys.argv[1]) == 2: filename = "X100_train.parquet" files = [ "datasets/model_selection_CV/{}".format(i) for i in filename.split("+") ] df = dd.read_parquet(files).fillna( '').compute() # generated using analysis/job_array_processing