def test_no_missing(self): config = deepLegisConfig('distilbert_128.json') df, encoder = createDeepLegisDataFrame(config, ) for col in df.columns: print("testing no missing in:" + col) self.assertEqual(sum(df[col].isna()), 0)
def test_all_configs(self): print("Testing all configs in ./src/configs/.") models_to_test = os.listdir('src/configs/') for file in models_to_test: config = deepLegisConfig(file)
def run_model(json_file): """ """ config = deepLegisConfig(json_file) # The class code is in the config, specified by the json deep_legis_model = config.model_class(config) print("Import and process the dataset") deep_legis_model.load_data() pp.pprint(vars(config)) print("Build the model and show the strucutre.") deep_legis_model.build() deep_legis_model.deep_legis_model.summary() print("Train the model!") deep_legis_model.train() print("Save the model.") deep_legis_model.deep_legis_model.save(config.model_location) print("Evaluation on the Test set:") deep_legis_model.evaluate() print("Cache predictions on all observations for later use.") deep_legis_model.full_dataset_prediction()
def test_model(json_file): """ Run every model with 1000 times less data and for one epoch """ config = deepLegisConfig(json_file) config.epochs = 1 # The class code is in the config, specified by the json deep_legis_model = config.model_class(config) print("Import and process the dataset") deep_legis_model.load_data(reduce_by_factor=1000) pp.pprint(vars(config)) print("Build the model and show the strucutre.") deep_legis_model.build() deep_legis_model.deep_legis_model.summary() print("Train the model!") deep_legis_model.train() print("Evaluation on the Test set:") deep_legis_model.evaluate() deep_legis_model.deep_legis_model.save(config.model_location) print("Cache predictions on all observations for later use.") deep_legis_model.full_dataset_prediction()
def create_pretokenized_dataset(): logger = logging.getLogger(__name__) logger.info('making final data set from raw data') # Use every core on the machine. pandarallel.initialize(use_memory_fs=False) config = deepLegisConfig("bert_128.json") # Create a dataframe out of the ml_data.csv by adding the text to it. df, _ = createDeepLegisDataFrame(config, read_cached=False) # Take the text and tokenize it into the final product the model wants to see. tokenizer = config.tokenizer def tokenizer_wrapper(text): d = tokenizer(text, truncation=True, padding='max_length', max_length=config.max_length) return (d['input_ids']) tic = time.perf_counter() df['tokens'] = df.text.parallel_apply(tokenizer_wrapper) toc = time.perf_counter() logger.info( f"Tokenized in {(toc-tic)/60.0} min - {toc - tic:0.4f} seconds") print(df.head()) # Save it for later use pickle_file = config.data_vol + "preprocessed_df_128.pkl" pickle.dump(df, open(pickle_file, "wb"))
def test_loading_df(self): config = deepLegisConfig('distilbert_128.json') df, encoder = createDeepLegisDataFrame(config, reduce_by_factor=1000) cols = df.columns #print(df.head()) expected_columns = ['passed', 'signed', 'text', 'id', 'version_number', \ 'bill_id', 'partisan_lean', 'sc_id_cat', 'sc_id'] for col in cols: self.assertIn(col, expected_columns)
def test_text_only_batches(self): config = deepLegisConfig('distilbert_128.json') text_only_dataset = legislationDatasetText(config) df, encoder = createDeepLegisDataFrame(config, reduce_by_factor=1000) train_data, val_data, test_data, full_data, split_data = \ text_only_dataset.create_batch_stream(df) for elem in train_data.take(1): x, y = elem self.assertEqual(x['input_ids'].shape, (config.batch_size, config.max_length)) self.assertEqual(y.shape, (config.batch_size, ))
def test_no_text_batches(self): config = deepLegisConfig('no_text.json') all_dataset = legislationDatasetNoText(config) df, encoder = createDeepLegisDataFrame(config, reduce_by_factor=1000) all_dataset.config.n_sc_id_classes = len(encoder.classes_) train_data, val_data, test_data, full_data, split_data = \ all_dataset.create_batch_stream(df) for elem in train_data.take(1): x, y = elem self.assertEqual(x['version_number'].shape, (config.batch_size, )) self.assertEqual(x['partisan_lean'].shape, (config.batch_size, )) self.assertEqual(x['sc_id'].shape, (config.batch_size, len(encoder.classes_))) self.assertEqual(y.shape, (config.batch_size, ))
def test_rev_cat_batches(self): config = deepLegisConfig('distilbert_128.json') all_dataset = legislationDatasetRevCat(config) df, encoder = createDeepLegisDataFrame(config, reduce_by_factor=1000) all_dataset.config.n_sc_id_classes = len(encoder.classes_) train_data, val_data, test_data, full_data, split_data = \ all_dataset.create_batch_stream(df) for elem in train_data.take(1): x, y = elem self.assertEqual(x['input_ids'].shape, (config.batch_size, config.max_length)) self.assertEqual(x['version_number'].shape, (config.batch_size, )) self.assertEqual(x['sc_id'].shape, (config.batch_size, len(encoder.classes_))) self.assertEqual(y.shape, (config.batch_size, ))
def create_hidden_states(self, df): config = deepLegisConfig("distilbert_feature_extractor_128.json") deep_legis_model = config.model_class(config) # Batch the data deep_legis_model.batch_df(df, n_sc_id_classes=len( self.label_encoder.classes_), only_full=True) # Load the transformer #deep_legis_model.deep_legis_model = tf.keras.models.load_model('models/transformer_production') deep_legis_model.build() # Do prediction with the transformer on the full dataset. hidden_states = deep_legis_model.deep_legis_model.predict( deep_legis_model.full_batches) return pd.DataFrame(hidden_states)
#from src.models.deeplegis import * from src.models.data_loader import createDeepLegisDataFrame from src.models.configurationClasses import deepLegisConfig from src.models.predict_model import DeepLegisCatboost config = deepLegisConfig("distilbert_feature_extractor_128.json") df, encoder = createDeepLegisDataFrame(config, read_cached=True) print(df.head()) print(df.shape) prod_model = DeepLegisCatboost() prod_model.train_catboost(df)
# Run script for no_text import pprint from src.models.deeplegis import * from src.models.data_loader import * from src.models.configurationClasses import deepLegisConfig pp = pprint.PrettyPrinter() # for the config config = deepLegisConfig("no_text.json") config.build_from_scratch = True config.epochs = 1 # The class code is in the config, specified by the json deep_legis_model = config.model_class(config) print("Import and process the dataset") deep_legis_model.load_data() pp.pprint(vars(config)) print("Build the model and show the strucutre.") deep_legis_model.build() deep_legis_model.deep_legis_model.summary() print("Train the model!") deep_legis_model.train() print("Evaluation on the Test set:") deep_legis_model.evaluate() print("Cache predictions on all observations for later use.")