예제 #1
0
    def test_no_missing(self):

        config = deepLegisConfig('distilbert_128.json')
        df, encoder = createDeepLegisDataFrame(config, )
        for col in df.columns:
            print("testing no missing in:" + col)
            self.assertEqual(sum(df[col].isna()), 0)
예제 #2
0
    def test_all_configs(self):

        print("Testing all configs in ./src/configs/.")
        models_to_test = os.listdir('src/configs/')
        for file in models_to_test:

            config = deepLegisConfig(file)
예제 #3
0
def run_model(json_file):
    """
    """

    config = deepLegisConfig(json_file)

    # The class code is in the config, specified by the json
    deep_legis_model = config.model_class(config)

    print("Import and process the dataset")
    deep_legis_model.load_data()

    pp.pprint(vars(config))

    print("Build the model and show the strucutre.")
    deep_legis_model.build()
    deep_legis_model.deep_legis_model.summary()

    print("Train the model!")
    deep_legis_model.train()

    print("Save the model.")
    deep_legis_model.deep_legis_model.save(config.model_location)

    print("Evaluation on the Test set:")
    deep_legis_model.evaluate()

    print("Cache predictions on all observations for later use.")
    deep_legis_model.full_dataset_prediction()
예제 #4
0
def test_model(json_file):
    """
    Run every model with 1000 times less data and for one epoch
    """

    config = deepLegisConfig(json_file)
    config.epochs = 1

    # The class code is in the config, specified by the json
    deep_legis_model = config.model_class(config)

    print("Import and process the dataset")
    deep_legis_model.load_data(reduce_by_factor=1000)

    pp.pprint(vars(config))

    print("Build the model and show the strucutre.")
    deep_legis_model.build()
    deep_legis_model.deep_legis_model.summary()

    print("Train the model!")
    deep_legis_model.train()

    print("Evaluation on the Test set:")
    deep_legis_model.evaluate()

    deep_legis_model.deep_legis_model.save(config.model_location)

    print("Cache predictions on all observations for later use.")
    deep_legis_model.full_dataset_prediction()
예제 #5
0
def create_pretokenized_dataset():

    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')

    # Use every core on the machine.
    pandarallel.initialize(use_memory_fs=False)

    config = deepLegisConfig("bert_128.json")

    # Create a dataframe out of the ml_data.csv by adding the text to it.
    df, _ = createDeepLegisDataFrame(config, read_cached=False)

    # Take the text and tokenize it into the final product the model wants to see.
    tokenizer = config.tokenizer

    def tokenizer_wrapper(text):
        d = tokenizer(text,
                      truncation=True,
                      padding='max_length',
                      max_length=config.max_length)
        return (d['input_ids'])

    tic = time.perf_counter()
    df['tokens'] = df.text.parallel_apply(tokenizer_wrapper)
    toc = time.perf_counter()

    logger.info(
        f"Tokenized in {(toc-tic)/60.0} min -  {toc - tic:0.4f} seconds")

    print(df.head())

    # Save it for later use
    pickle_file = config.data_vol + "preprocessed_df_128.pkl"
    pickle.dump(df, open(pickle_file, "wb"))
예제 #6
0
    def test_loading_df(self):

        config = deepLegisConfig('distilbert_128.json')

        df, encoder = createDeepLegisDataFrame(config, reduce_by_factor=1000)
        cols = df.columns
        #print(df.head())
        expected_columns = ['passed', 'signed', 'text', 'id', 'version_number', \
                            'bill_id', 'partisan_lean', 'sc_id_cat', 'sc_id']
        for col in cols:
            self.assertIn(col, expected_columns)
예제 #7
0
    def test_text_only_batches(self):

        config = deepLegisConfig('distilbert_128.json')
        text_only_dataset = legislationDatasetText(config)
        df, encoder = createDeepLegisDataFrame(config, reduce_by_factor=1000)

        train_data, val_data, test_data, full_data, split_data = \
            text_only_dataset.create_batch_stream(df)

        for elem in train_data.take(1):
            x, y = elem
            self.assertEqual(x['input_ids'].shape,
                             (config.batch_size, config.max_length))
            self.assertEqual(y.shape, (config.batch_size, ))
예제 #8
0
    def test_no_text_batches(self):

        config = deepLegisConfig('no_text.json')
        all_dataset = legislationDatasetNoText(config)
        df, encoder = createDeepLegisDataFrame(config, reduce_by_factor=1000)
        all_dataset.config.n_sc_id_classes = len(encoder.classes_)

        train_data, val_data, test_data, full_data, split_data = \
            all_dataset.create_batch_stream(df)

        for elem in train_data.take(1):
            x, y = elem
        self.assertEqual(x['version_number'].shape, (config.batch_size, ))
        self.assertEqual(x['partisan_lean'].shape, (config.batch_size, ))
        self.assertEqual(x['sc_id'].shape,
                         (config.batch_size, len(encoder.classes_)))
        self.assertEqual(y.shape, (config.batch_size, ))
예제 #9
0
    def test_rev_cat_batches(self):

        config = deepLegisConfig('distilbert_128.json')
        all_dataset = legislationDatasetRevCat(config)
        df, encoder = createDeepLegisDataFrame(config, reduce_by_factor=1000)
        all_dataset.config.n_sc_id_classes = len(encoder.classes_)

        train_data, val_data, test_data, full_data, split_data = \
            all_dataset.create_batch_stream(df)

        for elem in train_data.take(1):
            x, y = elem
        self.assertEqual(x['input_ids'].shape,
                         (config.batch_size, config.max_length))
        self.assertEqual(x['version_number'].shape, (config.batch_size, ))
        self.assertEqual(x['sc_id'].shape,
                         (config.batch_size, len(encoder.classes_)))
        self.assertEqual(y.shape, (config.batch_size, ))
예제 #10
0
    def create_hidden_states(self, df):

        config = deepLegisConfig("distilbert_feature_extractor_128.json")
        deep_legis_model = config.model_class(config)

        # Batch the data
        deep_legis_model.batch_df(df,
                                  n_sc_id_classes=len(
                                      self.label_encoder.classes_),
                                  only_full=True)

        # Load the transformer
        #deep_legis_model.deep_legis_model = tf.keras.models.load_model('models/transformer_production')
        deep_legis_model.build()

        # Do prediction with the transformer on the full dataset.
        hidden_states = deep_legis_model.deep_legis_model.predict(
            deep_legis_model.full_batches)

        return pd.DataFrame(hidden_states)
예제 #11
0
#from src.models.deeplegis import *
from src.models.data_loader import createDeepLegisDataFrame
from src.models.configurationClasses import deepLegisConfig
from src.models.predict_model import DeepLegisCatboost

config = deepLegisConfig("distilbert_feature_extractor_128.json")
df, encoder = createDeepLegisDataFrame(config, read_cached=True)

print(df.head())
print(df.shape)
prod_model = DeepLegisCatboost()
prod_model.train_catboost(df)
예제 #12
0
# Run script for no_text
import pprint
from src.models.deeplegis import *
from src.models.data_loader import *
from src.models.configurationClasses import deepLegisConfig

pp = pprint.PrettyPrinter()  # for the config

config = deepLegisConfig("no_text.json")
config.build_from_scratch = True
config.epochs = 1

# The class code is in the config, specified by the json
deep_legis_model = config.model_class(config)

print("Import and process the dataset")
deep_legis_model.load_data()

pp.pprint(vars(config))

print("Build the model and show the strucutre.")
deep_legis_model.build()
deep_legis_model.deep_legis_model.summary()

print("Train the model!")
deep_legis_model.train()

print("Evaluation on the Test set:")
deep_legis_model.evaluate()

print("Cache predictions on all observations for later use.")