def main(args): address_parser = AddressParser(model_type=args.model_type, device=0) train_container = PickleDatasetContainer(args.train_dataset_path) lr_scheduler = StepLR(step_size=20) address_parser.retrain(train_container, 0.8, epochs=args.epochs, batch_size=args.batch_size, num_workers=6, learning_rate=args.learning_rate, callbacks=[lr_scheduler], logging_path=f"./chekpoints/{args.model_type}") test_container = PickleDatasetContainer(args.test_dataset_path) checkpoint = "best" address_parser.test(test_container, batch_size=args.batch_size, num_workers=4, logging_path=f"./chekpoints/{args.model_type}", checkpoint=checkpoint)
def main(args): address_parser = AddressParser(model_type=args.model_type, device=0) if args.mode in ("train", "both"): train_container = PickleDatasetContainer(args.train_dataset_path) lr_scheduler = StepLR(step_size=20) address_parser.retrain(train_container, 0.8, epochs=100, batch_size=1024, num_workers=6, learning_rate=0.001, callbacks=[lr_scheduler], logging_path=f"./chekpoints/{args.model_type}") if args.mode in ("test", "both"): test_container = PickleDatasetContainer(args.test_dataset_path) if args.mode == "test": checkpoint = handle_pre_trained_checkpoint(args.model_type) else: checkpoint = "best" address_parser.test(test_container, batch_size=2048, num_workers=4, logging_path=f"./chekpoints/{args.model_type}", checkpoint=checkpoint)
def test_givenAPickleDatasetContainer_whenGetOneItem_thenReturnTheCorrectItem( self): create_pickle_file(self.a_pickle_data_container_path) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) # first data point idx = 0 expected_address = base_string.format(idx) expected_tags_idx = a_tags_sequence actual_address, actual_tags_idx = pickle_dataset_container[idx] self.assertEqual(expected_address, actual_address) self.assertListEqual(expected_tags_idx, actual_tags_idx) # second data point idx = 1 expected_address = base_string.format(idx) expected_tags_idx = a_tags_sequence actual_address, actual_tags_idx = pickle_dataset_container[idx] self.assertEqual(expected_address, actual_address) self.assertListEqual(expected_tags_idx, actual_tags_idx) # third data point idx = 2 expected_address = base_string.format(idx) expected_tags_idx = a_tags_sequence actual_address, actual_tags_idx = pickle_dataset_container[idx] self.assertEqual(expected_address, actual_address) self.assertListEqual(expected_tags_idx, actual_tags_idx)
def setUpClass(cls): cls.temp_dir_obj = TemporaryDirectory() cls.a_data_saving_dir = os.path.join(cls.temp_dir_obj.name, "data") os.makedirs(cls.a_data_saving_dir, exist_ok=True) file_extension = "p" training_dataset_name = "sample_incomplete_data" test_dataset_name = "test_sample_data" download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) download_from_url(test_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) cls.training_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension) ) cls.test_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, test_dataset_name + "." + file_extension) ) cls.a_fasttext_model_type = "fasttext" cls.a_fasttext_light_model_type = "fasttext-light" cls.a_bpemb_model_type = "bpemb" cls.verbose = False # training constant cls.a_single_epoch = 1 cls.a_three_epoch = 3 cls.a_train_ratio = 0.8 cls.a_batch_size = 128 cls.a_number_of_workers = 2 cls.a_learning_rate = 0.001 cls.a_torch_device = torch.device("cuda:0") cls.a_cpu_device = "cpu" cls.a_zero_number_of_workers = 0 cls.fasttext_local_path = os.path.join(CACHE_PATH, "fasttext.ckpt") cls.bpemb_local_path = os.path.join(CACHE_PATH, "bpemb.ckpt") cls.with_new_prediction_tags = { "ALastTag": 0, "ATag": 1, "AnotherTag": 2, "EOS": 3, }
def test_given_list_of_tuple_data_when_predict_container_raise_data_error( self): number_of_data_points = 4 create_pickle_file(self.a_pickle_data_container_path, number_of_data_points=number_of_data_points, predict_container=False) with self.assertRaises(DataError): PickleDatasetContainer(self.a_pickle_data_container_path, is_training_container=False)
def setUpClass(cls): super(AddressParserIntegrationTestNewTags, cls).setUpClass() file_extension = "p" training_dataset_name = "test_sample_data_new_prediction_tags" download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) cls.new_prediction_data_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension) )
def test_givenAPickleDatasetContainer_whenInstantiate_thenDataIsPickleContent( self): number_of_data_points = 4 create_pickle_file(self.a_pickle_data_container_path, number_of_data_points=number_of_data_points) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) expected = number_of_data_points self.assertEqual(expected, len(pickle_dataset_container)) number_of_data_points = 5 create_pickle_file(self.a_pickle_data_container_path, number_of_data_points=number_of_data_points) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) expected = number_of_data_points self.assertEqual(expected, len(pickle_dataset_container))
def test_integration_predict_container(self): number_of_data_points = 4 create_pickle_file(self.a_pickle_data_container_path, number_of_data_points=number_of_data_points, predict_container=True) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path, is_training_container=False) expected = number_of_data_points self.assertEqual(expected, len(pickle_dataset_container)) number_of_data_points = 5 create_pickle_file(self.a_pickle_data_container_path, number_of_data_points=number_of_data_points, predict_container=True) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path, is_training_container=False) expected = number_of_data_points self.assertEqual(expected, len(pickle_dataset_container))
def test_integration(self): number_of_data_points = 4 create_pickle_file( self.a_pickle_data_container_path, number_of_data_points=number_of_data_points, ) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) expected = number_of_data_points self.assertEqual(expected, len(pickle_dataset_container)) number_of_data_points = 5 create_pickle_file( self.a_pickle_data_container_path, number_of_data_points=number_of_data_points, ) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) expected = number_of_data_points self.assertEqual(expected, len(pickle_dataset_container))
def setUpClass(cls): cls.a_data_saving_dir = "./data" file_extension = "p" training_dataset_name = "sample_incomplete_data" test_dataset_name = "test_sample_data" download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) download_from_url(test_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) cls.training_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension)) cls.test_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, test_dataset_name + "." + file_extension)) cls.a_fasttext_model_type = "fasttext" cls.a_fasttext_light_model_type = "fasttext-light" cls.a_bpemb_model_type = "bpemb" cls.verbose = False # training constant cls.a_single_epoch = 1 cls.a_three_epoch = 3 cls.a_train_ratio = 0.8 cls.a_batch_size = 128 cls.a_number_of_workers = 2 cls.a_learning_rate = 0.001 cls.a_checkpoints_saving_dir = "./chekpoints" cls.fasttext_local_path = os.path.join(CACHE_PATH, "fasttext.ckpt") cls.bpemb_local_path = os.path.join(CACHE_PATH, "bpemb.ckpt")
def test_givenAPickleDatasetContainer_whenGetSlice_thenReturnTheCorrectItems( self): create_pickle_file(self.a_pickle_data_container_path) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) expected = [np.array(range(0, 10)), np.array(range(10, 20))] # first and second data points actual = pickle_dataset_container[0:2] self.assertListOfArraysEqual(expected, actual) expected = [np.array(range(20, 30)), np.array(range(30, 40))] # third and forth data points actual = pickle_dataset_container[2:4] self.assertListOfArraysEqual(expected, actual)
def test_givenAPickleDatasetContainer_whenGetOneItem_thenReturnTheCorrectItem( self): create_pickle_file(self.a_pickle_data_container_path) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) expected = list(range(0, 10)) # first data point actual = pickle_dataset_container[0] self.assertEqual(expected, actual.tolist()) expected = list(range(10, 20)) # second data point actual = pickle_dataset_container[1] self.assertEqual(expected, actual.tolist()) expected = list(range(20, 30)) # third data point actual = pickle_dataset_container[2] self.assertEqual(expected, actual.tolist())
def test_on_country_data(address_parser: AddressParser, file: str, directory_path: str, args) -> tuple: """ Compute the results over a country data. """ country = pycountry.countries.get( alpha_2=file.replace(".p", "").upper()).name country = clean_up_name(country) print(f"Testing on test files {country}") test_file_path = os.path.join(directory_path, file) test_container = PickleDatasetContainer(test_file_path) results = address_parser.test( test_container, batch_size=4096, num_workers=4, logging_path=f"./chekpoints/{args.model_type}", checkpoint=args.model_path) return results, country
def test_on_country_data(address_parser: AddressParser, file: str, directory_path: str, args) -> tuple: """ Compute the results over a country data. """ country = convert_2_letters_name_into_country_name(file) print(f"Testing on test files {country}") test_file_path = os.path.join(directory_path, file) test_container = PickleDatasetContainer(test_file_path, is_training_container=False) results = address_parser.test( test_container, batch_size=args.batch_size, num_workers=4, logging_path=f"./checkpoints/{args.model_type}", checkpoint=args.model_path, ) return results, country
def test_givenAPickleDatasetContainer_whenGetSlice_thenReturnTheCorrectItems( self): create_pickle_file(self.a_pickle_data_container_path) pickle_dataset_container = PickleDatasetContainer( self.a_pickle_data_container_path) start_idx = 0 end_idx = 2 expected_addresses = [ base_string.format(idx) for idx in range(start_idx, end_idx) ] expected_tags_idxs = [a_tags_sequence] * (end_idx - start_idx) sliced_addresses = pickle_dataset_container[start_idx:end_idx] self.assertIsInstance(sliced_addresses, list) for actual_address_tuple, expected_address, expected_tags_idx in zip( sliced_addresses, expected_addresses, expected_tags_idxs): actual_address, actual_tags_idx = actual_address_tuple[ 0], actual_address_tuple[1] self.assertEqual(expected_address, actual_address) self.assertListEqual(expected_tags_idx, actual_tags_idx) start_idx = 2 end_idx = 4 expected_addresses = [ base_string.format(idx) for idx in range(start_idx, end_idx) ] expected_tags_idxs = [a_tags_sequence] * (end_idx - start_idx) sliced_addresses = pickle_dataset_container[start_idx:end_idx] self.assertIsInstance(sliced_addresses, list) for actual_address_tuple, expected_address, expected_tags_idx in zip( sliced_addresses, expected_addresses, expected_tags_idxs): actual_address, actual_tags_idx = actual_address_tuple[ 0], actual_address_tuple[1] self.assertEqual(expected_address, actual_address) self.assertListEqual(expected_tags_idx, actual_tags_idx)
def setUpClass(cls): cls.an_address_to_parse = "350 rue des lilas o" cls.temp_dir_obj = TemporaryDirectory() cls.a_data_saving_dir = os.path.join(cls.temp_dir_obj.name, "data") os.makedirs(cls.a_data_saving_dir, exist_ok=True) file_extension = "p" training_dataset_name = "sample_incomplete_data" download_from_url(training_dataset_name, cls.a_data_saving_dir, file_extension=file_extension) cls.training_container = PickleDatasetContainer( os.path.join(cls.a_data_saving_dir, training_dataset_name + "." + file_extension)) cls.a_fasttext_model_type = "fasttext" cls.a_bpemb_model_type = "bpemb" cls.verbose = False # training constant cls.a_single_epoch = 1 cls.a_train_ratio = 0.8 cls.a_batch_size = 128 cls.a_number_of_workers = 2 cls.a_learning_rate = 0.001 cls.a_torch_device = torch.device("cuda:0") cls.a_cpu_device = "cpu" cls.seq2seq_params = { "encoder_hidden_size": 512, "decoder_hidden_size": 512 } cls.retrain_file_name_format = "retrained_{}_address_parser"
import os from deepparse import download_from_url from deepparse.dataset_container import PickleDatasetContainer from deepparse.parser import AddressParser # Here is an example on how to parse multiple addresses # First, let's download the train and test data from the public repository. saving_dir = "./data" file_extension = "p" test_dataset_name = "predict" download_from_url(test_dataset_name, saving_dir, file_extension=file_extension) # Now let's load the dataset using one of our dataset container addresses_to_parse = PickleDatasetContainer("./data/predict.p", is_training_container=False) # Let's download a BPEmb retrained model create just for this example, but you can also use one of yours. retrained_model_name = "retrained_light_bpemb_address_parser" model_file_extension = "ckpt" download_from_url(retrained_model_name, saving_dir, file_extension=model_file_extension) address_parser = AddressParser( model_type="bpemb", device=0, path_to_retrained_model=os.path.join(saving_dir, retrained_model_name + "." + model_file_extension), ) # We can now parse some addresses parsed_addresses = address_parser(addresses_to_parse[0:300])
def main(args=None) -> None: # pylint: disable=too-many-locals, too-many-branches """ CLI function to rapidly parse an addresses dataset and output it in another file. Examples of usage: .. code-block:: sh parse fasttext ./dataset_path.csv parsed_address.pickle Using a gpu device .. code-block:: sh parse fasttext ./dataset_path.csv parsed_address.pickle --device 0 Using a CSV dataset .. code-block:: sh parse fasttext ./dataset.csv parsed_address.pickle --path_to_retrained_model ./path """ if args is None: # pragma: no cover args = sys.argv[1:] parsed_args = get_args(args) dataset_path = parsed_args.dataset_path if is_csv_path(dataset_path): csv_column_name = parsed_args.csv_column_name if csv_column_name is None: raise ValueError( "For a CSV dataset path, you need to specify the 'csv_column_name' argument to provide the" " column name to extract address.") csv_column_separator = parsed_args.csv_column_separator addresses_to_parse = CSVDatasetContainer( dataset_path, column_names=[csv_column_name], separator=csv_column_separator, is_training_container=False) elif is_pickle_path(dataset_path): addresses_to_parse = PickleDatasetContainer( dataset_path, is_training_container=False) else: raise ValueError( "The dataset path argument is not a CSV or pickle file.") export_filename = parsed_args.export_filename export_path = generate_export_path(dataset_path, export_filename) if is_csv_path(export_filename): export_fn = partial(to_csv, export_path=export_path, sep=csv_column_separator) elif is_pickle_path(export_filename): export_fn = partial(to_pickle, export_path=export_path) elif is_json_path(export_filename): export_fn = partial(to_json, export_path=export_path) else: raise ValueError("We do not support this type of export.") parsing_model = parsed_args.parsing_model device = parsed_args.device path_to_retrained_model = parsed_args.path_to_retrained_model if "cpu" not in device: device = int(device) parser_args = {"device": device} if "-attention" in parsing_model: parser_args.update({"attention_mechanism": True}) parsing_model = parsing_model.strip("attention").strip("-") parser_args.update({"model_type": parsing_model}) if path_to_retrained_model is not None: parser_args.update( {"path_to_retrained_model": path_to_retrained_model}) address_parser = AddressParser(**parser_args) if parsed_args.log: logging_export_path = replace_path_extension(export_path, ".log") logging.basicConfig(filename=logging_export_path, format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO) text_to_log = f"Parsing dataset file {dataset_path} using the parser {address_parser}" logging.info(text_to_log) parsed_address = address_parser(addresses_to_parse) export_fn(parsed_address) print(f"{len(addresses_to_parse)} addresses have been parsed.") if parsed_args.log: text_to_log = ( f"{len(addresses_to_parse)} addresses have been parsed.\n" f"The parsed addresses are outputted here: {export_path}") logging.info(text_to_log)
from deepparse import download_from_url from deepparse.dataset_container import PickleDatasetContainer from deepparse.parser import AddressParser # First, let's download the train and test data from the public repository. saving_dir = "./data" file_extension = "p" training_dataset_name = "sample_noisy_data" test_dataset_name = "test_sample_data" download_from_url(training_dataset_name, saving_dir, file_extension=file_extension) download_from_url(test_dataset_name, saving_dir, file_extension=file_extension) # Now let's create a training and test container. training_container = PickleDatasetContainer( os.path.join(saving_dir, training_dataset_name + "." + file_extension)) test_container = PickleDatasetContainer( os.path.join(saving_dir, test_dataset_name + "." + file_extension)) # We will retrain the fasttext version of our pretrained model. address_parser = AddressParser(model_type="fasttext", device=0) # Now let's retrain for 5 epochs using a batch size of 8 since the data is really small for the example. # Let's start with the default learning rate of 0.01 and use a learning rate scheduler to lower the learning rate # as we progress. lr_scheduler = poutyne.StepLR( step_size=1, gamma=0.1) # reduce LR by a factor of 10 each epoch # The checkpoints (ckpt) are saved in the default "./chekpoints" directory. address_parser.retrain(training_container, 0.8,