Exemplo n.º 1
0
    def test_remove_stop_words(self, setup_stopwords_test):
        string = setup_stopwords_test
        instance = string.split()
        instance_preprocess = InstancePreprocessing()

        clean_instance = instance_preprocess.remove_stop_words(instance)
        assert len(clean_instance) == 0
Exemplo n.º 2
0
    def test_preprocessing_lower(self, instances):
        single_instance = instances["single_instance"]
        instance_preprocesing = InstancePreprocessing()
        vocab_builder = Vocab(
            instances=single_instance,
            max_num_tokens=1000,
            preprocessing_pipeline=[instance_preprocesing.lowercase],
        )

        instances = vocab_builder.instances

        for instance in instances:
            for token in instance:
                assert token.islower()
Exemplo n.º 3
0
    def test_conll_dataset_manager(self, test_file):
        instance_preprocessing = InstancePreprocessing()
        manager = CoNLLDatasetManager(
            train_filename=test_file,
            dev_filename=test_file,
            test_filename=test_file,
            namespace_vocab_options={
                "tokens": {
                    "preprocessing_pipeline":
                    [instance_preprocessing.lowercase],
                    "include_special_vocab": False,
                }
            },
        )

        token_vocab = manager.namespace_to_vocab[
            "tokens"].get_token2idx_mapping()

        for token in token_vocab.keys():
            assert token.islower()
def get_preprocessed_instances(get_tokenized_data):
    instances, labels = get_tokenized_data
    instance_preprocessing = InstancePreprocessing()
    instances = list(map(instance_preprocessing.lowercase, instances))
    return instances, labels
Exemplo n.º 5
0
                        type=int)
    parser.add_argument(
        "--add_projection_layer",
        help="If set to true, then projection layer will be added "
        "after lstm2seq encoder with an activation function",
        action="store_true",
    )

    args = parser.parse_args()
    msg_printer = wasabi.Printer()
    data_dir = pathlib.Path(DATA_DIR)
    train_filename = data_dir.joinpath("conll_2003_small.train")
    dev_filename = data_dir.joinpath("conll_2003_small.dev")
    test_filename = data_dir.joinpath("conll_2003_small.test")

    instance_preprocessing = InstancePreprocessing()

    data_manager = CoNLLDatasetManager(
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=test_filename,
        column_names=["POS", "DEP", "NER"],
        train_only="ner",
        namespace_vocab_options={
            "tokens": {
                "preprocessing_pipeline": [instance_preprocessing.lowercase]
            }
        },
    )

    word_embedder = WordEmbedder(embedding_type=args.emb_type,
Exemplo n.º 6
0
def setup_lowercase_tests():
    string = "I LIKE TO MAKE THIS INTO LoWER CASE"
    instance = string.split(" ")
    instance_preprocessing = InstancePreprocessing()
    return string, instance, instance_preprocessing
Exemplo n.º 7
0
def setup_tests_indicate_capitalization(request):
    instance = request.param[0]
    expected_instance = request.param[1]
    instance_preprocessing = InstancePreprocessing()
    return instance, expected_instance, instance_preprocessing