예제 #1
0
def simple_extract(
    model: LanguageModel,
    activations_dir: str,
    corpus: Corpus,
    activation_names: ActivationNames,
    selection_func: SelectFunc = lambda sen_id, pos, item: True,
) -> None:
    """ Basic extraction method. """
    extractor = Extractor(model, corpus, activations_dir, activation_names)

    extractor.extract(batch_size=BATCH_SIZE,
                      dynamic_dumping=False,
                      selection_func=selection_func)
예제 #2
0
파일: lm.py 프로젝트: so2jia/diagnnose
    def _create_init_states_from_corpus(
        self,
        init_states_corpus: str,
        vocab_path: str,
        save_init_states_to: Optional[str],
    ) -> ActivationTensors:
        corpus: Corpus = import_corpus(init_states_corpus,
                                       vocab_path=vocab_path)

        self.init_states = self.create_zero_state()
        extractor = Extractor(self, corpus, save_init_states_to)
        init_states = extractor.extract(
            create_avg_eos=True,
            only_return_avg_eos=(save_init_states_to is None))
        assert init_states is not None

        return init_states
예제 #3
0
    def setUpClass(cls) -> None:
        # Create directory if necessary
        if not os.path.exists(ACTIVATIONS_DIR):
            os.makedirs(ACTIVATIONS_DIR)

        test_corpus = """The ripe taste improves .\t0 0 1 0 0\tdelicious
        The hog crawled .\t0 1 0 0\thairy
        Move the vat .\t0 0 1 0\tok"""

        corpus_path = os.path.join(ACTIVATIONS_DIR, "corpus.txt")
        with open(corpus_path, "w") as f:
            f.write(test_corpus)

        cls.corpus = import_corpus(corpus_path,
                                   header=["sen", "labels", "quality"],
                                   vocab_from_corpus=True)
        cls.examples = cls.corpus.examples
        cls.iterator = create_iterator(cls.corpus, batch_size=1)

        # Mock the activations the model produces
        cls.all_words = list(
            itertools.chain(*[item.sen for item in cls.corpus]))
        cls.all_tokens = [cls.corpus.vocab.stoi[w] for w in cls.all_words]
        cls.all_labels = cls._merge_labels(
            [example.labels for example in cls.corpus])

        test_sentence_activations = []
        identifier_value = 0
        for example in cls.corpus:
            test_sentence_activations.append(
                create_sentence_dummy_activations(len(example.sen),
                                                  ACTIVATION_DIM,
                                                  identifier_value))
            identifier_value += len(example.sen)

        cls.all_activations = torch.cat(test_sentence_activations)

        # Prepare Mock Model
        cls.model = MockLanguageModel(
            num_layers=1,
            hidden_size=ACTIVATION_DIM,
            all_tokens=cls.all_tokens,
            all_activations=cls.all_activations,
        )
        cls.model.set_init_states()

        # Init extractor
        cls.extractor = Extractor(
            cls.model,
            cls.corpus,
            activations_dir=ACTIVATIONS_DIR,
            activation_names=ACTIVATION_NAMES,
        )
        cls.extractor.activation_names = ACTIVATION_NAMES
예제 #4
0
    def setUpClass(cls) -> None:
        # Create directory if necessary
        if not os.path.exists(ACTIVATIONS_DIR):
            os.makedirs(ACTIVATIONS_DIR)

        # Prepare Mock sentences
        cls.test_sentences = [MagicMock(), MagicMock(), MagicMock()]
        cls.test_sentences[0].sen = ["The", "ripe", "taste", "improves", "."]
        cls.test_sentences[0].labels = [0, 0, 1, 0, 0]
        cls.test_sentences[0].misc_info = {"quality": "delicious"}

        cls.test_sentences[1].sen = ["The", "hog", "crawled", "."]
        cls.test_sentences[1].labels = [0, 1, 0, 0]
        cls.test_sentences[1].misc_info = {"quality": "hairy"}

        cls.test_sentences[2].sen = ["Move", "the", "vat", "."]
        cls.test_sentences[2].labels = [0, 0, 1, 0]
        cls.test_sentences[2].misc_info = {"quality": "ok"}

        cls.corpus = {
            i: cls.test_sentences[i]
            for i in range(len(cls.test_sentences))
        }

        # Mock the activations the model produces
        cls.all_tokens = list(
            itertools.chain(*[sentence.sen
                              for sentence in cls.test_sentences]))
        cls.all_labels = cls._merge_labels(
            [sentence.labels for sentence in cls.corpus.values()])

        cls.test_sentence_activations = []
        identifier_value = 0
        for sentence in cls.corpus.values():
            cls.test_sentence_activations.append(
                create_sentence_dummy_activations(len(sentence.sen),
                                                  ACTIVATION_DIM,
                                                  identifier_value))
            identifier_value += len(sentence.sen)

        cls.all_activations = torch.cat(cls.test_sentence_activations)

        # Prepare Mock Model
        cls.model = MockLanguageModel(num_layers=1,
                                      hidden_size=ACTIVATION_DIM,
                                      all_tokens=cls.all_tokens,
                                      all_activations=cls.all_activations)

        # Init extractor
        cls.extractor = Extractor(cls.model,
                                  cls.corpus,
                                  ACTIVATION_NAMES,
                                  output_dir=ACTIVATIONS_DIR)
예제 #5
0
from diagnnose.config.arg_parser import create_arg_parser
from diagnnose.config.setup import ConfigSetup
from diagnnose.corpora.import_corpus import import_corpus_from_path
from diagnnose.extractors.base_extractor import Extractor
from diagnnose.models.import_model import import_model_from_json
from diagnnose.models.language_model import LanguageModel
from diagnnose.typedefs.corpus import Corpus

if __name__ == '__main__':
    arg_groups = {'model', 'activations', 'corpus', 'extract'}
    arg_parser, required_args = create_arg_parser(arg_groups)

    config_dict = ConfigSetup(arg_parser, required_args, arg_groups).config_dict

    model: LanguageModel = import_model_from_json(config_dict['model'])
    corpus: Corpus = import_corpus_from_path(**config_dict['corpus'])

    extractor = Extractor(model, corpus, **config_dict['activations'])
    extractor.extract(**config_dict['extract'])
예제 #6
0
    """ Select activations only when they occur on the subject's position. """
    return pos == sentence.misc_info["subj_pos"]


def pos_4_selection_func(pos: int, token: str, sentence: LabeledSentence):
    """ Select activations only on position 4. """
    return pos == 4


if __name__ == "__main__":
    required_args = {'model', 'vocab', 'lm_module', 'corpus_path', 'activation_names', 'output_dir'}
    arg_groups = {
        'model': {'model', 'vocab', 'lm_module', 'device'},
        'corpus': {'corpus_path'},
        'init_extract': {'activation_names', 'output_dir', 'init_lstm_states_path'},
        'extract': {'cutoff', 'print_every'},
    }
    argparser = init_argparser()

    config_object = ConfigSetup(argparser, required_args, arg_groups)
    config_dict = config_object.config_dict

    model: LanguageModel = import_model_from_json(**config_dict['model'])
    corpus: LabeledCorpus = convert_to_labeled_corpus(**config_dict['corpus'])

    extractor = Extractor(model, corpus, **config_dict['init_extract'])
    extractor.extract(**config_dict['extract'], selection_func=pos_4_selection_func)

    # In case you want to extract average eos activations as well, uncomment this line
    # extractor.extract_average_eos_activations(print_every=config_dict['extract']['print_every'])
예제 #7
0
from diagnnose.config.arg_parser import create_arg_parser
from diagnnose.config.setup import create_config_dict
from diagnnose.corpus.import_corpus import import_corpus
from diagnnose.extractors.base_extractor import Extractor
from diagnnose.models.import_model import import_model
from diagnnose.models.lm import LanguageModel
from diagnnose.typedefs.corpus import Corpus
from diagnnose.vocab import get_vocab_from_config

if __name__ == "__main__":
    arg_groups = {
        "model", "activations", "corpus", "extract", "init_states", "vocab"
    }
    arg_parser, required_args = create_arg_parser(arg_groups)
    config_dict = create_config_dict(arg_parser, required_args, arg_groups)

    model: LanguageModel = import_model(config_dict)
    corpus: Corpus = import_corpus(
        vocab_path=get_vocab_from_config(config_dict), **config_dict["corpus"])

    extractor = Extractor(model, corpus, **config_dict["activations"])
    extractor.extract(**config_dict["extract"])