Exemplo n.º 1
0
    def test_extend_from_vocab(self):
        vocab1 = Vocabulary(non_padded_namespaces={"1", "2"})
        vocab2 = Vocabulary(non_padded_namespaces={"3"})

        vocab1.add_tokens_to_namespace(["a", "b", "c"], namespace="1")
        vocab1.add_tokens_to_namespace(["d", "e", "f"], namespace="2")

        vocab2.add_tokens_to_namespace(["c", "d", "e"], namespace="1")
        vocab2.add_tokens_to_namespace(["g", "h", "i"], namespace="3")

        vocab1.extend_from_vocab(vocab2)
        assert vocab1.get_namespaces() == {"1", "2", "3"}
        assert vocab1._non_padded_namespaces == {"1", "2", "3"}
        assert vocab1.get_token_to_index_vocabulary("1") == {
            "a": 0,
            "b": 1,
            "c": 2,
            "@@PADDING@@": 3,
            "@@UNKNOWN@@": 4,
            "d": 5,
            "e": 6,
        }
        assert vocab1.get_token_to_index_vocabulary("2") == {
            "d": 0,
            "e": 1,
            "f": 2,
        }
        assert vocab1.get_token_to_index_vocabulary("3") == {
            "g": 0,
            "h": 1,
            "i": 2,
        }
Exemplo n.º 2
0
    def __init__(self,
                 vocab: Vocabulary,
                 openai_model_path: str,
                 n_ctx: int = 512,
                 tokens_to_add: List[str] = None,
                 requires_grad: bool = True,
                 clf_token: str = '__clf__',
                 dropout: float = .1,
                 entity_dropout: float = 0.0,
                 language_model_weight: float = .5,
                 selector: str = 'average',
                 label_namespace='labels') -> None:

        super().__init__(vocab)

        n_special = len(tokens_to_add) if tokens_to_add is not None else -1

        transformer = OpenaiTransformer(model_path=openai_model_path,
                                        n_special=n_special,
                                        requires_grad=requires_grad,
                                        n_ctx=n_ctx)

        self.embedder = OpenaiTransformerEmbedder(transformer=transformer,
                                                  top_layer_only=True)

        self.clf_head = BagClassificationHead(
            model=transformer,
            encoder_vocab=vocab.get_token_to_index_vocabulary(
                'openai_transformer'),
            n_class=vocab.get_vocab_size(label_namespace),
            clf_token=clf_token + '</w>',
            selector=selector,
            dropout=dropout)

        self.lm_head = LanguageModelHead(transformer)
        self.language_model_weight = language_model_weight

        self.entity_dropout = entity_dropout

        self.encoder_vocab = vocab.get_token_to_index_vocabulary(
            'openai_transformer')
        self.del1_token = '__del1__</w>'
        self.del2_token = '__del2__</w>'
        self.mask_token = '__mask__</w>'
        self.na_idx = self.vocab.get_token_to_index_vocabulary('labels')['NA']

        self.metrics = {
            "accuracy": CategoricalAccuracy(),
            "not_na_accuracy": CategoricalAccuracy()
        }
Exemplo n.º 3
0
def generate_neighbours(vocab, file_name, measure='euc', topk=8, rho=0.6):
    if vocab is None:
        tokens = _read_pretrained_tokens(WORD2VECS['counter'])
        vocab = Vocabulary(tokens_to_add={"tokens": tokens})

    embed = read_weight(vocab, "counter", None)
    emb_util = EmbeddingNbrUtil(embed, vocab.get_token_index,
                                vocab.get_token_from_index)
    if rho is None:
        emb_util.pre_search(measure, topk + 1, None)

    nbr_num = []
    ret = {}
    tokens = list(vocab.get_token_to_index_vocabulary("tokens").keys())
    if file_name is None:
        tokens = random.choices(tokens, k=100)
    for ele in tqdm(tokens):
        nbrs = emb_util.find_neighbours(ele,
                                        measure,
                                        topk + 1,
                                        rho,
                                        return_words=True)
        if ele in nbrs:
            nbrs.remove(ele)
        ret[ele] = nbrs
        nbr_num.append(len(nbrs))
    print(nbr_num)
    print('Average neighbour num:', np.mean(nbr_num))
    if file_name is None:
        return
    json.dump(ret, open(f"external_data/{file_name}", "w"))
Exemplo n.º 4
0
def plot_weight(weight_path):
    weight = torch.load(weight_path)
    weight = weight.numpy()
    vocab = Vocabulary().from_files("data/vocabulary")
    xlabels = ylabels = list(vocab.get_token_to_index_vocabulary())
    if not os.path.isdir("figures"):
        os.makedirs("figures")
    fn = os.path.basename(weight_path)
    fn = os.path.splitext(fn)[0]
    save_confusion_matrix_figure("figures", fn, weight, xlabels, ylabels)
Exemplo n.º 5
0
def load_neighbour_words(
        vocab: Vocabulary,
        file_name='external_data/counter_fitted_neighbors.json'):
    nbr_dct = json.load(open(file_name))
    tokens = vocab.get_token_to_index_vocabulary()
    ret = {}
    for k in nbr_dct:
        if k in tokens:
            ret[k] = []
            for v in nbr_dct[k]:
                if v in tokens:
                    ret[k].append(v)
    ret = defaultdict(lambda: [], ret)
    return ret
Exemplo n.º 6
0
    def __init__(self,
                 vocab: Vocabulary,
                 ngram_orders: Union[int, List[int]],
                 max_sentences: Optional[int] = None,
                 max_words: Optional[int] = None,
                 max_bytes: Optional[int] = None,
                 use_porter_stemmer: bool = True,
                 remove_stopwords: bool = False,
                 namespace: str = 'tokens') -> None:
        super().__init__()
        if isinstance(ngram_orders, int):
            ngram_orders = [ngram_orders]
        self.ngram_orders = ngram_orders
        self.max_sentences = max_sentences
        self.max_words = max_words
        self.max_bytes = max_bytes
        self.use_porter_stemmer = use_porter_stemmer
        self.remove_stopwords = remove_stopwords
        self.python_rouge = PythonRouge()

        self.vocab = vocab
        self.namespace = namespace
        vocab_tokens = vocab.get_token_to_index_vocabulary(namespace)

        # Extract the special tokens from the vocabulary. We need to check and
        # ensure each one exists, otherwise we would get the OOV symbol, which
        # we don't want to skip when converting from indices to strings.
        self.start_index = None
        if START_SYMBOL in vocab_tokens:
            self.start_index = vocab_tokens[START_SYMBOL]
        self.end_index = None
        if END_SYMBOL in vocab_tokens:
            self.end_index = vocab_tokens[END_SYMBOL]
        self.pad_index = None
        if DEFAULT_PADDING_TOKEN in vocab_tokens:
            self.pad_index = vocab_tokens[DEFAULT_PADDING_TOKEN]
        self.sent_start_index = None
        if SENT_START_SYMBOL in vocab_tokens:
            self.sent_start_index = vocab_tokens[SENT_START_SYMBOL]
        self.sent_end_index = None
        if SENT_END_SYMBOL in vocab_tokens:
            self.sent_end_index = vocab_tokens[SENT_END_SYMBOL]

        self.count = 0
        self.totals = {}
Exemplo n.º 7
0
 def __init__(
     self,
     vocab: Vocabulary,
     vocab_namespace: str = "tokens",
     projection_dim: int = None,
     ignore_oov: bool = False,
 ) -> None:
     super().__init__()
     self.vocab = vocab
     self.vocab_size = vocab.get_vocab_size(vocab_namespace)
     if projection_dim:
         self._projection = torch.nn.Linear(self.vocab_size, projection_dim)
     else:
         self._projection = None
     self._ignore_oov = ignore_oov
     oov_token = vocab._oov_token
     self._oov_idx = vocab.get_token_to_index_vocabulary(vocab_namespace).get(oov_token)
     if self._oov_idx is None:
         raise ConfigurationError(
             "OOV token does not exist in vocabulary namespace {}".format(vocab_namespace)
         )
     self.output_dim = projection_dim or self.vocab_size
Exemplo n.º 8
0
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2SeqEncoder, dropout_p: int,
                 vocab: Vocabulary) -> None:
        super().__init__(vocab)

        self.word_embeddings = word_embeddings

        self.embedding2input = FeedForward(
            input_dim=word_embeddings.get_output_dim(),
            num_layers=1,
            hidden_dims=encoder.get_input_dim(),
            activations=Activation.by_name('relu')(),
            dropout=dropout_p)

        self.encoder = encoder

        self.hidden2intermediate = FeedForward(
            input_dim=encoder.get_output_dim(),
            num_layers=1,
            hidden_dims=int(encoder.get_output_dim() / 2),
            activations=Activation.by_name('relu')(),
            dropout=dropout_p)

        self.intermediate2tag = nn.Linear(
            in_features=int(encoder.get_output_dim() / 2),
            out_features=vocab.get_vocab_size('labels'))

        # self.accuracy = CategoricalAccuracy()

        label_vocab = vocab.get_token_to_index_vocabulary('labels').copy()
        # print("label_vocab: ", label_vocab)
        [label_vocab.pop(x) for x in ['O', 'OR']]
        labels_for_metric = list(label_vocab.values())
        # print("labels_for_metric: ", labels_for_metric)
        self.accuracy = CustomFBetaMeasure(beta=1.0,
                                           average='micro',
                                           labels=labels_for_metric)
Exemplo n.º 9
0
def read_dataset(dataset_fp: Path, incl_labels: bool,
                 vocab: Vocabulary) -> List[Dict[str, Any]]:
    '''
    :param dataset_fp: File Path to a list of JSON formatted data
    :param incl_labels: Wether to add the extra `label_array` key/value
    :param vocab: Vocab of the model that is going to predict on the data
    :returns: The data from the dataset with optionally the extra `label_array` 
              key that contains the labels in one hot format.
    '''
    samples = []
    token_to_index = vocab.get_token_to_index_vocabulary(namespace='labels')
    num_labels = vocab.get_vocab_size('labels')
    with dataset_fp.open('r') as dataset_file:
        for line in dataset_file:
            sample = json.loads(line)
            if incl_labels:
                labels = sample['labels']
                label_array = [0] * num_labels
                for label in labels:
                    label_index = token_to_index[label]
                    label_array[label_index] = 1
                sample['label_array'] = label_array
            samples.append(sample)
    return samples
Exemplo n.º 10
0
    def __init__(self,
                 vocab: Vocabulary,
                 text_field_embedder: TextFieldEmbedder,
                 text_encoder: Seq2SeqEncoder,
                 variational_autoencoder: FeedForward = None,
                 sentiment_classifier: FeedForward = None,
                 topic_dim: int = 20,
                 freeze_feature_extraction: bool = False,
                 classification_mode: bool = False,
                 pretrained_file: str = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: Optional[RegularizerApplicator] = None) -> None:
        super(TopicRNN, self).__init__(vocab, regularizer)

        self.metrics = {
            'cross_entropy': Average(),
            'negative_kl_divergence': Average(),
            'stopword_loss': Average()
        }

        self.classification_mode = classification_mode
        if classification_mode:
            self.metrics['sentiment'] = CategoricalAccuracy()

        if pretrained_file:
            archive = load_archive(pretrained_file)
            pretrained_model = archive.model
            self._init_from_archive(pretrained_model)
        else:
            # Model parameter definition.
            #
            # Defaults reflect Dieng et al.'s decisions when training their semi-unsupervised
            # IMDB sentiment classifier.
            self.text_field_embedder = text_field_embedder
            self.vocab_size = self.vocab.get_vocab_size("tokens")
            self.text_encoder = text_encoder
            self.topic_dim = topic_dim
            self.vocabulary_projection_layer = TimeDistributed(
                Linear(text_encoder.get_output_dim(), self.vocab_size))

            # Parameter gamma from the paper; projects hidden states into binary logits for whether a
            # word is a stopword.
            self.stopword_projection_layer = TimeDistributed(
                Linear(text_encoder.get_output_dim(), 2))

            self.tokens_to_index = vocab.get_token_to_index_vocabulary()

            # This step should only ever be performed ONCE.
            # When running allennlp train, the vocabulary will be constructed before the model instantiation, but
            # we can't create the stopless namespace until we get here.
            # Check if there already exists a stopless namespace: if so refrain from altering it.
            if "stopless" not in vocab._token_to_index.keys():
                assert self.tokens_to_index[DEFAULT_PADDING_TOKEN] == 0 and \
                       self.tokens_to_index[DEFAULT_OOV_TOKEN] == 1
                for token, _ in self.tokens_to_index.items():
                    if token not in STOP_WORDS:
                        vocab.add_token_to_namespace(token, "stopless")

                # Since a vocabulary with the stopless namespace hasn't been saved, save one for convienience.
                vocab.save_to_files("vocabulary")

            # Compute stop indices in the normal vocab space to prevent stop words
            # from contributing to the topic additions.
            self.stop_indices = torch.LongTensor(
                [vocab.get_token_index(stop) for stop in STOP_WORDS])

            # Learnable topics.
            # TODO: How should these be initialized?
            self.beta = nn.Parameter(torch.rand(topic_dim, self.vocab_size))

            # mu: The mean of the variational distribution.
            self.mu_linear = nn.Linear(topic_dim, topic_dim)

            # sigma: The root standard deviation of the variational distribution.
            self.sigma_linear = nn.Linear(topic_dim, topic_dim)

            # noise: used when sampling.
            self.noise = MultivariateNormal(torch.zeros(topic_dim),
                                            torch.eye(topic_dim))

            stopless_dim = vocab.get_vocab_size("stopless")
            self.variational_autoencoder = variational_autoencoder or FeedForward(
                # Takes as input the word frequencies in the stopless dimension and projects
                # the word frequencies into a latent topic representation.
                #
                # Each latent representation will help tune the variational dist.'s parameters.
                stopless_dim,
                3,
                [500, 500, topic_dim],
                torch.nn.ReLU(),
            )

            # The shape for the feature vector for sentiment classification.
            # (RNN Hidden Size + Inference Network output dimension).
            sentiment_input_size = text_encoder.get_output_dim() + topic_dim
            self.sentiment_classifier = sentiment_classifier or FeedForward(
                # As done by the paper; a simple single layer with 50 hidden units
                # and sigmoid activation for sentiment classification.
                sentiment_input_size,
                2,
                [50, 2],
                torch.nn.Sigmoid(),
            )

        if freeze_feature_extraction:
            # Freeze the RNN and VAE pipeline so that only the classifier is trained.
            for name, param in self.named_parameters():
                if "sentiment_classifier" not in name:
                    param.requires_grad = False

        self.sentiment_criterion = nn.CrossEntropyLoss()

        self.num_samples = 50

        initializer(self)
class KNNPredictor(Predictor):
    def __init__(self, 
                 model: Model, 
                 dataset_reader: DatasetReader,
                 vocab_path: str = 'resources/vocab',
                 df_path: str = 'https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/spotify_with_genius.csv',
                 annoy_index_path: str = 'https://storage.googleapis.com/jacobdanovitch/spotify_lyrics/index.tree'
                ) -> None:
        super().__init__(model.eval(), dataset_reader)
        
        self.vocab = Vocabulary().from_files(vocab_path)
        self.df = pd.read_csv(df_path).set_index("track_id")
        
        self.index = None
        if annoy_index_path:
            self.build_index(annoy_index_path)
    
    def build_index(self, path: str, tracks: List[Tuple[str, np.array]] =None):
        features = self._model.classifier_feedforward.get_output_dim()
        if tracks is None:
            if not os.path.exists(path):
                path = urlretrieve(path)[0]
            self.index = AnnoyIndex(features, metric='angular')
            self.index.load(path)
            return
        
        index = AnnoyIndex(features, metric='angular')
        for track, vector in tqdm(tracks):
            i = self.vocab.get_token_to_index_vocabulary("labels")[track]
            index.add_item(i, vector)
        
        index.build(-1)
        index.save(path)
        
        self.index = index
    
    def neighbors_to_tracks(self, nns):
        tracks = [self.vocab.get_token_from_index(i, "labels") for i in nns]
        return self.df.loc[tracks].reset_index(drop=True).to_dict(orient='records')
    
    def predict_json(self, inputs: JsonDict) -> JsonDict:
        n = inputs.pop('n', 10)
        if 'track_id' in inputs:
            if self.index is None:
                raise AttributeError("Please build an index before searching by track.")
            idx = self.vocab.get_token_to_index_vocabulary("labels")[inputs['track_id']]
            nns = self.index.get_nns_by_item(idx, n+1)[1:]
            #scores = self.index.get_item_vector(idx) 
            tracks = self.neighbors_to_tracks(nns)
            return tracks
            #return {'tracks': tracks, 'scores': scores}
            
            
        instance = self._json_to_instance(inputs)
        output_dict = self.predict_instance(instance)
        output_dict['inputs'] = inputs
        if self.index:
            logits = output_dict.get('logits')
            nns = self.index.get_nns_by_vector(logits, n)
            return self.neighbors_to_tracks(nns)
            #output_dict['tracks'] = self.neighbors_to_tracks(nns)
        return output_dict

    @overrides
    def _json_to_instance(self, json_dict: JsonDict) -> Instance:
        return self._dataset_reader.text_to_instance(text=json_dict['query'])
Exemplo n.º 12
0
    def __init__(self,
                 vocab: Vocabulary,
                 document_token_embedder: TextFieldEmbedder,
                 encoder: RNN,
                 attention: MatrixAttention,
                 attention_layer: FeedForward,
                 decoder: RNN,
                 bridge: Bridge,
                 beam_search: BeamSearch,
                 run_beam_search: bool = True,
                 summary_token_embedder: Optional[TokenEmbedder] = None,
                 summary_namespace: str = 'tokens',
                 use_input_feeding: bool = False,
                 input_feeding_projection_layer: Optional[FeedForward] = None,
                 instance_loss_normalization: str = 'sum',
                 batch_loss_normalization: str = 'average',
                 metrics: Optional[List[Metric]] = None,
                 initializer: InitializerApplicator = InitializerApplicator(),
                 regularizer: RegularizerApplicator = None) -> None:
        super().__init__(vocab, regularizer)
        self.document_token_embedder = document_token_embedder
        self.encoder = encoder
        self.attention = attention
        self.attention_layer = attention_layer
        self.decoder = decoder
        self.bridge = bridge
        self.beam_search = beam_search
        self.run_beam_search = run_beam_search
        self.summary_token_embedder = summary_token_embedder or document_token_embedder._token_embedders[
            'tokens']
        self.summary_namespace = summary_namespace
        self.use_input_feeding = use_input_feeding
        self.input_feeding_projection_layer = input_feeding_projection_layer
        self.instance_loss_normalization = instance_loss_normalization
        self.batch_loss_normalization = batch_loss_normalization
        # The ``output_layer`` is applied after the attention context and decoder
        # hidden state are combined. It is used to calculate the softmax over the
        # summary vocabulary
        self.output_layer = torch.nn.Linear(
            decoder.get_output_dim(), vocab.get_vocab_size(summary_namespace))

        # Retrieve some special vocabulary token indices. Some of them are
        # required to exist.
        token_to_index = vocab.get_token_to_index_vocabulary(summary_namespace)
        assert START_SYMBOL in token_to_index
        self.start_index = token_to_index[START_SYMBOL]
        assert END_SYMBOL in token_to_index
        self.end_index = token_to_index[END_SYMBOL]
        assert DEFAULT_PADDING_TOKEN in token_to_index
        self.pad_index = token_to_index[DEFAULT_PADDING_TOKEN]
        self.sent_start_index = None
        if SENT_START_SYMBOL in token_to_index:
            self.sent_start_index = token_to_index[SENT_START_SYMBOL]
        self.sent_end_index = None
        if SENT_END_SYMBOL in token_to_index:
            self.sent_end_index = token_to_index[SENT_END_SYMBOL]

        self.loss = torch.nn.CrossEntropyLoss(ignore_index=self.pad_index,
                                              reduction='none')

        # Define the metrics that will be computed
        self.metrics = metrics
        self.cross_entropy_metric = CrossEntropyMetric()

        initializer(self)