Exemplo n.º 1
0
 def _create_embeddings_from_file(embed_file, embed_dsz, embed_sha1,
                                  data_download_cache, vocab, unif,
                                  keep_unused):
     embed_file = EmbeddingDownloader(embed_file, embed_dsz, embed_sha1,
                                      data_download_cache).download()
     EmbeddingT = baseline.GloVeModel if mime_type(
         embed_file) == 'text/plain' else baseline.Word2VecModel
     return EmbeddingT(embed_file,
                       vocab,
                       unif_weight=unif,
                       keep_unused=keep_unused)
Exemplo n.º 2
0
    def _create_embeddings(self, embeddings_set, vocabs, features):
        """Creates a set of arbitrary sub-graph, DL-framework-specific embeddings by delegating to wired sub-module.

        As part of this process, we take in an index of embeddings by name, a ``dict`` of ``Counter`` objects (keyed by
        feature name), containing the number of times each token has been seen, and a `features` list which is a
        sub-section of the mead config containing the `embeddings` section for each feature.
        This method's job is to either create a sub-graph from a pretrained model, or to create a new random
        initialized sub-graph, taking into account the input vocabulary counters.  The embeddings model has control
        to determine the actual word indices and sub-graph for the embeddings, both of which are returned from this
        method.  If some sort of feature selection is
        performed, such as low count removal that would be required via the delegated methods

        :param embeddings_set: The embeddings index passed to mead driver
        :param vocabs: A set of known ``Counter``s for each vocabulary consisting of a token key and count for each
        :param features: The `features` sub-section of the mead config
        :return: Returns a ``tuple`` comprised of a ``dict`` of (`feature name`, `Embedding`) and an updated vocab
        """
        unif = self.config_params.get('unif', 0.1)
        keep_unused = self.config_params.get('keep_unused', False)

        embeddings_map = {}
        out_vocabs = {}
        for feature in features:
            embeddings_section = feature['embeddings']
            name = feature['name']
            embed_label = embeddings_section.get('label', None)
            embed_type = embeddings_section.get('type', 'default')
            embeddings_section['unif'] = embeddings_section.get('unif', unif)
            embeddings_section['keep_unused'] = embeddings_section.get('keep_unused', keep_unused)
            if self.backend.params is not None:
                for k, v in self.backend.params.items():
                    embeddings_section[k] = v
            if embed_label is not None:
                # Allow local overrides to uniform initializer

                embed_file = embeddings_set[embed_label]['file']
                embed_dsz = embeddings_set[embed_label]['dsz']
                embed_sha1 = embeddings_set[embed_label].get('sha1', None)
                embed_file = EmbeddingDownloader(embed_file, embed_dsz, embed_sha1, self.data_download_cache).download()

                embedding_bundle = baseline.embeddings.load_embeddings(name, embed_file=embed_file, known_vocab=vocabs[name], embed_type=embed_type, **embeddings_section)

                embeddings_map[name] = embedding_bundle['embeddings']
                out_vocabs[name] = embedding_bundle['vocab']
            else:
                dsz = embeddings_section.pop('dsz')
                embedding_bundle = baseline.embeddings.load_embeddings(name, dsz=dsz, known_vocab=vocabs[name], embed_type=embed_type, **embeddings_section)
                embeddings_map[name] = embedding_bundle['embeddings']
                out_vocabs[name] = embedding_bundle['vocab']

        return embeddings_map, out_vocabs
Exemplo n.º 3
0
parser.add_argument('--datasets',
                    help='json library of dataset labels',
                    default='config/datasets.json',
                    type=convert_path)
parser.add_argument('--embeddings',
                    help='json library of embeddings',
                    default='config/embeddings.json',
                    type=convert_path)
args = parser.parse_args()

datasets = read_json(args.datasets)
datasets = index_by_label(datasets)

for name, d in datasets.items():
    print(name)
    try:
        DataDownloader(d, args.cache).download()
    except Exception as e:
        print(e)

emb = read_json(args.embeddings)
emb = index_by_label(emb)

for name, e in emb.items():
    print(name)
    try:
        EmbeddingDownloader(e['file'], e['dsz'], e.get('sha1'),
                            args.cache).download()
    except Exception as e:
        print(e)