def __init__(self, name, **kwargs): super(CharConvEmbeddings, self).__init__() self.vsz = kwargs.get('vsz') self.dsz = kwargs.get('dsz') self.finetune = kwargs.get('finetune', True) weights = kwargs.get('weights') if weights is None: self.embeddings = nn.Embedding(self.vsz, self.dsz, padding_idx=0) else: self.embeddings = pytorch_embedding(weights) char_filtsz = kwargs.get('cfiltsz', [3]) if is_sequence(char_filtsz[0]): char_hsz = [pair[1] for pair in char_filtsz] char_filtsz = [pair[0] for pair in char_filtsz] else: char_hsz = kwargs.get('wsz', 30) activation_type = kwargs.get('activation', 'tanh') pdrop = kwargs.get('pdrop', 0.5) self.char_comp = ParallelConv(self.dsz, char_hsz, char_filtsz, activation_type, pdrop) wchsz = self.char_comp.outsz self.linear = pytorch_linear(wchsz, wchsz) gating = kwargs.get('gating', 'skip') GatingConnection = SkipConnection if gating == 'skip' else Highway num_gates = kwargs.get('num_gates', 1) self.gating_seq = nn.Sequential( OrderedDict([('gate-{}'.format(i), GatingConnection(wchsz)) for i in range(num_gates)]))
def batch_input(self, tokens): """Convert the input into a consistent format. :return: List[List[dict[str] -> str]] """ mxlen = 0 mxwlen = 0 # Input is a list of strings. (assume strings are tokens) if isinstance(tokens[0], six.string_types): mxlen = len(tokens) tokens_seq = [] for t in tokens: mxwlen = max(mxwlen, len(t)) tokens_seq.append({'text': t}) tokens_seq = [tokens_seq] else: # Better be a sequence, but it could be pre-batched, [[],[]] # But what kind of object is at the first one then? if is_sequence(tokens[0]): tokens_seq = [] # Then what we have is [['The', 'dog',...], ['I', 'cannot']] # [[{'text': 'The', 'pos': 'DT'}, ... # For each of the utterances, we need to make a dictionary if isinstance(tokens[0][0], six.string_types): for utt in tokens: utt_dict_seq = [] mxlen = max(mxlen, len(utt)) for t in utt: mxwlen = max(mxwlen, len(t)) utt_dict_seq += [dict({'text': t})] tokens_seq += [utt_dict_seq] # Its already in dict form so we dont need to do anything elif isinstance(tokens[0][0], dict): for utt in tokens: mxlen = max(mxlen, len(utt)) for t in utt['text']: mxwlen = max(mxwlen, len(t)) # If its a dict, we just wrap it up elif isinstance(tokens[0], dict): mxlen = len(tokens) for t in tokens: mxwlen = max(mxwlen, len(t)) tokens_seq = [tokens] else: raise Exception('Unknown input format') if len(tokens_seq) == 0: return [] return tokens_seq, mxlen, mxwlen
def format_output(output) -> Union[Dict, List]: """Convert the outputs into a consistent format. The outputs are dicts. When functions return lists/scalars they are converted into dicts with numbers (as str's) for the keys. :param output Output to convert :return the formatted output """ if is_sequence(output): result = {} for i, out in enumerate(listify(output)): result[str(i)] = out output = result return output
def _get_filtsz(self): # If this is a list, then its a tuple of (filtsz, nfeats) if is_sequence(self.cfiltsz[0]): filtsz = [filter_and_size[0] for filter_and_size in self.cfiltsz] nfeats = [filter_and_size[1] for filter_and_size in self.cfiltsz] # If we get a nfeat factor, we multiply that by each filter, and thresh at max_feat elif self.nfeat_factor: max_feat = self.max_feat filtsz = self.cfiltsz nfeats = [min(self.nfeat_factor * fsz, max_feat) for fsz in filtsz] # Otherwise its just a scalar else: nfeats = self.wsz filtsz = self.cfiltsz return filtsz, nfeats
def vectorize(self, tokens_batch): """Turn the input into that batch dict for prediction. :param tokens_batch: `List[List[str]]`: The input text batch. :returns: dict[str] -> np.ndarray: The vectorized batch. """ examples = defaultdict(list) keys = self.vectorizers.keys() for i, tokens in enumerate(tokens_batch): if is_sequence(tokens[0]): if len(tokens) != 2: raise Exception( "We currently only accept dual inputs for multi-encoder" ) keys = [] for k, vectorizer in self.vectorizers.items(): vec0, length0 = vectorizer.run(tokens[0], self.vocabs[k]) vec1, length1 = vectorizer.run(tokens[1], self.vocabs[k]) # Its paired data key0 = f'{k}[0]' key1 = f'{k}[1]' keys.append(key0) keys.append(key1) examples[key0].append(vec0) examples[key1].append(vec1) if length0 is not None: lengths_key = f'{key0}_lengths' examples[lengths_key].append(length0) if length1 is not None: lengths_key = f'{key1}_lengths' examples[lengths_key].append(length1) else: for k, vectorizer in self.vectorizers.items(): vec, length = vectorizer.run(tokens, self.vocabs[k]) examples[k].append(vec) if length is not None: lengths_key = f'{k}_lengths' examples[lengths_key].append(length) for k in keys: examples[k] = np.stack(examples[k]) lengths_key = f'{k}_lengths' if lengths_key in examples: examples[lengths_key] = np.stack(examples[lengths_key]) return examples
def wire_inputs(inputs: Dict, results: Dict, chore: 'Chore') -> Dict: """Replace the reference inputs with the output files. References are assumed to be of the form `^X.Y` or just `X`. If the former, the chore and sub-field of the results Dict are returned. In the latter case, the whole Dict associated with the results is returned :param inputs: A dictionary of inputs :param results: A dictionary of upstream results :param chore: The chore function that will be called. :returns: The substituted input dictionary """ for key, values in inputs.items(): if is_sequence(values): new_vs = [] for value in values: if is_reference(value): new_vs.append( extract_outputs(parse_reference(value), results)) else: new_vs.append(value) inputs[key] = new_vs else: if is_reference(values): inputs[key] = extract_outputs(parse_reference(values), results) # Get the signature of the function sig = inspect.signature(chore) # Bind the args we populated with inputs bound = sig.bind_partial(**inputs) for param in sig.parameters.values(): # Look at all params and if they haven't been bound (they are not # present in the bound args and therefore were not in inputs) default # them to `None` in inputs. If this param has a default value we # don't need to add it to inputs if param.name not in bound.arguments and param.default is param.empty: inputs[param.name] = None return inputs
def batch_input(self, tokens): """Convert the input into a consistent format. :return: List[List[dict[str] -> str]] """ # Input is a list of strings. (assume strings are tokens) if isinstance(tokens[0], str): tokens_batch = [] for t in tokens: tokens_batch.append({'text': t}) tokens_batch = [tokens_batch] else: # Better be a sequence, but it could be pre-batched, [[],[]] # But what kind of object is at the first one then? if is_sequence(tokens[0]): tokens_batch = [] # Then what we have is [['The', 'dog',...], ['I', 'cannot']] # [[{'text': 'The', 'pos': 'DT'}, ... # For each of the utterances, we need to make a dictionary if isinstance(tokens[0][0], str): for utt in tokens: utt_dict_seq = [] for t in utt: utt_dict_seq += [dict({'text': t})] tokens_batch += [utt_dict_seq] # Its already in List[List[dict]] form, do nothing elif isinstance(tokens[0][0], dict): tokens_batch = tokens # If its a dict, we just wrap it up elif isinstance(tokens[0], dict): tokens_batch = [tokens] else: raise Exception('Unknown input format') if len(tokens_batch) == 0: return [] return tokens_batch
def _create_embeddings(self, embeddings_set, vocabs, features): """Creates a set of arbitrary sub-graph, DL-framework-specific embeddings by delegating to wired sub-module. As part of this process, we take in an index of embeddings by name, a ``dict`` of ``Counter`` objects (keyed by feature name), containing the number of times each token has been seen, and a `features` list which is a sub-section of the mead config containing the `embeddings` section for each feature. This method's job is to either create a sub-graph from a pretrained model, or to create a new random initialized sub-graph, taking into account the input vocabulary counters. The embeddings model has control to determine the actual word indices and sub-graph for the embeddings, both of which are returned from this method. If some sort of feature selection is performed, such as low count removal that would be required via the delegated methods :param embeddings_set: The embeddings index passed to mead driver :param vocabs: A set of known ``Counter``s for each vocabulary consisting of a token key and count for each :param features: The `features` sub-section of the mead config :return: Returns a ``tuple`` comprised of a ``dict`` of (`feature name`, `Embedding`) and an updated vocab """ embeddings_map = {} out_vocabs = {} for feature in features: # Get the block from the features section with key `embeddings` embeddings_section = feature['embeddings'] # The name is at the top level for the feature block of mead config name = feature['name'] # Get the label out of the embeddings section in the features block of mead config embed_label = embeddings_section.get('label', embeddings_section.get('labels')) # Get the type of embedding out of the embeddings section in the features block of mead config embed_type = embeddings_section.get('type', 'default') is_stacked = is_sequence(embed_label) if is_stacked: if embed_type != 'default': logger.warning("You have requested a stack of pretrained embeddings but didnt request 'default' or representation") # Backwards compat, copy from main block if not present locally embeddings_section['unif'] = embeddings_section.get('unif', self.config_params.get('unif', 0.1)) # Backwards compat, copy from main block if not present locally embeddings_section['keep_unused'] = embeddings_section.get('keep_unused', self.config_params.get('keep_unused', False)) # Overlay any backend parameters # Also, if we are in eager mode, we might have to place the embeddings explicitly on the CPU embeddings_section['cpu_placement'] = bool(embeddings_section.get('cpu_placement', False)) if self.backend.params is not None: # If we are in eager mode train_block = self.config_params['train'] optimizer_type = train_block.get('optim', 'sgd') # If the optimizer cannot handle embeddings on GPU if optimizer_type not in ['sgd', 'adam', 'adamw']: embeddings_section['cpu_placement'] = True elif optimizer_type == 'sgd' and float(train_block.get('mom', 0.0)) > 0: embeddings_section['cpu_placement'] = True for k, v in self.backend.params.items(): embeddings_section[k] = v if embed_label is not None: # Allow local overrides to uniform initializer embed_labels = listify(embed_label) embed_files = [] for embed_label in embed_labels: embeddings_global_config_i = embeddings_set[embed_label] if 'type' in embeddings_global_config_i: embed_type_i = embeddings_global_config_i['type'] embed_type = embed_type_i if embed_type_i != 'default' and is_stacked: raise Exception("Stacking embeddings only works for 'default' pretrained word embeddings") embed_file = embeddings_global_config_i.get('file') unzip_file = embeddings_global_config_i.get('unzip', True) embed_dsz = embeddings_global_config_i['dsz'] embed_sha1 = embeddings_global_config_i.get('sha1') # Should we grab vocab here too? embed_model = embeddings_global_config_i.get('model', {}) if 'dsz' not in embed_model and not is_stacked: embed_model['dsz'] = embed_dsz embeddings_section = {**embed_model, **embeddings_section} try: # We arent necessarily going to get an `embed_file`. For instance, using the HuggingFace # models in the Hub addon, the `embed_file` should be downloaded using HuggingFace's library, # not by us. In this case we want it to be None and we dont want to download it if embed_file: embed_file = EmbeddingDownloader(embed_file, embed_dsz, embed_sha1, self.data_download_cache, unzip_file=unzip_file).download() embed_files.append(embed_file) else: embed_files.append(None) except Exception as e: if is_stacked: raise e logger.warning(f"We were not able to download {embed_file}, passing to the addon") embed_files.append(embed_file) # If we have stacked embeddings (which only works with `default` model, we need to pass the list # If not, grab the first item embed_file = embed_files if is_stacked else embed_files[0] embedding_bundle = baseline.embeddings.load_embeddings(name, embed_file=embed_file, known_vocab=vocabs.get(name), embed_type=embed_type, data_download_cache=self.data_download_cache, **embeddings_section) embeddings_map[name] = embedding_bundle['embeddings'] out_vocabs[name] = embedding_bundle['vocab'] else: # if there is no label given, assume we need random initialization vectors dsz = embeddings_section.pop('dsz') embedding_bundle = baseline.embeddings.load_embeddings(name, dsz=dsz, known_vocab=vocabs[name], embed_type=embed_type, data_download_cache=self.data_download_cache, **embeddings_section) embeddings_map[name] = embedding_bundle['embeddings'] out_vocabs[name] = embedding_bundle['vocab'] return embeddings_map, out_vocabs
def batch_input(self, tokens): """Convert the input into a consistent format. :return: List[List[dict[str] -> str]] """ mxlen = 0 mxwlen = 0 vmxlen, vmxwlen = self.get_vectorizer_lens() # Input is a list of strings. (assume strings are tokens) if isinstance(tokens[0], six.string_types): if vmxlen != -1: tokens = tokens[:vmxlen] mxlen = len(tokens) tokens_seq = [] for t in tokens: mxwlen = max(mxwlen, len(t)) if vmxwlen != -1: t = t[:vmxwlen] tokens_seq.append({'text': t}) tokens_seq = [tokens_seq] else: # Better be a sequence, but it could be pre-batched, [[],[]] # But what kind of object is at the first one then? if is_sequence(tokens[0]): tokens_seq = [] # Then what we have is [['The', 'dog',...], ['I', 'cannot']] # [[{'text': 'The', 'pos': 'DT'}, ... # For each of the utterances, we need to make a dictionary if isinstance(tokens[0][0], six.string_types): for utt in tokens: utt_dict_seq = [] if vmxlen != -1: utt = utt[:vmxlen] mxlen = max(mxlen, len(utt)) for t in utt: if vmxwlen != -1: t = t[:vmxwlen] mxwlen = max(mxwlen, len(t)) utt_dict_seq += [dict({'text': t})] tokens_seq += [utt_dict_seq] # Its already in List[List[dict]] form so just iterate to get mxlen and mxwlen elif isinstance(tokens[0][0], dict): for utt_dict_seq in tokens: if vmxlen != -1: utt_dict_seq = utt_dict_seq[:vmxlen] mxlen = max(mxlen, len(utt_dict_seq)) for token_dict in utt_dict_seq: text = token_dict['text'] if vmxwlen != -1: text = text[:vmxwlen] token_dict['text'] = text mxwlen = max(mxwlen, len(text)) tokens_seq += [utt_dict_seq] # If its a dict, we just wrap it up elif isinstance(tokens[0], dict): if vmxlen != -1: tokens = tokens[:vmxlen] mxlen = len(tokens) for t in tokens: text = t['text'] if vmxwlen != -1: text = text[:vmxwlen] t['text'] = text mxwlen = max(mxwlen, len(text)) tokens_seq = [tokens] else: raise Exception('Unknown input format') if len(tokens_seq) == 0: return [] return tokens_seq, mxlen, mxwlen
def pool_chars(x_char, Wch, ce0, char_dsz, nfeat_factor=None, cfiltsz=[3], max_feat=200, gating='skip', num_gates=1, activation='tanh', wsz=30): """Take in a tensor of characters (B x maxs x maxw) and do character convolution :param x_char: TF tensor for input characters, (B x maxs x maxw) :param Wch: A character embeddings matrix :param ce0: A control dependency for the embeddings that keeps the <PAD> value 0 :param char_dsz: The character embedding dsz :param kwargs: :Keyword Arguments: * *cfiltsz* -- (``list``) A list of filters sizes, or a list of tuples of (filter size, num filts) * *nfeat_factor* -- (``int``) A factor to be multiplied to filter size to decide number of hidden units * *max_feat* -- (``int``) The maximum number of hidden units per filter * *gating* -- (``str``) `skip` or `highway` supported, yielding residual conn or highway, respectively * *num_gates* -- (``int``) How many gating functions to apply * *activation* -- (``str``) A string name of an activation, (e.g. `tanh`) :return: The character compositional embedding and the number of hidden units as a tuple """ if is_sequence(cfiltsz[0]): filtsz = [filter_and_size[0] for filter_and_size in cfiltsz] nfeats = [filter_and_size[1] for filter_and_size in cfiltsz] elif nfeat_factor: max_feat = max_feat filtsz = cfiltsz nfeats = [min(nfeat_factor * fsz, max_feat) for fsz in filtsz] else: filtsz = cfiltsz nfeats = wsz mxlen = tf.shape(x_char)[1] gating_fn = highway_conns if gating.startswith('highway') else skip_conns with tf.variable_scope("Chars2Word"): with tf.control_dependencies([ce0]): mxwlen = tf.shape(x_char)[-1] char_bt_x_w = tf.reshape(x_char, [-1, mxwlen]) cembed = tf.nn.embedding_lookup(Wch, char_bt_x_w, name="embeddings") cmot, num_filts = char_word_conv_embeddings( cembed, filtsz, char_dsz, nfeats, activation_fn=tf_activation(activation), gating=gating_fn, num_gates=num_gates) word_char = tf.reshape(cmot, [-1, mxlen, num_filts]) return word_char, num_filts