def endElement(self, name): # print("endElement '" + name + "'") if name == "p": # end of sentence if self.accumulated != '': localTokens = tokenizeAndFilterSimple(self.accumulated) for token in localTokens: self.tokens.append(token) self.labels.append('O') self.sents.append(self.tokens) self.allLabels.append(self.labels) self.tokens = [] self.labels = [] if name == "rs": # end of entity localTokens = tokenizeAndFilterSimple(self.accumulated) begin = True if self.currentLabel is None: self.currentLabel = 'O' for token in localTokens: self.tokens.append(token) if begin: self.labels.append('B-' + self.currentLabel) begin = False else: self.labels.append('I-' + self.currentLabel) self.currentLabel = None self.accumulated = ''
def startElement(self, name, attrs): if self.accumulated != '': localTokens = tokenizeAndFilterSimple(self.accumulated) for token in localTokens: self.tokens.append(token) self.labels.append('O') if name == 'TEI' or name == 'tei': # beginning of a document self.tokens = [] self.labels = [] self.sents = [] self.allLabels = [] if name == "p": # beginning of sentence self.tokens = [] self.labels = [] self.currentLabel = 'O' if name == "rs": # beginning of entity if attrs.getLength() != 0: if attrs.getValue("type") != 'insult' and attrs.getValue( "type") != 'threat': print("Invalid entity type:", attrs.getValue("type")) self.currentLabel = '<' + attrs.getValue("type") + '>' self.accumulated = ''
def to_vector_single(text, embeddings, maxlen=300): """ Given a string, tokenize it, then convert it to a sequence of word embedding vectors with the provided embeddings, introducing <PAD> and <UNK> padding token vector when appropriate """ tokens = tokenizeAndFilterSimple(clean_text(text)) window = tokens[-maxlen:] # TBD: use better initializers (uniform, etc.) x = np.zeros((maxlen, embeddings.embed_size), ) # TBD: padding should be left and which vector do we use for padding? # and what about masking padding later for RNN? for i, word in enumerate(window): x[i, :] = embeddings.get_word_vector(word).astype('float32') return x
def startElement(self, name, attrs): if self.accumulated != '': localTokens = tokenizeAndFilterSimple(self.accumulated) for token in localTokens: self.tokens.append(token) self.labels.append('O') if name == 'corpus' or name == 'DOC': # beginning of a document self.tokens = [] self.labels = [] self.sents = [] self.allLabels = [] if name == "sentence": # beginning of sentence self.tokens = [] self.labels = [] self.currentLabel = 'O' if name == "ENAMEX": # beginning of entity if attrs.getLength() != 0: #if attrs.getValue("type") != 'insult' and attrs.getValue("type") != 'threat': # print("Invalid entity type:", attrs.getValue("type")) attribute_names = attrs.getNames() mainType = None if "type" in attrs: mainType = attrs.getValue("type") if "TYPE" in attrs: mainType = attrs.getValue("TYPE") if mainType is None: print('ENAMEX element without type attribute!') if "sub_type" in attrs: subType = attrs.getValue("sub_type") else: subType = '' if self.corpus_type == 'lemonde': self.currentLabel = '<' + self.translate_fr_labels( mainType, subType) + '>' else: self.currentLabel = '<' + mainType + '>' self.accumulated = ''
def __data_generation(self, index): 'Generates data containing batch_size samples' max_iter = min(self.batch_size, len(self.x) - self.batch_size * index) # restrict data to index window sub_x = self.x[(index * self.batch_size):(index * self.batch_size) + max_iter] # tokenize texts in self.x if not already done max_length_x = 0 if self.tokenize: x_tokenized = [] for i in range(0, max_iter): tokens = tokenizeAndFilterSimple(sub_x[i]) if len(tokens) > max_length_x: max_length_x = len(tokens) x_tokenized.append(tokens) else: for tokens in sub_x: if len(tokens) > max_length_x: max_length_x = len(tokens) x_tokenized = sub_x batch_x = np.zeros( (max_iter, max_length_x, self.embeddings.embed_size), dtype='float32') if self.preprocessor.return_casing: batch_a = np.zeros((max_iter, max_length_x), dtype='float32') batch_y = None max_length_y = max_length_x if self.y is not None: # note: tags are always already "tokenized", batch_y = np.zeros((max_iter, max_length_y), dtype='float32') if self.embeddings.use_ELMo: #batch_x = to_vector_elmo(x_tokenized, self.embeddings, max_length_x) batch_x = to_vector_simple_with_elmo(x_tokenized, self.embeddings, max_length_x) # generate data for i in range(0, max_iter): # store sample embeddings if not self.embeddings.use_ELMo: batch_x[i] = to_vector_single(x_tokenized[i], self.embeddings, max_length_x) if self.preprocessor.return_casing: batch_a[i] = to_casing_single(x_tokenized[i], max_length_x) # store tag embeddings if self.y is not None: batch_y = self.y[(index * self.batch_size):(index * self.batch_size) + max_iter] if self.y is not None: batches, batch_y = self.preprocessor.transform( x_tokenized, batch_y) else: batches = self.preprocessor.transform(x_tokenized) batch_c = np.asarray(batches[0]) batch_l = batches[1] if self.preprocessor.return_casing: return batch_x, batch_c, batch_a, batch_l, batch_y else: return batch_x, batch_c, batch_l, batch_y