def for_model_char_side(data, limit=1000, pad_len=60) -> List: char_matrix = load_char_embeddings(os.path.relpath('./data/char_vectors.txt'), limit=limit) encoded = [] for sentence in data[0]: for i, word in enumerate(sentence): sentence[i] = t.padding(char_vectorize(word[0], char_matrix), pad_len=10, pad_char=np.zeros(300).tolist()) sentence = t.padding(sentence, pad_len=pad_len, pad_char=np.zeros(shape=(10, 300)).tolist()) encoded.append(sentence) return encoded
def _run_noun_pred(self): noun_model = cxsession.cortecx_sess.noun_model in_data = t.padding(self.processed[0], 60, pad_char=np.zeros(301)) pred = noun_model.predict([[in_data]])[0] pred = [np.argmax(element) for element in pred] return pred
def _encode_smile(smile: str, embedding_matrix: Dict, padding: bool=True, pad_len: int=None, pad_char: int=0) -> List[int]: encoded = [embedding_matrix[char] for char in smile] if padding: if pad_len is None: raise ValueError('pad_len must not be None') encoded = t.padding(encoded, pad_len=pad_len, pad_char=pad_char) return encoded
def _run_pos_pred(self): n_pred = self._run_noun_pred() for n, pred in enumerate(self.processed[0]): self.processed[0][n][-1] = n_pred[n] pos = cxsession.cortecx_sess.pos_model inpt = t.padding(self.processed[0], 60, pad_char=np.zeros(301)) pos_pred = pos.predict([[inpt]])[0] pos_pred = [np.argmax(element) for element in pos_pred] return pos_pred
def for_model_word_side(data, limit=1000000, pad_len=60) -> List: vector_matrix = load_word_embeddings(os.path.relpath('./data/word_vectors.txt'), limit=limit) encoded = [] for element in data[0]: temp = [] for point in element: try: temp.append(vector_matrix[point[0]]) except KeyError: temp.append(np.zeros(300).tolist()) temp = t.padding(temp, pad_len=pad_len, pad_char=np.zeros(300).tolist()) encoded.append(temp) return encoded
def _run_ner_pred(self): p_pred = self._run_pos_pred() p_pred = t.convert_encoding_pos_to_ner(p_pred) for i, element in enumerate(self.processed[0]): self.processed[0][i][-1] = p_pred[i] ner = cxsession.cortecx_sess.ner_model inpt = t.padding(self.processed[0], 60, pad_char=np.zeros(301)) ner_pred = ner.predict([[inpt]])[0] ner_pred = [np.argmax(element) for element in ner_pred] return ner_pred
def tag_output(data, tags_dict: Dict, num_classes=45, pad_len=60) -> List: tags = data[1] for i, tag in enumerate(tags): tags[i] = [tags_dict[element] for element in tag] tags[i] = t.padding(tags[i], pad_len=pad_len, pad_char=0) return [to_categorical(i, num_classes=num_classes) for i in tags]
def encode(self): if isinstance(self.data, DataObj): self.data = self.data.data if isinstance(self.data, Encoder): self.data = self.data.data if isinstance(self.data, str): self.data = t.Tokenizer(self.data).tokenize().tokens else: self.data = self.data self.encode_type = self.kwargs['encode_type'] worker = [] for element in self.data: worker.append(element) if element not in worker else None if self.encode_type not in ['integer', 'binary', 'frequency', 'conll']: raise ValueError( 'Encoding type must be either "integer", "binary", or "conll"') if self.encode_type == 'binary': keys = {} initializer = [0 for num in range(len(worker))] for i in range(len(worker)): initializer[i] = 1 keys.update({worker[i]: initializer}) initializer = [0 for num in range(len(worker))] self.token_map = keys for element in self.data: self.transformed_data.append(keys[element]) return self.transformed_data if self.encode_type == 'integer': keys = {} initializer = 1 for i in range(len(worker)): keys.update({worker[i]: initializer}) initializer += 1 self.token_map = keys for element in self.data: self.transformed_data.append(keys[element]) return self.transformed_data if self.encode_type == 'conll': max_length = 0 worker = [] ner = [] for element in self.data[1]: for value in element: worker.append(value) encoder = Encoder(data=worker, encode_type='integer') encoder.encode() pos_map = encoder.token_map for element in self.data[1]: element = [pos_map[value] for value in element] element = t.padding(element, pad_len=60, pad_char=0) ner.append(element) self.token_map = pos_map worker = [] self.data = self.data[0] for sentence in self.data: for word in sentence: if worker.count(word[1]) >= 1: pass else: worker.append(word[1]) encoder = Encoder(data=worker, encode_type='integer') encoder.encode() pos_map = encoder.token_map # print(pos_map) for sentence in self.data: if len(sentence) > max_length: max_length = len(sentence) else: pass for sentence in self.data: worker = [] for word in sentence: new_word = list(Word(word[0]).vector) new_word.append(float(pos_map[word[1]])) worker.append(new_word) self.transformed_data.append( t.padding(worker, pad_len=60, pad_char=list(np.zeros(301)))) self.data = self.transformed_data return self.data, ner if self.encode_type == 'frequency': tokens = self.data for token in tokens: tokens[tokens.index(token)] = tokens.count(token) self.transformed_data = tokens return self.transformed_data