예제 #1
0
def for_model_char_side(data, limit=1000, pad_len=60) -> List:
    char_matrix = load_char_embeddings(os.path.relpath('./data/char_vectors.txt'), limit=limit)
    encoded = []
    for sentence in data[0]:
        for i, word in enumerate(sentence):
            sentence[i] = t.padding(char_vectorize(word[0], char_matrix), pad_len=10, pad_char=np.zeros(300).tolist())
        sentence = t.padding(sentence, pad_len=pad_len, pad_char=np.zeros(shape=(10, 300)).tolist())
        encoded.append(sentence)
    return encoded
예제 #2
0
    def _run_noun_pred(self):
        noun_model = cxsession.cortecx_sess.noun_model
        in_data = t.padding(self.processed[0], 60, pad_char=np.zeros(301))

        pred = noun_model.predict([[in_data]])[0]
        pred = [np.argmax(element) for element in pred]
        return pred
예제 #3
0
def _encode_smile(smile: str, embedding_matrix: Dict, padding: bool=True,
                 pad_len: int=None, pad_char: int=0) -> List[int]:
    encoded = [embedding_matrix[char] for char in smile]
    if padding:
        if pad_len is None:
            raise ValueError('pad_len must not be None')
        encoded = t.padding(encoded, pad_len=pad_len, pad_char=pad_char)

    return encoded
예제 #4
0
    def _run_pos_pred(self):
        n_pred = self._run_noun_pred()
        for n, pred in enumerate(self.processed[0]):
            self.processed[0][n][-1] = n_pred[n]

        pos = cxsession.cortecx_sess.pos_model
        inpt = t.padding(self.processed[0], 60, pad_char=np.zeros(301))

        pos_pred = pos.predict([[inpt]])[0]
        pos_pred = [np.argmax(element) for element in pos_pred]
        return pos_pred
예제 #5
0
def for_model_word_side(data, limit=1000000, pad_len=60) -> List:
    vector_matrix = load_word_embeddings(os.path.relpath('./data/word_vectors.txt'), limit=limit)
    encoded = []
    for element in data[0]:
        temp = []
        for point in element:
            try:
                temp.append(vector_matrix[point[0]])
            except KeyError:
                temp.append(np.zeros(300).tolist())
            temp = t.padding(temp, pad_len=pad_len, pad_char=np.zeros(300).tolist())
        encoded.append(temp)
    return encoded
예제 #6
0
    def _run_ner_pred(self):
        p_pred = self._run_pos_pred()
        p_pred = t.convert_encoding_pos_to_ner(p_pred)

        for i, element in enumerate(self.processed[0]):
            self.processed[0][i][-1] = p_pred[i]

        ner = cxsession.cortecx_sess.ner_model
        inpt = t.padding(self.processed[0], 60, pad_char=np.zeros(301))

        ner_pred = ner.predict([[inpt]])[0]
        ner_pred = [np.argmax(element) for element in ner_pred]
        return ner_pred
예제 #7
0
def tag_output(data, tags_dict: Dict, num_classes=45, pad_len=60) -> List:
    tags = data[1]
    for i, tag in enumerate(tags):
        tags[i] = [tags_dict[element] for element in tag]
        tags[i] = t.padding(tags[i], pad_len=pad_len, pad_char=0)
    return [to_categorical(i, num_classes=num_classes) for i in tags]
예제 #8
0
    def encode(self):
        if isinstance(self.data, DataObj):
            self.data = self.data.data
        if isinstance(self.data, Encoder):
            self.data = self.data.data
        if isinstance(self.data, str):
            self.data = t.Tokenizer(self.data).tokenize().tokens
        else:
            self.data = self.data

        self.encode_type = self.kwargs['encode_type']

        worker = []
        for element in self.data:
            worker.append(element) if element not in worker else None
        if self.encode_type not in ['integer', 'binary', 'frequency', 'conll']:
            raise ValueError(
                'Encoding type must be either "integer", "binary", or "conll"')

        if self.encode_type == 'binary':
            keys = {}
            initializer = [0 for num in range(len(worker))]
            for i in range(len(worker)):
                initializer[i] = 1
                keys.update({worker[i]: initializer})
                initializer = [0 for num in range(len(worker))]
            self.token_map = keys
            for element in self.data:
                self.transformed_data.append(keys[element])
            return self.transformed_data

        if self.encode_type == 'integer':
            keys = {}
            initializer = 1
            for i in range(len(worker)):
                keys.update({worker[i]: initializer})
                initializer += 1
            self.token_map = keys
            for element in self.data:
                self.transformed_data.append(keys[element])
            return self.transformed_data

        if self.encode_type == 'conll':
            max_length = 0
            worker = []
            ner = []

            for element in self.data[1]:
                for value in element:
                    worker.append(value)

            encoder = Encoder(data=worker, encode_type='integer')
            encoder.encode()
            pos_map = encoder.token_map

            for element in self.data[1]:
                element = [pos_map[value] for value in element]
                element = t.padding(element, pad_len=60, pad_char=0)
                ner.append(element)
            self.token_map = pos_map

            worker = []

            self.data = self.data[0]

            for sentence in self.data:
                for word in sentence:
                    if worker.count(word[1]) >= 1:
                        pass
                    else:
                        worker.append(word[1])

            encoder = Encoder(data=worker, encode_type='integer')
            encoder.encode()
            pos_map = encoder.token_map
            # print(pos_map)

            for sentence in self.data:
                if len(sentence) > max_length:
                    max_length = len(sentence)
                else:
                    pass

            for sentence in self.data:
                worker = []
                for word in sentence:
                    new_word = list(Word(word[0]).vector)
                    new_word.append(float(pos_map[word[1]]))
                    worker.append(new_word)
                self.transformed_data.append(
                    t.padding(worker, pad_len=60,
                              pad_char=list(np.zeros(301))))
            self.data = self.transformed_data
            return self.data, ner

        if self.encode_type == 'frequency':
            tokens = self.data
            for token in tokens:
                tokens[tokens.index(token)] = tokens.count(token)
            self.transformed_data = tokens
            return self.transformed_data