def _predict(self, doc: Document, return_float=False): """ Predict labels for a given Document object :param doc: Document object :return: list of labels with corresponding confidence intervals """ set_tf_growth() if isinstance(self.keras_model.input, list): _, sample_length, embedding_size = self.keras_model.input_shape[0] else: _, sample_length, embedding_size = self.keras_model.input_shape words = doc.get_all_words()[:sample_length] x_matrix = np.zeros((1, sample_length, embedding_size)) for i, w in enumerate(words): if w in self.word2vec_model.wv: word_vector = self.word2vec_model.wv[w].reshape(1, -1) scaled_vector = self.scaler.transform(word_vector, copy=True)[0] x_matrix[0][i] = scaled_vector if isinstance(self.keras_model.input, list): x = [x_matrix] * len(self.keras_model.input) else: x = [x_matrix] with tf.device('/cpu:0'): y_predicted = self.keras_model.predict(x) # return weighted avg of labels # return reduce(lambda acc, x: acc + (x[0] * x[1]), zipped, 1) #weighted avg # TODO make this return weighted avg or max prob a param # max probablitiy, corresponding to standard keras mmethodology # print(f'model output shape {self.keras_model.output_shape}') if self.keras_model.output_shape[1] == 1: # print(f'returning {y_predicted[0][0]}') float_y_pred = float(y_predicted[0][0]) # if not isinstance(y_predicted[0][0], float): # print(type(y_predicted[0][0])) # print(y_predicted, y_predicted[0][0]) assert(isinstance(float_y_pred, float)) # print(float_y_pred) return float_y_pred elif return_float: zipped = zip(self.labels, y_predicted[0]) return float( sorted( zipped, key=lambda elem: elem[1], reverse=True)[0][0]) else: zipped = zip(self.labels, y_predicted[0]) return sorted(zipped, key=lambda elem: elem[1], reverse=True)[0][0]
def build_x_and_y(data: DataList, **kwargs): """ Given file names and their directory, build (X, y) data matrices :param filenames: iterable of strings showing file ids (no extension) :param file_directory: path to a directory where those files lie :param kwargs: additional necessary data for matrix building e.g. scaler :return: a tuple (X, y) """ label_indices = kwargs['label_indices'] word2vec_model = kwargs['word2vec_model'] scaler = kwargs['scaler'] nn_model = kwargs['nn_model'] regression = kwargs.get('regression', False) x_matrix = np.zeros( (len(data), SAMPLE_LENGTH, word2vec_model.vector_size)) if regression: # print('YES REGRESSION') y_matrix = np.zeros((len(data), 1), dtype=np.float_) # print(y_matrix) else: # print('NOT REGRESSION') y_matrix = np.zeros((len(data), len(label_indices)), dtype=np.bool_) for doc_id, example in enumerate(data): doc = Document(example['text']) words = doc.get_all_words()[:SAMPLE_LENGTH] for i, w in enumerate(words): if w in word2vec_model.wv: word_vector = word2vec_model.wv[w].reshape(1, -1) x_matrix[doc_id][i] = scaler.transform(word_vector, copy=True)[0] labels = [example['label']] for lab in labels: if regression: y_matrix[doc_id] = float(lab) else: index = label_indices[lab] y_matrix[doc_id][index] = True if nn_model and isinstance(nn_model.input, list): return [x_matrix] * len(nn_model.input), y_matrix else: return [x_matrix], y_matrix
def build_x_and_y(filenames, file_directory, **kwargs): """ Given file names and their directory, build (X, y) data matrices :param filenames: iterable of strings showing file ids (no extension) :param file_directory: path to a directory where those files lie :param kwargs: additional necessary data for matrix building e.g. scaler :return: a tuple (X, y) """ label_indices = kwargs['label_indices'] word2vec_model = kwargs['word2vec_model'] scaler = kwargs['scaler'] nn_model = kwargs['nn_model'] x_matrix = np.zeros((len(filenames), SAMPLE_LENGTH, EMBEDDING_SIZE)) y_matrix = np.zeros((len(filenames), len(label_indices)), dtype=np.bool_) for doc_id, fname in enumerate(filenames): doc = Document(doc_id, os.path.join(file_directory, fname + '.txt')) words = doc.get_all_words()[:SAMPLE_LENGTH] for i, w in enumerate(words): if w in word2vec_model: word_vector = word2vec_model[w].reshape(1, -1) x_matrix[doc_id][i] = scaler.transform(word_vector, copy=True)[0] labels = get_answers_for_doc( fname + '.txt', file_directory, filtered_by=set(label_indices.keys()), ) for lab in labels: index = label_indices[lab] y_matrix[doc_id][index] = True if nn_model and type(nn_model.input) == list: return_data = [x_matrix] * len(nn_model.input), y_matrix else: return_data = [x_matrix], y_matrix if type(nn_model) == Graph: return {'input': return_data[0], 'output': return_data[1]} else: return return_data