예제 #1
0
  def _predict(self, doc: Document, return_float=False):
    """
    Predict labels for a given Document object
    :param doc: Document object
    :return: list of labels with corresponding confidence intervals
    """
    set_tf_growth()
    if isinstance(self.keras_model.input, list):
      _, sample_length, embedding_size = self.keras_model.input_shape[0]
    else:
      _, sample_length, embedding_size = self.keras_model.input_shape
    words = doc.get_all_words()[:sample_length]
    x_matrix = np.zeros((1, sample_length, embedding_size))

    for i, w in enumerate(words):
      if w in self.word2vec_model.wv:
        word_vector = self.word2vec_model.wv[w].reshape(1, -1)
        scaled_vector = self.scaler.transform(word_vector, copy=True)[0]
        x_matrix[0][i] = scaled_vector

    if isinstance(self.keras_model.input, list):
      x = [x_matrix] * len(self.keras_model.input)
    else:
      x = [x_matrix]

    with tf.device('/cpu:0'):
      y_predicted = self.keras_model.predict(x)
    # return weighted avg of labels
    # return reduce(lambda acc, x: acc + (x[0] * x[1]), zipped, 1) #weighted avg
    # TODO make this return weighted avg or max prob a param
    # max probablitiy, corresponding to standard keras mmethodology
    # print(f'model output shape {self.keras_model.output_shape}')
    if self.keras_model.output_shape[1] == 1:
      # print(f'returning {y_predicted[0][0]}')
      float_y_pred = float(y_predicted[0][0])
      # if not isinstance(y_predicted[0][0], float):
      #   print(type(y_predicted[0][0]))
      #   print(y_predicted, y_predicted[0][0])
      assert(isinstance(float_y_pred, float))
      # print(float_y_pred)
      return float_y_pred
    elif return_float:
      zipped = zip(self.labels, y_predicted[0])
      return float(
          sorted(
              zipped,
              key=lambda elem: elem[1],
              reverse=True)[0][0])
    else:
      zipped = zip(self.labels, y_predicted[0])
      return sorted(zipped, key=lambda elem: elem[1], reverse=True)[0][0]
예제 #2
0
def build_x_and_y(data: DataList, **kwargs):
  """
  Given file names and their directory, build (X, y) data matrices
  :param filenames: iterable of strings showing file ids (no extension)
  :param file_directory: path to a directory where those files lie
  :param kwargs: additional necessary data for matrix building e.g. scaler

  :return: a tuple (X, y)
  """
  label_indices = kwargs['label_indices']
  word2vec_model = kwargs['word2vec_model']
  scaler = kwargs['scaler']
  nn_model = kwargs['nn_model']
  regression = kwargs.get('regression', False)

  x_matrix = np.zeros(
      (len(data),
       SAMPLE_LENGTH,
       word2vec_model.vector_size))
  if regression:
    # print('YES REGRESSION')
    y_matrix = np.zeros((len(data), 1), dtype=np.float_)
    # print(y_matrix)
  else:
    # print('NOT REGRESSION')
    y_matrix = np.zeros((len(data), len(label_indices)), dtype=np.bool_)

  for doc_id, example in enumerate(data):
    doc = Document(example['text'])
    words = doc.get_all_words()[:SAMPLE_LENGTH]

    for i, w in enumerate(words):
      if w in word2vec_model.wv:
        word_vector = word2vec_model.wv[w].reshape(1, -1)
        x_matrix[doc_id][i] = scaler.transform(word_vector, copy=True)[0]

    labels = [example['label']]

    for lab in labels:
      if regression:
        y_matrix[doc_id] = float(lab)
      else:
        index = label_indices[lab]
        y_matrix[doc_id][index] = True

  if nn_model and isinstance(nn_model.input, list):
    return [x_matrix] * len(nn_model.input), y_matrix
  else:
    return [x_matrix], y_matrix
예제 #3
0
def build_x_and_y(filenames, file_directory, **kwargs):
    """
    Given file names and their directory, build (X, y) data matrices
    :param filenames: iterable of strings showing file ids (no extension)
    :param file_directory: path to a directory where those files lie
    :param kwargs: additional necessary data for matrix building e.g. scaler

    :return: a tuple (X, y)
    """
    label_indices = kwargs['label_indices']
    word2vec_model = kwargs['word2vec_model']
    scaler = kwargs['scaler']
    nn_model = kwargs['nn_model']

    x_matrix = np.zeros((len(filenames), SAMPLE_LENGTH, EMBEDDING_SIZE))
    y_matrix = np.zeros((len(filenames), len(label_indices)), dtype=np.bool_)

    for doc_id, fname in enumerate(filenames):
        doc = Document(doc_id, os.path.join(file_directory, fname + '.txt'))
        words = doc.get_all_words()[:SAMPLE_LENGTH]

        for i, w in enumerate(words):
            if w in word2vec_model:
                word_vector = word2vec_model[w].reshape(1, -1)
                x_matrix[doc_id][i] = scaler.transform(word_vector,
                                                       copy=True)[0]

        labels = get_answers_for_doc(
            fname + '.txt',
            file_directory,
            filtered_by=set(label_indices.keys()),
        )

        for lab in labels:
            index = label_indices[lab]
            y_matrix[doc_id][index] = True

    if nn_model and type(nn_model.input) == list:
        return_data = [x_matrix] * len(nn_model.input), y_matrix
    else:
        return_data = [x_matrix], y_matrix

    if type(nn_model) == Graph:
        return {'input': return_data[0], 'output': return_data[1]}
    else:
        return return_data
예제 #4
0
def build_x_and_y(filenames, file_directory, **kwargs):
    """
    Given file names and their directory, build (X, y) data matrices
    :param filenames: iterable of strings showing file ids (no extension)
    :param file_directory: path to a directory where those files lie
    :param kwargs: additional necessary data for matrix building e.g. scaler

    :return: a tuple (X, y)
    """
    label_indices = kwargs['label_indices']
    word2vec_model = kwargs['word2vec_model']
    scaler = kwargs['scaler']
    nn_model = kwargs['nn_model']

    x_matrix = np.zeros((len(filenames), SAMPLE_LENGTH, EMBEDDING_SIZE))
    y_matrix = np.zeros((len(filenames), len(label_indices)), dtype=np.bool_)

    for doc_id, fname in enumerate(filenames):
        doc = Document(doc_id, os.path.join(file_directory, fname + '.txt'))
        words = doc.get_all_words()[:SAMPLE_LENGTH]

        for i, w in enumerate(words):
            if w in word2vec_model:
                word_vector = word2vec_model[w].reshape(1, -1)
                x_matrix[doc_id][i] = scaler.transform(word_vector, copy=True)[0]

        labels = get_answers_for_doc(
            fname + '.txt',
            file_directory,
            filtered_by=set(label_indices.keys()),
        )

        for lab in labels:
            index = label_indices[lab]
            y_matrix[doc_id][index] = True

    if nn_model and type(nn_model.input) == list:
        return_data = [x_matrix] * len(nn_model.input), y_matrix
    else:
        return_data = [x_matrix], y_matrix

    if type(nn_model) == Graph:
        return {'input': return_data[0], 'output': return_data[1]}
    else:
        return return_data