예제 #1
0
 def __iter__(self):
     files = {filename[:-4] for filename in os.listdir(self.dirname)}
     for doc_id, fname in enumerate(files):
         d = Document(doc_id, os.path.join(self.dirname,
                                           fname + '.txt'))
         for sentence in d.read_sentences():
             yield sentence
예제 #2
0
 def __iter__(self):
     files = {filename[:-4] for filename in os.listdir(self.dirname)}
     for doc_id, fname in enumerate(files):
         print(str(doc_id) + '||||' + fname)  #这句是我加的,查看读取的文件进度,否则等的心急
         d = Document(doc_id, os.path.join(self.dirname,
                                           fname + '.txt'))
         for sentence in d.read_sentences():
             yield sentence
예제 #3
0
def build_x_and_y(data: DataList, **kwargs):
  """
  Given file names and their directory, build (X, y) data matrices
  :param filenames: iterable of strings showing file ids (no extension)
  :param file_directory: path to a directory where those files lie
  :param kwargs: additional necessary data for matrix building e.g. scaler

  :return: a tuple (X, y)
  """
  label_indices = kwargs['label_indices']
  word2vec_model = kwargs['word2vec_model']
  scaler = kwargs['scaler']
  nn_model = kwargs['nn_model']
  regression = kwargs.get('regression', False)

  x_matrix = np.zeros(
      (len(data),
       SAMPLE_LENGTH,
       word2vec_model.vector_size))
  if regression:
    # print('YES REGRESSION')
    y_matrix = np.zeros((len(data), 1), dtype=np.float_)
    # print(y_matrix)
  else:
    # print('NOT REGRESSION')
    y_matrix = np.zeros((len(data), len(label_indices)), dtype=np.bool_)

  for doc_id, example in enumerate(data):
    doc = Document(example['text'])
    words = doc.get_all_words()[:SAMPLE_LENGTH]

    for i, w in enumerate(words):
      if w in word2vec_model.wv:
        word_vector = word2vec_model.wv[w].reshape(1, -1)
        x_matrix[doc_id][i] = scaler.transform(word_vector, copy=True)[0]

    labels = [example['label']]

    for lab in labels:
      if regression:
        y_matrix[doc_id] = float(lab)
      else:
        index = label_indices[lab]
        y_matrix[doc_id][index] = True

  if nn_model and isinstance(nn_model.input, list):
    return [x_matrix] * len(nn_model.input), y_matrix
  else:
    return [x_matrix], y_matrix
예제 #4
0
def get_documents_from_mongo(ids,
                             mongo_collection,
                             as_generator=True,
                             shuffle=False):
    """
    Extract documents from *.txt files in a given directory
    :param data_dir: path to the directory with .txt files
    :param as_generator: flag whether to return a document generator or a list
    :param shuffle: flag whether to return the documents
    in a shuffled vs sorted order

    :return: generator or a list of Document objects
    """
    print("get document from mongo!")
    if shuffle:
        random.shuffle(ids)

    docs_step = 500000
    steps_times = len(ids) // docs_step
    steps = [docs_step * i for i in range(steps_times + 1)] + [len(ids)]
    cursors = [
        mongo_collection.find({"_id": {
            "$in": ids[steps[i - 1]:steps[i]]
        }}) for i in range(1, len(steps))
    ]
    all_docs = (x for c in cursors for x in c)
    generator = (Document(doc_id, None, text=d["full_text"])
                 for doc_id, d in enumerate(all_docs))
    return generator if as_generator else list(generator)
예제 #5
0
 def predict_from_text(self, text):
     """
     Predict labels for a given string of text
     :param text: string or unicode with the text
     :return: list of labels with corresponding confidence intervals
     """
     doc = Document(0, None, text=text)
     return self._predict(doc)
예제 #6
0
def build_x_and_y(filenames, file_directory, **kwargs):
    """
    Given file names and their directory, build (X, y) data matrices
    :param filenames: iterable of strings showing file ids (no extension)
    :param file_directory: path to a directory where those files lie
    :param kwargs: additional necessary data for matrix building e.g. scaler

    :return: a tuple (X, y)
    """
    label_indices = kwargs['label_indices']
    word2vec_model = kwargs['word2vec_model']
    scaler = kwargs['scaler']
    nn_model = kwargs['nn_model']

    x_matrix = np.zeros((len(filenames), SAMPLE_LENGTH, EMBEDDING_SIZE))
    y_matrix = np.zeros((len(filenames), len(label_indices)), dtype=np.bool_)

    for doc_id, fname in enumerate(filenames):
        doc = Document(doc_id, os.path.join(file_directory, fname + '.txt'))
        words = doc.get_all_words()[:SAMPLE_LENGTH]

        for i, w in enumerate(words):
            if w in word2vec_model:
                word_vector = word2vec_model[w].reshape(1, -1)
                x_matrix[doc_id][i] = scaler.transform(word_vector,
                                                       copy=True)[0]

        labels = get_answers_for_doc(
            fname + '.txt',
            file_directory,
            filtered_by=set(label_indices.keys()),
        )

        for lab in labels:
            index = label_indices[lab]
            y_matrix[doc_id][index] = True

    if nn_model and type(nn_model.input) == list:
        return_data = [x_matrix] * len(nn_model.input), y_matrix
    else:
        return_data = [x_matrix], y_matrix

    if type(nn_model) == Graph:
        return {'input': return_data[0], 'output': return_data[1]}
    else:
        return return_data
예제 #7
0
def build_x_and_y(filenames, file_directory, **kwargs):
    """
    Given file names and their directory, build (X, y) data matrices
    :param filenames: iterable of strings showing file ids (no extension)
    :param file_directory: path to a directory where those files lie
    :param kwargs: additional necessary data for matrix building e.g. scaler

    :return: a tuple (X, y)
    """
    label_indices = kwargs['label_indices']
    word2vec_model = kwargs['word2vec_model']
    scaler = kwargs['scaler']
    nn_model = kwargs['nn_model']

    x_matrix = np.zeros((len(filenames), SAMPLE_LENGTH, EMBEDDING_SIZE))
    y_matrix = np.zeros((len(filenames), len(label_indices)), dtype=np.bool_)

    for doc_id, fname in enumerate(filenames):
        doc = Document(doc_id, os.path.join(file_directory, fname + '.txt'))
        words = doc.get_all_words()[:SAMPLE_LENGTH]

        for i, w in enumerate(words):
            if w in word2vec_model:
                word_vector = word2vec_model[w].reshape(1, -1)
                x_matrix[doc_id][i] = scaler.transform(word_vector, copy=True)[0]

        labels = get_answers_for_doc(
            fname + '.txt',
            file_directory,
            filtered_by=set(label_indices.keys()),
        )

        for lab in labels:
            index = label_indices[lab]
            y_matrix[doc_id][index] = True

    if nn_model and type(nn_model.input) == list:
        return_data = [x_matrix] * len(nn_model.input), y_matrix
    else:
        return_data = [x_matrix], y_matrix

    if type(nn_model) == Graph:
        return {'input': return_data[0], 'output': return_data[1]}
    else:
        return return_data
예제 #8
0
    def predict_from_file(self, filepath):
        """
        Predict labels for a txt file
        :param filepath: path to the file

        :return: list of labels with corresponding confidence intervals
        """
        doc = Document(0, filepath)
        return self._predict(doc)
예제 #9
0
  def _predict(self, doc: Document, return_float=False):
    """
    Predict labels for a given Document object
    :param doc: Document object
    :return: list of labels with corresponding confidence intervals
    """
    set_tf_growth()
    if isinstance(self.keras_model.input, list):
      _, sample_length, embedding_size = self.keras_model.input_shape[0]
    else:
      _, sample_length, embedding_size = self.keras_model.input_shape
    words = doc.get_all_words()[:sample_length]
    x_matrix = np.zeros((1, sample_length, embedding_size))

    for i, w in enumerate(words):
      if w in self.word2vec_model.wv:
        word_vector = self.word2vec_model.wv[w].reshape(1, -1)
        scaled_vector = self.scaler.transform(word_vector, copy=True)[0]
        x_matrix[0][i] = scaled_vector

    if isinstance(self.keras_model.input, list):
      x = [x_matrix] * len(self.keras_model.input)
    else:
      x = [x_matrix]

    with tf.device('/cpu:0'):
      y_predicted = self.keras_model.predict(x)
    # return weighted avg of labels
    # return reduce(lambda acc, x: acc + (x[0] * x[1]), zipped, 1) #weighted avg
    # TODO make this return weighted avg or max prob a param
    # max probablitiy, corresponding to standard keras mmethodology
    # print(f'model output shape {self.keras_model.output_shape}')
    if self.keras_model.output_shape[1] == 1:
      # print(f'returning {y_predicted[0][0]}')
      float_y_pred = float(y_predicted[0][0])
      # if not isinstance(y_predicted[0][0], float):
      #   print(type(y_predicted[0][0]))
      #   print(y_predicted, y_predicted[0][0])
      assert(isinstance(float_y_pred, float))
      # print(float_y_pred)
      return float_y_pred
    elif return_float:
      zipped = zip(self.labels, y_predicted[0])
      return float(
          sorted(
              zipped,
              key=lambda elem: elem[1],
              reverse=True)[0][0])
    else:
      zipped = zip(self.labels, y_predicted[0])
      return sorted(zipped, key=lambda elem: elem[1], reverse=True)[0][0]
예제 #10
0
 def predict_from_text(self, text, test=False, return_float=False):
   """
   Predict labels for a given string of text
   :param text: string or unicode with the text
   :return: list of labels with corresponding confidence intervals
   """
   if hasattr(self, 'training_set') and not test:
     if text in self.training_set:
       print(f'found text in training set: {text}')
     # assert(not test or (text not in self.training_set))
   # else:
   #   # print("pretrained model not checking for test train split")
   doc = Document(text)
   return self._predict(doc, return_float=return_float)
예제 #11
0
파일: main.py 프로젝트: zhang45258/magpie
    def predict_from_file(self, filepath):
        """
        Predict labels for a txt file
        :param filepath: path to the file

        :return: list of labels with corresponding confidence intervals
        预测TXT文件的标签

:param file path:文件路径



:返回:具有相应置信区间的标签列表
        """
        doc = Document(0, filepath)
        return self._predict(doc)
예제 #12
0
def get_documents(data_dir, as_generator=True, shuffle=False):
    """
    Extract documents from *.txt files in a given directory
    :param data_dir: path to the directory with .txt files
    :param as_generator: flag whether to return a document generator or a list
    :param shuffle: flag whether to return the documents
    in a shuffled vs sorted order

    :return: generator or a list of Document objects
    """
    files = list({filename[:-4] for filename in os.listdir(data_dir)})
    files.sort()
    if shuffle:
        random.shuffle(files)

    generator = (Document(doc_id, os.path.join(data_dir, f + '.txt'))
                 for doc_id, f in enumerate(files))
    return generator if as_generator else list(generator)
예제 #13
0
def fit_scaler(data: DataList, word2vec_model,
               batch_size=1024, persist_to_path=None):
  """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it.
   This scaler can be used afterwards for normalizing feature matrices. """
  if isinstance(word2vec_model, str):
    word2vec_model = Word2Vec.load(word2vec_model)

  # TODO add other non-text features here
  doc_generator = iter([Document(example['text']) for example in data])
  scaler = StandardScaler(copy=False)

  no_more_samples = False
  while not no_more_samples:
    batch = []
    for i in range(batch_size):
      try:
        batch.append(six.next(doc_generator))
      except StopIteration:
        no_more_samples = True
        break

    vectors = []
    for doc in batch:
      for word in doc.get_all_words():
        if word in word2vec_model.wv:
          vectors.append(word2vec_model.wv[word])

    matrix = np.array(vectors)
    print("Fitted to {} vectors".format(matrix.shape[0]))

    scaler.partial_fit(matrix)

  if persist_to_path:
    save_to_disk(persist_to_path, scaler)

  return scaler
예제 #14
0
 def __iter__(self):
     files = {filename[:-4] for filename in os.listdir(self.dirname)}
     for doc_id, fname in enumerate(files):
         d = Document(doc_id, os.path.join(self.dirname, fname + '.txt'))
         for sentence in d.read_sentences():
             yield sentence