def preprocess_data(self, dataset, y_dataset): logging.info("Transform data on " + self.__class__.__name__) processed_dataset = process_dataset(dataset) processed_dataset = processed_dataset.map( lambda x: ' '.join(word for word in x)) doc_term_matrix = self.count_vectorizer.transform( processed_dataset.values.astype('U')) return self.model.transform(doc_term_matrix)
def preprocess_data(self, dataset, y_dataset): logging.info("Transforming data on " + self.__class__.__name__) processed_dataset = process_dataset(dataset) processed_dataset = processed_dataset.map( lambda x: ' '.join(word for word in x)) tfidf = self.tfidf_vectorizer.transform( processed_dataset.values.astype('U')) return self.model.transform(tfidf)
def preprocess_data( self, dataset, y_dataset ): logging.info("Transforming data on " + self.__class__.__name__) processed_dataset = process_dataset(dataset).tolist() vectors = [self.model.infer_vector(processed_dataset[doc_id]) for doc_id in range(len(processed_dataset))] return vectors
def train(self, x, y=None): logging.info("Building vocabulary on " + self.__class__.__name__) t0 = time.time() processed_dataset = process_dataset(x) processed_dataset = processed_dataset.map( lambda x: ' '.join(word for word in x)) doc_term_matrix = self.count_vectorizer.fit_transform( processed_dataset.values.astype('U')) self.model.fit(doc_term_matrix) elapsed = (time.time() - t0) logging.info("Done in %.3fsec" % elapsed)
def train(self, x, y=None): logging.info("Building vectorizer on " + self.__class__.__name__) t0 = time.time() processed_dataset = process_dataset(x) processed_dataset = processed_dataset.map( lambda x: ' '.join(word for word in x)) tfidf = self.tfidf_vectorizer.fit_transform( processed_dataset.values.astype('U')) self.model.fit(tfidf) elapsed = (time.time() - t0) logging.info("Done in %.3fsec" % elapsed)
def train( self, x, y ): logging.info("Training " + self.__class__.__name__) t0 = time.time() processed_x = process_dataset(x) documents = [TaggedDocument(doc, [tag]) for doc, tag in zip(processed_x, y)] self.model.build_vocab(documents) self.model.train(documents, total_examples=self.model.corpus_count, epochs=self.model.epochs) elapsed = (time.time() - t0) logging.info("Done in %.3fsec" % elapsed)