Пример #1
0
class MultihotEncoder(BaseEstimator, TransformerMixin):
    """
    Wraps `MultiLabelBinarizer` in a pipeline safe transformer

    Args:
        sparse_output (bool): convert output to sparse matrix
    """
    def __init__(self, sparse_output=False):
        self.transformer = MultiLabelBinarizer()
        self.sparse_output = sparse_output

    def fit(self, X, y=None):
        """ Fit MultiLabelBinarizer """
        self.transformer.fit(X)
        return self

    def transform(self, X,y=None):
        """ Transform MultiLabelBinarizer """
        # ignore unseen label warning
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            X_t = self.transformer.transform(X)
        if self.sparse_output:
            return sparse.csr_matrix(X_t)
        else:
            return X_t
Пример #2
0
class MultiLabelBinarizerImpl():
    def __init__(self, classes=None, sparse_output=False):
        self._hyperparams = {
            'classes': classes,
            'sparse_output': sparse_output
        }

    def fit(self, X, y=None):
        self._sklearn_model = SKLModel(**self._hyperparams)
        if (y is not None):
            self._sklearn_model.fit(X, y)
        else:
            self._sklearn_model.fit(X)
        return self

    def transform(self, X):
        return self._sklearn_model.transform(X)
doc2vec = Doc2Vec.load(doc2vec_model_location)

# Convert the categories to one hot encoded categories
labelBinarizer = MultiLabelBinarizer()
labelBinarizer.fit([reuters.categories(fileId) for fileId in reuters.fileids()])

# Convert load the articles with their corresponding categories
train_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('training/')]
test_articles = [{'raw': reuters.raw(fileId), 'categories': reuters.categories(fileId)} for fileId in reuters.fileids() if fileId.startswith('test/')]
shuffle(train_articles)
shuffle(test_articles)

# Convert the articles to document vectors using the doc2vec model
train_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in train_articles]
test_data = [doc2vec.infer_vector(word_tokenize(article['raw'])) for article in test_articles]
train_labels = labelBinarizer.transform([article['categories'] for article in train_articles])
test_labels = labelBinarizer.transform([article['categories'] for article in test_articles])
train_data, test_data, train_labels, test_labels = numpy.asarray(train_data), numpy.asarray(test_data), numpy.asarray(train_labels), numpy.asarray(test_labels)

# Initialize the neural network
model = Sequential()
model.add(Dense(input_dim=doc2vec_dimensions, output_dim=500, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim=1200, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim=400, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim=600, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim=train_labels.shape[1], activation='sigmoid'))
model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    'raw': reuters.raw(fileId),
    'categories': reuters.categories(fileId)
} for fileId in reuters.fileids() if fileId.startswith('test/')]
shuffle(train_articles)
shuffle(test_articles)

# Convert the articles to document vectors using the doc2vec model
train_data = [
    doc2vec.infer_vector(word_tokenize(article['raw']))
    for article in train_articles
]
test_data = [
    doc2vec.infer_vector(word_tokenize(article['raw']))
    for article in test_articles
]
train_labels = labelBinarizer.transform(
    [article['categories'] for article in train_articles])
test_labels = labelBinarizer.transform(
    [article['categories'] for article in test_articles])
train_data, test_data, train_labels, test_labels = numpy.asarray(
    train_data), numpy.asarray(test_data), numpy.asarray(
        train_labels), numpy.asarray(test_labels)

# Initialize the neural network
model = Sequential()
model.add(
    Dense(input_dim=doc2vec_dimensions, output_dim=500, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim=1200, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(output_dim=400, activation='relu'))
model.add(Dropout(0.3))