def train(self, classdict, nb_topics, *args, **kwargs):
        """ Train the autoencoder.

        :param classdict: training data
        :param nb_topics: number of topics, i.e., the number of encoding dimensions
        :param args: arguments to be passed to keras model fitting
        :param kwargs: arguments to be passed to keras model fitting
        :return: None
        :type classdict: dict
        :type nb_topics: int
        """
        CompactIOMachine.__init__(self, {'classifier': 'kerasautoencoder'},
                                  'kerasautoencoder', autoencoder_suffices)
        self.nb_topics = nb_topics
        self.generate_corpus(classdict)
        vecsize = len(self.dictionary)

        # define all the layers of the autoencoder
        input_vec = Input(shape=(vecsize, ))
        encoded = Dense(self.nb_topics, activation='relu')(input_vec)
        decoded = Dense(vecsize, activation='sigmoid')(encoded)

        # define the autoencoder model
        autoencoder = Model(input=input_vec, output=decoded)

        # define the encoder
        encoder = Model(input=input_vec, output=encoded)

        # define the decoder
        encoded_input = Input(shape=(self.nb_topics, ))
        decoder_layer = autoencoder.layers[-1]
        decoder = Model(input=encoded_input,
                        output=decoder_layer(encoded_input))

        # compile the autoencoder
        autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')

        # process training data
        embedvecs = np.array(
            reduce(add, [[
                self.retrieve_bow_vector(shorttext, normalize=True)
                for shorttext in classdict[classtype]
            ] for classtype in classdict]))

        # fit the model
        autoencoder.fit(embedvecs, embedvecs, *args, **kwargs)

        # store the autoencoder models
        self.autoencoder = autoencoder
        self.encoder = encoder
        self.decoder = decoder

        # flag setting
        self.trained = True

        # classes topic vector precomputation
        self.classtopicvecs = {}
        for label in classdict:
            self.classtopicvecs[label] = self.precalculate_liststr_topicvec(
                classdict[label])
 def __init__(self, intermediate_classifiers={}):
     CompactIOMachine.__init__(
         self, {'classifier': 'stacked_logistics'}, 'stacked_logistics', [
             '_stackedlogistics.pkl', '_stackedlogistics.h5',
             '_stackedlogistics.json'
         ])
     StackedGeneralization.__init__(
         self, intermediate_classifiers=intermediate_classifiers)
 def __init__(self,
              preprocessor=textpreprocess.standard_text_preprocessor_1(),
              toweigh=True,
              normalize=True):
     GensimTopicModeler.__init__(self,
                                 preprocessor=preprocessor,
                                 algorithm='rp',
                                 toweigh=toweigh,
                                 normalize=normalize)
     CompactIOMachine.__init__(self, {'classifier': 'rptopic'}, 'rptopic', rp_suffices)
예제 #4
0
    def __init__(self, preprocessor=lambda s: s.lower()):
        """ Initializer.

        :param preprocessor: text preprocessor
        :type preprocessor: function
        """
        CompactIOMachine.__init__(self,
                                  {'classifier': 'maxent'},
                                  'maxent',
                                  ['_classlabels.txt', '.json', '.h5', '_labelidx.pkl', '_dictionary.dict'])
        self.preprocessor = preprocessor
        self.trained = False
    def __init__(self, wvmodel, vecsize=None, maxlen=15):
        """ Initialize the classifier.

        :param wvmodel: Word2Vec model
        :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model)
        :param maxlen: maximum number of words in a sentence (Default: 15)
        :type wvmodel: gensim.models.word2vec.Word2Vec
        :type vecsize: int
        :type maxlen: int
        """
        CompactIOMachine.__init__(self, {'classifier': 'sumnnlibvec'}, 'sumnnlibvec', ['_classlabels.txt', '.json', '.h5'])
        self.wvmodel = wvmodel
        self.vecsize = self.wvmodel.vector_size if vecsize==None else vecsize
        self.maxlen = maxlen
        self.trained = False
    def __init__(self, wvmodel, vecsize=None, maxlen=15, with_gensim=False):
        """ Initialize the classifier.

        :param wvmodel: Word2Vec model
        :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model)
        :param maxlen: maximum number of words in a sentence (Default: 15)
        :type wvmodel: gensim.models.keyedvectors.KeyedVectors
        :type vecsize: int
        :type maxlen: int
        """
        CompactIOMachine.__init__(
            self, {'classifier': 'nnlibvec'}, 'nnlibvec',
            ['_classlabels.txt', '.json', '.h5', '_config.json'])
        self.wvmodel = wvmodel
        self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize
        self.maxlen = maxlen
        self.with_gensim = False if not with_gensim else with_gensim
        self.trained = False
예제 #7
0
    def __init__(self,
                 wvmodel,
                 vecsize=None,
                 simfcn=lambda u, v: 1 - cosine(u, v)):
        """ Initialize the classifier.

        :param wvmodel: Word2Vec model
        :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model)
        :param simfcn: similarity function (Default: cosine similarity)
        :type wvmodel: gensim.models.keyedvectors.KeyedVectors
        :type vecsize: int
        :type simfcn: function
        """
        CompactIOMachine.__init__(self, {'classifier': 'sumvec'}, 'sumvec',
                                  ['_embedvecdict.pkl'])
        self.wvmodel = wvmodel
        self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize
        self.simfcn = simfcn
        self.trained = False
예제 #8
0
    def __init__(self,
                 operation,
                 alph=default_alph,
                 specialsignals=default_specialsignals,
                 concatcharvec_encoder=None,
                 batchsize=1,
                 nb_hiddenunits=650):
        """ Instantiate the scRNN spell corrector.

        :param operation: types of distortion of words in training (options: "NOISE-INSERT", "NOISE-DELETE", "NOISE-REPLACE", "JUMBLE-WHOLE", "JUMBLE-BEG", "JUMBLE-END", and "JUMBLE-INT")
        :param alph: default string of characters (Default: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,:;'*!?`$%&(){}[]-/\@_#")
        :param specialsignals: dictionary of special signals (Default built-in)
        :param concatcharvec_encoder: one-hot encoder for characters, initialize if None. (Default: None)
        :param batchsize: batch size. (Default: 1)
        :param nb_hiddenunits: number of hidden units. (Default: 650)
        :type operation: str
        :type alpha: str
        :type specialsignals: dict
        :type concatcharvec_encoder: shorttext.spell.binarize.SpellingToConcatCharVecEncoder
        :type batchsize: int
        :type nb_hiddenunits: int
        """
        CompactIOMachine.__init__(
            self, {'classifier': 'scrnn_spell'}, 'scrnn_spell',
            ['_config.json', '_vocabs.gensimdict', '.h5', '.json'])
        self.operation = operation
        self.alph = alph
        self.specialsignals = specialsignals
        self.binarizer = SCRNNBinarizer(self.alph, self.specialsignals)
        self.concatcharvec_encoder = SpellingToConcatCharVecEncoder(
            self.alph
        ) if concatcharvec_encoder == None else concatcharvec_encoder
        self.onehotencoder = OneHotEncoder()
        self.trained = False
        self.batchsize = batchsize
        self.nb_hiddenunits = nb_hiddenunits