def train(self, classdict, nb_topics, *args, **kwargs): """ Train the autoencoder. :param classdict: training data :param nb_topics: number of topics, i.e., the number of encoding dimensions :param args: arguments to be passed to keras model fitting :param kwargs: arguments to be passed to keras model fitting :return: None :type classdict: dict :type nb_topics: int """ CompactIOMachine.__init__(self, {'classifier': 'kerasautoencoder'}, 'kerasautoencoder', autoencoder_suffices) self.nb_topics = nb_topics self.generate_corpus(classdict) vecsize = len(self.dictionary) # define all the layers of the autoencoder input_vec = Input(shape=(vecsize, )) encoded = Dense(self.nb_topics, activation='relu')(input_vec) decoded = Dense(vecsize, activation='sigmoid')(encoded) # define the autoencoder model autoencoder = Model(input=input_vec, output=decoded) # define the encoder encoder = Model(input=input_vec, output=encoded) # define the decoder encoded_input = Input(shape=(self.nb_topics, )) decoder_layer = autoencoder.layers[-1] decoder = Model(input=encoded_input, output=decoder_layer(encoded_input)) # compile the autoencoder autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy') # process training data embedvecs = np.array( reduce(add, [[ self.retrieve_bow_vector(shorttext, normalize=True) for shorttext in classdict[classtype] ] for classtype in classdict])) # fit the model autoencoder.fit(embedvecs, embedvecs, *args, **kwargs) # store the autoencoder models self.autoencoder = autoencoder self.encoder = encoder self.decoder = decoder # flag setting self.trained = True # classes topic vector precomputation self.classtopicvecs = {} for label in classdict: self.classtopicvecs[label] = self.precalculate_liststr_topicvec( classdict[label])
def __init__(self, intermediate_classifiers={}): CompactIOMachine.__init__( self, {'classifier': 'stacked_logistics'}, 'stacked_logistics', [ '_stackedlogistics.pkl', '_stackedlogistics.h5', '_stackedlogistics.json' ]) StackedGeneralization.__init__( self, intermediate_classifiers=intermediate_classifiers)
def __init__(self, preprocessor=textpreprocess.standard_text_preprocessor_1(), toweigh=True, normalize=True): GensimTopicModeler.__init__(self, preprocessor=preprocessor, algorithm='rp', toweigh=toweigh, normalize=normalize) CompactIOMachine.__init__(self, {'classifier': 'rptopic'}, 'rptopic', rp_suffices)
def __init__(self, preprocessor=lambda s: s.lower()): """ Initializer. :param preprocessor: text preprocessor :type preprocessor: function """ CompactIOMachine.__init__(self, {'classifier': 'maxent'}, 'maxent', ['_classlabels.txt', '.json', '.h5', '_labelidx.pkl', '_dictionary.dict']) self.preprocessor = preprocessor self.trained = False
def __init__(self, wvmodel, vecsize=None, maxlen=15): """ Initialize the classifier. :param wvmodel: Word2Vec model :param vecsize: length of embedded vectors in the model (Default: None, extracted directly from the word-embedding model) :param maxlen: maximum number of words in a sentence (Default: 15) :type wvmodel: gensim.models.word2vec.Word2Vec :type vecsize: int :type maxlen: int """ CompactIOMachine.__init__(self, {'classifier': 'sumnnlibvec'}, 'sumnnlibvec', ['_classlabels.txt', '.json', '.h5']) self.wvmodel = wvmodel self.vecsize = self.wvmodel.vector_size if vecsize==None else vecsize self.maxlen = maxlen self.trained = False
def __init__(self, wvmodel, vecsize=None, maxlen=15, with_gensim=False): """ Initialize the classifier. :param wvmodel: Word2Vec model :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model) :param maxlen: maximum number of words in a sentence (Default: 15) :type wvmodel: gensim.models.keyedvectors.KeyedVectors :type vecsize: int :type maxlen: int """ CompactIOMachine.__init__( self, {'classifier': 'nnlibvec'}, 'nnlibvec', ['_classlabels.txt', '.json', '.h5', '_config.json']) self.wvmodel = wvmodel self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize self.maxlen = maxlen self.with_gensim = False if not with_gensim else with_gensim self.trained = False
def __init__(self, wvmodel, vecsize=None, simfcn=lambda u, v: 1 - cosine(u, v)): """ Initialize the classifier. :param wvmodel: Word2Vec model :param vecsize: length of the embedded vectors in the model (Default: None, directly extracted from word-embedding model) :param simfcn: similarity function (Default: cosine similarity) :type wvmodel: gensim.models.keyedvectors.KeyedVectors :type vecsize: int :type simfcn: function """ CompactIOMachine.__init__(self, {'classifier': 'sumvec'}, 'sumvec', ['_embedvecdict.pkl']) self.wvmodel = wvmodel self.vecsize = self.wvmodel.vector_size if vecsize == None else vecsize self.simfcn = simfcn self.trained = False
def __init__(self, operation, alph=default_alph, specialsignals=default_specialsignals, concatcharvec_encoder=None, batchsize=1, nb_hiddenunits=650): """ Instantiate the scRNN spell corrector. :param operation: types of distortion of words in training (options: "NOISE-INSERT", "NOISE-DELETE", "NOISE-REPLACE", "JUMBLE-WHOLE", "JUMBLE-BEG", "JUMBLE-END", and "JUMBLE-INT") :param alph: default string of characters (Default: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz.,:;'*!?`$%&(){}[]-/\@_#") :param specialsignals: dictionary of special signals (Default built-in) :param concatcharvec_encoder: one-hot encoder for characters, initialize if None. (Default: None) :param batchsize: batch size. (Default: 1) :param nb_hiddenunits: number of hidden units. (Default: 650) :type operation: str :type alpha: str :type specialsignals: dict :type concatcharvec_encoder: shorttext.spell.binarize.SpellingToConcatCharVecEncoder :type batchsize: int :type nb_hiddenunits: int """ CompactIOMachine.__init__( self, {'classifier': 'scrnn_spell'}, 'scrnn_spell', ['_config.json', '_vocabs.gensimdict', '.h5', '.json']) self.operation = operation self.alph = alph self.specialsignals = specialsignals self.binarizer = SCRNNBinarizer(self.alph, self.specialsignals) self.concatcharvec_encoder = SpellingToConcatCharVecEncoder( self.alph ) if concatcharvec_encoder == None else concatcharvec_encoder self.onehotencoder = OneHotEncoder() self.trained = False self.batchsize = batchsize self.nb_hiddenunits = nb_hiddenunits