def _load_models(self): hf_path = self.params_path.format('hf') idf_path = self.params_path.format('idfmodel') rf_path = self.params_path.format('rf') self.hashingTF = HashingTF.load(hf_path) self.idfmodel = IDFModel.load(idf_path) self.rf = RandomForestClassificationModel.load(rf_path)
def update_models(): # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map logger.debug( '===================================================Starting load models===================================================' ) try: logger.debug('Loading tokenizer model') new_tokenizer = Tokenizer.load(tokenizer_file) logger.debug('Load tokenizer model successfully') except: logger.debug('Fail to load tokenizer') try: logger.debug('Loading hashing_tf model') new_hashing_tf = HashingTF.load(hashing_tf_file) logger.debug('Load hashing_tf model successfully') except: logger.debug('Fail to load hashing_tf') try: logger.debug('Loading idf_model') new_idf_model = IDFModel.load(idf_model_file) logger.debug('Load IDFModel successfully') except: logger.debug('Fail to load IDFModel') try: logger.debug('Loading nb_model') new_nb_model = NaiveBayesModel.load(nb_model_file) logger.debug('Load NaiveBayesModel successfully') except: logger.debug('Fail to load NaiveBayesModel') try: logger.debug('Updating models') tokenizer = new_tokenizer hashing_tf = new_hashing_tf idf_model = new_idf_model nb_model = new_nb_model logger.debug('update model successfully') except: logger.debug('Fail to update models') logger.debug( '===================================================Stopped load models===================================================' )
ads_filter = udf(filter_ads, BooleanType()) ads_free = df.filter(ads_filter(df.Text)) #remove punctuation pp_udf = udf(preprocess, ArrayType(StringType())) words = ads_free.withColumn('Words', pp_udf(ads_free.Text)) #remove stop words remover = StopWordsRemover(inputCol="Words", outputCol="filtered") removed = remover.transform(words) params_path = '../tmp/{}' #Load trained hashing frequency and transform hf_path = params_path.format('hf') hashingTF = HashingTF.load(hf_path) featureized = hashingTF.transform(removed) #Load trained hashing frequency and transform idf_path = params_path.format('idfmodel') idfmodel = IDFModel.load(idf_path) result = idfmodel.transform(featureized) #load rf model and predict rf_path = params_path.format('rf') rf = RandomForestClassificationModel.load(rf_path) prediction = rf.transform(result) path_to_save = '../tmp/twitterstream_test_prediction.json' prediction.write.json(path_to_save)
logger.debug('Fail to start kafka producer, caused by %s' % ke.message) try: # Create dstream from kafka topic directKafkaStream = KafkaUtils.createDirectStream(ssc, kafka_topic, {'metadata.broker.list' = broker_ip}) logger.debug('Create direct dstream from kafka successfully') except: logger.debug('Unable to create dstream from kafka') atexit.register(shutdown_hook, kafka_producer, spark) # Load in idf_model, nb_model, hashing_tf, idf_model and tag_catId map try: logger.debug('Loading models') tokenizer = Tokenizer.load(tokenizer_file) hashing_tf = HashingTF.load(hashing_tf_file) idf_model = IDFModel.load(idf_model_file) nb_model = NaiveBayesModel.load(nb_model_file) selected_tags = pd.read_csv(selected_tags_file, header=None) local_catId_to_tags = dict(zip(list(selected_tags.index), selected_tags[0])) local_tags_to_catId=dict(zip(selected_tags[0], list(selected_tags.index))) catId_to_tags = sc.broadcast(local_catId_to_tags) tags_to_catId = sc.broadcast(local_tags_to_catId) tags_to_catId_transform = udf(lambda tag: float(tags_to_catId.value[tag]), FloatType()) catId_to_tags_transform = udf(lambda catId: catId_to_tags.value[catId], StringType()) logger.debug('loaded models successfully') except: logger.debug('Fail to load models') logger.debug('Start to process data')