def index(model_dir, rawfile, encodeIndexFile, batchsize=10000): if not os.path.exists(model_dir): print('Error! Model folder does not exist!! : %s' % model_dir) exit(-1) if not os.path.exists(os.path.join(model_dir, 'vocabulary.txt')): print( 'Error!! Could not find vocabulary file for encoder in folder :%s' % model_dir) exit(-1) encoder = text_encoder.SubwordTextEncoder( filename=os.path.join(model_dir, 'vocabulary.txt')) print("Loaded vocab size is: %d" % encoder.vocab_size) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: #load model modelConfigs = data_utils.load_model_configs(model_dir) model = sse_model.SSEModel(modelConfigs) ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print( 'Error!!!Could not load any model from specified folder: %s' % model_dir) exit(-1) # start to indexing createIndexFile(model, encoder, rawfile, int(modelConfigs['max_seq_length']), encodeIndexFile, sess, batchsize)
def create_model(session, targetSpaceSize, vocabsize, forward_only): """Create SSE model and initialize or load parameters in session.""" modelParams = {'max_seq_length': FLAGS.max_seq_length, 'vocab_size': vocabsize, 'embedding_size': FLAGS.embedding_size, 'encoding_size': FLAGS.encoding_size, 'learning_rate': FLAGS.learning_rate, 'learning_rate_decay_factor': FLAGS.learning_rate_decay_factor, 'src_cell_size':FLAGS.src_cell_size, 'tgt_cell_size':FLAGS.tgt_cell_size, 'network_mode': FLAGS.network_mode, 'predict_nbest':FLAGS.predict_nbest, 'targetSpaceSize':targetSpaceSize, 'forward_only': forward_only} data_utils.save_model_configs(FLAGS.model_dir, modelParams) model = sse_model.SSEModel( modelParams ) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(session, ckpt.model_checkpoint_path) else: if forward_only: print('Error!!!Could not load any model from specified folder: %s' % FLAGS.model_dir ) exit(-1) else: print("Created model with fresh parameters.") session.run(tf.global_variables_initializer()) return model
def create_model(session, targetSpaceSize, forward_only): """Create SSE model and initialize or load parameters in session.""" model = sse_model.SSEModel(FLAGS.max_seq_length, FLAGS.max_gradient_norm, FLAGS.src_vocab_size, FLAGS.tgt_vocab_size, FLAGS.embedding_size, FLAGS.encoding_size, FLAGS.src_cell_size, FLAGS.tgt_cell_size, FLAGS.num_layers, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, targetSpaceSize, network_mode=FLAGS.network_mode, forward_only=forward_only, TOP_N=FLAGS.predict_nbest) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(session, ckpt.model_checkpoint_path) else: if forward_only: print('Error!!!Could not load model from specified folder: %s' % FLAGS.model_dir) exit(-1) else: print("Created model with fresh parameters.") session.run(tf.initialize_all_variables()) return model
def create_model(session, targetSpaceSize, vocabsize, forward_only): """Create SSE model and initialize or load parameters in session.""" modelConfigs = ( FLAGS.max_seq_length, FLAGS.max_gradient_norm, vocabsize, FLAGS.embedding_size, FLAGS.encoding_size, FLAGS.src_cell_size, FLAGS.tgt_cell_size, FLAGS.num_layers, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, targetSpaceSize , FLAGS.network_mode , FLAGS.predict_nbest, FLAGS.alpha, FLAGS.neg_samples ) data_utils.save_model_configs(FLAGS.model_dir, modelConfigs) model = sse_model.SSEModel( FLAGS.max_seq_length, FLAGS.max_gradient_norm, vocabsize, FLAGS.embedding_size, FLAGS.encoding_size, FLAGS.src_cell_size, FLAGS.tgt_cell_size, FLAGS.num_layers, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, targetSpaceSize , network_mode=FLAGS.network_mode , forward_only=forward_only, TOP_N=FLAGS.predict_nbest, alpha=FLAGS.alpha, neg_samples = FLAGS.neg_samples ) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(session, ckpt.model_checkpoint_path) else: if forward_only: print('Error!!!Could not load any model from specified folder: %s' % FLAGS.model_dir ) exit(-1) else: print("Created model with fresh parameters.") session.run(tf.global_variables_initializer()) return model
def index(model_dir, rawfile, encodeIndexFile, batchsize=10000): if not os.path.exists( model_dir ): print('Error! Model folder does not exist!! : %s' % model_dir) exit(-1) if not os.path.exists( os.path.join(model_dir, 'vocabulary.txt' ) ): print('Error!! Could not find vocabulary file for encoder in folder :%s' % model_dir) exit(-1) encoder = text_encoder.SubwordTextEncoder(filename=os.path.join(model_dir, 'vocabulary.txt' )) print("Loaded vocab size is: %d" % encoder.vocab_size) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: #load model modelConfigs = data_utils.load_model_configs(model_dir) model = sse_model.SSEModel( int(modelConfigs['max_seq_length']), float(modelConfigs['max_gradient_norm']), int(modelConfigs['vocabsize']), int(modelConfigs['embedding_size']), int(modelConfigs['encoding_size']), int(modelConfigs['src_cell_size']), int(modelConfigs['tgt_cell_size']), int(modelConfigs['num_layers']), float(modelConfigs['learning_rate']), float(modelConfigs['learning_rate_decay_factor']), int(modelConfigs['targetSpaceSize']), network_mode=modelConfigs['network_mode'], forward_only=True, TOP_N=int(modelConfigs['TOP_N']) ) ckpt = tf.train.get_checkpoint_state(model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print('Error!!!Could not load any model from specified folder: %s' % model_dir) exit(-1) # start to indexing createIndexFile(model, encoder, rawfile, int(modelConfigs['max_seq_length']), encodeIndexFile, sess, batchsize)
def __init__(self, *args, **kwargs): super(FlaskApp, self).__init__(*args, **kwargs) self.model = 'Do my initialization work here, loading model and index ....' self.model_type = os.environ.get("MODEL_TYPE", "classification") self.model_dir = "models-" + self.model_type self.indexFile = os.environ.get("INDEX_FILE", "targetEncodingIndex.tsv") print("In app class: Received flask appconfig is: " + os.environ.get('MODEL_TYPE', 'Default_classification') ) if not os.path.exists(self.model_dir): print('Model folder %s does not exist!!' % self.model_dir ) exit(-1) if not os.path.exists(os.path.join(self.model_dir, self.indexFile)): print('Index File does not exist!!') exit(-1) # load full set targetSeqID data if not os.path.exists(os.path.join(self.model_dir, 'vocabulary.txt')): print('Error!! Could not find vocabulary file for encoder in model folder.') exit(-1) self.encoder = text_encoder.SubwordTextEncoder(filename=os.path.join(self.model_dir, 'vocabulary.txt')) # load full set target Index data self.targetEncodings = [] self.targetIDs = [] self.targetIDNameMap = {} idx = 0 for line in codecs.open(os.path.join(self.model_dir, self.indexFile), 'r', 'utf-8').readlines(): info = line.strip().split('\t') if len(info) != 3: print('Error in targetIndexFile! %s' % line) continue tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2] self.targetIDs.append(tgtid) self.targetEncodings.append([float(f) for f in tgtEncoding.strip().split(',')]) self.targetIDNameMap[tgtid] = tgtseq idx += 1 self.targetEncodings = np.array(self.targetEncodings) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) self.sess = tf.Session(config=cfg) #load model self.modelConfigs = data_utils.load_model_configs(self.model_dir) self.model = sse_model.SSEModel( int(self.modelConfigs['max_seq_length']), float(self.modelConfigs['max_gradient_norm']), int(self.modelConfigs['vocabsize']), int(self.modelConfigs['embedding_size']), int(self.modelConfigs['encoding_size']), int(self.modelConfigs['src_cell_size']), int(self.modelConfigs['tgt_cell_size']), int(self.modelConfigs['num_layers']), float(self.modelConfigs['learning_rate']), float(self.modelConfigs['learning_rate_decay_factor']), int(self.modelConfigs['targetSpaceSize']), network_mode=self.modelConfigs['network_mode'], forward_only=True, TOP_N=int(self.modelConfigs['TOP_N']) ) ckpt = tf.train.get_checkpoint_state(self.model_dir) if ckpt: print("loading model from %s" % ckpt.model_checkpoint_path) self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: print('Error!!!Could not load any model from specified folder: %s' % self.model_dir) exit(-1)
def __init__(self, *args, **kwargs): super(FlaskApp, self).__init__(*args, **kwargs) self.catreco_model = 'Do my initialization work here' if not os.path.exists(FLAGS.model_dir): print('Model folder does not exist!!') exit(-1) encodedFullTargetSpace_path = os.path.join(FLAGS.model_dir, "encoded.FullTargetSpace") if not os.path.exists(encodedFullTargetSpace_path): print('Encoded full target space file not exist!!') exit(-1) # load full set targetSeqID data #tgtID_Name_Map, tgtID_EncodingMap, tgtID_FullLableMap, fullLabel_tgtID_Map, target_inputs, target_lens = load_encodedTargetSpace(encodedFullTargetSpace_path) self.tgtID_Name_Map, self.tgtID_EncodingMap, self.tgtID_FullLableMap, self.fullLabel_tgtID_Map, self.target_inputs, self.target_lens = load_encodedTargetSpace( ) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) self.sess = tf.Session(config=cfg) # check and load tensorflow models and related target space files self.model = sse_model.SSEModel( FLAGS.max_seq_length, FLAGS.max_gradient_norm, FLAGS.src_vocab_size, FLAGS.tgt_vocab_size, FLAGS.embedding_size, FLAGS.encoding_size, FLAGS.src_cell_size, FLAGS.tgt_cell_size, FLAGS.num_layers, FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, len(self.tgtID_FullLableMap), network_mode=FLAGS.network_mode, forward_only=True, TOP_N=FLAGS.predict_nbest) # Load vocabularies. ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): print("Loading model parameters from %s" % ckpt.model_checkpoint_path) self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: print('Error!!!Could not load model from specified folder: %s' % FLAGS.model_dir) self.sess.close() exit(-1) src_vocab_path = os.path.join(FLAGS.model_dir, "vocab.src") tgt_vocab_path = os.path.join(FLAGS.model_dir, "vocab.tgt") self.src_vocab, _ = data_utils.initialize_vocabulary(src_vocab_path) _, self.rev_tgt_vocab = data_utils.initialize_vocabulary( tgt_vocab_path)
def __init__(self, *args, **kwargs): super(FlaskApp, self).__init__(*args, **kwargs) self.model = 'Do my initialization work here, loading model and index ....' self.model_type = os.environ.get("MODEL_TYPE", "classification") self.model_dir = "models-" + self.model_type self.indexFile = os.environ.get("INDEX_FILE", "targetEncodingIndex.tsv") if not os.path.exists("./logs"): os.makedirs("./logs", exist_ok=True) log = logging.getLogger('') log.setLevel(logging.DEBUG) format = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt='%m/%d/%Y %I:%M:%S %p') ch = logging.StreamHandler(sys.stdout) ch.setFormatter(format) log.addHandler(ch) fh = handlers.RotatingFileHandler('./logs/WebServerLog.txt', maxBytes=(1048576 * 20), backupCount=7) fh.setFormatter(format) log.addHandler(fh) logging.info("In app class: Received flask appconfig is: " + os.environ.get('MODEL_TYPE', 'Default_classification')) if not os.path.exists(self.model_dir): logging.error('Model folder %s does not exist!!' % self.model_dir) exit(-1) if not os.path.exists(os.path.join(self.model_dir, self.indexFile)): logging.error('Index File does not exist!!') exit(-1) # load full set targetSeqID data if not os.path.exists(os.path.join(self.model_dir, 'vocabulary.txt')): logging.error( 'Error!! Could not find vocabulary file for encoder in model folder.' ) exit(-1) self.encoder = text_encoder.SubwordTextEncoder( filename=os.path.join(self.model_dir, 'vocabulary.txt')) # load full set target Index data self.targetEncodings = [] self.targetIDs = [] self.targetIDNameMap = {} idx = 0 for line in codecs.open(os.path.join(self.model_dir, self.indexFile), 'r', 'utf-8').readlines(): info = line.strip().split('\t') if len(info) != 3: logging.info('Error in targetIndexFile! %s' % line) continue tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2] self.targetIDs.append(tgtid) self.targetEncodings.append( [float(f) for f in tgtEncoding.strip().split(',')]) self.targetIDNameMap[tgtid] = tgtseq idx += 1 self.targetEncodings = np.array(self.targetEncodings) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) self.sess = tf.Session(config=cfg) #load model self.modelConfigs = data_utils.load_model_configs(self.model_dir) self.model = sse_model.SSEModel(self.modelConfigs) ckpt = tf.train.get_checkpoint_state(self.model_dir) if ckpt: logging.info("loading model from %s" % ckpt.model_checkpoint_path) self.model.saver.restore(self.sess, ckpt.model_checkpoint_path) else: logging.error( 'Error!!!Could not load any model from specified folder: %s' % self.model_dir) exit(-1)
def demo(nbest): if not os.path.exists(FLAGS.model_dir): print('Model folder does not exist!!') exit(-1) if not os.path.exists(os.path.join(FLAGS.model_dir, 'vocabulary.txt')): print( 'Error!! Could not find vocabulary file for encoder in model folder.' ) exit(-1) encoder = text_encoder.SubwordTextEncoder( filename=os.path.join(FLAGS.model_dir, 'vocabulary.txt')) if not os.path.exists(os.path.join(FLAGS.model_dir, FLAGS.indexFile)): print('Index file does not exist!!!') exit(-1) #load full set target Index data targetEncodings = [] targetIDs = [] idLabelMap = {} targetIDNameMap = {} idx = 0 for line in codecs.open(os.path.join(FLAGS.model_dir, FLAGS.indexFile), 'rt', 'utf-8').readlines(): info = line.strip().split('\t') if len(info) != 3: print('Error in targetIndexFile! %s' % line) continue tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2] targetIDs.append(tgtid) targetEncodings.append( [float(f) for f in tgtEncoding.strip().split(',')]) idLabelMap[tgtid] = idx targetIDNameMap[tgtid] = tgtseq idx += 1 targetEncodings = np.array(targetEncodings) cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: # TODO: improve here later #load model modelConfigs = data_utils.load_model_configs(FLAGS.model_dir) model = sse_model.SSEModel(modelConfigs) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print( 'Error!!!Could not load any model from specified folder: %s' % FLAGS.model_dir) exit(-1) # Decode from standard input. sys.stdout.write( "\n\nPlease type some keywords to get related task results.\nType 'exit' to quit demo.\n > " ) sys.stdout.flush() sentence = sys.stdin.readline() while sentence and sentence.strip().lower() != 'exit': # Get token-ids for the input sentence. source_tokens = encoder.encode(tf.compat.as_str(sentence).lower()) srclen = len(source_tokens) max_seq_length = int(modelConfigs['max_seq_length']) if srclen > max_seq_length - 2: print( 'Input sentence too long, max allowed is %d. Try to increase limit!!!!' % (max_seq_length)) source_tokens = [ text_encoder.PAD_ID ] + source_tokens[:max_seq_length - 2] + [text_encoder.EOS_ID] else: source_tokens = [text_encoder.PAD_ID] * ( max_seq_length - srclen - 1) + source_tokens + [ text_encoder.EOS_ID ] feed_dict = model.get_source_encoding_feed_dict( np.array([source_tokens])) model.set_forward_only(True) sourceEncodings = sess.run([model.src_seq_embedding], feed_dict=feed_dict) #sourceEncodings = sess.run([model.norm_src_seq_embedding], feed_dict=feed_dict) sourceEncodings = np.vstack(sourceEncodings) distances = np.dot(sourceEncodings, targetEncodings.T) rankedScore, rankedIdx = data_utils.getSortedResults(distances) top_confs = rankedScore[0][:nbest] top_tgtIDs = [targetIDs[lbl] for lbl in rankedIdx[0][:nbest]] top_tgtNames = [targetIDNameMap[id] for id in top_tgtIDs] print('Top %s Prediction results are:\n' % nbest) for idx in range(nbest): print('top%d: %s , %f , %s ' % (idx + 1, top_tgtIDs[idx], top_confs[idx], top_tgtNames[idx])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()
def demo(): if not os.path.exists(FLAGS.model_dir): print('Model folder does not exist!!') exit(-1) encodedFullTargetSpace_path = os.path.join(FLAGS.model_dir, "encoded.FullTargetSpace") if not os.path.exists(encodedFullTargetSpace_path): print( 'Encoded full target space file not exist. Please ReTrain the model to get it!!' ) exit(-1) #load full set targetSeqID data encoder, encodedTgtSpace, tgtID_Name_Map = data_utils.load_encodedTargetSpace( FLAGS.model_dir) fullTgtIdList = encodedTgtSpace.keys() tgtLabel_IDMap = {idx: tgtid for (idx, tgtid) in enumerate(fullTgtIdList)} tgtInput_batches = [encodedTgtSpace[tgtid] for tgtid in fullTgtIdList] tgtLen_batches = [ encodedTgtSpace[tgtid].index(text_encoder.PAD_ID) + 1 for tgtid in fullTgtIdList ] cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True) with tf.Session(config=cfg) as sess: # TODO: improve here later #load model modelConfigs = data_utils.load_model_configs(FLAGS.model_dir) model = sse_model.SSEModel( int(modelConfigs['max_seq_length']), float(modelConfigs['max_gradient_norm']), int(modelConfigs['vocabsize']), int(modelConfigs['embedding_size']), int(modelConfigs['encoding_size']), int(modelConfigs['src_cell_size']), int(modelConfigs['tgt_cell_size']), int(modelConfigs['num_layers']), float(modelConfigs['learning_rate']), float(modelConfigs['learning_rate_decay_factor']), int(modelConfigs['targetSpaceSize']), network_mode=modelConfigs['network_mode'], forward_only=True, TOP_N=int(modelConfigs['TOP_N'])) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path) else: print( 'Error!!!Could not load any model from specified folder: %s' % FLAGS.model_dir) exit(-1) # Decode from standard input. sys.stdout.write( "\n\nPlease type some keywords to get related task results.\nType 'exit' to quit demo.\n > " ) sys.stdout.flush() sentence = sys.stdin.readline() while sentence and sentence.strip().lower() != 'exit': # Get token-ids for the input sentence. source_tokens = encoder.encode(tf.compat.as_str(sentence).lower()) srclen = len(source_tokens) if srclen > int(modelConfigs['max_seq_length']) - 1: print( 'Max number of supported keywords is %d \n Please try againt!!!!' % (int(modelConfigs['max_seq_length']))) continue source_tokens = source_tokens + [ text_encoder.EOS_ID ] + [text_encoder.PAD_ID ] * (int(modelConfigs['max_seq_length']) - srclen - 1) print("") dict = model.get_predict_feed_dict(np.array([source_tokens]), tgtInput_batches, np.array([srclen]), tgtLen_batches) pred_conf, pred_labels = sess.run( [model.predicted_tgts_score, model.predicted_labels], feed_dict=dict) pred_labels = np.vstack(pred_labels) pred_conf = np.vstack(pred_conf) top5_confs = pred_conf[0][:5] top5_tgtIDs = [tgtLabel_IDMap[lbl] for lbl in pred_labels[0][:5]] top5_tgtNames = [tgtID_Name_Map[id] for id in top5_tgtIDs] print('Top 5 Prediction results are:\n') for idx in range(5): print('top%d: %s , %f , %s ' % (idx + 1, top5_tgtIDs[idx], top5_confs[idx], top5_tgtNames[idx])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline()