def index(model_dir, rawfile, encodeIndexFile, batchsize=10000):
    if not os.path.exists(model_dir):
        print('Error! Model folder does not exist!! : %s' % model_dir)
        exit(-1)

    if not os.path.exists(os.path.join(model_dir, 'vocabulary.txt')):
        print(
            'Error!! Could not find vocabulary file for encoder in folder :%s'
            % model_dir)
        exit(-1)

    encoder = text_encoder.SubwordTextEncoder(
        filename=os.path.join(model_dir, 'vocabulary.txt'))
    print("Loaded  vocab size is: %d" % encoder.vocab_size)

    cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    with tf.Session(config=cfg) as sess:
        #load model
        modelConfigs = data_utils.load_model_configs(model_dir)
        model = sse_model.SSEModel(modelConfigs)
        ckpt = tf.train.get_checkpoint_state(model_dir)
        if ckpt:
            print("Reading model parameters from %s" %
                  ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print(
                'Error!!!Could not load any model from specified folder: %s' %
                model_dir)
            exit(-1)

        # start to indexing
        createIndexFile(model, encoder, rawfile,
                        int(modelConfigs['max_seq_length']), encodeIndexFile,
                        sess, batchsize)
示例#2
0
def index(model_dir, rawfile, encodeIndexFile, batchsize=10000):
  if not os.path.exists( model_dir ):
    print('Error! Model folder does not exist!! : %s' % model_dir)
    exit(-1)

  if not os.path.exists( os.path.join(model_dir, 'vocabulary.txt' ) ):
    print('Error!! Could not find vocabulary file for encoder in folder :%s' % model_dir)
    exit(-1)

  encoder = text_encoder.SubwordTextEncoder(filename=os.path.join(model_dir, 'vocabulary.txt' ))
  print("Loaded  vocab size is: %d" % encoder.vocab_size)

  cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
  with tf.Session(config=cfg) as sess:
    #load model
    modelConfigs = data_utils.load_model_configs(model_dir)
    model = sse_model.SSEModel( int(modelConfigs['max_seq_length']), float(modelConfigs['max_gradient_norm']), int(modelConfigs['vocabsize']),
                               int(modelConfigs['embedding_size']), int(modelConfigs['encoding_size']),
                               int(modelConfigs['src_cell_size']), int(modelConfigs['tgt_cell_size']), int(modelConfigs['num_layers']),
                               float(modelConfigs['learning_rate']), float(modelConfigs['learning_rate_decay_factor']), int(modelConfigs['targetSpaceSize']), network_mode=modelConfigs['network_mode'], forward_only=True, TOP_N=int(modelConfigs['TOP_N']) )
    ckpt = tf.train.get_checkpoint_state(model_dir)
    if ckpt:
      print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
      model.saver.restore(sess, ckpt.model_checkpoint_path)
    else:
        print('Error!!!Could not load any model from specified folder: %s' % model_dir)
        exit(-1)

    # start to indexing
    createIndexFile(model, encoder, rawfile, int(modelConfigs['max_seq_length']), encodeIndexFile, sess, batchsize)
示例#3
0
  def __init__(self, *args, **kwargs):
    super(FlaskApp, self).__init__(*args, **kwargs)

    self.model = 'Do my initialization work here, loading model and index ....'
    self.model_type = os.environ.get("MODEL_TYPE", "classification")
    self.model_dir = "models-" + self.model_type
    self.indexFile = os.environ.get("INDEX_FILE", "targetEncodingIndex.tsv")
    print("In app class: Received flask appconfig is: " + os.environ.get('MODEL_TYPE', 'Default_classification') )

    if not os.path.exists(self.model_dir):
      print('Model folder %s does not exist!!' % self.model_dir )
      exit(-1)

    if not os.path.exists(os.path.join(self.model_dir, self.indexFile)):
      print('Index File does not exist!!')
      exit(-1)

    # load full set targetSeqID data
    if not os.path.exists(os.path.join(self.model_dir, 'vocabulary.txt')):
        print('Error!! Could not find vocabulary file for encoder in model folder.')
        exit(-1)
    self.encoder = text_encoder.SubwordTextEncoder(filename=os.path.join(self.model_dir, 'vocabulary.txt'))

    # load full set target Index data
    self.targetEncodings = []
    self.targetIDs = []
    self.targetIDNameMap = {}
    idx = 0
    for line in codecs.open(os.path.join(self.model_dir, self.indexFile), 'r', 'utf-8').readlines():
        info = line.strip().split('\t')
        if len(info) != 3:
            print('Error in targetIndexFile! %s' % line)
            continue
        tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2]
        self.targetIDs.append(tgtid)
        self.targetEncodings.append([float(f) for f in tgtEncoding.strip().split(',')])
        self.targetIDNameMap[tgtid] = tgtseq
        idx += 1
    self.targetEncodings = np.array(self.targetEncodings)

    cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    self.sess = tf.Session(config=cfg)
    #load model
    self.modelConfigs = data_utils.load_model_configs(self.model_dir)
    self.model = sse_model.SSEModel( int(self.modelConfigs['max_seq_length']), float(self.modelConfigs['max_gradient_norm']),
                                     int(self.modelConfigs['vocabsize']),
                               int(self.modelConfigs['embedding_size']), int(self.modelConfigs['encoding_size']),
                               int(self.modelConfigs['src_cell_size']), int(self.modelConfigs['tgt_cell_size']), int(self.modelConfigs['num_layers']),
                               float(self.modelConfigs['learning_rate']), float(self.modelConfigs['learning_rate_decay_factor']),
                                     int(self.modelConfigs['targetSpaceSize']), network_mode=self.modelConfigs['network_mode'],
                                     forward_only=True, TOP_N=int(self.modelConfigs['TOP_N']) )
    ckpt = tf.train.get_checkpoint_state(self.model_dir)
    if ckpt:
      print("loading model from %s" % ckpt.model_checkpoint_path)
      self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
    else:
        print('Error!!!Could not load any model from specified folder: %s' % self.model_dir)
        exit(-1)
    def __init__(self, *args, **kwargs):
        super(FlaskApp, self).__init__(*args, **kwargs)

        self.model = 'Do my initialization work here, loading model and index ....'
        self.model_type = os.environ.get("MODEL_TYPE", "classification")
        self.model_dir = "models-" + self.model_type
        self.indexFile = os.environ.get("INDEX_FILE",
                                        "targetEncodingIndex.tsv")

        if not os.path.exists("./logs"):
            os.makedirs("./logs", exist_ok=True)
        log = logging.getLogger('')
        log.setLevel(logging.DEBUG)
        format = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
            datefmt='%m/%d/%Y %I:%M:%S %p')
        ch = logging.StreamHandler(sys.stdout)
        ch.setFormatter(format)
        log.addHandler(ch)
        fh = handlers.RotatingFileHandler('./logs/WebServerLog.txt',
                                          maxBytes=(1048576 * 20),
                                          backupCount=7)
        fh.setFormatter(format)
        log.addHandler(fh)

        logging.info("In app class: Received flask appconfig is: " +
                     os.environ.get('MODEL_TYPE', 'Default_classification'))

        if not os.path.exists(self.model_dir):
            logging.error('Model folder %s does not exist!!' % self.model_dir)
            exit(-1)

        if not os.path.exists(os.path.join(self.model_dir, self.indexFile)):
            logging.error('Index File does not exist!!')
            exit(-1)

        # load full set targetSeqID data
        if not os.path.exists(os.path.join(self.model_dir, 'vocabulary.txt')):
            logging.error(
                'Error!! Could not find vocabulary file for encoder in model folder.'
            )
            exit(-1)
        self.encoder = text_encoder.SubwordTextEncoder(
            filename=os.path.join(self.model_dir, 'vocabulary.txt'))

        # load full set target Index data
        self.targetEncodings = []
        self.targetIDs = []
        self.targetIDNameMap = {}
        idx = 0
        for line in codecs.open(os.path.join(self.model_dir, self.indexFile),
                                'r', 'utf-8').readlines():
            info = line.strip().split('\t')
            if len(info) != 3:
                logging.info('Error in targetIndexFile! %s' % line)
                continue
            tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2]
            self.targetIDs.append(tgtid)
            self.targetEncodings.append(
                [float(f) for f in tgtEncoding.strip().split(',')])
            self.targetIDNameMap[tgtid] = tgtseq
            idx += 1
        self.targetEncodings = np.array(self.targetEncodings)

        cfg = tf.ConfigProto(log_device_placement=False,
                             allow_soft_placement=True)
        self.sess = tf.Session(config=cfg)
        #load model
        self.modelConfigs = data_utils.load_model_configs(self.model_dir)
        self.model = sse_model.SSEModel(self.modelConfigs)
        ckpt = tf.train.get_checkpoint_state(self.model_dir)
        if ckpt:
            logging.info("loading model from %s" % ckpt.model_checkpoint_path)
            self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            logging.error(
                'Error!!!Could not load any model from specified folder: %s' %
                self.model_dir)
            exit(-1)
示例#5
0
def demo(nbest):
    if not os.path.exists(FLAGS.model_dir):
        print('Model folder does not exist!!')
        exit(-1)

    if not os.path.exists(os.path.join(FLAGS.model_dir, 'vocabulary.txt')):
        print(
            'Error!! Could not find vocabulary file for encoder in model folder.'
        )
        exit(-1)
    encoder = text_encoder.SubwordTextEncoder(
        filename=os.path.join(FLAGS.model_dir, 'vocabulary.txt'))

    if not os.path.exists(os.path.join(FLAGS.model_dir, FLAGS.indexFile)):
        print('Index file does not exist!!!')
        exit(-1)

    #load full set target Index data
    targetEncodings = []
    targetIDs = []
    idLabelMap = {}
    targetIDNameMap = {}
    idx = 0
    for line in codecs.open(os.path.join(FLAGS.model_dir, FLAGS.indexFile),
                            'rt', 'utf-8').readlines():
        info = line.strip().split('\t')
        if len(info) != 3:
            print('Error in targetIndexFile! %s' % line)
            continue
        tgtid, tgtseq, tgtEncoding = info[0], info[1], info[2]
        targetIDs.append(tgtid)
        targetEncodings.append(
            [float(f) for f in tgtEncoding.strip().split(',')])
        idLabelMap[tgtid] = idx
        targetIDNameMap[tgtid] = tgtseq
        idx += 1
    targetEncodings = np.array(targetEncodings)

    cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    with tf.Session(config=cfg) as sess:
        # TODO: improve here later
        #load model
        modelConfigs = data_utils.load_model_configs(FLAGS.model_dir)
        model = sse_model.SSEModel(modelConfigs)
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        if ckpt:
            print("Reading model parameters from %s" %
                  ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print(
                'Error!!!Could not load any model from specified folder: %s' %
                FLAGS.model_dir)
            exit(-1)

        # Decode from standard input.
        sys.stdout.write(
            "\n\nPlease type some keywords to get related task results.\nType 'exit' to quit demo.\n > "
        )
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence and sentence.strip().lower() != 'exit':
            # Get token-ids for the input sentence.
            source_tokens = encoder.encode(tf.compat.as_str(sentence).lower())
            srclen = len(source_tokens)
            max_seq_length = int(modelConfigs['max_seq_length'])
            if srclen > max_seq_length - 2:
                print(
                    'Input sentence too long, max allowed is %d. Try to increase limit!!!!'
                    % (max_seq_length))
                source_tokens = [
                    text_encoder.PAD_ID
                ] + source_tokens[:max_seq_length - 2] + [text_encoder.EOS_ID]
            else:
                source_tokens = [text_encoder.PAD_ID] * (
                    max_seq_length - srclen - 1) + source_tokens + [
                        text_encoder.EOS_ID
                    ]

            feed_dict = model.get_source_encoding_feed_dict(
                np.array([source_tokens]))
            model.set_forward_only(True)
            sourceEncodings = sess.run([model.src_seq_embedding],
                                       feed_dict=feed_dict)
            #sourceEncodings = sess.run([model.norm_src_seq_embedding], feed_dict=feed_dict)
            sourceEncodings = np.vstack(sourceEncodings)
            distances = np.dot(sourceEncodings, targetEncodings.T)
            rankedScore, rankedIdx = data_utils.getSortedResults(distances)
            top_confs = rankedScore[0][:nbest]
            top_tgtIDs = [targetIDs[lbl] for lbl in rankedIdx[0][:nbest]]
            top_tgtNames = [targetIDNameMap[id] for id in top_tgtIDs]

            print('Top %s Prediction results are:\n' % nbest)
            for idx in range(nbest):
                print('top%d:  %s , %f ,  %s ' %
                      (idx + 1, top_tgtIDs[idx], top_confs[idx],
                       top_tgtNames[idx]))
            print("> ", end="")

            sys.stdout.flush()
            sentence = sys.stdin.readline()
def demo():
    if not os.path.exists(FLAGS.model_dir):
        print('Model folder does not exist!!')
        exit(-1)
    encodedFullTargetSpace_path = os.path.join(FLAGS.model_dir,
                                               "encoded.FullTargetSpace")
    if not os.path.exists(encodedFullTargetSpace_path):
        print(
            'Encoded full target space file not exist. Please ReTrain the model to get it!!'
        )
        exit(-1)

    #load full set targetSeqID data
    encoder, encodedTgtSpace, tgtID_Name_Map = data_utils.load_encodedTargetSpace(
        FLAGS.model_dir)
    fullTgtIdList = encodedTgtSpace.keys()
    tgtLabel_IDMap = {idx: tgtid for (idx, tgtid) in enumerate(fullTgtIdList)}
    tgtInput_batches = [encodedTgtSpace[tgtid] for tgtid in fullTgtIdList]
    tgtLen_batches = [
        encodedTgtSpace[tgtid].index(text_encoder.PAD_ID) + 1
        for tgtid in fullTgtIdList
    ]

    cfg = tf.ConfigProto(log_device_placement=False, allow_soft_placement=True)
    with tf.Session(config=cfg) as sess:
        # TODO: improve here later
        #load model
        modelConfigs = data_utils.load_model_configs(FLAGS.model_dir)
        model = sse_model.SSEModel(
            int(modelConfigs['max_seq_length']),
            float(modelConfigs['max_gradient_norm']),
            int(modelConfigs['vocabsize']),
            int(modelConfigs['embedding_size']),
            int(modelConfigs['encoding_size']),
            int(modelConfigs['src_cell_size']),
            int(modelConfigs['tgt_cell_size']),
            int(modelConfigs['num_layers']),
            float(modelConfigs['learning_rate']),
            float(modelConfigs['learning_rate_decay_factor']),
            int(modelConfigs['targetSpaceSize']),
            network_mode=modelConfigs['network_mode'],
            forward_only=True,
            TOP_N=int(modelConfigs['TOP_N']))
        ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir)
        if ckpt:
            print("Reading model parameters from %s" %
                  ckpt.model_checkpoint_path)
            model.saver.restore(sess, ckpt.model_checkpoint_path)
        else:
            print(
                'Error!!!Could not load any model from specified folder: %s' %
                FLAGS.model_dir)
            exit(-1)

        # Decode from standard input.
        sys.stdout.write(
            "\n\nPlease type some keywords to get related task results.\nType 'exit' to quit demo.\n > "
        )
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence and sentence.strip().lower() != 'exit':
            # Get token-ids for the input sentence.
            source_tokens = encoder.encode(tf.compat.as_str(sentence).lower())
            srclen = len(source_tokens)
            if srclen > int(modelConfigs['max_seq_length']) - 1:
                print(
                    'Max number of supported keywords is  %d \n Please try againt!!!!'
                    % (int(modelConfigs['max_seq_length'])))
                continue
            source_tokens = source_tokens + [
                text_encoder.EOS_ID
            ] + [text_encoder.PAD_ID
                 ] * (int(modelConfigs['max_seq_length']) - srclen - 1)

            print("")

            dict = model.get_predict_feed_dict(np.array([source_tokens]),
                                               tgtInput_batches,
                                               np.array([srclen]),
                                               tgtLen_batches)
            pred_conf, pred_labels = sess.run(
                [model.predicted_tgts_score, model.predicted_labels],
                feed_dict=dict)
            pred_labels = np.vstack(pred_labels)
            pred_conf = np.vstack(pred_conf)
            top5_confs = pred_conf[0][:5]
            top5_tgtIDs = [tgtLabel_IDMap[lbl] for lbl in pred_labels[0][:5]]
            top5_tgtNames = [tgtID_Name_Map[id] for id in top5_tgtIDs]

            print('Top 5 Prediction results are:\n')
            for idx in range(5):
                print('top%d:  %s , %f ,  %s ' %
                      (idx + 1, top5_tgtIDs[idx], top5_confs[idx],
                       top5_tgtNames[idx]))
            print("> ", end="")

            sys.stdout.flush()
            sentence = sys.stdin.readline()