def train_in_ids_lm(train_data, vocab_path, out_dir):
    if not os.path.exists(vocab_path):
        os.mkdir(vocab_path)
    vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words")

    vocab_file_out = os.path.join(vocab_path, "vocab_out")


    data_ut = DataUtility(vocab_file_in_words=vocab_file_in_words,
                               vocab_file_out=vocab_file_out)


    with codecs.open(train_data, "r") as f:
        with codecs.open(os.path.join(vocab_path, out_dir), "w") as f1:
            for line in f.readlines():
                words = line.strip()
                words = words.replace('.', ' .')
                words = words.replace(',', ' ,')
                words = words.replace("'", "' ")
                words = words.replace('"', '" ')
                words = words.split()
                words_ids = data_ut.words2ids(words)
                words_ids = [str(id) for id in words_ids]
                words_ids = ' '.join(words_ids)
                f1.write(words_ids + '#' + words_ids + '\n')
示例#2
0
    def __init__(self, model_path, config_name):

        vocab_file_in_words = os.path.join(model_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(model_path, "vocab_in_letters")
        vocab_file_out = os.path.join(model_path, "vocab_out")
        config_file = os.path.join(model_path, config_name)

        config = Config()
        config.get_config(config_file)
        self._data_utility = DataUtility(
            vocab_file_in_words=vocab_file_in_words,
            vocab_file_in_letters=vocab_file_in_letters,
            vocab_file_out=vocab_file_out,
            max_sentence_length=config.num_steps)

        self.sparsity = config.sparsity
        prefix = "import/"
        self.top_k_name = prefix + "Online/Model/top_k:0"
        self.state_in_name = prefix + "Online/Model/state:0"
        self.input_name = prefix + "Online/Model/batched_input_word_ids:0"

        self.top_k_prediction_name = prefix + "Online/Model/top_k_prediction:1"
        self.output_name = prefix + "Online/Model/probabilities:0"
        self.state_out_name = prefix + "Online/Model/state_out:0"

        saved_model_path = os.path.join(
            model_path, 'sparse_graph-finetune-' + config_name + '.pb')
        with open(saved_model_path, 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())
            tf.import_graph_def(graph_def)

        gpu_config = tf.ConfigProto()
        gpu_config.gpu_options.per_process_gpu_memory_fraction = config.gpu_fraction
        self._sess = tf.Session(config=gpu_config)
    def __init__(self,
                 graph_file,
                 vocab_path,
                 full_vocab,
                 config_name,
                 use_phrase=False):

        vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters")
        vocab_file_out = os.path.join(vocab_path, "vocab_out")
        vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase")

        self.use_phrase = use_phrase
        self._config = Config()
        self._config.get_config(vocab_path, config_name)
        self._data_utility = DataUtility(
            vocab_file_in_words=vocab_file_in_words,
            vocab_file_in_letters=vocab_file_in_letters,
            vocab_file_out=vocab_file_out,
            vocab_file_phrase=vocab_file_phrase,
            full_vocab_file_in_words=full_vocab)
        print(
            "in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d"
            "\nphrase vocabulary size = %d" %
            (self._config.vocab_size_in, self._config.vocab_size_out,
             self._config.vocab_size_letter, self._config.vocab_size_phrase))

        prefix = "import/"
        self.lm_state_in_name = prefix + "Online/WordModel/state:0"
        self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0"
        self.lm_state_out_name = prefix + "Online/WordModel/state_out:0"

        self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1"
        self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0"
        self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1"
        self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0"
        self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0"

        self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0"
        self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0"
        self.kc_state_in_name = prefix + "Online/LetterModel/state:0"
        self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0"
        self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0"
        self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1"
        self.kc_output_name = prefix + "Online/LetterModel/probabilities:0"
        self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0"

        with open(graph_file, 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())
            tf.import_graph_def(graph_def)

        gpu_config = tf.ConfigProto()
        gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction
        self._sess = tf.Session(config=gpu_config)
    def render_PUT_advanced(self, request, response):
        request_text = str(request.payload)
        uri = request.uri_path
        first_part_uri = uri.split('/')[0]
        second_part_uri = uri.split('/')[1]
        local_uri = first_part_uri + '/' + second_part_uri
        key = self.get_data_handlers_key_byuri(local_uri)
        #       assert (isinstance(response, Response))
        if key is not None and request_text is not None and "" != request_text:
            listener = self._handler.get(key)
            if listener is not None:
                fmt = request.content_type
                from data_utility import DataUtility
                parser = DataUtility().get_parser(fmt)
                data = None
                if parser is None:
                    from ..model.resource_data_general import ResourceDataGeneral
                    data = ResourceDataGeneral(request_text)
                else:
                    data = parser.parse(request_text)
                data.set_format(fmt)  # ResourceDataOCF/LMW2M/GENERAL
                paths = uri.split("/")
                device_id = paths[3] if len(paths) > 2 else ""

                resource_uri = ""
                if len(paths[4:]) > 0:
                    resource_uri = "/".join(paths[4:])
                    if not resource_uri.startswith("/"):
                        resource_uri = "/" + resource_uri
                """
                resource_uri = paths[3] if len(paths) > 3 else ""
                if fmt == MediaTypeFormat.APPLICATION_JSON:
                    resource_uri += "/" + paths[4] if len(paths) > 4 else ""
                if fmt == MediaTypeFormat.TEXT_PLAIN:
                    resource_uri += ("/" + paths[4]) if len(paths) > 4 else ""
                    resource_uri += ("/" + paths[5]) if len(paths) > 5 else ""
                """

                rt = listener(device_id, resource_uri,
                              data)  # listener is a function
                if key.process:
                    if rt is True:
                        response.code = defines.Codes.CHANGED.number
                    else:
                        response.code = defines.Codes.FORBIDDEN.number
                    response.content_type = fmt
                    response.payload = data.to_json()
                    return self, response
                else:
                    return self, None

        # response changed
        else:
            pass
示例#5
0
    def __init__(self,
                 model_path,
                 model_name,
                 config_name,
                 full_vocab_path=None):
        vocab_file_in_words = os.path.join(model_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(model_path, "vocab_in_letters")
        vocab_file_out = os.path.join(model_path, "vocab_out")
        model_file = os.path.join(model_path, model_name)
        config_file = os.path.join(model_path, config_name)

        self._config = Config()
        self._config.get_config(config_file)
        self._data_utility = DataUtility(
            vocab_file_in_words=vocab_file_in_words,
            vocab_file_in_letters=vocab_file_in_letters,
            vocab_file_out=vocab_file_out,
            max_sentence_length=self._config.num_steps,
            full_vocab_file_in_words=full_vocab_path)
        self._config.batch_size = 1
        self._config.num_steps = 1

        with tf.Graph().as_default():
            with tf.variable_scope("Model"):
                self._language_model_test = PTBModel(is_training=False,
                                                     config=self._config,
                                                     bucket=1)

            gpu_config = tf.ConfigProto()
            gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction
            self._sess = tf.Session(config=gpu_config)
            with self._sess.as_default():
                # Do not restore sparse weights from pretrain phase
                restore_variables = dict()
                for v in tf.trainable_variables():
                    if v.name.startswith("Model/Softmax/softmax_sp_trainable_weights") \
                            or v.name.startswith("Model/Embedding/embedding_sp_trainable_weights"):
                        continue
                    print("restore:", v.name)
                    restore_variables[v.name] = v
                saver = tf.train.Saver(restore_variables)
                saver.restore(self._sess, model_file)

            self._fetches = {
                "topk": self._language_model_test._top_k_prediction,
                "probability": self._language_model_test._probabilities,
                "final_state": self._language_model_test.final_state
            }
示例#6
0
def main():
    writer = FCDRWriter()

    # get a template for sensor name in FULL format, supply product height
    # The scan-width is set automatically
    # ---------------------------------------------------------------------
    dataset = writer.createTemplateFull("AVHRR", 128)

    # set some mandatory global attributes (CF standards). Writing will fail if not all of them are filled
    # automatically set: CF version and FIDUCEO license
    # ----------------------------------------------------------------------------------------------------
    dataset.attrs["institution"] = "Brockmann Consult GmbH"
    dataset.attrs["title"] = "FIDUCEO test dataset"
    dataset.attrs["source"] = "arbitray stuff"
    dataset.attrs["history"] = "none"
    dataset.attrs["references"] = "CDR_FCDR sensor reference documentation"
    dataset.attrs[
        "comment"] = "just to show how things are intended to be used"

    # write real data to the variables. All variables initially contain "_FillValue".
    # Not writing to the whole array is completely OK
    # -------------------------------------------------------------------------------
    Time = dataset.variables["Time"]
    Time.data[44] = 0.456
    Time.data[45] = 0.457

    raa = dataset.variables["relative_azimuth_angle"]
    raa.data[3, 0] = 0.567
    raa.data[3, 1] = 0.568

    # ensure not to generate over/underflows
    # --------------------------------------
    DataUtility.check_scaling_ranges(raa)

    # create a standardized file name
    # -------------------------------
    start = datetime.datetime(2006, 8, 23, 14, 24, 52)
    end = datetime.datetime(2006, 8, 23, 15, 25, 53)
    file_name = writer.create_file_name_FCDR_full("AVHRR", "NOAA12", start,
                                                  end, "01.2")

    # dump it to disk, netcdf4, medium compression
    # overwrite existing file
    # --------------------------------------------
    writer.write(dataset,
                 "D:\\Satellite\\DELETE\\" + file_name,
                 overwrite=True)
def train_in_ids_letters(train_data, vocab_path, emoji_data):
    if not os.path.exists(vocab_path):
        os.mkdir(vocab_path)
    vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words")
    vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters")
    vocab_file_out = os.path.join(vocab_path, "vocab_out")

    data_ut = DataUtility(vocab_file_in_words=vocab_file_in_words,
                          vocab_file_in_letters=vocab_file_in_letters,
                          vocab_file_out=vocab_file_out)

    emojis = []
    with codecs.open(emoji_data, "r") as f:
        for line in f.readlines():
            emoji, _ = line.strip().split('\t')
            emojis.append(emoji)

    with codecs.open(train_data, "r") as f:
        with codecs.open(os.path.join(vocab_path, "train_in_ids_letters"),
                         "w") as f1:
            for line in f.readlines():
                letters, _ = line.strip().split('\t')
                letters = letters.split(
                    '#'
                )  #letters = ['where', 'so', 'you', 'want', 'me', 'tk', 'ride', '?', '!', 'baby']
                letters_ids = ['1']
                for word in letters:  #word = 'where'
                    if word in emojis:
                        letters_id = '1'
                    else:
                        letter = []
                        for i in word:
                            letter.append(
                                i)  #letter = ['w', 'h', 'e', 'r', 'e']
                        letter = ' '.join(letter)  #letter = 'w h e r e'
                        letters_id = data_ut.letters2ids(
                            letter)  #letters_id = [1, 25, 10, 7, 20, 7]
                        letters_id = [str(id) for id in letters_id
                                      ]  #['1', '25', ..., '7']
                        letters_id = ' '.join(
                            letters_id)  #letters_id = '1 25 10 7 20 7'
                    letters_ids.append(
                        letters_id
                    )  #letters_ids = ['1', '1', '1 25 10 7 20 7', ...,]
                f1.write('#'.join(letters_ids) + '\n')
示例#8
0
    def get_config(self, vocab_path, config_filename=None):
        vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters")
        vocab_file_out = os.path.join(vocab_path, "vocab_out")
        vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase")

        print ("the data file path is:", vocab_path)

        self.data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words,
                                        vocab_file_in_letters=vocab_file_in_letters,
                                        vocab_file_out=vocab_file_out,
                                        vocab_file_phrase=vocab_file_phrase)

        self.vocab_size_letter = self.data_utility.in_letters_count
        self.vocab_size_in = self.data_utility.in_words_count
        self.vocab_size_out = self.data_utility.out_words_count
        self.vocab_size_phrase = self.data_utility.phrase_count

        if config_filename is not None:
            with open(config_filename) as f:
                for line in f:
                    if line.startswith('#'):
                        continue
                    param, value = line.split()
                    if param == "init_scale":
                        self.init_scale = float(value)
                    elif param == "learning_rate":
                        self.learning_rate = float(value)
                    elif param == "max_grad_norm":
                        self.max_grad_norm = float(value)
                    elif param == "num_layers":
                        self.num_layers = int(value)
                    elif param == "num_steps":
                        self.num_steps = int(value)
                    elif param == "max_word_length":
                        self.max_word_length = int(value)
                    elif param == "word_embedding_size":
                        self.word_embedding_size = int(value)
                    elif param == "letter_embedding_size":
                        self.letter_embedding_size = int(value)
                    elif param == "word_hidden_size":
                        self.word_hidden_size = int(value)
                    elif param == "letter_hidden_size":
                        self.letter_hidden_size = int(value)
                    elif param == "max_epoch":
                        self.max_epoch = int(value)
                    elif param == "max_max_epoch":
                        self.max_max_epoch = int(value)
                    elif param == "keep_prob":
                        self.keep_prob = float(value)
                    elif param == "lr_decay":
                        self.lr_decay = float(value)
                    elif param == "batch_size":
                        self.batch_size = int(value)
                    elif param == "gpu_fraction":
                        self.gpu_fraction = float(value)
示例#9
0
    def __init__(self, config, vocab_file_in_words="resource/vocab/vocab_in_words",
                 vocab_file_in_letters="resource/vocab/vocab_in_letters",
                 vocab_file_out="resource/vocab/vocab_out",
                 corpus_file_in_words="resource/train_data/train_in_ids_words",
                 corpus_file_in_letters="resource/train_data/train_in_ids_letters",
                 corpus_file_out="resource/train_data/train_out_ids"):
        # Use bucketing to reduce padding
        self.PAD_ID = 0
        self.Buckets = config.buckets
        self.data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words,
                                        vocab_file_in_letters=vocab_file_in_letters,
                                        vocab_file_out=vocab_file_out, max_sentence_length=0)

        corpus_in_words = self.load_corpus(corpus_file_in_words)
        corpus_in_letters = self.load_corpus(corpus_file_in_letters)
        corpus_out = self.load_corpus(corpus_file_out)
        self.all_data = [[] for _ in self.Buckets]  # all_data which is divided into different bukets
        for i in range(len(corpus_in_words)):
            in_words_array = corpus_in_words[i].strip().split()
            in_letters_array = corpus_in_letters[i].strip().split()
            if len(in_letters_array) + len(in_words_array) == 0:
                continue
            if len(in_letters_array) <= self.Buckets[-1]:
                for bucketid, bucketlength in enumerate(self.Buckets):
                    if len(in_letters_array) + len(in_words_array) <= bucketlength:
                        in_data = in_words_array + in_letters_array + [self.PAD_ID] * (bucketlength - len(in_words_array) - len(in_letters_array))
                        words_num = len(in_words_array)
                        letters_num = len(in_letters_array)
                        out_data = corpus_out[i].strip()
                        data = Data(in_data=in_data, words_num=words_num, letters_num=letters_num, out_data=out_data)
                        self.all_data[bucketid].append(data)
                        break
                    if len(in_letters_array) + len(in_words_array) > self.Buckets[-1]:
                        if len(in_letters_array) < self.Buckets[-1]:
                            in_data = in_words_array[-(self.Buckets[-1] - len(in_letters_array)):] + in_letters_array
                        else:
                            in_data = in_letters_array
                        words_num = self.Buckets[-1] - len(in_letters_array)
                        letters_num = len(in_letters_array)
                        out_data = corpus_out[i].strip()
                        data = Data(in_data=in_data, words_num=words_num, letters_num=letters_num, out_data=out_data)
                        self.all_data[self.Buckets.index(self.Buckets[-1])].append(data)
                        break

        self.train_bucket_sizes = [len(self.all_data[b]) for b in range(len(self.Buckets))]
        print ("bucket size = " + str(self.train_bucket_sizes))
        self.num_samples = float(sum(self.train_bucket_sizes))
        self.train_buckets_scale = [sum(self.train_bucket_sizes[:i + 1]) / self.num_samples for i in range(len(self.train_bucket_sizes))]
        print ("bucket_scale = " + str(self.train_buckets_scale))
        print ("samples num = " + str(self.num_samples))
        self.current_batch_index = [0 for i in range(len(self.Buckets))]
        self.tmp_bucket_sizes = [len(self.all_data[b]) for b in range(len(self.Buckets))]
        self.tmp_bucket_scale = [sum(self.train_bucket_sizes[:i + 1]) / self.num_samples for i in range(len(self.train_bucket_sizes))]
def train_in_ids_lm(train_data, vocab_path):
    if not os.path.exists(vocab_path):
        os.mkdir(vocab_path)
    vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words")
    vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters")
    vocab_file_out = os.path.join(vocab_path, "vocab_out")

    data_ut = DataUtility(vocab_file_in_words=vocab_file_in_words,
                          vocab_file_in_letters=vocab_file_in_letters,
                          vocab_file_out=vocab_file_out)

    with codecs.open(train_data, "r") as f:
        with codecs.open(os.path.join(vocab_path, "train_in_ids_lm"),
                         "w") as f1:
            for line in f.readlines():
                _, words = line.strip().split('\t')
                words = words.split('#')
                words_ids = data_ut.words2ids(words)
                words_ids = [str(id) for id in words_ids]
                words_ids = ' '.join(words_ids)
                f1.write(words_ids + '#' + words_ids + '\n')
示例#11
0
    def __init__(self, config, is_train=True, vocab_path="../lang-8_process/user_data/",
                 data_path="../lang-8_process/user_data/"):
        # Use bucketing to reduce padding
        vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters")
        vocab_file_out = os.path.join(vocab_path, "vocab_out")

        phase = "train" if is_train else "dev"

        corpus_file_in_letters = os.path.join(data_path, phase + "_in_ids_letters")
        corpus_file_in_lm = os.path.join(data_path, phase + "_in_ids_lm")

        self.PAD_ID = 0
        self.Buckets = config.buckets
        self.num_steps = config.num_steps

        self.data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words,
                                        vocab_file_in_letters=vocab_file_in_letters,
                                        vocab_file_out=vocab_file_out)
        self.all_data = [[] for _ in self.Buckets]
        corpus_in_words_lm = self.load_corpus(corpus_file_in_lm)
        corpus_in_letters = self.load_corpus(corpus_file_in_letters)

        assert len(corpus_in_words_lm) == len(corpus_in_letters)
        for i in range(len(corpus_in_words_lm)):
            corpus_in_words = corpus_in_words_lm[i].strip().split("#")
            lemma_words = corpus_in_words[0].split()
            words = corpus_in_words[1].split()

            if len(corpus_in_words) == 3:
                lemma_index = corpus_in_words[2].split()
            else:
                lemma_index = [0]

            letters = [letter.split() for letter in corpus_in_letters[i].strip().split("#")]

            self.gen_data(words, lemma_words, lemma_index, letters)

        self.train_bucket_sizes = [len(self.all_data[b]) for b in range(len(self.Buckets))]
        print ("bucket size = " + str(self.train_bucket_sizes))
        self.num_samples = float(sum(self.train_bucket_sizes))
        self.train_buckets_scale = [sum(self.train_bucket_sizes[:i + 1]) / self.num_samples
                                    for i in range(len(self.train_bucket_sizes))]
        print ("bucket_scale = " + str(self.train_buckets_scale))
        print ("samples num = " + str(self.num_samples))
        self.current_batch_index = [0 for i in range(len(self.Buckets))]
        self.tmp_bucket_sizes = [len(self.all_data[b]) for b in range(len(self.Buckets))]
        self.tmp_bucket_scale = [sum(self.train_bucket_sizes[:i + 1]) / self.num_samples
                                 for i in range(len(self.train_bucket_sizes))]
示例#12
0
    def __init__(self, vocab_file_in_words="resource/vocab/vocab_in_words",
                 vocab_file_in_letters="resource/vocab/vocab_in_letters",
                 vocab_file_out="resource/vocab/vocab_out",
                 corpus_file_in_words="resource/train_data/train_in_ids_words",
                 corpus_file_in_letters="resource/train_data/train_in_ids_letters",
                 corpus_file_out="resource/train_data/train_out_ids",
                 max_sentence_length=30):
        # Use bucketing to reduce padding
        self.PAD_ID = 0

        self.data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words,
                                         vocab_file_in_letters=vocab_file_in_letters,
                                         vocab_file_out=vocab_file_out, max_sentence_length=max_sentence_length)

        corpus_in_words = self.load_corpus(corpus_file_in_words)
        corpus_in_letters = self.load_corpus(corpus_file_in_letters)
        corpus_out = self.load_corpus(corpus_file_out)
        self.all_data = []
        for i in range(len(corpus_in_words)):
            in_words_array = corpus_in_words[i].strip().split()
            in_letters_array = corpus_in_letters[i].strip().split()
            if len(in_letters_array) <= max_sentence_length:
                if len(in_words_array) + len(in_letters_array) <= max_sentence_length:
                    in_data = in_words_array + in_letters_array + [self.PAD_ID] * (max_sentence_length - len(in_words_array) - len(in_letters_array))
                    words_num = len(in_words_array)
                    letters_num = len(in_letters_array)
                else:
                    if len(in_letters_array) < max_sentence_length:
                        in_data = in_words_array[-(max_sentence_length - len(in_letters_array)):] + in_letters_array
                    else:
                        in_data = in_letters_array
                    words_num = max_sentence_length - len(in_letters_array)
                    letters_num = len(in_letters_array)
                out_data = corpus_out[i].strip()
                data = Data(in_data=in_data, words_num=words_num, letters_num=letters_num, out_data=out_data)
                self.all_data.append(data)
        self.num_samples = len(self.all_data)
        print ("samples num = " + str(self.num_samples))
        self.current_batch_index = 0
        self.max_sentence_length = max_sentence_length
class InputEngineRnn:
    def __init__(self, graph_file, vocab_path, config_name):

        vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters")
        vocab_file_out = os.path.join(vocab_path, "vocab_out")
        vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase")

        self._config = Config()
        self._config.get_config(vocab_path, config_name)
        self._data_utility = DataUtility(
            vocab_file_in_words=vocab_file_in_words,
            vocab_file_in_letters=vocab_file_in_letters,
            vocab_file_out=vocab_file_out,
            vocab_file_phrase=vocab_file_phrase)
        print(
            "in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d"
            "\nphrase vocabulary size = %d" %
            (self._config.vocab_size_in, self._config.vocab_size_out,
             self._config.vocab_size_letter, self._config.vocab_size_phrase))

        prefix = "import/"
        self.lm_state_in_name = prefix + "Online/WordModel/state:0"
        self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0"
        self.lm_state_out_name = prefix + "Online/WordModel/state_out:0"
        self.lm_output_top_k_name = prefix + "Online/WordModel/top_k_prediction:1"
        self.lm_output_top_k_probability = prefix + "Online/WordModel/probabilities:0"
        self.lm_top_k_name = prefix + "Online/WordModel/top_k:0"

        self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1"
        self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0"
        self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1"
        self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0"
        self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0"

        self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0"
        self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0"
        self.kc_state_in_name = prefix + "Online/LetterModel/state:0"
        self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0"
        self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0"
        self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1"
        self.kc_output_name = prefix + "Online/LetterModel/probabilities:0"
        self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0"

        with open(graph_file, 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())
            tf.import_graph_def(graph_def)

        gpu_config = tf.ConfigProto()
        gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction
        self._sess = tf.Session(config=gpu_config)

    def predict(self, sentence, k):
        global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k
        inputs, inputs_key, word_letters = self._data_utility.sentence2ids(
            sentence)  #word_letters是最后一个单词
        # print(inputs)
        # print(inputs_key)
        lm_state_out = np.zeros(
            [self._config.num_layers, 2, 1, self._config.word_hidden_size],
            dtype=np.float32)
        kc_state_out = np.zeros(
            [self._config.num_layers, 2, 1, self._config.letter_hidden_size],
            dtype=np.float32)
        words_out = list()
        phrase_logits = None
        if len(inputs) > 0:  #对输入的句子的每个单词循环
            for i in range(len(inputs)):
                feed_values = {
                    self.lm_input_name: [[inputs[i]]],
                    self.lm_top_k_name: k
                }  #外面多加一层列表是为了满足batchsize的那一维。即使batchsize为1
                if i > 0:
                    feed_values[self.lm_state_in_name] = lm_state_out
                # lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run([self.lm_state_out_name,
                #                                                                                self.phrase_p_name,
                #                                                                                self.phrase_p_probability,
                #                                                                                self.phrase_logits],
                #                                                                               feed_dict=feed_values)
                lm_state_out, lm_prob, lm_top_k = self._sess.run(
                    [
                        self.lm_state_out_name,
                        self.lm_output_top_k_probability,
                        self.lm_output_top_k_name
                    ],
                    feed_dict=feed_values)

                # phrase_p_top_k = [id for id in phrase_p_top_k[0]]#[0]指的是第一个batchsize,本身是个二维的量,第一个维度是batchsize,但是因为是测试,所以batchsize只有1
                # probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]#对应的归一化后的概率

                lm_top_k = [id for id in lm_top_k[0]]
                lm_probability_topk = [lm_prob[0][id] for id in lm_top_k]
                words_out = self._data_utility.ids2outwords(lm_top_k)

        # for i in range(len(inputs_key)):#对最后一个单词内的字母进行循环。
        #     feed_values = {self.kc_input_name: [[inputs_key[i]]],
        #                    self.kc_top_k_name: k}
        #     if i == 0 and len(inputs) > 0:
        #         feed_values[self.kc_lm_state_in_name] = lm_state_out
        #     else:
        #         feed_values[self.kc_state_in_name] = kc_state_out
        #     probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name,
        #                                                                   self.kc_state_out_name], feed_dict=feed_values)
        #     probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]#softmax归一后(probabilities[0])的概率的前k个值
        #     words_out = self._data_utility.ids2outwords(top_k_predictions[0])#前k个id转为词
        #     if i == 0 and len(inputs) > 0:
        #         top_word = words_out[0]#概率最大的那个词
        #         top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word)#以概率最大的那个词为首的概率最大的词组,以及他的概率。是个长度为2的元组
        #         if top_phrase[0] is not None:
        #             is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k)
        #             words_out, probability_topk = self.final_words_out(words_out, top_phrase, phrase_p, probability_topk)#把词组概率大于单个词的预测概率的那个词和对应的概率换成词组和对应的词组概率

        return [{
            'word': word,
            'probability': float(probability)
        } if word != '<unk>' else {
            'word': '<' + word_letters + '>',
            'probability': float(probability)
        } for word, probability in zip(words_out, lm_probability_topk)
                ] if len(words_out) > 0 else []

    def predict_data(self, sentence, k):
        global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k
        sentence = sentence.rstrip()
        words_line, letters_line, words_ids, letters_ids, words_num, letters_num = self._data_utility.data2ids_line(
            sentence)  #把一行输入拆成单词部分,字母部分,单词部分id表示,字母部分id,单词个数,每个单词的字母个数
        print('!!!!!', words_ids)
        print('!!!!!', letters_ids)
        out_str_list = []
        probability_topk_list = []
        # print(words_ids)
        # print(letters_ids)
        lm_state_out = np.zeros(
            [self._config.num_layers, 2, 1, self._config.word_hidden_size],
            dtype=np.float32)
        kc_state_out = np.zeros(
            [self._config.num_layers, 2, 1, self._config.letter_hidden_size],
            dtype=np.float32)

        for i in range(len(words_ids)):  #对每个单词循环
            words_out = []
            probs_out = []
            feed_values = {
                self.lm_input_name: [[words_ids[i]]],
                self.lm_top_k_name: k
            }
            if i > 0:
                feed_values[self.lm_state_in_name] = lm_state_out

            # lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run(
            #     [self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability,
            #      self.phrase_logits], feed_dict=feed_values)
            # phrase_p_top_k = [id for id in phrase_p_top_k[0]]#######################################
            # probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]###################################

            lm_state_out, lm_prob, lm_top_k = self._sess.run(
                [
                    self.lm_state_out_name, self.lm_output_top_k_probability,
                    self.lm_output_top_k_name
                ],
                feed_dict=feed_values)

            lm_top_k = [id for id in lm_top_k[0]]
            lm_probability_topk = [lm_prob[0][id] for id in lm_top_k]
            words = self._data_utility.ids2outwords(lm_top_k)

            if i == len(letters_ids):
                break
            for j in range(len(letters_ids[i])):  #循环这个单词内部的每个字母
                #     feed_values = {self.kc_input_name: [[letters_ids[i][j]]],
                #                    self.kc_top_k_name: k, self.key_length:[1]}
                #
                #     if j == 0 and len(words_ids) > 0:#第一个字母的初始状态是从语言模型来的,后面的字母的输入状态是从上一个字母的状态来的
                #         feed_values[self.kc_lm_state_in_name] = lm_state_out
                #     else:
                #         feed_values[self.kc_state_in_name] = kc_state_out
                #     probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name,
                #                                                                   self.kc_state_out_name], feed_dict=feed_values)
                #     probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]
                #     words = self._data_utility.ids2outwords(top_k_predictions[0])
                #
                #     if j == 0 and i > 0:
                #         top_word = words[0]
                #         top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word)
                #         if top_phrase[0] is not None:
                #             is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k)
                #             words, probability_topk = self.final_words_out(words, top_phrase, phrase_p, probability_topk)
                words_out.append(words)
                probs_out.append(lm_probability_topk)
            out_str = words_out if i > 0 else [['', '', '']] + words_out[1:]
            out_str_list.append(out_str)
            probability_topk_list.append(probs_out)

        return words_line, letters_line, out_str_list, probability_topk_list

    def calculate_phrase_p(self, top_phrase, probability_p_topk,
                           phrase_p_top_k):
        is_phrase_p = probability_p_topk[phrase_p_top_k.index(1)]
        phrase_p = is_phrase_p * top_phrase[
            1]  #即起到把权重降下来的作用。词组的权重,乘上词组是否在词组表里的概率
        return is_phrase_p, phrase_p

    def final_words_out(self, words, top_phrase, phrase_p, probability_topk):
        for i in range(len(probability_topk)):
            if phrase_p >= probability_topk[i]:
                probability_topk[i] = phrase_p
                words[i] = top_phrase[0]
                break
        return words, probability_topk

    def result_print(self, out_string, out_prob):
        string = ""
        for (word, prob) in zip(out_string, out_prob):
            prob = str(prob) if word != "" else "0.0"
            string = string + word + ":" + prob + "|"
        string = string[:-1]
        return string

    def predict_file(self, test_file_in, test_file_out, k):
        testfilein = open(test_file_in, "r")
        testfileout = open(test_file_out, 'w')
        t1 = time.time()
        jj = 0

        for sentence in testfilein:
            print(jj)
            jj += 1
            sentence = sentence.rstrip()
            result = self.predict_data(sentence, k)

            if result is not None:
                words_line, letters_line, out_words_list, out_prob_list = result

                for i in range(len(out_words_list)):
                    print("\t".join(words_line[:i]) + "|#|" + letters_line[i] +
                          "|#|" + "\t".join(words_line[i:]) + "|#|" +
                          '\t'.join([
                              self.result_print(out_words, out_prob)
                              for (out_words, out_prob) in zip(
                                  out_words_list[i], out_prob_list[i])
                          ]) + "\n")
                    testfileout.write(
                        "\t".join(words_line[:i]) + "|#|" + letters_line[i] +
                        "|#|" + "\t".join(words_line[i:]) + "|#|" + '\t'.join([
                            self.result_print(out_words, out_prob)
                            for (out_words, out_prob
                                 ) in zip(out_words_list[i], out_prob_list[i])
                        ]) + "\n")

        t2 = time.time()
        print(t2 - t1)
        testfilein.close()
        testfileout.close()
示例#14
0
class InputEngineSparse(object):
    def __init__(self, model_path, config_name):

        vocab_file_in_words = os.path.join(model_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(model_path, "vocab_in_letters")
        vocab_file_out = os.path.join(model_path, "vocab_out")
        config_file = os.path.join(model_path, config_name)

        config = Config()
        config.get_config(config_file)
        self._data_utility = DataUtility(
            vocab_file_in_words=vocab_file_in_words,
            vocab_file_in_letters=vocab_file_in_letters,
            vocab_file_out=vocab_file_out,
            max_sentence_length=config.num_steps)

        self.sparsity = config.sparsity
        prefix = "import/"
        self.top_k_name = prefix + "Online/Model/top_k:0"
        self.state_in_name = prefix + "Online/Model/state:0"
        self.input_name = prefix + "Online/Model/batched_input_word_ids:0"

        self.top_k_prediction_name = prefix + "Online/Model/top_k_prediction:1"
        self.output_name = prefix + "Online/Model/probabilities:0"
        self.state_out_name = prefix + "Online/Model/state_out:0"

        saved_model_path = os.path.join(
            model_path, 'sparse_graph-finetune-' + config_name + '.pb')
        with open(saved_model_path, 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())
            tf.import_graph_def(graph_def)

        gpu_config = tf.ConfigProto()
        gpu_config.gpu_options.per_process_gpu_memory_fraction = config.gpu_fraction
        self._sess = tf.Session(config=gpu_config)

    def predict(self, sentence, k):
        """Feed a sentence (str) and perform inference on this sentence """
        global probabilities, top_k_predictions

        sentence_ids, word_letters = self._data_utility.sentence2ids(sentence)

        # Feed input sentence word by word.
        state_out = None
        for i in range(len(sentence_ids)):
            feed_values = {
                self.input_name: [[sentence_ids[i]]],
                self.top_k_name: k
            }
            if i > 0:
                feed_values[self.state_in_name] = state_out
            # probabilities is an ndarray of shape (batch_size * time_step) * vocab_size
            # For inference, batch_size = num_step = 1, thus probabilities.shape = 1 * vocab_size
            probabilities, top_k_predictions, state_out = self._sess.run(
                [
                    self.output_name, self.top_k_prediction_name,
                    self.state_out_name
                ],
                feed_dict=feed_values)

        probability_topk = [
            probabilities[0][id] for id in top_k_predictions[0]
        ]
        words_out = self._data_utility.ids2outwords(top_k_predictions[0])
        return [{
            'word': word,
            'probability': float(probability)
        } if word != '<unk>' else {
            'word': '<' + word_letters + '>',
            'probability': float(probability)
        } for word, probability in zip(words_out, probability_topk)
                ] if len(words_out) > 0 else []

    def predict_data(self, sentence):
        sentence = sentence.rstrip()
        inputs, words_num, letters_num = self._data_utility.data2ids_line(
            sentence)
        if inputs == None:
            return None
        words_out = []
        state_out = None
        for i in range(len(inputs)):
            feed_values = {self.input_name: [[inputs[i]]], self.top_k_name: 3}
            if i > 0:
                feed_values[self.state_in_name] = state_out
            probabilities, top_k_predictions, state_out = self._sess.run(
                [
                    self.output_name, self.top_k_prediction_name,
                    self.state_out_name
                ],
                feed_dict=feed_values)
            words = self._data_utility.ids2outwords(top_k_predictions[0])
            words_out.append(words)
        out_str = str(
            words_out[words_num - 1:words_num +
                      letters_num] if words_num > 0 else [['', '', '']] +
            words_out[0:letters_num])
        return out_str

    def predict_file(self, test_file_in, test_file_out):
        testfilein = open(test_file_in, "r")
        testfileout = open(test_file_out, 'w')
        t1 = time.time()
        for sentence in testfilein:
            sentence = sentence.rstrip()
            out_str = self.predict_data(sentence)
            if (out_str):
                print(sentence + " |#| " + out_str)
                testfileout.write(sentence + " |#| " + out_str + "\n")
            else:
                print("predict error : " + sentence)
        t2 = time.time()
        print(t2 - t1)
        testfilein.close()
        testfileout.close()

    def predict_data_probability(self, sentence):
        sentence = sentence.rstrip()
        inputs, words_num, letters_num = self._data_utility.data2ids_line(
            sentence)
        if inputs == None:
            return None
        words_out = []
        probability_out = []
        state_out = None
        for i in range(len(inputs)):
            feed_values = {self.input_name: [[inputs[i]]], self.top_k_name: 3}
            if i > 0:
                feed_values[self.state_in_name] = state_out
            probabilities, top_k_predictions, state_out = self._sess.run(
                [
                    self.output_name, self.top_k_prediction_name,
                    self.state_out_name
                ],
                feed_dict=feed_values)
            top3 = top_k_predictions[0]
            probability_top3 = [probabilities[0][id] for id in top3]
            words = self._data_utility.ids2outwords(top3)
            words_out.append(words)
            probability_out.append(probability_top3)

        out_str = ''
        if words_num > 0:
            words_out_use = words_out[words_num - 1:words_num + letters_num]
            probability_out_use = probability_out[words_num - 1:words_num +
                                                  letters_num]
            for words, probabilities in zip(words_out_use,
                                            probability_out_use):
                out_str_line = ''
                for word, probability in zip(words, probabilities):
                    out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format(
                        probability)
                out_str_line = out_str_line[3:-1]
                out_str = out_str + " || " + out_str_line
            out_str = out_str[4:-1]
        else:
            words_out_use = words_out[0:letters_num]
            probability_out_use = probability_out[0:letters_num]
            for words, probabilities in zip(words_out_use,
                                            probability_out_use):
                out_str_line = ''
                for word, probability in zip(words, probabilities):
                    out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format(
                        probability)
                out_str_line = out_str_line[3:-1]
                out_str = out_str + " || " + out_str_line
        return out_str

    def predict_file_probability(self, test_file_in, test_file_out):
        testfilein = open(test_file_in, "r")
        testfileout = open(test_file_out, 'w')
        t1 = time.time()
        for sentence in testfilein:
            sentence = sentence.rstrip()
            out_str = self.predict_data_probability(sentence)
            if (out_str):
                print(sentence + " |#| " + out_str)
                testfileout.write(sentence + " |#| " + out_str + "\n")
            else:
                print("predict error : " + sentence)
        t2 = time.time()
        print(t2 - t1)
        testfilein.close()
        testfileout.close()
nb_train_samples = 49700
nb_validation_samples = 2000
epochs = 10
batch_size = 32  # Note:  Must be less than or equal to the nb_validation_samples size.
img_width, img_height = 26, 99

if K.image_data_format() == 'channels_first':
    input_shape = (1, img_width, img_height)
else:
    input_shape = (img_width, img_height, 1)

m = models.Models()
#model = m.get_cifar_model(input_shape, 10)
#model = m.get_cifar_model_2(input_shape, 10)
model = m.get_covn2d_six_layer_model(input_shape, len(training_categories) + 1)
du = DataUtility(bucket_id='kaggle_voice_data', root_folder='/')

X, Y = du.load_data_local('../../data/npz', training_categories,
                          other_categories)
#X, Y = du.du.load_local_binary_data('../../data/npz', target)

x_train, y_train, x_test, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=42)

# x_train -> Training data to feed the net
# x_test ->  Training data for evaluation
# y_train -> VALIDATION data for net input
# y_test -> Expected Validation output
#
示例#16
0
class InputEngineRnn:

    def __init__(self, graph_file, vocab_path, config_name, use_phrase=False):

        vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters")
        vocab_file_out = os.path.join(vocab_path, "vocab_out")
        vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase")

        self.use_phrase = use_phrase
        self._config = Config()
        self._config.get_config(vocab_path, config_name)
        self._data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words, vocab_file_in_letters=vocab_file_in_letters,
                                         vocab_file_out=vocab_file_out, vocab_file_phrase=vocab_file_phrase)
        print("in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d"
              "\nphrase vocabulary size = %d" % (
                self._config.vocab_size_in, self._config.vocab_size_out, self._config.vocab_size_letter,
                self._config.vocab_size_phrase))
        
        prefix = "import/"
        self.lm_state_in_name = prefix + "Online/WordModel/state:0"
        self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0"
        self.lm_state_out_name = prefix + "Online/WordModel/state_out:0"

        self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1"
        self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0"
        self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1"
        self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0"
        self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0"

        self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0"
        self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0"
        self.kc_state_in_name = prefix + "Online/LetterModel/state:0"
        self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0"
        self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0"
        self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1"
        self.kc_output_name = prefix + "Online/LetterModel/probabilities:0"
        self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0"
        self.max_test_line = 10000

        with open(graph_file, 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())
            tf.import_graph_def(graph_def)

        gpu_config = tf.ConfigProto()
        gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction
        self._sess = tf.Session(config=gpu_config)

    def predict(self, sentence, k):
        global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k
        inputs, inputs_key, word_letters = self._data_utility.sentence2ids(sentence)

        lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32)
        kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32)
        words_out = list()
        phrase_logits = None
        # Phase I: read contexts.
        if len(inputs) > 0:
            for i in range(len(inputs)):
                feed_values = {self.lm_input_name: [[inputs[i]]]}
                if i > 0:
                    feed_values[self.lm_state_in_name] = lm_state_out
                    # Use previous language model's final state as language model's initial state.
                if self.use_phrase:
                    lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run([self.lm_state_out_name,
                                                                                                 self.phrase_p_name,
                                                                                                 self.phrase_p_probability,
                                                                                                 self.phrase_logits],
                                                                                                 feed_dict=feed_values)
                    phrase_p_top_k = [id for id in phrase_p_top_k[0]]
                    probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]
                else:
                    lm_state_out = self._sess.run([self.lm_state_out_name], feed_dict=feed_values)[0]

        # Phase II: read letters, predict by feed the letters one-by-one.
        for i in range(len(inputs_key)):
            feed_values = {self.kc_input_name: [[inputs_key[i]]],
                           self.kc_top_k_name: k}
            if i == 0 and len(inputs) > 0:
                feed_values[self.kc_lm_state_in_name] = lm_state_out
                # Use language model's final state to letter model's initial state when the letters haven't been feed.
            else:
                feed_values[self.kc_state_in_name] = kc_state_out
                # Use letter model's final state to letter model's initial state when feed the letters one-by-one.
            probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name,
                                                                             self.kc_state_out_name], feed_dict=feed_values)
            probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]
            words_out = self._data_utility.ids2outwords(top_k_predictions[0])
            # Predict phrase
            if self.use_phrase:
                if i == 0 and len(inputs) > 0:
                    top_word = words_out[0]
                    top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word)
                    if top_phrase[0] is not None:
                        is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k)
                        words_out, probability_topk = self.final_words_out(words_out, top_phrase, phrase_p, probability_topk)

        return [{'word': word, 'probability': float(probability)}
                if word != '<unk>' else {'word': '<' + word_letters + '>', 'probability': float(probability)}
                for word, probability in zip(words_out, probability_topk)] if len(words_out) > 0 else []

    def predict_data(self, sentence, k):
        global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k
        sentence = sentence.rstrip()
        res = self._data_utility.data2ids_line(sentence)
        if res is None:
            return None
        words_line, letters_line, words_ids, letters_ids, words_num, letters_num = res

        out_str_list = []
        probability_topk_list = []
        phrase_logits = None

        lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32)
        kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32)

        for i in range(len(words_ids)):
            words_out = []
            probs_out = []
            # Phase I: read contexts.
            feed_values = {self.lm_input_name: [[words_ids[i]]]}
            if i > 0:
                feed_values[self.lm_state_in_name] = lm_state_out
                # Use previous language model's final state as language model's initial state.
            if self.use_phrase:
                lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run(
                    [self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability,
                     self.phrase_logits], feed_dict=feed_values)
                phrase_p_top_k = [id for id in phrase_p_top_k[0]]
                probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]
            else:
                lm_state_out = self._sess.run([self.lm_state_out_name], feed_dict=feed_values)[0]

            if i == len(letters_ids):
                break
            # Phase II: read letters, predict by feed the letters one-by-one.
            for j in range(len(letters_ids[i])):
                feed_values = {self.kc_input_name: [[letters_ids[i][j]]],
                               self.kc_top_k_name: k, self.key_length: [1]}

                if j == 0 and len(words_ids) > 0:
                    feed_values[self.kc_lm_state_in_name] = lm_state_out
                    # Use language model's final state to letter model's initial state when letters haven't been feed.
                else:
                    feed_values[self.kc_state_in_name] = kc_state_out
                    # Use letter model's final state to letter model's initial state when feed the letters one-by-one.
                probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name,
                                                                                 self.kc_state_out_name], feed_dict=feed_values)
                probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]
                words = self._data_utility.ids2outwords(top_k_predictions[0])

                # Predict phrase
                if self.use_phrase:
                    if j == 0 and i > 0:
                        top_word = words[0]
                        top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word)
                        if top_phrase[0] is not None:
                            is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k)
                            words, probability_topk = self.final_words_out(words, top_phrase, phrase_p, probability_topk)
                words_out.append(words)
                probs_out.append(probability_topk)
            out_str = words_out if i > 0 else [['','','']] + words_out[1: ]
            out_str_list.append(out_str)
            probability_topk_list.append(probs_out)

        return words_line, letters_line, out_str_list, probability_topk_list

    def calculate_phrase_p(self, top_phrase, probability_p_topk, phrase_p_top_k):
        is_phrase_p = probability_p_topk[phrase_p_top_k.index(1)]
        phrase_p = is_phrase_p * top_phrase[1]
        return is_phrase_p, phrase_p

    def final_words_out(self, words, top_phrase, phrase_p, probability_topk):
        for i in range(len(probability_topk)):
            if phrase_p >= probability_topk[i]:
                probability_topk[i] = phrase_p
                words[i] = top_phrase[0]
                break
        return words, probability_topk

    def result_print(self, out_string, out_prob):
        string = ""
        for (word, prob) in zip(out_string, out_prob):
            prob = str(prob) if word != "" else "0.0"
            string = string + word + ":" + prob + "|"
        string = string[:-1]
        return string

    def predict_file(self, test_file_in, test_file_out, k):
        testfilein = open(test_file_in, "r")
        testfileout = open(test_file_out, 'w')
        t1 = time.time()

        line_count = 0
        for sentence in testfilein:
            line_count += 1
            if line_count > self.max_test_line:
                break
            sentence = sentence.rstrip()
            result = self.predict_data(sentence, k)

            if result is not None:
                words_line, letters_line, out_words_list, out_prob_list = result

                for i in range(len(out_words_list)):
                    print("\t".join(words_line[:i])
                         + "|#|" + " ".join(letters_line[i])
                         + "|#|" + "\t".join(words_line[i:]) + "|#|"
                          + '\t'.join([self.result_print(out_words, out_prob)
                                       for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])])
                          + "\n")
                    testfileout.write("\t".join(words_line[:i])
                                      + "|#|" + " ".join(letters_line[i])
                                      + "|#|" + "\t".join(words_line[i:]) + "|#|"
                                      + '\t'.join([self.result_print(out_words, out_prob)
                                      for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])])
                                      + "\n")

        t2 = time.time()
        print(t2 - t1)
        testfilein.close()
        testfileout.close()
示例#17
0
class InputEngineRnn:
    def __init__(self, graph_file, vocab_path, config_name):

        vocab_file_in_words = os.path.join(vocab_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(vocab_path, "vocab_in_letters")
        vocab_file_out = os.path.join(vocab_path, "vocab_out")
        vocab_file_phrase = os.path.join(vocab_path, "vocab_phrase")

        self._config = Config()
        self._config.get_config(vocab_path, config_name)
        self._data_utility = DataUtility(
            vocab_file_in_words=vocab_file_in_words,
            vocab_file_in_letters=vocab_file_in_letters,
            vocab_file_out=vocab_file_out,
            vocab_file_phrase=vocab_file_phrase)
        print(
            "in words vocabulary size = %d\nout words vocabulary size = %d\nin letters vocabulary size = %d"
            "\nphrase vocabulary size = %d" %
            (self._config.vocab_size_in, self._config.vocab_size_out,
             self._config.vocab_size_letter, self._config.vocab_size_phrase))

        prefix = "import/"
        self.lm_state_in_name = prefix + "Online/WordModel/state:0"
        self.lm_input_name = prefix + "Online/WordModel/batched_input_word_ids:0"
        self.lm_state_out_name = prefix + "Online/WordModel/state_out:0"
        # self.lm_top_k_name = prefix + "Online/WordModel/top_k:0"

        self.phrase_p_name = prefix + "Online/WordModel/phrase_p_prediction: 1"
        self.phrase_p_probability = prefix + "Online/WordModel/phrase_p_probabilities: 0"
        self.phrase_top_k_name = prefix + "Online/WordModel/phrase_top_k_prediction: 1"
        self.phrase_top_k_probability = prefix + "Online/WordModel/phrase_probabilities: 0"
        self.phrase_logits = prefix + "Online/WordModel/logits_phrase: 0"

        self.kc_top_k_name = prefix + "Online/LetterModel/top_k:0"
        self.key_length = prefix + "Online/LetterModel/batched_input_sequence_length:0"
        self.kc_state_in_name = prefix + "Online/LetterModel/state:0"
        self.kc_lm_state_in_name = prefix + "Online/LetterModel/lm_state_in:0"
        self.kc_input_name = prefix + "Online/LetterModel/batched_input_word_ids:0"
        self.kc_top_k_prediction_name = prefix + "Online/LetterModel/top_k_prediction:1"
        self.kc_output_name = prefix + "Online/LetterModel/probabilities:0"
        self.kc_state_out_name = prefix + "Online/LetterModel/state_out:0"

        with open(graph_file, 'rb') as f:
            graph_def = tf.GraphDef()
            graph_def.ParseFromString(f.read())
            tf.import_graph_def(graph_def)

        gpu_config = tf.ConfigProto()
        gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction
        self._sess = tf.Session(config=gpu_config)

    def predict(self, sentence, k):
        global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k
        inputs, inputs_key, word_letters = self._data_utility.sentence2ids(
            sentence)
        # print(inputs)
        # print(inputs_key)
        lm_state_out = np.zeros(
            [self._config.num_layers, 2, 1, self._config.word_hidden_size],
            dtype=np.float32)
        kc_state_out = np.zeros(
            [self._config.num_layers, 2, 1, self._config.letter_hidden_size],
            dtype=np.float32)
        words_out = list()
        phrase_logits = None
        if len(inputs) > 0:
            for i in range(len(inputs)):
                feed_values = {self.lm_input_name: [[inputs[i]]]}
                if i > 0:
                    feed_values[self.lm_state_in_name] = lm_state_out
                lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run(
                    [
                        self.lm_state_out_name, self.phrase_p_name,
                        self.phrase_p_probability, self.phrase_logits
                    ],
                    feed_dict=feed_values)
                phrase_p_top_k = [id for id in phrase_p_top_k[0]]
                probability_p_topk = [
                    phrase_p_prob[0][id] for id in phrase_p_top_k
                ]

        for i in range(len(inputs_key)):
            feed_values = {
                self.kc_input_name: [[inputs_key[i]]],
                self.kc_top_k_name: k
            }
            if i == 0 and len(inputs) > 0:
                feed_values[self.kc_lm_state_in_name] = lm_state_out
            else:
                feed_values[self.kc_state_in_name] = kc_state_out
            probabilities, top_k_predictions, kc_state_out = self._sess.run(
                [
                    self.kc_output_name, self.kc_top_k_prediction_name,
                    self.kc_state_out_name
                ],
                feed_dict=feed_values)
            probability_topk = [
                probabilities[0][id] for id in top_k_predictions[0]
            ]
            words_out = self._data_utility.ids2outwords(top_k_predictions[0])
            if i == 0 and len(inputs) > 0:
                top_word = words_out[0]
                top_phrase = self._data_utility.get_top_phrase(
                    phrase_logits, top_word)
                if top_phrase[0] is not None:
                    is_phrase_p, phrase_p = self.calculate_phrase_p(
                        top_phrase, probability_p_topk, phrase_p_top_k)
                    words_out, probability_topk = self.final_words_out(
                        words_out, top_phrase, phrase_p, probability_topk)

        return [{
            'word': word,
            'probability': float(probability)
        } if word != '<unk>' else {
            'word': '<' + word_letters + '>',
            'probability': float(probability)
        } for word, probability in zip(words_out, probability_topk)
                ] if len(words_out) > 0 else []

    # def predict_data(self, sentence, k):
    #     global probabilities, top_k_predictions, probability_topk, probability_p_topk, phrase_p_top_k
    #     sentence = sentence.rstrip()
    #     words_line, letters_line, words_ids, letters_ids, words_num, letters_num = self._data_utility.data2ids_line(sentence)
    #     out_str_list = []
    #     probability_topk_list = []
    #     # print(words_ids)
    #     # print(letters_ids)
    #     lm_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.word_hidden_size], dtype=np.float32)
    #     kc_state_out = np.zeros([self._config.num_layers, 2, 1, self._config.letter_hidden_size], dtype=np.float32)
    #
    #     for i in range(len(words_ids)):
    #         words_out = []
    #         probs_out = []
    #         feed_values = {self.lm_input_name: [[words_ids[i]]]}
    #         if i > 0:
    #             feed_values[self.lm_state_in_name] = lm_state_out
    #
    #         lm_state_out, phrase_p_top_k, phrase_p_prob, phrase_logits = self._sess.run(
    #             [self.lm_state_out_name, self.phrase_p_name, self.phrase_p_probability,
    #              self.phrase_logits], feed_dict=feed_values)
    #         phrase_p_top_k = [id for id in phrase_p_top_k[0]]
    #         probability_p_topk = [phrase_p_prob[0][id] for id in phrase_p_top_k]
    #
    #         if i == len(letters_ids):
    #             break
    #         for j in range(len(letters_ids[i])):
    #             feed_values = {self.kc_input_name: [[letters_ids[i][j]]],
    #                            self.kc_top_k_name: k, self.key_length:[1]}
    #
    #             if j == 0 and len(words_ids) > 0:
    #                 feed_values[self.kc_lm_state_in_name] = lm_state_out
    #             else:
    #                 feed_values[self.kc_state_in_name] = kc_state_out
    #             probabilities, top_k_predictions, kc_state_out = self._sess.run([self.kc_output_name, self.kc_top_k_prediction_name,
    #                                                                           self.kc_state_out_name], feed_dict=feed_values)
    #             probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]
    #             words = self._data_utility.ids2outwords(top_k_predictions[0])
    #
    #             if j == 0 and i > 0:
    #                 top_word = words[0]
    #                 top_phrase = self._data_utility.get_top_phrase(phrase_logits, top_word)
    #                 if top_phrase[0] is not None:
    #                     is_phrase_p, phrase_p = self.calculate_phrase_p(top_phrase, probability_p_topk, phrase_p_top_k)
    #                     words, probability_topk = self.final_words_out(words, top_phrase, phrase_p, probability_topk)
    #             words_out.append(words)
    #             probs_out.append(probability_topk)
    #         out_str = words_out if i > 0 else [['','','']] + words_out[1: ]
    #         out_str_list.append(out_str)
    #         probability_topk_list.append(probs_out)
    #
    #     return words_line, letters_line, out_str_list, probability_topk_list

    def calculate_phrase_p(self, top_phrase, probability_p_topk,
                           phrase_p_top_k):
        is_phrase_p = probability_p_topk[phrase_p_top_k.index(1)]
        phrase_p = is_phrase_p * top_phrase[1]
        return is_phrase_p, phrase_p

    def final_words_out(self, words, top_phrase, phrase_p, probability_topk):
        for i in range(len(probability_topk)):
            if phrase_p >= probability_topk[i]:
                probability_topk[i] = phrase_p
                words[i] = top_phrase[0]
                break
        return words, probability_topk

    def result_print(self, out_string, out_prob):
        string = ""
        for (word, prob) in zip(out_string, out_prob):
            prob = str(prob) if word != "" else "0.0"
            string = string + word + ":" + prob + "|"
        string = string[:-1]
        return string

    # def predict_file(self, test_file_in, test_file_out, k):
    #     testfilein = open(test_file_in, "r")
    #     testfileout = open(test_file_out, 'w')
    #     t1 = time.time()
    #
    #     for sentence in testfilein:
    #         sentence = sentence.rstrip()
    #         result = self.predict_data(sentence, k)
    #
    #         if result is not None:
    #             words_line, letters_line, out_words_list, out_prob_list = result
    #
    #             for i in range(len(out_words_list)):
    #                 print("\t".join(words_line[:i])
    #                      + "|#|" + letters_line[i]
    #                      + "|#|" + "\t".join(words_line[i:]) + "|#|"
    #                       + '\t'.join([self.result_print(out_words, out_prob)
    #                                    for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])])
    #                       + "\n")
    #                 testfileout.write("\t".join(words_line[:i])
    #                                   + "|#|" + letters_line[i]
    #                                   + "|#|" + "\t".join(words_line[i:]) + "|#|"
    #                                   + '\t'.join([self.result_print(out_words, out_prob)
    #                                         for (out_words, out_prob) in zip(out_words_list[i], out_prob_list[i])])
    #                                   + "\n")
    #
    #     t2 = time.time()
    #     print(t2 - t1)
    #     testfilein.close()
    #     testfileout.close()

    def predict_data(self, sentence, k):
        sentence = sentence.rstrip()
        inputs, inputs_key, words_num, letters_num = self._data_utility.data2ids_line(
            sentence)  #上下文的id,要预测的单词的键码部分id,上下文单词数,要预测的单词的字母数
        words_out = []
        lm_state = np.zeros(
            [self._config.num_layers, 2, 1, self._config.word_hidden_size],
            dtype=np.float32)
        kc_state = np.zeros(
            [self._config.num_layers, 2, 1, self._config.letter_hidden_size],
            dtype=np.float32)
        if len(inputs) > 0:
            for i in range(len(inputs)):
                feed_values = {self.lm_input_name: [[inputs[i]]]}
                if i > 0:
                    feed_values[self.lm_state_in_name] = lm_state
                # probabilities is an ndarray of shape (batch_size * time_step) * vocab_size
                # For inference, batch_size = num_step = 1, thus probabilities.shape = 1 * vocab_size
                result = self._sess.run([self.lm_state_out_name],
                                        feed_dict=feed_values)
                lm_state = result[0]
                #probability_topk = [probabilities[0][id] for id in top_k_predictions[0]]
                #words = self._data_utility.ids2outwords(top_k_predictions[0])
                #words_out.append(words)

        for i in range(len(inputs_key)):
            feed_values = {
                self.kc_input_name: [[inputs_key[i]]],
                self.kc_top_k_name: k
            }
            if i > 0 or len(inputs) == 0:
                feed_values[self.kc_state_in_name] = kc_state
            else:
                feed_values[self.kc_lm_state_in_name] = lm_state
            #print (state_out)
            probabilities, top_k_predictions, kc_state = self._sess.run(
                [
                    self.kc_output_name, self.kc_top_k_prediction_name,
                    self.kc_state_out_name
                ],
                feed_dict=feed_values)
            probability_topk = [
                probabilities[0][id] for id in top_k_predictions[0]
            ]
            words = self._data_utility.ids2outwords(top_k_predictions[0])
            words_out.append(words)
        out_str = str(words_out if words_num > 0 else [['', '', '']] +
                      words_out[1:])
        return out_str

    def predict_file(self, test_file_in, test_file_out, k):
        testfilein = open(test_file_in, "r")
        testfileout = open(test_file_out, 'w')
        t1 = time.time()
        topk = k
        for sentence in testfilein:
            sentence = sentence.rstrip()
            sentence_in = sentence.lower()
            out_str = self.predict_data(sentence_in, topk)
            if (out_str):
                print(sentence + " | " + out_str)
                testfileout.write(sentence + " | " + out_str + "\n")
            else:
                print("predict error : " + sentence)
        t2 = time.time()
        print(t2 - t1)
        testfilein.close()
        testfileout.close()
示例#18
0
def main(_):
    if not FLAGS.data_path:
        raise ValueError("Must set --data_path to PTB data directory")

    logfile = open(FLAGS.mode + '-' + FLAGS.model_config + '.log', 'w')
    # logfile = sys.stdout

    if not os.path.isdir(FLAGS.save_path):
        os.mkdir(FLAGS.save_path)
    if not os.path.isdir(FLAGS.graph_save_path):
        os.mkdir(FLAGS.graph_save_path)

    config = Config()
    config.get_config(FLAGS.model_config)

    test_config = Config()
    test_config.get_config(FLAGS.model_config)
    test_config.batch_size = 1
    test_config.num_steps = 1

    vocab_file_in_words = os.path.join(FLAGS.vocab_path, "vocab_in_words")
    vocab_file_in_letters = os.path.join(FLAGS.vocab_path, "vocab_in_letters")
    vocab_file_out = os.path.join(FLAGS.vocab_path, "vocab_out")

    train_file_in_words = os.path.join(FLAGS.data_path, "train_in_ids_words")
    train_file_in_letters = os.path.join(FLAGS.data_path,
                                         "train_in_ids_letters")
    train_file_out = os.path.join(FLAGS.data_path, "train_out_ids")
    dev_file_in_words = os.path.join(FLAGS.data_path, "dev_in_ids_words")
    dev_file_in_letters = os.path.join(FLAGS.data_path, "dev_in_ids_letters")
    dev_file_out = os.path.join(FLAGS.data_path, "dev_out_ids")

    data_utility = DataUtility(vocab_file_in_words=vocab_file_in_words,
                               vocab_file_in_letters=vocab_file_in_letters,
                               vocab_file_out=vocab_file_out,
                               max_sentence_length=config.num_steps)

    with tf.Graph().as_default():
        initializer = tf.random_uniform_initializer(-config.init_scale,
                                                    config.init_scale)
        gpu_config = tf.ConfigProto()
        gpu_config.gpu_options.per_process_gpu_memory_fraction = config.gpu_fraction
        with tf.Session(config=gpu_config) as session:
            with tf.name_scope("Train"):
                train_feeder = DataFeederContext(
                    vocab_file_in_words=vocab_file_in_words,
                    vocab_file_in_letters=vocab_file_in_letters,
                    vocab_file_out=vocab_file_out,
                    corpus_file_in_words=train_file_in_words,
                    corpus_file_in_letters=train_file_in_letters,
                    corpus_file_out=train_file_out,
                    max_sentence_length=config.num_steps)

                with tf.variable_scope("Model",
                                       reuse=None,
                                       initializer=initializer):
                    mtrain = PTBModel(is_training=True, config=config)
                tf.summary.scalar("Training Loss", mtrain.cost)
                tf.summary.scalar("Learning Rate", mtrain.lr)

            with tf.name_scope("Valid"):
                valid_feeder = DataFeederContext(
                    vocab_file_in_words=vocab_file_in_words,
                    vocab_file_in_letters=vocab_file_in_letters,
                    vocab_file_out=vocab_file_out,
                    corpus_file_in_words=dev_file_in_words,
                    corpus_file_in_letters=dev_file_in_letters,
                    corpus_file_out=dev_file_out,
                    max_sentence_length=config.num_steps)
                with tf.variable_scope("Model",
                                       reuse=True,
                                       initializer=initializer):
                    mvalid = PTBModel(is_training=False, config=config)
                tf.summary.scalar("Validation Loss", mvalid.cost)

            # Evaluate on test data
            with tf.name_scope("Test"):
                test_feeder = DataFeederContext(
                    vocab_file_in_words=vocab_file_in_words,
                    vocab_file_in_letters=vocab_file_in_letters,
                    vocab_file_out=vocab_file_out,
                    corpus_file_in_words=dev_file_in_words,
                    corpus_file_in_letters=dev_file_in_letters,
                    corpus_file_out=dev_file_out,
                    max_sentence_length=config.num_steps)
                with tf.variable_scope("Model",
                                       reuse=True,
                                       initializer=initializer):
                    mtest = PTBModel(is_training=False, config=config)

            # Model to be saved and exported
            # Note: it's beneficial to distinguish between test model and save model,
            # because when evaluating on test set, a large batch size is more GPU-friendly and faster.
            # But when running on cellphone, it can accept a batch size of 1 only, this is why monline exists.
            with tf.name_scope("Online"):
                with tf.variable_scope("Model",
                                       reuse=True,
                                       initializer=initializer):
                    monline = PTBModel(is_training=False, config=test_config)

            # Do not restore sparse weights from pretrain phase
            restore_variables = dict()
            for v in tf.trainable_variables():
                if v.name.startswith("Model/Softmax/softmax_sp_trainable_weights") \
                        or v.name.startswith("Model/Embedding/embedding_sp_trainable_weights"):
                    continue
                print("store:", v.name)
                restore_variables[v.name] = v

            sv = tf.train.Saver(restore_variables)
            if not FLAGS.model_name.endswith(".ckpt"):
                FLAGS.model_name += ".ckpt"

            session.run(tf.global_variables_initializer())
            if FLAGS.mode == "pretrain":
                # restore previously trained model
                check_point_dir = os.path.join(FLAGS.save_path, "pretrain")
                ckpt = tf.train.get_checkpoint_state(check_point_dir)
                if ckpt and tf.train.checkpoint_exists(
                        ckpt.model_checkpoint_path):
                    print("Reading model parameters from %s" %
                          ckpt.model_checkpoint_path)
                    sv.restore(session, ckpt.model_checkpoint_path)
                else:
                    print("Created model with fresh parameters.")
                for i in range(config.max_max_epoch // FLAGS.laptop_discount):
                    lr_decay = config.lr_decay**max(i + 1 - config.max_epoch,
                                                    0)
                    mtrain.assign_lr(session, config.learning_rate * lr_decay)

                    print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile)
                    print("Epoch: %d Learning rate: %.3f" %
                          (i + 1, session.run(mtrain.lr)),
                          file=logfile)
                    train_perplexity = run_epoch(session,
                                                 mtrain,
                                                 eval_op=mtrain.train_op,
                                                 data_feeder=train_feeder,
                                                 verbose=True)

                    print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile)
                    print("Epoch: %d Train Perplexity: %.3f" %
                          (i + 1, train_perplexity),
                          file=logfile)
                    logfile.flush()

                    valid_perplexity = run_epoch(session,
                                                 mvalid,
                                                 data_feeder=valid_feeder)
                    print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile)
                    print("Epoch: %d Valid Perplexity: %.3f" %
                          (i + 1, valid_perplexity),
                          file=logfile)
                    logfile.flush()

                    print("save path:", FLAGS.save_path)

                    # Save model if FLAGS.mode == "pretrain" or "finetune"
                    if FLAGS.save_path:
                        print("Saving model to %s." % FLAGS.save_path,
                              file=logfile)
                        step = mtrain.get_global_step(session)
                        pretrain_save_path = os.path.join(
                            FLAGS.save_path, "pretrain")
                        if not os.path.isdir(pretrain_save_path):
                            os.mkdir(pretrain_save_path)
                        model_save_path = os.path.join(pretrain_save_path,
                                                       FLAGS.model_name)
                        sv.save(session, model_save_path, global_step=step)

                print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                      "] Begin exporting graph!")
                export_graph(session)  # Export dense graph
                print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                      "] Finish exporting graph!")

                # Evaluate on test data for {"pretrain", "finetune",} phase
                print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                      "] Begin exporting graph!")
                export_graph(session)  # Export dense graph

                print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                      "] Begin test epoch!")
                sys.stdout.flush()
                print("=" * 30 + FLAGS.mode + "=" * 30, file=logfile)
                test_perplexity = run_evaluate_epoch(
                    session,
                    mtest,
                    logfile,
                    word_dict=data_utility.id2token_out,
                    data_feeder=test_feeder)
                print("Test Perplexity: %.3f" % test_perplexity, file=logfile)
                print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                      "] Finish test epoch!")
                print("Test Perplexity: %.3f" %
                      test_perplexity)  # print to stdout
                logfile.close()

            elif FLAGS.mode == "learn_basis":
                sv.restore(
                    session,
                    tf.train.latest_checkpoint(
                        os.path.join(FLAGS.save_path, "pretrain")))

                print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                      "] Begin learning embedding basis!")
                learn_sparse_embedding(session, mtrain, verbose=True)
                print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                      "] Finish learning embedding basis!")

                print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                      "] Begin learning softmax basis!")
                learn_sparse_softmax(session, mtrain, verbose=True)
                print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                      "] Finish learning softmax basis!")
                sys.exit(0)

            elif FLAGS.mode == "finetune":
                # Restore pre-trained model
                sv.restore(
                    session,
                    tf.train.latest_checkpoint(
                        os.path.join(FLAGS.save_path, "pretrain")))
                for i in range(config.finetune_epoch // FLAGS.laptop_discount):
                    lr_decay = config.lr_decay**(i // config.max_epoch)
                    mtrain.assign_lr(session,
                                     config.finetune_learning_rate * lr_decay)

                    print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile)
                    print("Epoch: %d Learning rate: %.3f" %
                          (i + 1, session.run(mtrain.lr)),
                          file=logfile)
                    train_perplexity = run_epoch(session,
                                                 mtrain,
                                                 eval_op=mtrain.train_op,
                                                 data_feeder=train_feeder,
                                                 verbose=True)

                    print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile)
                    print("Epoch: %d Train Perplexity: %.3f" %
                          (i + 1, train_perplexity),
                          file=logfile)
                    logfile.flush()

                    print(time.strftime('%Y-%m-%d %H:%M:%S'), file=logfile)
                    valid_perplexity = run_epoch(session,
                                                 mvalid,
                                                 data_feeder=valid_feeder)
                    print("Epoch: %d Valid Perplexity: %.3f" %
                          (i + 1, valid_perplexity),
                          file=logfile)
                    logfile.flush()

                    # Save model if FLAGS.mode == "pretrain" or "finetune"
                    if FLAGS.save_path:
                        print("Saving model to %s." % FLAGS.save_path,
                              file=logfile)
                        step = mtrain.get_global_step(session)
                        finetune_save_path = os.path.join(
                            FLAGS.save_path, "finetune-" + FLAGS.model_config)
                        if not os.path.isdir(finetune_save_path):
                            os.mkdir(finetune_save_path)
                        model_save_path = os.path.join(finetune_save_path,
                                                       FLAGS.model_name)
                        sv.save(session, model_save_path, global_step=step)

                    # Export sparse graph at every iteration
                    print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                          "] Begin exporting graph!")
                    export_graph(session)  # Export dense graph
                    print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                          "] Finish exporting graph!")

                # Evaluate on test data for {"pretrain", "finetune",} phase
                print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                      "] Begin test epoch!")
                sys.stdout.flush()
                print("=" * 30 + FLAGS.mode + "=" * 30, file=logfile)
                test_perplexity = run_evaluate_epoch(
                    session,
                    mtest,
                    logfile,
                    word_dict=data_utility.id2token_out,
                    data_feeder=test_feeder)
                print("[" + time.strftime('%Y-%m-%d %H:%M:%S') +
                      "] Finish test epoch!")
                print("Test Perplexity: %.3f" % test_perplexity, file=logfile)
                print("Test Perplexity: %.3f" %
                      test_perplexity)  # print to stdout
                logfile.close()
示例#19
0
    def train(self, target):
        start_time = time()
        img_width, img_height = 26, 99
        epochs = 20
        batch_size = 32
        tb_callback = CB.TensorBoard(log_dir='./logs',
                                     histogram_freq=0,
                                     batch_size=1,
                                     write_graph=True,
                                     write_grads=True,
                                     write_images=True,
                                     embeddings_freq=0,
                                     embeddings_layer_names=None,
                                     embeddings_metadata=None)

        m = models.Models()
        print('Training with target "{0}".'.format(target))
        du = DataUtility(bucket_id='kaggle_voice_data', root_folder='/')
        if K.image_data_format() == 'channels_first':
            input_shape = (1, img_width, img_height)
        else:
            input_shape = (img_width, img_height, 1)

        model = m.get_covn2d_six_layer_model(input_shape, 1)

        X, Y = du.load_local_binary_data('../../data/npz', target)
        # X, Y = du.load_cloud_binary_data(target)
        x_train, y_train, x_test, y_test = train_test_split(X,
                                                            Y,
                                                            test_size=0.1,
                                                            random_state=42)

        # x_train -> Training data to feed the net
        # x_test ->  Training data for evaluation
        # y_train -> VALIDATION data for net input
        # y_test -> Expected Validation output
        #
        # Train the network with x_train and x_test
        # Evaluate the network with y_train and y_test
        # x_test = np_utils.to_categorical(x_test, 2)
        # y_test = np_utils.to_categorical(y_test, 2)

        new_x_train = np.expand_dims(x_train, axis=3)
        new_y_train = np.expand_dims(y_train, axis=3)

        # datagen = ImageDataGenerator(
        #     featurewise_std_normalization=True,
        #     rotation_range=0,
        #     height_shift_range=0.2,
        #     horizontal_flip=False
        # )

        #  Fit the data generator to the test data for featurewise_std.
        #datagen.fit(new_x_train)

        # x_train = x_train[0:nb_train_samples]
        # x_test = x_test[0:nb_train_samples]
        # y_train = y_train[0:nb_validation_samples]
        # y_test = y_test[0:nb_validation_samples]

        #model.fit_generator(datagen.flow(new_x_train, x_test, batch_size=batch_size),
        #                   steps_per_epoch=len(x_train) / batch_size, epochs=epochs, validation_data=(new_y_train, y_test))

        history = model.fit(x=new_x_train,
                            y=x_test,
                            validation_data=(new_y_train, y_test),
                            batch_size=batch_size,
                            epochs=epochs,
                            verbose=0,
                            callbacks=[tb_callback])

        stop_time = time()
        print("Total training time:  {0} seconds.".format(
            int(stop_time - start_time)))
        # model.save("./local_big_training")
        du.save_multi_model(self.save_dir, '{0}'.format(target), model)
        print("Model saved as {0}.h5".format(target))
        return {"name": target, "accuracy": history.history['acc']}
示例#20
0
class InputEngineRnn:
    def __init__(self,
                 model_path,
                 model_name,
                 config_name,
                 full_vocab_path=None):
        vocab_file_in_words = os.path.join(model_path, "vocab_in_words")
        vocab_file_in_letters = os.path.join(model_path, "vocab_in_letters")
        vocab_file_out = os.path.join(model_path, "vocab_out")
        model_file = os.path.join(model_path, model_name)
        config_file = os.path.join(model_path, config_name)

        self._config = Config()
        self._config.get_config(config_file)
        self._data_utility = DataUtility(
            vocab_file_in_words=vocab_file_in_words,
            vocab_file_in_letters=vocab_file_in_letters,
            vocab_file_out=vocab_file_out,
            max_sentence_length=self._config.num_steps,
            full_vocab_file_in_words=full_vocab_path)
        self._config.batch_size = 1
        self._config.num_steps = 1

        with tf.Graph().as_default():
            with tf.variable_scope("Model"):
                self._language_model_test = PTBModel(is_training=False,
                                                     config=self._config,
                                                     bucket=1)

            gpu_config = tf.ConfigProto()
            gpu_config.gpu_options.per_process_gpu_memory_fraction = self._config.gpu_fraction
            self._sess = tf.Session(config=gpu_config)
            with self._sess.as_default():
                # Do not restore sparse weights from pretrain phase
                restore_variables = dict()
                for v in tf.trainable_variables():
                    if v.name.startswith("Model/Softmax/softmax_sp_trainable_weights") \
                            or v.name.startswith("Model/Embedding/embedding_sp_trainable_weights"):
                        continue
                    print("restore:", v.name)
                    restore_variables[v.name] = v
                saver = tf.train.Saver(restore_variables)
                saver.restore(self._sess, model_file)

            self._fetches = {
                "topk": self._language_model_test._top_k_prediction,
                "probability": self._language_model_test._probabilities,
                "final_state": self._language_model_test.final_state
            }

    def predict(self, sentence, k):
        state = self._sess.run(self._language_model_test.initial_state)
        inputs, word_letters = self._data_utility.sentence2ids(sentence)
        for i in range(len(inputs)):
            vals = self._sess.run(self._fetches,
                                  feed_dict={
                                      self._language_model_test.initial_state:
                                      state,
                                      self._language_model_test.input_data:
                                      [[inputs[i]]],
                                      self._language_model_test.target_data:
                                      [[0]],
                                      self._language_model_test.output_masks:
                                      [[0.0]],
                                      self._language_model_test.top_k:
                                      k
                                  })
            state = vals["final_state"]
        topk = vals["topk"][0]
        probability = vals["probability"][0]
        probability_topk = [probability[id] for id in topk]
        words_out = self._data_utility.ids2outwords(topk)
        return [{
            'word': word,
            'probability': float(probability)
        } if word != '<unk>' else {
            'word': '<' + word_letters + '>',
            'probability': float(probability)
        } for word, probability in zip(words_out, probability_topk)
                ] if len(words_out) > 0 else []

    def predict_data(self, sentence):
        sentence = sentence.rstrip()
        state = self._sess.run(self._language_model_test.initial_state)
        inputs, words_num, letters_num = self._data_utility.data2ids_line(
            sentence)
        if inputs == None:
            return None
        words_out = []
        for i in range(len(inputs)):
            vals = self._sess.run(self._fetches,
                                  feed_dict={
                                      self._language_model_test.initial_state:
                                      state,
                                      self._language_model_test.input_data:
                                      [[inputs[i]]],
                                      self._language_model_test.target_data:
                                      [[0]],
                                      self._language_model_test.output_masks:
                                      [[0.0]],
                                      self._language_model_test.top_k:
                                      3
                                  })
            state = vals["final_state"]
            top3 = vals["topk"][0]
            words = self._data_utility.ids2outwords(top3)
            words_out.append(words)
        out_str = str(
            words_out[words_num - 1:words_num +
                      letters_num] if words_num > 0 else [['', '', '']] +
            words_out[0:letters_num])
        return out_str

    def predict_file(self, test_file_in, test_file_out):
        testfilein = open(test_file_in, "r")
        testfileout = open(test_file_out, 'w')
        t1 = time.time()
        for sentence in testfilein:
            sentence = sentence.rstrip()
            out_str = self.predict_data(sentence)
            if (out_str):
                print(sentence + " |#| " + out_str)
                testfileout.write(sentence + " |#| " + out_str + "\n")
            else:
                print("predict error : " + sentence)
        t2 = time.time()
        print(t2 - t1)
        testfilein.close()
        testfileout.close()

    def predict_data_probability(self, sentence):
        sentence = sentence.rstrip()
        state = self._sess.run(self._language_model_test.initial_state)
        inputs, words_num, letters_num = self._data_utility.data2ids_line(
            sentence)
        if inputs == None:
            return None
        words_out = []
        probability_out = []
        for i in range(len(inputs)):
            vals = self._sess.run(self._fetches,
                                  feed_dict={
                                      self._language_model_test.initial_state:
                                      state,
                                      self._language_model_test.input_data:
                                      [[inputs[i]]],
                                      self._language_model_test.target_data:
                                      [[0]],
                                      self._language_model_test.output_masks:
                                      [[0.0]],
                                      self._language_model_test.top_k:
                                      3
                                  })
            state = vals["final_state"]
            top3 = vals["topk"][0]
            probability = vals["probability"][0]
            probability_top3 = [probability[id] for id in top3]
            words = self._data_utility.ids2outwords(top3)
            words_out.append(words)
            probability_out.append(probability_top3)
        out_str = ''
        if words_num > 0:
            words_out_use = words_out[words_num - 1:words_num + letters_num]
            probability_out_use = probability_out[words_num - 1:words_num +
                                                  letters_num]
            for words, probabilities in zip(words_out_use,
                                            probability_out_use):
                out_str_line = ''
                for word, probability in zip(words, probabilities):
                    out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format(
                        probability)
                out_str_line = out_str_line[3:-1]
                out_str = out_str + " || " + out_str_line
            out_str = out_str[4:-1]
        else:
            words_out_use = words_out[0:letters_num]
            probability_out_use = probability_out[0:letters_num]
            for words, probabilities in zip(words_out_use,
                                            probability_out_use):
                out_str_line = ''
                for word, probability in zip(words, probabilities):
                    out_str_line = out_str_line + " | " + word + ' # ' + '{:.8f}'.format(
                        probability)
                out_str_line = out_str_line[3:-1]
                out_str = out_str + " || " + out_str_line
        return out_str

    def predict_file_probability(self, test_file_in, test_file_out):
        testfilein = open(test_file_in, "r")
        testfileout = open(test_file_out, 'w')
        t1 = time.time()
        for sentence in testfilein:
            sentence = sentence.rstrip()
            out_str = self.predict_data_probability(sentence)
            if (out_str):
                print(sentence + " |#| " + out_str)
                testfileout.write(sentence + " |#| " + out_str + "\n")
            else:
                print("predict error : " + sentence)
        t2 = time.time()
        print(t2 - t1)
        testfilein.close()
        testfileout.close()

    def save_model(self, out_path):
        tf.train.write_graph(self._sess.graph_def, out_path, "graph_rnn.pb",
                             False)