示例#1
0
def batch_generator(descs, batch_size, train_vocab, word_ngrams, sort_ngrams, shuffle=False,
                    show_progress=True):
    global cache
    inds = np.arange(len(descs))
    rem_inds, batch_inds = next_batch(inds, batch_size, shuffle)

    if show_progress:
        progress_bar = tqdm(total=int(np.ceil(len(descs) / batch_size)))
    while len(batch_inds) > 0:
        batch_descs = [descs[i] for i in batch_inds]
        desc_hashes = [hash(str(desc)) for desc in batch_descs]
        batch = [[0] + [train_vocab[phrase]["id"] for phrase in get_all(desc, word_ngrams, sort_ngrams) if
                        phrase in train_vocab] if h not in cache else cache[h] for
                 desc, h in zip(batch_descs, desc_hashes)]

        for h, inds in zip(desc_hashes, batch):
            if h not in cache:
                cache[h] = inds
        batch_weights = [[1 / len(i) for _ in range(len(i))] for i in batch]

        cur_lens = np.array([len(i) for i in batch])
        mx_len = max(cur_lens)
        to_pad = mx_len - cur_lens

        batch = [i + [0 for _ in range(pad)] for i, pad in zip(batch, to_pad)]
        batch_weights = [i + [0 for _ in range(pad)] for i, pad in zip(batch_weights, to_pad)]

        rem_inds, batch_inds = next_batch(rem_inds, batch_size, shuffle)
        if show_progress:
            progress_bar.update()
        yield batch, np.expand_dims(batch_weights, axis=2)

    if show_progress:
        progress_bar.close()
示例#2
0
def batch_generator(descs,
                    labels,
                    batch_size,
                    train_vocab,
                    labels_lookup,
                    word_ngrams,
                    shuffle=False):
    global cache
    inds = np.arange(len(descs))
    rem_inds, batch_inds = next_batch(inds, batch_size, shuffle)

    while len(batch_inds) > 0:
        batch_descs = [descs[i] for i in batch_inds]
        desc_hashes = [hash(str(desc)) for desc in batch_descs]
        batch = [[0] + [
            train_vocab[phrase]["id"]
            for phrase in get_all(desc, word_ngrams) if phrase in train_vocab
        ] if h not in cache else cache[h]
                 for desc, h in zip(batch_descs, desc_hashes)]

        for h, inds in zip(desc_hashes, batch):
            if h not in cache:
                cache[h] = inds
        batch_weights = [[1 / len(i) for _ in range(len(i))] for i in batch]
        batch_labels = [labels[i] for i in batch_inds]
        batch_labels = [labels_lookup[label] for label in batch_labels]

        cur_lens = np.array([len(i) for i in batch])
        mx_len = max(cur_lens)
        to_pad = mx_len - cur_lens

        batch = [i + [0 for _ in range(pad)] for i, pad in zip(batch, to_pad)]
        batch_weights = [
            i + [0 for _ in range(pad)]
            for i, pad in zip(batch_weights, to_pad)
        ]

        rem_inds, batch_inds = next_batch(rem_inds, batch_size, shuffle)
        yield batch, np.expand_dims(batch_weights, axis=2), batch_labels
示例#3
0
    def _batch_generator(self, list_of_texts, batch_size):
        """
        Generate batch from list of texts
        :param list_of_texts: list/array
        :param batch_size: int
        :return: batch word indices, batch word weights
        """
        if self.preprocessing_function:
            list_of_texts = [self.preprocessing_function(str(t)) for t in list_of_texts]
        else:
            list_of_texts = [str(t) for t in list_of_texts]
        inds = np.arange(len(list_of_texts))
        rem_inds, batch_inds = next_batch(inds, batch_size)

        while len(batch_inds) > 0:
            batch, batch_weights = [], []

            descs_words = [list(get_all(list_of_texts[ind].split(), self.info["word_ngrams"], self.info["sort_ngrams"]))
                           for ind in batch_inds]
            num_max_words = max([len(desc_split) for desc_split in descs_words]) + 1

            for desc_words in descs_words:
                init_test_inds = [0] + [self.train_vocab[phrase]["id"] for phrase in desc_words
                                        if phrase in self.train_vocab]

                test_desc_inds = init_test_inds + [0 for _ in range(num_max_words - len(init_test_inds))]
                test_desc_weights = np.zeros_like(test_desc_inds, dtype=float)
                test_desc_weights[:len(init_test_inds)] = 1. / len(init_test_inds)

                batch.append(test_desc_inds)
                batch_weights.append(test_desc_weights)
            rem_inds, batch_inds = next_batch(rem_inds, batch_size)
            batch_weights = np.expand_dims(batch_weights, 2)
            batch = np.array(batch)

            yield batch, batch_weights
示例#4
0
    def _batch_generator(self, list_of_texts, batch_size, show_progress=False):
        """
        Generate batch from list of texts
        :param list_of_texts: list/array
        :param batch_size: int
        :param show_progress: bool, show progress bar
        :return: batch word indices, batch word weights
        """
        if self.preprocessing_function:
            list_of_texts = [
                self.preprocessing_function(str(text))
                for text in list_of_texts
            ]
        else:
            list_of_texts = [str(text) for text in list_of_texts]
        indices = np.arange(len(list_of_texts))
        remaining_indices, batch_indices = next_batch(indices, batch_size)

        if len(list_of_texts) <= batch_size:
            show_progress = False

        disable_progress_bar = not show_progress
        progress_bar = tqdm(total=int(np.ceil(len(list_of_texts) /
                                              batch_size)),
                            disable=disable_progress_bar)

        while len(batch_indices) > 0:
            batch, batch_weights = [], []

            batch_descriptions = [
                list(
                    get_all(list_of_texts[index].split(),
                            self.info["word_ngrams"],
                            self.info["sort_ngrams"]))
                for index in batch_indices
            ]
            num_max_words = max([
                len(batch_description)
                for batch_description in batch_descriptions
            ]) + 1

            for batch_description in batch_descriptions:
                initial_indices = [0] + [
                    self.word_dict[phrase]["id"]
                    for phrase in batch_description if phrase in self.word_dict
                ]

                description_indices = np.array(
                    initial_indices +
                    [0 for _ in range(num_max_words - len(initial_indices))])
                description_weights = np.zeros_like(description_indices,
                                                    dtype=np.float32)
                description_weights[:len(initial_indices
                                         )] = 1. / len(initial_indices)

                batch.append(description_indices)
                batch_weights.append(description_weights)
            remaining_indices, batch_indices = next_batch(
                remaining_indices, batch_size)

            progress_bar.update()
            yield batch, batch_weights

        progress_bar.close()
示例#5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("-md", "--model_dir", type=str, help="path where model.pb and model_params.json are")
    parser.add_argument("-tp", "--test_path", type=str, help="path to test file")
    parser.add_argument("-lp", "--label_prefix", type=str, help="label prefix", default="__label__")
    parser.add_argument("-bs", "--batch_size", type=int, default=1024, help="batch size for inference")
    parser.add_argument("-k", "--top_k", type=int, default=1, help="calculate accuracy on top k predictions")
    parser.add_argument("-hc", "--hand_check", type=bool, default=False, help="test on manually inputted data")
    parser.add_argument("-gpu", "--use_gpu", type=bool, default=True, help="use gpu for inference")
    parser.add_argument("-gpu_fr", "--gpu_fraction", type=float, default=0.4, help="what fraction of gpu to allocate")
    args = parser.parse_args()

    model_dir = args.model_dir
    model_params_path = os.path.join(model_dir, "model_params.json")
    model_path = os.path.join(model_dir, "model_best.pb")
    label_prefix = args.label_prefix

    if args.use_gpu:
        device = "/gpu:0"
        config = tf.ConfigProto(allow_soft_placement=True,
                                gpu_options=tf.GPUOptions(per_process_gpu_memory_fraction=args.gpu_fraction,
                                                          allow_growth=True))
    else:
        device = "/cpu:0"
        os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
        config = tf.ConfigProto(allow_soft_placement=True)

    num_thrown_for_label = 0
    with open(model_params_path, "r") as infile:
        model_params = json.load(infile)
    if os.path.isfile(model_params["label_dict_path"]):
        with open(model_params["label_dict_path"], "r") as infile:
            label_dict = json.load(infile)
    else:
        with open(os.path.join(model_dir, "label_dict.json"), "r") as infile:
            label_dict = json.load(infile)
    if os.path.isfile(model_params["word_dict_path"]):
        with open(model_params["word_dict_path"], "r") as infile:
            word_dict = json.load(infile)
    else:
        with open(os.path.join(model_dir, "word_dict.json"), "r") as infile:
            word_dict = json.load(infile)
    word_ngrams = model_params["word_ngrams"]
    sort_ngrams = model_params["sort_ngrams"]

    labels_dict_inverse = {}

    for label, label_id in label_dict.items():
        labels_dict_inverse[label_dict[label]["id"]] = label

    with tf.device(device):
        with tf.Session(config=config) as sess:
            run_arg = load_graph(model_path, ["input:0", "input_weights:0", "prediction:0"])
            if args.hand_check:
                while True:
                    query_description = input("Enter the description: ")
                    label = [token for token in query_description.split() if token.startswith(label_prefix)][0]
                    label = label.split(label_prefix)[-1]
                    query_description = query_description[20:]
                    test_description_indices = \
                        np.expand_dims([0] + [word_dict[phrase]["id"] for phrase in
                                              get_all(query_description.split(), word_ngrams, sort_ngrams)
                                              if phrase in word_dict], axis=0)

                    test_desc_weights = np.zeros_like(test_description_indices, dtype=np.float32)
                    test_desc_weights[0][:len(test_description_indices[0])] = 1. / len(test_description_indices[0])

                    if label not in label_dict:
                        print("New label")
                        continue

                    probabilities = np.squeeze(sess.run(run_arg[-1], feed_dict={run_arg[0]: test_description_indices,
                                                                                run_arg[1]: test_desc_weights}))

                    max_index = np.argmax(probabilities)
                    max_prob = probabilities[max_index]
                    predicted_label = labels_dict_inverse[max_index]
                    print(predicted_label == label, predicted_label, max_prob)
            else:
                test_descriptions, test_labels = parse_txt(args.test_path)
                test_indices = np.arange(len(test_descriptions))
                print("The total number of test datapoints: {}".format(len(test_descriptions)))

                progress_bar = tqdm(total=int(np.ceil(len(test_descriptions) / args.batch_size)))
                remaining_indices, batch_indices = next_batch(test_indices, args.batch_size)
                accuracy_top_1, accuracy_top_k = 0, 0
                cnt = 0

                while len(batch_indices) > 0:
                    batch_descriptions = [test_descriptions[i] for i in batch_indices]
                    batch_labels = [test_labels[i] for i in batch_indices]

                    batch, batch_weights, batch_labels2 = [], [], []

                    max_words = -1
                    for test_description in batch_descriptions:
                        max_words = max(max_words, len(test_description.split()))

                    num_max_words = 1
                    for ng in range(word_ngrams):
                        num_max_words += max_words - ng

                    for test_description, test_label in zip(batch_descriptions, batch_labels):
                        if test_label not in label_dict:
                            num_thrown_for_label += 1
                            continue
                        initial_test_indices = [0] + [word_dict[phrase]["id"] for phrase in
                                                      get_all(test_description.split(), word_ngrams, sort_ngrams)
                                                      if phrase in word_dict]

                        cnt += 1
                        test_description_indices = \
                            np.array(initial_test_indices +
                                     [0 for _ in range(num_max_words - len(initial_test_indices))])
                        test_description_weights = np.zeros_like(test_description_indices, dtype=np.float32)
                        test_description_weights[:len(initial_test_indices)] = 1. / len(initial_test_indices)

                        batch.append(test_description_indices)
                        batch_weights.append(test_description_weights)
                        batch_labels2.append(label_dict[test_label]["id"])

                    probabilities = sess.run(run_arg[-1], feed_dict={run_arg[0]: batch, run_arg[1]: batch_weights})
                    top_k = [np.argsort(i)[-args.top_k:] for i in probabilities]

                    accuracy_top_k += sum([True if i in j else False for i, j in zip(batch_labels2, top_k)])
                    accuracy_top_1 += sum([True if i == j[-1] else False for i, j in zip(batch_labels2, top_k)])
                    remaining_indices, batch_indices = next_batch(remaining_indices, args.batch_size)
                    progress_bar.update()
                progress_bar.close()

                print("{} datapoint thrown because of label".format(num_thrown_for_label))
                print("Number of test datapoints after cleaning: {}".format(len(test_descriptions) -
                                                                            num_thrown_for_label))
                print("Number of unique labels in test after cleaning: {}".format(len(set(test_labels))))
                print("Accuracy: {}".format(round(100 * accuracy_top_1 / len(test_descriptions), 2)))
                print("Accuracy top {}: {}".format(args.top_k, round(100 * accuracy_top_k / len(test_descriptions), 2)))