예제 #1
0
def get_run_components(run_dir):
    # Load args
    config = utils.load_json(os.path.join(run_dir, 'config.json'))

    # Load word_map
    _, emb_size = load_word2vec_embeddings(word2vec_file, word_map)

    model = models.HierarchialAttentionNetwork(
        n_classes=n_classes,
        vocab_size=len(word_map),
        emb_size=emb_size,
        word_rnn_size=word_rnn_size,
        sentence_rnn_size=sentence_rnn_size,
        word_rnn_layers=word_rnn_layers,
        sentence_rnn_layers=sentence_rnn_layers,
        word_att_size=word_att_size,
        sentence_att_size=sentence_att_size,
        dropout=dropout)

    # Load model
    model.load_state_dict(torch.load(os.path.join(run_dir, 'model.pt')))
    device = torch.device('cuda' if (
        torch.cuda.is_available() and args.cuda) else 'cpu')
    model = model.to(device)

    return model, word_map
예제 #2
0
def get_run_components(run_dir):
    # Load args
    config = utils.load_json(os.path.join(run_dir, 'config.json'))
    args = Namespace(**config)

    # Load tokenizers
    X_tokenizer = data.Tokenizer.load(
        fp=os.path.join(run_dir, 'X_tokenizer.json'))
    y_tokenizer = data.LabelEncoder.load(
        fp=os.path.join(run_dir, 'y_tokenizer.json'))

    # Load model
    model = models.TextCNN(embedding_dim=args.embedding_dim,
                           vocab_size=len(X_tokenizer) + 1,
                           num_filters=args.num_filters,
                           filter_sizes=args.filter_sizes,
                           hidden_dim=args.hidden_dim,
                           dropout_p=args.dropout_p,
                           num_classes=len(y_tokenizer.classes))
    model.load_state_dict(torch.load(os.path.join(run_dir, 'model.pt')))
    device = torch.device('cuda' if (
        torch.cuda.is_available() and args.cuda) else 'cpu')
    model = model.to(device)

    return args, model, X_tokenizer, y_tokenizer
예제 #3
0
async def _experiment_details(experiment_id: str = Path(
    default='latest', title="ID of experiment")):
    if experiment_id == 'latest':
        experiment_id = max(os.listdir(config.EXPERIMENTS_DIR))
    experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id)
    args = utils.load_json(
        filepath=os.path.join(experiment_dir, 'config.json'))
    classes = data.LabelEncoder.load(
        fp=os.path.join(experiment_dir, 'y_tokenizer.json')).classes
    performance = utils.load_json(
        filepath=os.path.join(experiment_dir, 'performance.json'))
    response = {
        'message': HTTPStatus.OK.phrase,
        'status-code': HTTPStatus.OK,
        'data': {
            "classes": classes,
            "args": args,
            "performance": performance
        }
    }
    config.logger.info(json.dumps(response, indent=2))
    return response
예제 #4
0
def get_run_components(run_dir):
    # Load args
    config = utils.load_json(os.path.join(run_dir, 'config.json'))
    args = Namespace(**config)

    # Load tokenizers
    with open(os.path.join(run_dir, 'X_tokenizer.json'), 'r') as fp:
        X_tokenizer = tokenizer_from_json(json.load(fp))
        y_tokenizer = LabelEncoder()
        y_tokenizer.classes_ = np.load(os.path.join(run_dir,
                                                    'y_tokenizer.npy'),
                                       allow_pickle=True)

        # Load model
        model = models.TextCNN(embedding_dim=args.embedding_dim,
                               vocab_size=len(X_tokenizer.word_index) + 1,
                               num_filters=args.num_filters,
                               filter_sizes=args.filter_sizes,
                               hidden_dim=args.hidden_dim,
                               dropout_p=args.dropout_p,
                               num_classes=len(y_tokenizer.classes_))

        model.summary(input_shape=(10, ))  # build it
        model_path = os.path.join(run_dir, 'model/cp.ckpt')
        model.load_weights(model_path)

        # Conv output model
        conv_outputs_model = models.ConvOutputsModel(
            vocab_size=len(X_tokenizer.word_index) + 1,
            embedding_dim=args.embedding_dim,
            filter_sizes=args.filter_sizes,
            num_filters=args.num_filters)
        conv_outputs_model.summary(input_shape=(10, ))  # build it

        # Set weights
        conv_outputs_model.layers[0].set_weights(model.layers[0].get_weights())
        conv_layer_start_num = 1

        for layer_num in range(conv_layer_start_num,
                               conv_layer_start_num + len(args.filter_sizes)):
            conv_outputs_model.layers[layer_num].set_weights(
                model.layers[layer_num].get_weights())

        return args, model, conv_outputs_model, X_tokenizer, y_tokenizer
예제 #5
0
def predict(experiment_id, inputs):
    """Predict the class for a text using
    a trained model from an experiment."""
    # Get experiment config
    if experiment_id == 'latest':
        experiment_id = max(os.listdir(config.EXPERIMENTS_DIR))
    experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id)
    experiment_config = utils.load_json(
        os.path.join(experiment_dir, 'config.json'))
    args = Namespace(**experiment_config)

    # Preprocess
    texts = [sample['text'] for sample in inputs]
    X_tokenizer = data.Tokenizer.load(
        fp=os.path.join(experiment_dir, 'X_tokenizer.json'))
    y_tokenizer = data.LabelEncoder.load(
        fp=os.path.join(experiment_dir, 'y_tokenizer.json'))
    preprocessed_texts = data.preprocess_texts(texts,
                                               lower=args.lower,
                                               filters=args.filters)

    # Create dataset
    X_infer = np.array(X_tokenizer.texts_to_sequences(preprocessed_texts))
    y_filler = np.array([0] * len(X_infer))
    infer_set = data.TextDataset(X=X_infer,
                                 y=y_filler,
                                 batch_size=args.batch_size,
                                 max_filter_size=max(args.filter_sizes))

    # Load model
    model = models.TextCNN(embedding_dim=args.embedding_dim,
                           vocab_size=len(X_tokenizer),
                           num_filters=args.num_filters,
                           filter_sizes=args.filter_sizes,
                           hidden_dim=args.hidden_dim,
                           dropout_p=args.dropout_p,
                           num_classes=len(y_tokenizer.classes))
    model.load_state_dict(torch.load(os.path.join(experiment_dir, 'model.h5')))
    device = torch.device('cuda' if (
        torch.cuda.is_available() and args.cuda) else 'cpu')
    model = model.to(device)

    # Predict
    results = []
    y_prob, conv_outputs = predict_step(model=model,
                                        dataset=infer_set,
                                        filter_sizes=args.filter_sizes,
                                        device=device)
    for index in range(len(X_infer)):
        results.append({
            'raw_input':
            texts[index],
            'preprocessed_input':
            X_tokenizer.sequences_to_texts([X_infer[index]])[0],
            'probabilities':
            get_probability_distribution(y_prob[index], y_tokenizer.classes),
            'top_n_grams':
            get_top_n_grams(
                tokens=preprocessed_texts[index].split(' '),
                conv_outputs={k: v[index]
                              for k, v in conv_outputs.items()},
                filter_sizes=args.filter_sizes)
        })
    return results
elif page == 'Model details':

    st.header("All Experiments")
    st.write(
        f'[https://app.wandb.ai/{project}](https://app.wandb.ai/{project})')

    st.header("Best Run")

    # Run details
    st.write(
        f"**Name**: {best_run._attrs['displayName']} ({best_run._attrs['name']})"
    )
    st.write("**Timestamp**:", best_run._attrs['createdAt'])
    st.write(
        f"**Runtime**: {best_run._attrs['summaryMetrics']['_runtime']:.1f} seconds"
    )

    # Performance
    st.write("**Performance**:")
    performance = utils.load_json(
        os.path.join(best_run_dir, 'performance.json'))
    st.json(performance)

    # Confusion matrix
    st.image(os.path.join(best_run_dir, 'confusion_matrix.png'))

    # Config
    st.write("**Config**:")
    st.json(best_run._attrs['config'])
예제 #7
0
import os
import sys

sys.path.append(".")

import logging
import logging.config

from text_classification import utils


# Directories
BASE_DIR = os.getcwd()  # project root
APP_DIR = os.path.dirname(__file__)  # app root

LOGS_DIR = os.path.join(BASE_DIR, 'logs')
EMBEDDINGS_DIR = os.path.join(BASE_DIR, 'embeddings')

# Create dirs
utils.create_dirs(LOGS_DIR)
utils.create_dirs(EMBEDDINGS_DIR)

# Loggers
log_config = utils.load_json(
    filepath=os.path.join(BASE_DIR, 'logging.json'))

logging.config.dictConfig(log_config)
logger = logging.getLogger('logger')
예제 #8
0
def predict(experiment_id, inputs):
    """Predict the class for a text using
    a trained model from an experiment."""
    # Get experiment config
    if experiment_id == 'latest':
        experiment_id = max(os.listdir(config.EXPERIMENTS_DIR))
    experiment_dir = os.path.join(config.EXPERIMENTS_DIR, experiment_id)
    experiment_config = utils.load_json(
        os.path.join(experiment_dir, 'config.json'))
    args = Namespace(**experiment_config)

    # Tokenizers
    texts = [sample['text'] for sample in inputs]
    with open(os.path.join(experiment_dir, 'X_tokenizer.json'), 'r') as fp:
        X_tokenizer = tokenizer_from_json(json.load(fp))
    y_tokenizer = LabelEncoder()
    y_tokenizer.classes_ = np.load(os.path.join(experiment_dir,
                                                'y_tokenizer.npy'),
                                   allow_pickle=True)

    # Create dataset generator
    X_infer = np.array(X_tokenizer.texts_to_sequences(texts))
    preprocessed_texts = X_tokenizer.sequences_to_texts(X_infer)
    y_filler = np.array([0] * len(X_infer))
    inference_generator = data.DataGenerator(X=X_infer,
                                             y=y_filler,
                                             batch_size=args.batch_size,
                                             max_filter_size=max(
                                                 args.filter_sizes))

    # Load model
    model = models.TextCNN(embedding_dim=args.embedding_dim,
                           vocab_size=len(X_tokenizer.word_index) + 1,
                           num_filters=args.num_filters,
                           filter_sizes=args.filter_sizes,
                           hidden_dim=args.hidden_dim,
                           dropout_p=args.dropout_p,
                           num_classes=len(y_tokenizer.classes_))
    model.summary(input_shape=(10, ))  # build it
    model_path = os.path.join(experiment_dir, 'model/cp.ckpt')
    model.load_weights(model_path)

    # Conv output model
    conv_outputs_model = models.ConvOutputsModel(
        vocab_size=len(X_tokenizer.word_index) + 1,
        embedding_dim=args.embedding_dim,
        filter_sizes=args.filter_sizes,
        num_filters=args.num_filters)
    conv_outputs_model.summary(input_shape=(10, ))  # build it

    # Set weights
    conv_outputs_model.layers[0].set_weights(model.layers[0].get_weights())
    conv_layer_start_num = 1
    for layer_num in range(conv_layer_start_num,
                           conv_layer_start_num + len(args.filter_sizes)):
        conv_outputs_model.layers[layer_num].set_weights(
            model.layers[layer_num].get_weights())

    # Predict
    results = []
    y_prob = model.predict(x=inference_generator, verbose=1)
    conv_outputs = conv_outputs_model.predict(x=inference_generator, verbose=1)
    for index in range(len(X_infer)):
        results.append({
            'raw_input':
            texts[index],
            'preprocessed_input':
            preprocessed_texts[index],
            'probabilities':
            get_probability_distribution(y_prob[index], y_tokenizer.classes_),
            'top_n_grams':
            get_top_n_grams(tokens=preprocessed_texts[index].split(' '),
                            conv_outputs=conv_outputs,
                            filter_sizes=args.filter_sizes)
        })

    return results