Пример #1
0
    # checkpoint to use (from models/)
    parser.add_argument('--checkpoint', type=str, required=False)
    # max len of sequences
    parser.add_argument('--maxlen', type=int, required=False)
    args = parser.parse_args()
    args = args.__dict__
    print("Passed args:")
    print(args)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("Found device: %s" % device)

    print("loading model...")
    model = SentimentClassifier()
    model.to(device)  # Enable gpu support for the model

    checkpoint = torch.load(args["checkpoint"], map_location=device)
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()
    tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
    print("model loaded")

    if not args["maxlen"]:
        args["maxlen"] = checkpoint["args"]["maxlen"]
        print("using maxlen from import checkpoint, new args:")
        print(args)

    print("connecting to redis...")
    redis_connection = redis.Redis(host=os.environ["REDIS_HOST"], port=os.environ["REDIS_PORT"], charset="utf-8")
    print("connected to redis")
Пример #2
0
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
from model import SentimentClassifier
from dataset import SSTDataset

#Create validation set
val_set = SSTDataset(filename='data/dev.tsv', maxlen=30)
#Create validation dataloader
val_loader = DataLoader(val_set, batch_size=64, num_workers=5)
#Create the network
net = SentimentClassifier()
#CPU or GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#Put the network to the GPU if available
net = net.to(device)
#Load the state dictionary of the network
net.load_state_dict(torch.load('./models/model', map_location=device))
#Takes as the input the logits of the positive class and computes the binary cross-entropy
criterion = nn.BCEWithLogitsLoss()


def get_accuracy_from_logits(logits, labels):
    #Get a tensor of shape [B, 1, 1] with probabilities that the sentiment is positive
    probs = torch.sigmoid(logits.unsqueeze(-1))
    #Convert probabilities to predictions, 1 being positive and 0 being negative
    soft_probs = (probs > 0.5).long()
    #Check which predictions are the same as the ground truth and calculate the accuracy
    acc = (soft_probs.squeeze() == labels).float().mean()
    #Return the accuracy
    return acc
Пример #3
0
    neg_pred, neut_pred, pos_pred = predict(raw_tweet)
    response = {}

    response["response"] = {
        'sentiment': {
            "positive": str(pos_pred),
            "neutral": str(neut_pred),
            "negative": str(neg_pred),
        },
        "tweet": str(raw_tweet),
        "time_taken": str(time.time() - start_time),
    }

    return flask.jsonify(response)


@app.route('/')
def index():
    return app.send_static_file('index.html')


if __name__ == "__main__":
    MODEL = SentimentClassifier(len(CLASS_NAMES))
    TOKENIZER = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
    MODEL = nn.DataParallel(MODEL)
    MODEL.load_state_dict(
        torch.load(MODEL_PATH, map_location=torch.device(DEVICE)))
    MODEL.to(DEVICE)
    MODEL.eval()
    app.run(host='0.0.0.0')
Пример #4
0
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_train, df_val = train_test_split(df_train, test_size=0.25)

BATCH_SIZE = 8
MAX_LEN = 512

train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN,
                                       BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SentimentClassifier(n_classes=3)
model = model.to(device)

EPOCHS = 10

optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=total_steps)

loss_fn = nn.CrossEntropyLoss().to(device)

history = defaultdict(list)
best_accuracy = 0
Пример #5
0
class ModelHandler(BaseHandler):
    """
    A custom model handler implementation.
    """
    def __init__(self):
        self.model = None
        self._context = None
        self.initialized = False
        self.explain = False
        self.target = 0

    def initialize(self, ctx):
        """
        Initialize model. This will be called during model loading time
        :param context: Initial context contains model server system properties.
        :return:
        """
        self.properties = ctx.system_properties
        self.initialized = True
        #  load the model, refer 'c     ustom handler class' above for details
        self.device = torch.device("cuda:" +
                                   str(self.properties.get("gpu_id")) if torch.
                                   cuda.is_available() else "cpu")

        model_dir = self.properties.get("model_dir")

        # Read model serialize/pt file
        model_pt_path = os.path.join(model_dir, "model.bin")
        # # Read model definition file
        # model_def_path = os.path.join(model_dir, "model.py")
        # if not os.path.isfile(model_def_path):
        #     raise RuntimeError("Missing the model definition file")
        PRE_TRAINED_MODEL_NAME = 'dccuchile/bert-base-spanish-wwm-cased'

        from model import SentimentClassifier

        self.model = SentimentClassifier(2)
        self.model.to(self.device)
        self.tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

        self.model.load_state_dict(
            torch.load(model_pt_path, map_location=torch.device(self.device)))

        self.model = self.model.eval()

        self.initialized = True

        logger.debug(
            'Transformer model from path {0} loaded successfully'.format(
                model_dir))

    def preprocess(self, data):
        """
        Transform raw input into model input data.
        :param batch: list of raw requests, should match batch size
        :return: list of preprocessed model input data
        """
        # # Take the input data and make it inference ready
        # text = data[0].get("data")
        # if text is None:
        try:
            reclamo = data[0].get("body").get("data")
        except:
            reclamo = data[0].get("body")

        # logger.debug(data)
        # logger.debug(str(data))

        MAX_LEN = 450
        inputs = self.tokenizer.encode_plus(
            reclamo,
            add_special_tokens=True,
            max_length=MAX_LEN,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'review_text': reclamo,
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten()
        }

    def inference(self, model_input):
        """
        Internal inference methods
        :param model_input: transformed model input data
        :return: list of inference output in NDArray
        """
        # Do some inference call to engine here and return output

        input_ids = model_input["input_ids"].to(self.device)
        attention_mask = model_input["attention_mask"].to(self.device)

        model_output, pooled_output = self.model(
            input_ids=input_ids.unsqueeze(0),
            attention_mask=attention_mask.unsqueeze(0))

        _, preds = torch.max(model_output, dim=1)

        probs = F.softmax(model_output, dim=1)

        predicted_idx = str(preds.item())

        out_dic = {
            'idx': [predicted_idx],
            'probs': probs.detach().numpy().tolist(),
            'pooled': pooled_output.detach().numpy().tolist()
        }

        out_js = json.dumps(out_dic)

        return [out_js]

    def postprocess(self, inference_output):
        """
        Return inference result.
        :param inference_output: list of inference output
        :return: list of predict results
        """
        # Take output from network and post-process to desired format
        postprocess_output = inference_output
        return postprocess_output
def run():
    df = pd.read_csv("inputs/reviews.csv")
    df["sentiment"] = df.score.apply(rating_to_sentiment)
    df_train, df_rem = train_test_split(df,
                                        test_size=0.1,
                                        random_state=config.RANDOM_SEED)
    df_val, df_test = train_test_split(df_rem,
                                       test_size=0.5,
                                       random_state=config.RANDOM_SEED)
    train_data_loader = create_data_loader(df_train, config.TOKENIZER,
                                           config.MAX_LEN, config.BATCH_SIZE)
    val_data_loader = create_data_loader(df_val, config.TOKENIZER,
                                         config.MAX_LEN, config.BATCH_SIZE)
    test_data_loader = create_data_loader(df_test, config.TOKENIZER,
                                          config.MAX_LEN, config.BATCH_SIZE)

    # data = next(iter(val_data_loader))
    # input_ids = data["input_ids"].to(config.DEVICE)
    # attention_mask = data["attention_mask"].to(config.DEVICE)
    # bert_model = BertModel.from_pretrained(config.BERT_NAME)

    model = SentimentClassifier(num_classes=len(class_labels))
    if config.LOAD_MODEL == True:
        model.load_state_dict(torch.load("best_model_state.bin"))
    model = model.to(config.DEVICE)

    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_data_loader) * config.EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)
    loss_fn = nn.CrossEntropyLoss().to(config.DEVICE)

    history = defaultdict(list)
    best_accuracy = 0

    for epoch in range(config.EPOCHS):
        print(f"Epoch {epoch + 1}/{config.EPOCHS}")
        print("-" * 10)

        train_acc, train_loss = train_fn(
            model,
            train_data_loader,
            loss_fn,
            optimizer,
            config.DEVICE,
            scheduler,
            len(df_train),
        )

        print(f"Train loss {train_loss} accuracy {train_acc}")

        val_acc, val_loss = eval_fn(model, val_data_loader, loss_fn,
                                    config.DEVICE, len(df_val))

        print(f"Val   loss {val_loss} accuracy {val_acc}")
        print()

        history["train_acc"].append(train_acc)
        history["train_loss"].append(train_loss)
        history["val_acc"].append(val_acc)
        history["val_loss"].append(val_loss)

        if val_acc > best_accuracy:
            torch.save(model.state_dict(), "best_model_state.bin")
            best_accuracy = val_acc