예제 #1
0
def cbet_data(file_path='data/CBET.csv', remove_stop_words=True, get_text=True, preprocess=True, multi=False, vector=False):
    NUM_CLASS = 9
    emo_list = ["anger", "fear", "joy", "love", "sadness", "surprise", "thankfulness", "disgust", "guilt"]
    stop_words = set(stopwords.words('english'))

    label = []
    train_text = []
    df = pd.read_csv(file_path)
    for i, row in df.iterrows():
        if get_text:
            from utils.tweet_processor import tweet_process
            text = row['text']
            if preprocess:
                text = tweet_process(text)
            if remove_stop_words:
                text = ' '.join([x for x in text.split() if x not in stop_words])
            train_text.append(text)

        emo_one_hot = row[emo_list]
        emo_one_hot = np.asarray(emo_one_hot)
        if not multi:
            if sum(emo_one_hot) != 1:
                continue
            emo_idx = np.argmax(emo_one_hot)
        else:
            if not vector:
                emo_idx = np.where(emo_one_hot == 1)[0].tolist()
            else:
                emo_idx = emo_one_hot
        label.append(emo_idx)

    return train_text, label, emo_list, NUM_CLASS
def isear_data(file_path='data/ISEAR.csv',
               remove_stop_words=True,
               get_text=True,
               preprocess=True):
    stop_words = set(stopwords.words('english'))
    NUM_CLASS = 7
    emo_list = ["joy", "fear", "anger", "sadness", "disgust", "shame", "guilt"]
    attributes = ['SIT']
    target = ['EMOT']
    loader = IsearLoader(attributes, target, True)
    data = loader.load_isear(file_path)
    train_text = []
    if get_text:
        text_all = data.get_freetext_content()  # returns attributes
        for text in text_all:
            from utils.tweet_processor import tweet_process
            if preprocess:
                text = tweet_process(text)
            if remove_stop_words:
                text = ' '.join(
                    [x for x in text.split() if x not in stop_words])
            train_text.append(text)
    emo = data.get_target()  # returns target

    return train_text, emo, emo_list, NUM_CLASS
예제 #3
0
def interactive_inference(model_token=''):
    with open(f'lstm_{model_token}{opt.dataset}_tokenizer.pkl', 'br') as f:
        tokenizer = pkl.load(f)

    def encode_seq(src):
        src = tokenizer.encode_ids(src)
        if len(src) < PAD_LEN:
            src_len = len(src)
            src = src + [0] * (PAD_LEN - len(src))
        else:
            src = src[:PAD_LEN]
            src_len = PAD_LEN

        return torch.LongTensor(src).unsqueeze(0), \
               torch.LongTensor([src_len]).unsqueeze(0)

    def softmax(x):
        """Compute softmax values for each sets of scores in x."""
        return np.exp(x) / np.sum(np.exp(x), axis=0)

    model = AttentionLSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, tokenizer.get_vocab_size(),
                                    NUM_EMO, BATCH_SIZE, att_mode=opt.attention, soft_last=False)
    # multi-GPU
    # model = nn.DataParallel(model)
    with open(f'lstm_{model_token}{opt.dataset}_model.pt', 'br') as f:
        model.load_state_dict(torch.load(f))

    model.cuda()
    label_cols = ['anger', 'fear', 'joy', 'love', 'sadness', 'surprise', 'thankfulness', 'disgust', 'guilt']

    while True:
        print('type "end" to terminate >>> ')
        text = input()
        if text.strip().lower() == 'end':
            break
        text = tweet_process(text)

        seq, seq_len = encode_seq(text)

        y_pred = model(seq.cuda(), seq_len)

        response = ''
        y_pred = y_pred[0].detach().cpu().numpy()
        y_pred = softmax(y_pred)
        for emo, prob in zip(label_cols, y_pred):
            if prob > 0:
                response += emo + ":" + str(prob) + '\n'

        print(response)
예제 #4
0
def cbet_data_other(mode='small', remove_stop_words=True, get_text=True, preprocess=True, vector=False):
    """
    :param mode: Small or median
    :param remove_stop_words:
    :param get_text:
    :param preprocess:
    :param multi:
    :param vector:
    :return:
    """
    assert mode in ['small', 'median']
    NUM_CLASS = 9
    emo_list = ["anger", "fear", "joy", "love", "sadness", "surprise", "thankfulness", "disgust", "guilt"]
    stop_words = set(stopwords.words('english'))

    label = []
    train_text = []

    if mode == 'small':
        file_path = 'data/CBET-single-small.txt'
    else:
        file_path = 'data/CBET-single-medium.txt'

    file_reader = open(file_path, 'r')
    for row in file_reader.readlines():
        tokens = row.strip().split('\t\t')
        if get_text:
            from utils.tweet_processor import tweet_process
            text = tokens[0]
            if preprocess:
                text = tweet_process(text)
            if remove_stop_words:
                text = ' '.join([x for x in text.split() if x not in stop_words])
            train_text.append(text)

        emo = int(tokens[1])
        if not vector:
            label.append(emo)
        else:
            emo_one_hot = np.zeros(NUM_CLASS)
            emo_one_hot[emo] = 1
            label.append(emo_one_hot)

    return train_text, label, emo_list, NUM_CLASS
def tec_data(file_path='data/TEC.txt', remove_stop_words=True, get_text=True):
    NUM_CLASS = 6
    emo_list = ['anger', 'fear', 'joy', 'sadness', 'surprise', 'disgust']

    stop_words = set(stopwords.words('english'))

    f = open(file_path, 'r', encoding='utf8')
    lines = f.readlines()
    f.close()

    label = []
    train_text = []
    for line in lines:
        text, emo = line.split('\t')
        if get_text:
            from utils.tweet_processor import tweet_process
            text = tweet_process(text)
            if remove_stop_words:
                text = ' '.join(
                    [x for x in text.split() if x not in stop_words])
            train_text.append(text)
        label.append(int(emo))
    return train_text, label, emo_list, NUM_CLASS