예제 #1
0
def prepare_data_and_model(Model, args, using_gpu=True):

    if args.test:
        ## # narvi
        #train_path = "/home/zhouy/thesis/data/text_classification_data/train_try.csv"
        #test_path = "/home/zhouy/thesis/data/text_classification_data/test_try.csv"

        # tut thinkstation
        # train_path = "/media/yi/harddrive/codes/thesis_sentimentAnalysis/data/text_classification_data/train_try.csv"
        # test_path = "/media/yi/harddrive/codes/thesis_sentimentAnalysis/data/text_classification_data/test_try.csv"

        # # tripadvisor dataset
        # # xps
        test_path = "D:\\sentimentAnalysis\\data\\text_classification_data\\test_model_data\\rev_sent_5_score_train_test\\tripadvisor\\test_try.csv"
        train_path = "D:\\sentimentAnalysis\\data\\text_classification_data\\test_model_data\\rev_sent_5_score_train_test\\tripadvisor\\train_try.csv"

    else:
        # original dataset

        # # narvi
        #train_path = "/home/zhouy/thesis/data/text_classification_data/tripadvisor_train_dataset.csv"
        #test_path = "/home/zhouy/thesis/data/text_classification_data/tripadvisor_test_dataset.csv"

        # # tut thinkstation
        # train_path = "/home/yi/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/train.csv"
        # test_path = "/home/yi/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/test.csv"

        # # xps
        # train_path = "D:/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/train.csv"
        # test_path = "D:/sentimentAnalysis/algos/5_ToxicCommentClassification-pytorch/data/test.csv"

        # tripadvisor dataset
        # xps
        train_path = "D:/sentimentAnalysis/data/text_classification_data/tripadvisor_train_dataset.csv"
        test_path = "D:/sentimentAnalysis/data/text_classification_data/tripadvisor_test_dataset.csv"

    def tokenize(text):
        fileters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
        trans_map = str.maketrans(fileters, " " * len(fileters))
        text = text.translate(trans_map)
        text = [
            tok.text for tok in spacy_en.tokenizer(text) if tok.text != ' '
        ]

        tokenized_text = []
        auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', "'s"]
        for token in text:
            if token == "n't":
                tmp = 'not'
            elif token == "'ll":
                tmp = 'will'
            elif token in auxiliary_verbs:
                tmp = 'be'
            else:
                tmp = token
            tokenized_text.append(tmp)
        return tokenized_text

    if args.dataset == 'tripadvisor':

        TEXT = data.Field(tokenize=tokenize,
                          lower=True,
                          batch_first=True,
                          truncate_first=True)
        LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)

        test = CustomDataset(test_path,
                             text_field=TEXT,
                             label_field=LABEL,
                             test=True)

        train = CustomDataset(train_path, text_field=TEXT, label_field=LABEL)
        # should save the above train, test, these two variables.

        if args.wordembedding == "glove-6b":
            vectors = GloVe(name='6B', dim=args.embed_dim)
        elif args.wordembedding == "FastText":
            vectors = FastText(language='en')

        else:
            NotImplementedError

        # # FastText
        # vectors = FastText(name='6B', dim=args.embed_dim)

        vectors.unk_init = init.xavier_uniform

        # 下面这行代码报错
        # TEXT.build_vocab(train, vectors=vectors, max_size=30000)

        TEXT.build_vocab(train, vectors=vectors, max_size=10000, min_freq=10)
        LABEL.build_vocab(train)
        print('train.fields', train.fields)
        print('train.name', getattr(train, 'text'))
        print('len(train)', len(train))
        print('vars(train[0])', vars(train[0]))

        # using the training corpus to create the vocabulary

        train_iter = data.Iterator(dataset=train,
                                   batch_size=args.batch_size,
                                   train=True,
                                   repeat=False,
                                   device=0 if using_gpu else -1)
        test_iter = data.Iterator(dataset=test,
                                  batch_size=args.batch_size,
                                  train=False,
                                  sort=False,
                                  device=0 if using_gpu else -1)

        # the number of unique words
        num_tokens = len(TEXT.vocab.itos)
        args.num_tokens = num_tokens

        dev_iter = test_iter

    elif args.dataset == 'SST':
        text_field = data.Field(batch_first=True,
                                lower=True,
                                tokenize=tokenize)
        label_field = data.Field(sequential=False, batch_first=True)

        train_data, dev_data, test_data = datasets.SST.splits(
            text_field, label_field, fine_grained=True)

        vectors = GloVe(name='6B', dim=args.embed_dim)

        text_field.build_vocab(train_data, vectors=vectors, min_freq=1)
        label_field.build_vocab(train_data)

        train_iter = data.Iterator(train_data,
                                   batch_size=args.batch_size,
                                   device=0 if using_gpu else -1,
                                   train=True,
                                   repeat=False,
                                   sort=False,
                                   shuffle=True)
        dev_iter = data.Iterator(dev_data,
                                 batch_size=args.batch_size,
                                 device=0 if using_gpu else -1,
                                 train=False,
                                 repeat=False,
                                 sort=False,
                                 shuffle=False)
        test_iter = data.Iterator(test_data,
                                  batch_size=args.batch_size,
                                  device=0 if using_gpu else -1,
                                  train=False,
                                  repeat=False,
                                  sort=False,
                                  shuffle=False)

        # train_iter, dev_iter, test_iter = sst(text_field, label_field)
        # train_iter, dev_iter, test_iter = SST.iters(batch_size=16, device=0 if using_gpu else -1, vectors="glove.6B.300d")

        # config.target_class = train_iter.dataset.NUM_CLASSES
        args.num_tokens = len(text_field.vocab)
        args.num_classes = len(label_field.vocab) - 1

        print("num_classes: ", args.num_classes)

    if args.model == "VDCNN":
        net = Model(depth=29,
                    vocabulary_size=args.num_tokens,
                    embed_size=16,
                    n_classes=args.num_classes,
                    k=2,
                    optional_shortcut=True)
    else:
        net = Model(args)
    # # copy pretrained glove word embedding into the model
    # net.embedding.weight.data.copy_(TEXT.vocab.vectors)
    if using_gpu:
        net.cuda()

    return train_iter, test_iter, net
예제 #2
0
    def __init__(self, batch_size=30, device=-1):
        self.batch_size = batch_size
        self.device = device
        #Define fields
        TEXT = data.Field(lower=True, include_lengths=False, batch_first=True)
        CHAR = data.Field(lower=True,
                          include_lengths=False,
                          batch_first=True,
                          tokenize=list)
        TEXT_C = data.NestedField(CHAR)
        LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True)
        INDEX = data.Field(sequential=False, use_vocab=False, batch_first=True)
        ID = data.RawField()
        fields = [("context", TEXT), ("query", TEXT), ("label", LABEL),
                  ("context_c", TEXT_C), ("query_c", TEXT_C), ("index", INDEX)]
        train_data = []
        val_data = []
        dev_data = []

        #Generate examples
        print("Loading datasets...")
        print("Loading training set...")
        try:
            with open("./data/processed/train_set.data", 'rb') as f:
                train_data = pickle.load(f)
            print("Loaded training set from file.")
        except:
            print(
                "Failed to loaded training set from file. Processing training data..."
            )
            with open("./data/squad/train.context") as f:
                context = list(f)

            with open("./data/squad/train.question") as f:
                query = list(f)

            with open("./data/squad/train.span") as f:
                label = list(f)
                for i in range(len(label)):
                    splited = list(map(int, label[i].split()))
                    label[i] = splited

            for i in tqdm(range(len(context)), ascii=True):
                list_content = [
                    context[i], query[i], label[i], context[i], query[i], i
                ]
                train_ex = data.Example.fromlist(list_content, fields)
                train_data.append(train_ex)
            with open("./data/processed/train_set.data", 'wb') as f:
                pickle.dump(train_data, f)
        train_set = data.Dataset(train_data, fields)

        print("Loading dev set...")
        try:
            with open("./data/processed/dev_set.data", 'rb') as f:
                dev_data = pickle.load(f)
            print("Loaded dev set from file.")
        except:
            print("Failed to loaded dev set from file. Processing dev data...")
            with open("./data/squad/dev.context") as f:
                context = list(f)

            with open("./data/squad/dev.question") as f:
                query = list(f)

            with open("./data/squad/dev.span") as f:
                label = list(f)
                for i in range(len(label)):
                    splited = list(map(int, label[i].split()))
                    label[i] = splited

            for i in tqdm(range(len(context)), ascii=True):
                list_content = [
                    context[i], query[i], label[i], context[i], query[i], i
                ]
                dev_ex = data.Example.fromlist(list_content, fields)
                dev_data.append(dev_ex)
            with open("./data/processed/dev_set.data", 'wb') as f:
                pickle.dump(dev_data, f)
        dev_set = data.Dataset(dev_data, fields)

        print("Loading validation set...")
        try:
            with open("./data/processed/val_set.data", 'rb') as f:
                val_data = pickle.load(f)
            print("Loaded validation set from file.")
        except:
            print(
                "Failed to loaded validation set from file. Processing validation data..."
            )
            with open("./data/squad/val.context") as f:
                context = list(f)

            with open("./data/squad/val.question") as f:
                query = list(f)

            with open("./data/squad/val.span") as f:
                label = list(f)
                for i in range(len(label)):
                    splited = list(map(int, label[i].split()))
                    label[i] = splited

            for i in tqdm(range(len(context)), ascii=True):
                list_content = [
                    context[i], query[i], label[i], context[i], query[i], i
                ]
                val_ex = data.Example.fromlist(list_content, fields)
                val_data.append(val_ex)
            with open("./data/processed/val_set.data", 'wb') as f:
                pickle.dump(val_data, f)
        val_set = data.Dataset(val_data, fields)

        print("Loading word embeddings...")
        glove_vecs = GloVe(name='6B', dim=100)
        glove_vecs.unk_init = nn.init.xavier_uniform_

        print("Building vocabulary...")
        TEXT.build_vocab(train_set, vectors=glove_vecs)
        TEXT_C.build_vocab(train_set, min_freq=20)
        self.vocab_vec = TEXT.vocab.vectors
        print(len(self.vocab_vec), " words in word vocabulary.")
        self.char_size = len(TEXT_C.vocab)
        print(len(TEXT_C.vocab), " tokens in char vocabulary.")

        print("Generating iterator...")
        self.train_iter = iter(
            data.Iterator(train_set,
                          batch_size=self.batch_size,
                          device=self.device,
                          sort_key=lambda x: len(x.context),
                          repeat=True,
                          sort=True))
        self.dev_iter = iter(
            data.Iterator(dev_set,
                          batch_size=self.batch_size,
                          device=self.device,
                          sort_key=lambda x: len(x.context),
                          repeat=False,
                          sort=True))
        self.val_iter = iter(
            data.Iterator(val_set,
                          batch_size=self.batch_size,
                          device=self.device,
                          sort_key=lambda x: len(x.context),
                          repeat=True,
                          sort=True))
        print("DataLoader initiated.")