예제 #1
0
 def read_data(self, fn, verbose=True, column_no=-1):
     word_sequences = list()
     tag_sequences = list()
     with codecs.open(fn, 'r', 'utf-8') as f:
         lines = f.readlines()
     curr_words = list()
     curr_tags = list()
     for k in range(len(lines)):
         line = lines[k].strip()
         if len(line) == 0 or line.startswith(
                 '-DOCSTART-'):  # new sentence or new document
             if len(curr_words) > 0:
                 word_sequences.append(curr_words)
                 tag_sequences.append(curr_tags)
                 curr_words = list()
                 curr_tags = list()
             continue
         strings = line.split(' ')
         word = strings[0]
         tag = strings[column_no]  # be default, we take the last tag
         curr_words.append(word)
         curr_tags.append(tag)
         if k == len(lines) - 1:
             word_sequences.append(curr_words)
             tag_sequences.append(curr_tags)
     if verbose:
         print('Loading from %s: %d samples, %d words.' %
               (fn, len(word_sequences), get_words_num(word_sequences)))
     return word_sequences, tag_sequences
예제 #2
0
 def read_data(self, fn, verbose=True, column_no=-1):
     word_sequences = list()
     tag_sequences = list()
     curr_words = list()
     curr_tags = list()
     with codecs.open(fn, 'r', 'utf-8') as f:
         lines = f.readlines()
     for k, line in enumerate(lines):
         elements = line.strip().split('\t')
         if len(elements) < 3:  # end of the document
             word_sequences.append(curr_words)
             tag_sequences.append(curr_tags)
             curr_words = list()
             curr_tags = list()
             continue
         word = elements[1]
         tag = elements[2].split(':')[0]
         curr_words.append(word)
         curr_tags.append(tag)
     if verbose:
         print('Loading from %s: %d samples, %d words.' %
               (fn, len(word_sequences), get_words_num(word_sequences)))
     return word_sequences, tag_sequences
    def read_train_dev_test(self, args):
        def tokenize(tokenizer, text):
            sequence = [
                token.text for token in tokenizer(text.decode('ascii'))
            ]
            return sequence

        path = args.data_dir
        data = []
        labels = []
        f_names = ['rt-polarity.neg', 'rt-polarity.pos']

        train_dir = os.path.join(args.data_dir, 'train')
        dev_dir = os.path.join(args.data_dir, 'dev')
        test_dir = os.path.join(args.data_dir, 'test')
        try:
            os.mkdir(train_dir)
            os.mkdir(dev_dir)
            os.mkdir(test_dir)
        except:
            pass

        if args.save_data:

            print("Loading tokenizer...")
            tokenizer = en_core_web_lg.load()

            for (l, f) in enumerate(f_names):
                for line in open(os.path.join(path, f), 'rb'):
                    try:
                        line.decode('utf-8')
                    except:
                        continue
                    data.append(line.strip())
                    labels.append('pos' if l else 'neg')

            # seed set in main.py
            train, test, train_labels, test_labels = sklearn.model_selection.train_test_split(
                data, labels, test_size=.2)
            train, val, train_labels, val_labels = sklearn.model_selection.train_test_split(
                train, train_labels, test_size=.1)

            word_sequences_train = [
                tokenize(tokenizer, text) for text in train
            ]
            word_sequences_dev = [tokenize(tokenizer, text) for text in val]
            word_sequences_test = [tokenize(tokenizer, text) for text in test]

            pickle.dump(
                word_sequences_train,
                open(os.path.join(train_dir, 'word_sequences') + '.pkl', 'wb'))
            pickle.dump(train_labels,
                        open(os.path.join(train_dir, 'labels') + '.pkl', 'wb'))
            pickle.dump(
                word_sequences_dev,
                open(os.path.join(dev_dir, 'word_sequences') + '.pkl', 'wb'))
            pickle.dump(val_labels,
                        open(os.path.join(dev_dir, 'labels') + '.pkl', 'wb'))
            pickle.dump(
                word_sequences_test,
                open(os.path.join(test_dir, 'word_sequences') + '.pkl', 'wb'))
            pickle.dump(test_labels,
                        open(os.path.join(test_dir, 'labels') + '.pkl', 'wb'))

        else:
            print("Loading data from .pkl's in %s" % args.data_dir)
            word_sequences_train = pickle.load(
                open(os.path.join(train_dir, 'word_sequences') + '.pkl', 'rb'))
            train_labels = pickle.load(
                open(os.path.join(train_dir, 'labels') + '.pkl', 'rb'))
            word_sequences_dev = pickle.load(
                open(os.path.join(dev_dir, 'word_sequences') + '.pkl', 'rb'))
            val_labels = pickle.load(
                open(os.path.join(dev_dir, 'labels') + '.pkl', 'rb'))
            word_sequences_test = pickle.load(
                open(os.path.join(test_dir, 'word_sequences') + '.pkl', 'rb'))
            test_labels = pickle.load(
                open(os.path.join(test_dir, 'labels') + '.pkl', 'rb'))

        if args.verbose:
            for name, word_sequences in zip(['train', 'dev', 'test'], [
                    word_sequences_train, word_sequences_dev,
                    word_sequences_test
            ]):
                print(
                    'Loading from %s: %d samples, %d words.' %
                    (name, len(word_sequences), get_words_num(word_sequences)))

        return word_sequences_train, train_labels, word_sequences_dev, val_labels, word_sequences_test, \
               test_labels
예제 #4
0
    def read_data(self, args, corpus_type, fn, verbose=True, column_no=-1):
        mode = '#'
        column_no = -1

        src_data = []
        data = []
        label = []

        src_data_sentence = []
        data_sentence = []
        label_sentence = []

        with codecs.open(fn, 'r', 'utf-8') as f:
            lines = f.readlines()
        for line in lines:
            # print('line',line)
            # for k in range(len(lines)):
            #     line = str(line, 'utf-8')
            line_t = line.replace('\n', '').replace('\r',
                                                    '').replace('  ',
                                                                '#').split('#')
            if len(line_t) < 3:
                if len(data_sentence) == 0:
                    continue
                src_data.append(src_data_sentence)
                data.append(data_sentence)
                label.append(label_sentence)
                src_data_sentence = []
                data_sentence = []
                label_sentence = []
                continue
            src_word = line_t[0]
            word = line_t[1]
            src_data_sentence.append(src_word)
            data_sentence.append(word)
            label_sentence += [line_t[2].split('_')[0]]
        if verbose:
            print('Loading from %s: %d samples, %d words.' %
                  (fn, len(data), get_words_num(data)))
        datas = deepcopy(data)

        # convert the word to window size words ...
        window_datas = []
        for sent in data:
            if not args.if_no_bigram:
                window_datas.append(
                    self.convert2window_bigram_feature(
                        sent, win_size=args.window_size))
            else:
                window_datas.append(
                    self.convert2window_noBigram_feature(
                        sent, win_size=args.window_size))

        # if args.if_bigram:
        #     for sent in data:
        #         window_datas.append(self.convert2window_bigram_feature(sent, win_size=args.window_size) )
        # else:
        #     for sent in data:
        #         window_datas.append(self.convert2window_noBigram_feature(sent, win_size=args.window_size) )

        return window_datas, datas, label