示例#1
0
def read_data(source_path: str, word_alphabet: Alphabet, max_size=None, normalize_digits=True):
    data = []
    max_length = 0
    print('Reading data from %s' % source_path)
    counter = 0
    reader = SSTReader(source_path, word_alphabet)
    inst = reader.getNext(normalize_digits=normalize_digits)
    while inst is not None and (not max_size or counter < max_size):
        counter += 1
        if counter % 10000 == 0:
            print("reading data: %d" % counter)

        sent = inst.sentence
        data.append([sent.word_ids, inst.pos_ids])
        if max_length < inst.length():
            max_length = inst.length()
        inst = reader.getNext(normalize_digits=normalize_digits)
    reader.close()
    print("Total number of data: %d" % counter)

    data_size = len(data)
    wid_inputs = np.empty([data_size, max_length], dtype=np.int64)
    lid_inputs = np.empty([data_size], dtype=np.int64)

    masks = np.zeros([data_size, max_length], dtype=np.float32)
    single = np.zeros([data_size, max_length], dtype=np.int64)
    lengths = np.empty(data_size, dtype=np.int64)

    for i, inst in enumerate(data):
        wids, lid = inst
        inst_size = len(wids)
        lengths[i] = inst_size
        # word ids
        wid_inputs[i, :inst_size] = wids
        wid_inputs[i, inst_size:] = PAD_ID_WORD
        # label id
        lid_inputs[i] = lid
        # masks
        masks[i, :inst_size] = 1.0
        for j, wid in enumerate(wids):
            if word_alphabet.is_singleton(wid):
                single[i, j] = 1

    words = torch.from_numpy(wid_inputs)
    labels = torch.from_numpy(lid_inputs)
    masks = torch.from_numpy(masks)
    single = torch.from_numpy(single)
    lengths = torch.from_numpy(lengths)

    data_tensor = {'WORD': words, 'LAB': labels, 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths}
    return data_tensor, data_size
示例#2
0
def read_bucketed_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet,
                       pos_alphabet: Alphabet, chunk_alphabet: Alphabet, ner_alphabet: Alphabet,
                       max_size=None, normalize_digits=True):
    data = [[] for _ in _buckets]
    max_char_length = [0 for _ in _buckets]
    print('Reading data from %s' % source_path)
    counter = 0
    reader = CoNLL03Reader(source_path, word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet)
    inst = reader.getNext(normalize_digits)
    while inst is not None and (not max_size or counter < max_size):
        counter += 1
        if counter % 10000 == 0:
            print("reading data: %d" % counter)

        inst_size = inst.length()
        sent = inst.sentence
        for bucket_id, bucket_size in enumerate(_buckets):
            if inst_size < bucket_size:
                data[bucket_id].append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.chunk_ids, inst.ner_ids])
                max_len = max([len(char_seq) for char_seq in sent.char_seqs])
                if max_char_length[bucket_id] < max_len:
                    max_char_length[bucket_id] = max_len
                break

        inst = reader.getNext(normalize_digits)
    reader.close()
    print("Total number of data: %d" % counter)

    bucket_sizes = [len(data[b]) for b in range(len(_buckets))]
    data_tensors = []
    for bucket_id in range(len(_buckets)):
        bucket_size = bucket_sizes[bucket_id]
        if bucket_size == 0:
            data_tensors.append((1, 1))
            continue

        bucket_length = _buckets[bucket_id]
        char_length = min(MAX_CHAR_LENGTH, max_char_length[bucket_id])
        wid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
        cid_inputs = np.empty([bucket_size, bucket_length, char_length], dtype=np.int64)
        pid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
        chid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
        nid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)

        masks = np.zeros([bucket_size, bucket_length], dtype=np.float32)
        single = np.zeros([bucket_size, bucket_length], dtype=np.int64)
        lengths = np.empty(bucket_size, dtype=np.int64)

        for i, inst in enumerate(data[bucket_id]):
            wids, cid_seqs, pids, chids, nids = inst
            inst_size = len(wids)
            lengths[i] = inst_size
            # word ids
            wid_inputs[i, :inst_size] = wids
            wid_inputs[i, inst_size:] = PAD_ID_WORD
            for c, cids in enumerate(cid_seqs):
                cid_inputs[i, c, :len(cids)] = cids
                cid_inputs[i, c, len(cids):] = PAD_ID_CHAR
            cid_inputs[i, inst_size:, :] = PAD_ID_CHAR
            # pos ids
            pid_inputs[i, :inst_size] = pids
            pid_inputs[i, inst_size:] = PAD_ID_TAG
            # chunk ids
            chid_inputs[i, :inst_size] = chids
            chid_inputs[i, inst_size:] = PAD_ID_TAG
            # ner ids
            nid_inputs[i, :inst_size] = nids
            nid_inputs[i, inst_size:] = PAD_ID_TAG
            # masks
            masks[i, :inst_size] = 1.0
            for j, wid in enumerate(wids):
                if word_alphabet.is_singleton(wid):
                    single[i, j] = 1

        words = torch.from_numpy(wid_inputs)
        chars = torch.from_numpy(cid_inputs)
        pos = torch.from_numpy(pid_inputs)
        chunks = torch.from_numpy(chid_inputs)
        ners = torch.from_numpy(nid_inputs)
        masks = torch.from_numpy(masks)
        single = torch.from_numpy(single)
        lengths = torch.from_numpy(lengths)

        data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'CHUNK': chunks,
                       'NER': ners, 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths}
        data_tensors.append(data_tensor)
    return data_tensors, bucket_sizes
示例#3
0
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None,
                     min_occurrence=1, normalize_digits=True):

    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0:
                        continue

                    tokens = line.split(' ')
                    word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0]
                    pos = tokens[1]
                    chunk = tokens[2]
                    ner = tokens[3]

                    pos_alphabet.add(pos)
                    chunk_alphabet.add(chunk)
                    ner_alphabet.add(ner)

                    if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
                        vocab_set.add(word)
                        vocab_list.append(word)

    logger = get_logger("Create Alphabets")
    word_alphabet = Alphabet('word', default_value=True, singleton=True)
    char_alphabet = Alphabet('character', default_value=True)
    pos_alphabet = Alphabet('pos')
    chunk_alphabet = Alphabet('chunk')
    ner_alphabet = Alphabet('ner')

    if not os.path.isdir(alphabet_directory):
        logger.info("Creating Alphabets: %s" % alphabet_directory)

        char_alphabet.add(PAD_CHAR)
        pos_alphabet.add(PAD_POS)
        chunk_alphabet.add(PAD_CHUNK)
        ner_alphabet.add(PAD_NER)

        vocab = defaultdict(int)
        with open(train_path, 'r') as file:
            for line in file:
                line = line.strip()
                if len(line) == 0:
                    continue

                tokens = line.split(' ')
                for char in tokens[0]:
                    char_alphabet.add(char)

                word = DIGIT_RE.sub("0", tokens[0]) if normalize_digits else tokens[0]
                vocab[word] += 1

                pos = tokens[1]
                pos_alphabet.add(pos)

                chunk = tokens[2]
                chunk_alphabet.add(chunk)

                ner = tokens[3]
                ner_alphabet.add(ner)

        # collect singletons
        singletons = set([word for word, count in vocab.items() if count <= min_occurrence])

        # if a singleton is in pretrained embedding dict, set the count to min_occur + c
        if embedd_dict is not None:
            assert isinstance(embedd_dict, OrderedDict)
            for word in vocab.keys():
                if word in embedd_dict or word.lower() in embedd_dict:
                    vocab[word] += min_occurrence

        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        logger.info("Total Vocabulary Size: %d" % len(vocab_list))
        logger.info("Total Singleton Size:  %d" % len(singletons))
        vocab_list = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence]
        logger.info("Total Vocabulary Size (w.o rare words): %d" % len(vocab_list))

        if len(vocab_list) > max_vocabulary_size:
            vocab_list = vocab_list[:max_vocabulary_size]

        if data_paths is not None and embedd_dict is not None:
            expand_vocab()

        for word in vocab_list:
            word_alphabet.add(word)
            if word in singletons:
                word_alphabet.add_singleton(word_alphabet.get_index(word))

        word_alphabet.save(alphabet_directory)
        char_alphabet.save(alphabet_directory)
        pos_alphabet.save(alphabet_directory)
        chunk_alphabet.save(alphabet_directory)
        ner_alphabet.save(alphabet_directory)
    else:
        word_alphabet.load(alphabet_directory)
        char_alphabet.load(alphabet_directory)
        pos_alphabet.load(alphabet_directory)
        chunk_alphabet.load(alphabet_directory)
        ner_alphabet.load(alphabet_directory)

    word_alphabet.close()
    char_alphabet.close()
    pos_alphabet.close()
    chunk_alphabet.close()
    ner_alphabet.close()
    logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size()))
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Chunk Alphabet Size: %d" % chunk_alphabet.size())
    logger.info("NER Alphabet Size: %d" % ner_alphabet.size())
    return word_alphabet, char_alphabet, pos_alphabet, chunk_alphabet, ner_alphabet
示例#4
0
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None,
                     min_occurrence=1, normalize_digits=True, unk_rank=5):

    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            # logger.info("Processing data: %s" % data_path)
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0:
                        continue

                    tokens = line.split('\t')
                    for char in tokens[1]:
                        char_alphabet.add(char)

                    word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
                    pos = tokens[4]
                    type = tokens[7]

                    pos_alphabet.add(pos)
                    type_alphabet.add(type)

                    if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
                        vocab_set.add(word)
                        vocab_list.append(word)

    logger = get_logger("Create Alphabets")
    word_alphabet = Alphabet('word', singleton=True)
    char_alphabet = Alphabet('character')
    pos_alphabet = Alphabet('pos')
    type_alphabet = Alphabet('type')

    logger.info("Creating Alphabets: %s" % alphabet_directory)

    char_alphabet.add(PAD_CHAR)
    pos_alphabet.add(PAD_POS)
    type_alphabet.add(PAD_TYPE)

    char_alphabet.add(ROOT_CHAR)
    pos_alphabet.add(ROOT_POS)
    type_alphabet.add(ROOT_TYPE)

    char_alphabet.add(END_CHAR)
    pos_alphabet.add(END_POS)
    type_alphabet.add(END_TYPE)

    vocab = defaultdict(int)

    # here we use the list to save every word and position
    word_collect = []
    with open(train_path, 'r') as file:
        words = []
        position = 0
        for line in file:
            line = line.strip()
            if len(line) == 0:
                position = 0
                word_collect.append(words)
                words = []
                continue

            tokens = line.split('\t')
            for char in tokens[1]:
                char_alphabet.add(char)

            word = DIGIT_RE.sub("0", tokens[1]) if normalize_digits else tokens[1]
            vocab[word] += 1
            words.append((word, position))
            position += 1

            pos = tokens[4]
            pos_alphabet.add(pos)

            type = tokens[7]
            type_alphabet.add(type)

    # collect singletons
    singletons = set([word for word, count in vocab.items() if count <= min_occurrence])

    # if a singleton is in pretrained embedding dict, set the count to min_occur + c
    if embedd_dict is not None:
        assert isinstance(embedd_dict, OrderedDict)
        for word in vocab.keys():
            if word in embedd_dict or word.lower() in embedd_dict:
                vocab[word] += min_occurrence

    vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
    logger.info("Total Vocabulary Size: %d" % len(vocab_list))
    logger.info("Total Singleton Size:  %d" % len(singletons))
    multi_vocab = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence]
    logger.info("Total Vocabulary Size (w.o rare words): %d" % len(multi_vocab))

    if len(vocab_list) > max_vocabulary_size:
        vocab_list = vocab_list[:max_vocabulary_size]

    if data_paths is not None and embedd_dict is not None:
        expand_vocab()

    for word in vocab_list:
        if word in multi_vocab:
            word_alphabet.add(word)
        elif word in singletons:
            word_alphabet.add_singleton(word_alphabet.get_index(word))
        else:
            raise ValueError("Error word: " + word)

    # unk refiner
    unk_refiner = UNKRefiner(level=unk_rank, alphabet=word_alphabet)
    for words in word_collect:
        for word, position in words:
            if word in singletons:
                unk_signature = unk_refiner.refine(word, position)
                word_alphabet.add(unk_signature)

    word_alphabet.close()
    char_alphabet.close()
    pos_alphabet.close()
    type_alphabet.close()
    logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size()))
    logger.info("Character Alphabet Size: %d" % char_alphabet.size())
    logger.info("POS Alphabet Size: %d" % pos_alphabet.size())
    logger.info("Type Alphabet Size: %d" % type_alphabet.size())
    return word_alphabet, char_alphabet, pos_alphabet, type_alphabet
示例#5
0
def read_data(source_path: str, word_alphabet: Alphabet, char_alphabet: Alphabet, pos_alphabet: Alphabet, type_alphabet: Alphabet,
              max_size=None, normalize_digits=True, symbolic_root=False, symbolic_end=False):
    data = []
    max_length = 0
    max_char_length = 0
    print('Reading data from %s' % source_path)
    counter = 0
    reader = CoNLLXReader(source_path, word_alphabet, char_alphabet, pos_alphabet, type_alphabet)
    inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end)
    while inst is not None and (not max_size or counter < max_size):
        counter += 1
        if counter % 10000 == 0:
            print("reading data: %d" % counter)

        sent = inst.sentence
        data.append([sent.word_ids, sent.char_id_seqs, inst.pos_ids, inst.heads, inst.type_ids])
        max_len = max([len(char_seq) for char_seq in sent.char_seqs])
        if max_char_length < max_len:
            max_char_length = max_len
        if max_length < inst.length():
            max_length = inst.length()
        inst = reader.getNext(normalize_digits=normalize_digits, symbolic_root=symbolic_root, symbolic_end=symbolic_end)
    reader.close()
    print("Total number of data: %d" % counter)

    data_size = len(data)
    char_length = min(MAX_CHAR_LENGTH, max_char_length)
    wid_inputs = np.empty([data_size, max_length], dtype=np.int64)
    cid_inputs = np.empty([data_size, max_length, char_length], dtype=np.int64)
    pid_inputs = np.empty([data_size, max_length], dtype=np.int64)
    hid_inputs = np.empty([data_size, max_length], dtype=np.int64)
    tid_inputs = np.empty([data_size, max_length], dtype=np.int64)

    masks = np.zeros([data_size, max_length], dtype=np.float32)
    single = np.zeros([data_size, max_length], dtype=np.int64)
    lengths = np.empty(data_size, dtype=np.int64)

    for i, inst in enumerate(data):
        wids, cid_seqs, pids, hids, tids = inst
        inst_size = len(wids)
        lengths[i] = inst_size
        # word ids
        wid_inputs[i, :inst_size] = wids
        wid_inputs[i, inst_size:] = PAD_ID_WORD
        for c, cids in enumerate(cid_seqs):
            cid_inputs[i, c, :len(cids)] = cids
            cid_inputs[i, c, len(cids):] = PAD_ID_CHAR
        cid_inputs[i, inst_size:, :] = PAD_ID_CHAR
        # pos ids
        pid_inputs[i, :inst_size] = pids
        pid_inputs[i, inst_size:] = PAD_ID_TAG
        # type ids
        tid_inputs[i, :inst_size] = tids
        tid_inputs[i, inst_size:] = PAD_ID_TAG
        # heads
        hid_inputs[i, :inst_size] = hids
        hid_inputs[i, inst_size:] = PAD_ID_TAG
        # masks
        masks[i, :inst_size] = 1.0
        for j, wid in enumerate(wids):
            if word_alphabet.is_singleton(wid):
                single[i, j] = 1

    words = torch.from_numpy(wid_inputs)
    chars = torch.from_numpy(cid_inputs)
    pos = torch.from_numpy(pid_inputs)
    heads = torch.from_numpy(hid_inputs)
    types = torch.from_numpy(tid_inputs)
    masks = torch.from_numpy(masks)
    single = torch.from_numpy(single)
    lengths = torch.from_numpy(lengths)

    data_tensor = {'WORD': words, 'CHAR': chars, 'POS': pos, 'HEAD': heads, 'TYPE': types,
                   'MASK': masks, 'SINGLE': single, 'LENGTH': lengths}
    return data_tensor, data_size
示例#6
0
def create_alphabets(alphabet_directory, train_path, data_paths=None, max_vocabulary_size=100000, embedd_dict=None,
                     min_occurrence=1, normalize_digits=True):
    def expand_vocab():
        vocab_set = set(vocab_list)
        for data_path in data_paths:
            with open(data_path, 'r') as file:
                for line in file:
                    line = line.strip()
                    if len(line) == 0:
                        continue
                    tokens = line.split('\t')[0].split(' ')

                    for token in tokens:
                        word = DIGIT_RE.sub("0", token) if normalize_digits else token

                        if word not in vocab_set and (word in embedd_dict or word.lower() in embedd_dict):
                            vocab_set.add(word)
                            vocab_list.append(word)

    logger = get_logger("Create Alphabets")
    word_alphabet = Alphabet('word', singleton=True)
    if not os.path.isdir(alphabet_directory):
        logger.info("Creating Alphabets: %s" % alphabet_directory)
        vocab = defaultdict(int)
        with open(train_path, 'r') as file:
            for line in file:
                line = line.strip()
                if len(line) == 0:
                    continue

                tokens = line.split('\t')[0].split(' ')
                for token in tokens:
                    word = DIGIT_RE.sub("0", token) if normalize_digits else token
                    vocab[word] += 1

        # collect singletons
        singletons = set([word for word, count in vocab.items() if count <= min_occurrence])

        # if a singleton is in pretrained embedding dict, set the count to min_occur + c
        if embedd_dict is not None:
            assert isinstance(embedd_dict, OrderedDict)
            for word in vocab.keys():
                if word in embedd_dict or word.lower() in embedd_dict:
                    vocab[word] += min_occurrence

        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        logger.info("Total Vocabulary Size: %d" % len(vocab_list))
        logger.info("Total Singleton Size:  %d" % len(singletons))
        multi_vocab = [word for word in vocab_list if word in _START_VOCAB or vocab[word] > min_occurrence]
        logger.info("Total Vocabulary Size (w.o rare words): %d" % len(multi_vocab))

        if len(vocab_list) > max_vocabulary_size:
            vocab_list = vocab_list[:max_vocabulary_size]

        if data_paths is not None and embedd_dict is not None:
            expand_vocab()
        for word in vocab_list:
            if word in multi_vocab:
                word_alphabet.add(word)
            elif word in singletons:
                word_alphabet.add_singleton(word_alphabet.get_index(word))
            else:
                raise ValueError("Error word: " + word)
        refiner = UNKRefiner(0, word_alphabet)
        # TODO fix the pos here
        for word in singletons:
            unk_signature = refiner.refine(word, 0)
            word_alphabet.add(unk_signature)

        word_alphabet.save(alphabet_directory)
    else:
        word_alphabet.load(alphabet_directory)

    word_alphabet.close()
    logger.info("Word Alphabet Size (Singleton): %d (%d)" % (word_alphabet.size(), word_alphabet.singleton_size()))
    return word_alphabet
示例#7
0
def read_bucketed_data(source_path: str, word_alphabet: Alphabet,
                       max_size=None, normalize_digits=True):
    data = [[] for _ in _buckets]
    print('Reading data from %s' % source_path)
    counter = 0
    reader = SSTReader(source_path, word_alphabet)
    inst = reader.getNext(normalize_digits=normalize_digits)
    while inst is not None and (not max_size or counter < max_size):
        counter += 1
        if counter % 10000 == 0:
            print("reading data: %d" % counter)

        inst_size = inst.length()
        sent = inst.sentence
        for bucket_id, bucket_size in enumerate(_buckets):
            if inst_size < bucket_size:
                data[bucket_id].append([sent.word_ids, inst.pos_ids])
                break

        inst = reader.getNext(normalize_digits=normalize_digits)
    reader.close()
    print("Total number of data: %d" % counter)

    bucket_sizes = [len(data[b]) for b in range(len(_buckets))]
    data_tensors = []
    for bucket_id in range(len(_buckets)):
        bucket_size = bucket_sizes[bucket_id]
        if bucket_size == 0:
            data_tensors.append((1, 1))
            continue

        bucket_length = _buckets[bucket_id]
        wid_inputs = np.empty([bucket_size, bucket_length], dtype=np.int64)
        lid_inputs = np.empty([bucket_size], dtype=np.int64)

        masks = np.zeros([bucket_size, bucket_length], dtype=np.float32)
        single = np.zeros([bucket_size, bucket_length], dtype=np.int64)
        lengths = np.empty(bucket_size, dtype=np.int64)

        for i, inst in enumerate(data[bucket_id]):
            wids, lid = inst
            inst_size = len(wids)
            lengths[i] = inst_size
            # word ids
            wid_inputs[i, :inst_size] = wids
            wid_inputs[i, inst_size:] = PAD_ID_WORD
            # label ids
            lid_inputs[i] = lid
            # masks
            masks[i, :inst_size] = 1.0
            for j, wid in enumerate(wids):
                if word_alphabet.is_singleton(wid):
                    single[i, j] = 1

        words = torch.from_numpy(wid_inputs)
        labels = torch.from_numpy(lid_inputs)
        masks = torch.from_numpy(masks)
        single = torch.from_numpy(single)
        lengths = torch.from_numpy(lengths)

        data_tensor = {'WORD': words, 'LAB': labels, 'MASK': masks, 'SINGLE': single, 'LENGTH': lengths}
        data_tensors.append(data_tensor)
    return data_tensors, bucket_sizes