Exemplo n.º 1
0
def read_data_set(file_path, vocab):
    """
    Reads the data set from one of the pre-processed CSVs composed
    of columns `label` and `sentence`.

    Parameters
    ---
    file_path : str
        Path to the CSV file.
    vocab : torchtext.Vocab
        Vocabulary to use.

    Returns
    ---
    X : torch.Tensor[num_labels x num_examples x sen_length]
        Sentences on the dataset grouped by labels.
    y : torch.Tensor[num_labels]
        Labels for each group of sentences.
    """
    sentence = Field(batch_first=True,
                     sequential=True,
                     tokenize=simple_tokenizer)
    sentence.vocab = vocab

    label = Field(is_target=True)
    label.vocab = vocab

    data_set = TabularDataset(path=file_path,
                              format='csv',
                              skip_header=True,
                              fields=[('label', label),
                                      ('sentence', sentence)])

    sentences_tensor = sentence.process(data_set.sentence)
    labels_tensor = label.process(data_set.label).squeeze()

    # Infer num_labels and group sentences by label
    num_labels = labels_tensor.unique().shape[0]
    num_examples = labels_tensor.shape[0] // num_labels
    y = labels_tensor[::num_examples]
    sen_length = sentences_tensor.shape[-1]
    X = sentences_tensor.view(num_labels, num_examples, sen_length)

    return X, y
Exemplo n.º 2
0
class EmbeddingTextVectorizer(TextVectorizer):
    """Word embeddings text vectorizer on top of torchtext"""

    __UNKNOWN__ = '<unk>'
    __PADDING__ = '<pad>'

    def __init__(self,
                 types: List[str],
                 embeddings: np.array,
                 tokenizer: TextTokenizer,
                 seq_len: int = 128,
                 device: str = None):

        self.seq_len = seq_len
        self.tokenizer = tokenizer
        if device is None:
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.device = device

        special_types = [
            EmbeddingTextVectorizer.__UNKNOWN__,
            EmbeddingTextVectorizer.__PADDING__
        ]
        self.types_ = special_types + types
        # Initialize embeddings for special types with random values
        special_embeddings = np.random.random(size=(len(special_types),
                                                    embeddings.shape[1]))
        embeddings = np.concatenate([special_embeddings, embeddings])
        embeddings = Variable(torch.Tensor(embeddings, device=device),
                              requires_grad=False)
        self.embeddings_ = embeddings
        self.text_field_ = Field(fix_length=seq_len,
                                 pad_token=EmbeddingTextVectorizer.__PADDING__,
                                 pad_first=True)
        # Create vocab from fake counts, reverse types for torchtext to preserve their order
        freqs = Counter(
            {t: i
             for i, t in enumerate(special_types + types[::-1])})
        self.text_field_.vocab = Vocab(counter=freqs, specials=special_types)

    def transform(self, texts: List[str]) -> torch.Tensor:
        """Transform batch of texts into (seq_len, batch_size, dim) tensor of embeddings"""
        texts = [self.tokenizer.tokenize(text) for text in texts]
        texts = self.text_field_.process(batch=texts)
        return self.embeddings_[texts]
Exemplo n.º 3
0
        text = [text]
    return text    


if __name__ == '__main__':
    txt_field = Field(tokenize=lambda x: [tok.text for tok in nlp.tokenizer(x)], 
                        init_token='<sos>', eos_token='<eos>', lower=True, batch_first=True)
    txt_field.build_vocab()
    save_model_path = Path(Path.cwd(), 'saved_models/')
    with open(Path(save_model_path, 'vocab_stoi.pkl'), 'rb') as file:
        txt_field.vocab.stoi = pickle.load(file)
    with open(Path(save_model_path, 'vocab_itos.pkl'), 'rb') as file:
        txt_field.vocab.itos = pickle.load(file)


    text_paths = os.listdir(Path(Path.cwd(), 'sample_docs/'))
    print(text_paths)

    batch = []
    for no, text_path in enumerate(text_paths):
        text = preprocess_text(text_path)        
        batch.extend(text)
        if len(batch) == 2 or no == len(text_paths) - 1:
            batch = txt_field.process(batch)
            print(f'How many OOV tokens per text: {[sum([oov == 0 for oov in txt]) for txt in batch]}')
            print(f'{batch.shape} shape')
            batch = []



Exemplo n.º 4
0
class Data(object):
    def __init__(self,
                 batch_size=128,
                 fix_length=32,
                 singer=None,
                 target_vocab_size=5000,
                 vector_path=VEC_PATH,
                 device=None):
        """
        用于生成歌词生成任务的数据预处理和Batch生成
        每次输入网络的数据包括:
            encoder_input:  编码器输入, shape: (batch_size, time_step, word_id)
            encoder_length: 编码器输入文本有效长度, shape: (batch_size, )
            decoder_input:  解码器输入, shape: (batch_size, time_step, word_id)
            decoder_length: 解码器输入文本有效长度, shape: (batch_size, )
            target: 解码器输出目标, 用于计算Loss, shape: (batch_size, time_step, word_id)
        :param batch_size: 每个batch的大小. 默认: 128
        :param fix_length: 每个序列的最大长度, 长度不足的句子会用"<pad>"补齐, 超过的句子会被截断. 默认: 32
        :param singer: 为None时读取所有歌曲; 否则只读取对应歌手的歌曲. 默认: None
        :param target_vocab_size: 目标词典(解码器输出)的长度, 在输出端(目标)只保留词频最高的前 target_vocab_size 个词语,
                            其它词语都会被"<unk>"替换. 默认: 5000
        :param vector_path: word2vec模型的路径. PS: 必须是.txt格式的文件
        :param device: 设备, "cuda"或"cpu". 默认: None, 自动选择"cuda"或"cpu"
        """
        self.batch_size = batch_size
        self.fix_length = fix_length
        self.singer = singer
        self.target_vocab_size = target_vocab_size
        self.vector_path = vector_path
        self.DEVICE = device or torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        self.tokenize = lambda x: jieba.lcut(x, HMM=False)  # 分词

        # 定义torchtext的三个用于文本预处理的Field对象, 其中ENCODER并不需要句首符"<go>"和末尾符"<eos>",
        # 但为了三个Field对象对文本编码解码的一致性, 在定义ENCODER Field对象时要将它们声明,
        # 在词典映射构建完毕后再将它们去掉, 在self._build_vocab 中也有说明
        self.ENCODER = Field(
            sequential=True,
            tokenize=self.tokenize,
            batch_first=True,  # 数据的第一维是batch(默认是time_step)
            fix_length=self.
            fix_length,  # 固定句子长度, 长度不足的句子会用"<pad>"补齐, 超过的句子会被截断
            include_lengths=True,  # 处理文本时除了返回编码后的文本, 同时返回文本的长度
            init_token="<go>",  # 文本的句首会自动添加"<go>"
            eos_token="<eos>")  # 文本的末尾会自动添加"<eos>"
        self.DECODER = Field(sequential=True,
                             tokenize=self.tokenize,
                             batch_first=True,
                             fix_length=self.fix_length,
                             include_lengths=True,
                             init_token="<go>",
                             eos_token="<eos>")
        self.TARGET = Field(
            sequential=True,
            tokenize=self.tokenize,
            batch_first=True,
            fix_length=self.fix_length,
            eos_token="<eos>")  # 由于`target`是`decoder`左移一位的结果, 所以不需要句首符"<go>"

        # 数据处理
        self._proprecess()  # 对语料库进行读取, 并转化维torchtext能识别的.json文件格式
        self.dataset = self._build_dataset()  # 读取处理后的数据, 生成torchtext的DataSet对象
        self.vectors = Vectors(name=self.vector_path,
                               cache=FILE_PATH + "/temp")  # 加载word2vec词向量
        self._build_vocab()  # 构建词典映射
        self._build_vector()  # 构建词向量映射
        self.stoi = self.ENCODER.vocab.stoi  # 从词语到id的映射字典
        self.itos = self.ENCODER.vocab.itos  # 从id到词典的映射字典
        self.vocab_size = len(self.ENCODER.vocab)  # 词典的大小
        self.vector_dim = self.vectors.dim  # 词向量的维度
        self.vector_weights = self.ENCODER.vocab.vectors  # 词向量的权重
        self.target_vocab_size = len(
            self.TARGET.vocab
        )  # 重新赋值, 因为加入了"<eos>"等标志位的实际词典大会大于原target_vocab_size

        # 迭代器, 用于训练时生成batch
        self.data_iter = BucketIterator(
            self.dataset,
            batch_size=self.batch_size,
            shuffle=True,  # 打乱数据原本顺序
            device=self.DEVICE)

    def process(self, text, return_length=False, go=None, eos=None):
        """
        文本数据预处理(编码), 用于生成可以输入pytorch神经网络的Tensor格式数据
        :param text: 原始文本, str格式
        :param return_length: 是否返回句子长度, 默认: False
        :param go: 是否添加句首符"<go>"
        :param eos: 是否添加句末符"<eos>"
        :return: Tensor格式, 编码后的文本(和句子长度)
        """
        tokens = self.ENCODER.preprocess(text)  # 用ENCODER Field对text进行分词
        if go:
            tokens.insert(0, "<go>")  # 在句首添加"<go>"
        if eos:
            tokens.append("<eos>")  # 在句末添加"<eos>"
        encoder_input, encoder_length = self.ENCODER.process([tokens])  # 将句子编码
        encoder_input = encoder_input.to(self.DEVICE)
        encoder_length = encoder_length.to(self.DEVICE)
        if return_length:
            return encoder_input, encoder_length
        else:
            return encoder_input

    def logist2word(self, logist, topn=1):
        """
        将pytorch神经网络输出的logist转换为对应的文本, 用于在test阶段处理batch_size=1的数据
        :param logist: 神经网络输出, shape: [1, self.target_vocab_size]
        :param topn: 保留概率最大的前topn个词语, 默认为1
        :return:
        """
        ids = logist.view(-1).argsort(descending=True)  # 将下标按输出值进行排序
        word = [self.itos[i] for i in ids[:topn]]  # 将下标id转为对应词语
        return word

    def _build_dataset(self):
        """
        读取由self._proprecess方法处理后的数据, 生成torchtext的DataSet对象
        """
        fields = {
            'encoder': ('encoder', self.ENCODER),
            'decoder': ('decoder', self.DECODER),
            'target': ('target', self.TARGET)
        }
        dataset = TabularDataset(path=FILE_PATH + "/temp/data.json",
                                 format="json",
                                 fields=fields)
        return dataset

    def _build_vocab(self):
        """
        为之前定义的三个Field对象构建词典映射.
        由于`encoder`/`decoder`/`target`都涉及到对文本的编码解码, 但语料库却不完全一致, 若分别由前面定义的
        三个Field对象分别处理对应的部分, 那么对同一个词的编码会在decoder和encoder端出现不一致, 例如`晴天`在
        ENCODER词典中的id是8764, 在DECODER中的id是8892. 这样在网络的ENCODER和DECODER端要分别使用两个不同
        的Embedding层, 增加了网络参数量.
        为了保证词表的一致性, 我们在实例化ENCODER对象的时候也一并声明了其并不需要的句首符"<go>"和末尾符"<eos>".
        因此在构建词典时我们全部用ENCODER构建, 然后再将其编码解码映射表(itos和stoi)赋予DECODER和TARGET.
        """
        self.ENCODER.build_vocab(
            self.dataset,
            max_size=self.target_vocab_size)  # 定义max_size是因为本次构建的词表是给TARGET使用的
        self.DECODER.build_vocab()
        self.TARGET.build_vocab()

        self.TARGET.vocab.stoi = self.ENCODER.vocab.stoi  # 将word到id的词典赋予TARGET
        self.TARGET.vocab.itos = self.ENCODER.vocab.itos  # 将id到word的词典赋予TARGET

        self.ENCODER.build_vocab(
            self.dataset,  # 再次构建词典, 这次不带max_size参数
            vectors=self.vectors)  # 加上词向量参数, 因为网络的输入要涉及到word->id->vector的映射
        self.DECODER.vocab.stoi = self.ENCODER.vocab.stoi  # 将word到id的词典赋予DECODER
        self.DECODER.vocab.itos = self.ENCODER.vocab.itos  # 将id到word的词典赋予DECODER
        self.ENCODER.init_token = None  # 清除ENCODER的句首符"<go>"
        self.ENCODER.eos_token = None  # 清除ENCODER的末尾符"<eos>"

    def _build_vector(self):
        """
        构建词向量映射, 将word2vec中的词向量按ENCODER和DECODER词典id顺序排列, 用于网络的Embedding层
        """
        if not os.path.exists(FILE_PATH + "/temp"):  # 如果./temp文件不存在, 就创建
            os.mkdir(FILE_PATH + "/temp")  # 将word2vec的缓存文件放在temp文件夹
        self.ENCODER.vocab.set_vectors(self.vectors.stoi, self.vectors.vectors,
                                       self.vectors.dim)

    def _proprecess(self):
        """
        对语料库进行读取, 并转化维torchtext能识别的.json文件格式, 形如:
            {"encoder": "鱼", "decoder": "我坐在椅子上看日出复活", "target": "我坐在椅子上看日出复活"}
            {"encoder": "我坐在椅子上看日出复活", "decoder": "我坐在夕阳里看城市的衰弱", "target": "我坐在夕阳里看城市的衰弱"}
            {"encoder": "我坐在夕阳里看城市的衰弱", "decoder": "我摘下一片叶子让它代替我", "target": "我摘下一片叶子让它代替我"}
            ...
        """
        data = []
        with open(FILE_PATH + "/data/songs.json") as f:
            songs = json.loads(f.read())
        for song in songs:
            if not self.singer or song["singer"] == self.singer:  # 按指定歌手进行读取
                # (歌名,首句) 组成第一条数据
                data.append({
                    "encoder": song["title"],
                    "decoder": song["lyric"][0],
                    "target": song["lyric"][0]
                })
                for i in range(len(song["lyric"]) - 1):
                    # (前句,后句) 组成一条数组
                    encoder = song["lyric"][i]  # encoder输入的文本, 上句
                    decoder = song["lyric"][i + 1]  # decoder输入的文本, 下句
                    target = song["lyric"][i + 1]  # decoder输出的目标, 下句左移一个字
                    data.append({
                        "encoder": encoder,
                        "decoder": decoder,
                        "target": target
                    })

        if not os.path.exists(FILE_PATH + "/temp"):  # 如果./temp文件不存在, 就创建
            os.mkdir(FILE_PATH + "/temp")

        with open(FILE_PATH + "/temp/data.json",
                  "w") as f:  # 保存为临时文件, 以便torchtext读取
            f.writelines(
                [json.dumps(d, ensure_ascii=False) + "\n" for d in data])
criterion_test = nn.CrossEntropyLoss(ignore_index=pad_idx)

test_losses = evaluate(test_iter, model, criterion_test)
losses["test"].append(test_losses)
test_loss = torch.tensor(sum(test_losses) / len(test_losses))
print(test_loss)
print('Perplexity:', torch.exp(test_loss))

# sentence = [SRC.preprocess("eine gruppe von menschen steht vor einem iglu .")]
# real_translation = TRG.preprocess("a man in a blue shirt is standing on a ladder and cleaning a window")
# sentence = [SRC.preprocess("eine gruppe von menschen steht vor einem iglu .")]
# real_translation = TRG.preprocess("a group of people stands in front of an igloo.")
sentence = [SRC.preprocess("ein mann mit kariertem hut in einer schwarzen jacke und einer schwarz-weiß gestreiften hose spielt auf einer bühne mit einem sänger und einem weiteren gitarristen im hintergrund auf einer e-gitarre .")]
real_translation = TRG.preprocess("a man in a black jacket and checkered hat wearing black and white striped pants plays an electric guitar on a stage with a singer and another guitar player in the background .")

src = SRC.process(sentence).to(device).T
src_mask = (src != SRC.vocab.stoi["<pad>"]).unsqueeze(-2)
model.eval()
out = greedy_decode(model, src, src_mask, max_len=60, start_symbol=TRG.vocab.stoi["<sos>"])
translation = ["<sos>"]
for i in range(1, out.size(1)):
    sym = TRG.vocab.itos[out[0, i]]
    translation.append(sym)
    if sym == "<eos>":
        break
print(' '.join(translation))
print(' '.join(real_translation))

# plot_loss_curves(losses["train"], losses["val"])

visualise_attention(translation, ["<sos>"] + sentence[0] + ["<eos>"])
Exemplo n.º 6
0
class Document:
    def __init__(self, boxes_and_transcripts_file: Path, image_file: Path, label_file: Path,
                 entities_list: List[str], resized_image_size: Tuple[int, int] = (480, 960),
                 iob_tagging_type: str = 'box_level', entities_file: Path = None, training: bool = True,
                 image_index=None):
        '''
        An item returned by dataset.

        :param boxes_and_transcripts_file: gt or ocr results file
        :param image_file: whole images file
        :param entities_list: list with entities
        :param resized_image_size: resize whole image size, (w, h)
        :param iob_tagging_type: 'box_level', 'document_level', 'box_and_within_box_level'
        :param entities_file: exactly entity type and entity value of documents, json file
        :param training: True for train and validation mode, False for test mode. True will also load labels,
        and entities_file must be set.
        :param image_index: image index, used to get image file name
        '''
        # text string label converter
        self.text_segments_field = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True)
        self.text_segments_field.vocab = vocab_cls['keys']
        # iob string label converter
        self.iob_tags_field = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True)
        self.iob_tags_field.vocab = vocab_cls['iob_labels']
        self.resized_image_size = resized_image_size
        self.training = training
        assert iob_tagging_type in ['box_level', 'document_level', 'box_and_within_box_level'], \
            'iob tagging type {} is not supported'.format(iob_tagging_type)
        self.iob_tagging_type = iob_tagging_type

        # For easier debug:
        # we will know what we are running on.
        self.image_filename = image_file.as_posix()

        try:
            # read boxes, transcripts, and entity types of boxes in one documents from boxes_and_transcripts file
            # match with regex pattern: index,x1,y1,x2,y2,x3,y3,x4,y4,transcript,type from boxes_and_transcripts tsv file
            # data format as [(index, points, transcription, entity_type)...]
            if self.training:
                # boxes_and_transcripts_data = [(index, [x1, y1, ...], transcript, entity_type), ...]
                boxes_and_transcripts_data = read_gt_file_with_box_entity_type(boxes_and_transcripts_file.as_posix())
            else:
                boxes_and_transcripts_data = read_ocr_file_without_box_entity_type(
                    boxes_and_transcripts_file.as_posix())

            # Sort the box based on the position.
            # boxes_and_transcripts_data = sort_box_with_list(boxes_and_transcripts_data)

            # read image
            image = cv2.imread(image_file.as_posix())
            label = pd.read_csv(label_file.as_posix(),sep='\n',header=None)[0].to_list()
        except Exception as e:
            raise IOError('Error occurs in image {}: {}'.format(image_file.stem, e.args))

        boxes, transcripts, box_entity_types = [], [], []
        if self.training:
            for index, points, transcript, entity_type in boxes_and_transcripts_data:
                if len(transcript) == 0:
                    transcript = ' '
                boxes.append(points)
                transcripts.append(transcript)
                box_entity_types.append(entity_type)
        else:
            for index, points, transcript in boxes_and_transcripts_data:
                if len(transcript) == 0:
                    transcript = ' '
                boxes.append(points)
                transcripts.append(transcript)

        # Limit the number of boxes and number of transcripts to process.
        boxes_num = min(len(boxes), MAX_BOXES_NUM)
        transcript_len = min(max([len(t) for t in transcripts[:boxes_num]]), MAX_TRANSCRIPT_LEN)
        mask = np.zeros((boxes_num, transcript_len), dtype=int)

        relation_features = np.zeros((boxes_num, boxes_num, 6))

        try:

            height, width, _ = image.shape

            # resize image
            image = cv2.resize(image, self.resized_image_size, interpolation=cv2.INTER_LINEAR)
            x_scale = self.resized_image_size[0] / width
            y_scale = self.resized_image_size[1] / height

            # get min area box for each (original) boxes, for calculate initial relation features
            min_area_boxes = [cv2.minAreaRect(np.array(box, dtype=np.float32).reshape(4, 2)) for box in
                              boxes[:boxes_num]]

            # calculate resized image box coordinate, and initial relation features between boxes (nodes)
            resized_boxes = []
            for i in range(boxes_num):
                box_i = boxes[i]
                transcript_i = transcripts[i]

                # get resized images's boxes coordinate, used to ROIAlign in Encoder layer
                resized_box_i = [int(np.round(pos * x_scale)) if i % 2 == 0 else int(np.round(pos * y_scale))
                                 for i, pos in enumerate(box_i)]

                # resized_rect_output_i = cv2.minAreaRect(np.array(resized_box_i, dtype=np.float32).reshape(4, 2))
                # resized_box_i = cv2.boxPoints(resized_rect_output_i)
                resized_box_i = np.array(resized_box_i).reshape((8,))
                resized_boxes.append(resized_box_i)

                # enumerate each box, calculate relation features between i and other nodes.
                # formula (9)
                self.relation_features_between_ij_nodes(boxes_num, i, min_area_boxes, relation_features, transcript_i,
                                                        transcripts)

            relation_features = normalize_relation_features(relation_features, width=width, height=height)
            # The length of texts of each segment.
            text_segments = [list(trans) for trans in transcripts[:boxes_num]]

            if self.training:
                # assign iob label to input text through exactly match way, this process needs entity-level label
                if self.iob_tagging_type != 'box_level':
                    with entities_file.open() as f:
                        entities = json.load(f)

                if self.iob_tagging_type == 'box_level':
                    # convert transcript of every boxes to iob label, using entity type of corresponding box
                    iob_tags_label = text2iob_label_with_box_level_match(box_entity_types[:boxes_num],
                                                                         transcripts[:boxes_num],
                                                                         entities_list=entities_list)
                elif self.iob_tagging_type == 'document_level':
                    # convert transcripts to iob label using document level tagging match method, all transcripts will
                    # be concatenated as a sequences
                    iob_tags_label = text2iob_label_with_document_level_exactly_match(transcripts[:boxes_num], entities,
                                                                                      entities_list=entities_list)

                elif self.iob_tagging_type == 'box_and_within_box_level':
                    # perform exactly tagging within specific box, box_level_entities parames will perform boex level tagging.
                    iob_tags_label = text2iob_label_with_box_and_within_box_exactly_level(box_entity_types[:boxes_num],
                                                                                          transcripts[:boxes_num],
                                                                                          entities, ['address'],
                                                                                          entities_list=entities_list)

                iob_tags_label = self.iob_tags_field.process(iob_tags_label)[:, :transcript_len].numpy()
                box_entity_types = [vocab_cls['entities'].stoi[t] for t in box_entity_types[:boxes_num]]

            # texts shape is (num_texts, max_texts_len), texts_len shape is (num_texts,)
            texts, texts_len = self.text_segments_field.process(text_segments)
            texts = texts[:, :transcript_len].numpy()
            texts_len = np.clip(texts_len.numpy(), 0, transcript_len)
            text_segments = (texts, texts_len)

            for i in range(boxes_num):
                mask[i, :texts_len[i]] = 1

            self.whole_image = RawField().preprocess(image)
            self.text_segments = self.text_segments_field.preprocess(text_segments)  # (text, texts_len)
            self.boxes_coordinate = RawField().preprocess(resized_boxes)
            self.relation_features = RawField().preprocess(relation_features)
            self.mask = RawField().preprocess(mask)
            self.boxes_num = RawField().preprocess(boxes_num)
            self.transcript_len = RawField().preprocess(transcript_len)  # max transcript len of current document
            if self.training:
                self.iob_tags_label = self.iob_tags_field.preprocess(iob_tags_label)
            else:
                self.image_index = RawField().preprocess(image_index)
                self.label = RawField().preprocess(label)

        except Exception as e:
            raise RuntimeError('Error occurs in image {}: {}'.format(boxes_and_transcripts_file.stem, e.args))

    def relation_features_between_ij_nodes(self, boxes_num, i, min_area_boxes, relation_features, transcript_i,
                                           transcripts):
        '''
        calculate node i and other nodes' initial relation features.
        :param boxes_num:
        :param i:
        :param min_area_boxes: the min rectangle of (original) points.
        :param relation_features: np.array, boxes_num x boxes_num x 6
        :param transcript_i:  transcripts[i]
        :param transcripts:
        :return:
        '''
        w,h = (480,960)
        for j in range(boxes_num):
            transcript_j = transcripts[j]

            rect_output_i = min_area_boxes[i]
            rect_output_j = min_area_boxes[j]

            # Centers of rect_of_box_i and rect_of_box_j.
            center_i = rect_output_i[0]
            center_j = rect_output_j[0]

            width_i, height_i = rect_output_i[1]
            width_j, height_j = rect_output_j[1]

            # Center distances of boxes on x-axis.
            relation_features[i, j, 0] = np.abs(center_i[0] - center_j[0]) \
                if np.abs(center_i[0] - center_j[0])/w is not None else -1  # x_ij

            # Center distances of boxes on y-axis.
            relation_features[i, j, 1] = np.abs(center_i[1] - center_j[1]) \
                if np.abs(center_i[1] - center_j[1]) is not None else -1  # y_ij

            relation_features[i, j, 2] = width_i / (height_i) \
                if height_i != 0 and width_i / (height_i) is not None else -1  # w_i/h_i

            relation_features[i, j, 3] = height_j / (height_i) \
                if height_i != 0 and height_j / (height_i) is not None else -1  # h_j/h_i

            relation_features[i, j, 4] = width_j / (height_i) \
                if height_i != 0 and width_j / (height_i) is not None else -1  # w_j/h_i

            relation_features[i, j, 5] = len(transcript_j) / (len(transcript_i)) \
                if len(transcript_j) / (len(transcript_i)) is not None else -1  # T_j/T_i
Exemplo n.º 7
0
class TestingDocument:
    def __init__(self, image, boxes_and_transcripts_data, iob_tagging_type: str = 'box_level',
                    image_index=None, resized_image_size: Tuple[int, int] = (480, 960)):
        '''
        An item returned by dataset.
        :param iob_tagging_type: 'box_level', 'document_level', 'box_and_within_box_level'
        :param entities_file: exactly entity type and entity value of documents, json file
        :param image_index: image index, used to get image file name
        '''
        # text string label converter
        self.text_segments_field = Field(sequential=True, use_vocab=True, include_lengths=True, batch_first=True)
        self.text_segments_field.vocab = vocab_cls['keys']
        # iob string label converter
        self.iob_tags_field = Field(sequential=True, is_target=True, use_vocab=True, batch_first=True)
        self.iob_tags_field.vocab = vocab_cls['iob_labels']
        assert iob_tagging_type in ['box_level', 'document_level', 'box_and_within_box_level'], \
            'iob tagging type {} is not supported'.format(iob_tagging_type)
        self.iob_tagging_type = iob_tagging_type
        self.resized_image_size = resized_image_size

        # read boxes, transcripts, and entity types of boxes in one documents from boxes_and_transcripts file
        # match with regex pattern: index,x1,y1,x2,y2,x3,y3,x4,y4,transcript,type from boxes_and_transcripts tsv file
        # data format as [(index, points, transcription, entity_type)...]
        # label = pd.read_csv(label_file.as_posix(),sep='\n',header=None)[0].to_list()

        boxes, transcripts, box_entity_types = [], [], []
        for index, points, transcript, _ in boxes_and_transcripts_data:
            if len(transcript) == 0:
                transcript = ' '
            boxes.append(points)
            transcripts.append(transcript)

        # Limit the number of boxes and number of transcripts to process.
        boxes_num = min(len(boxes), MAX_BOXES_NUM)
        transcript_len = min(max([len(t) for t in transcripts[:boxes_num]]), MAX_TRANSCRIPT_LEN)
        mask = np.zeros((boxes_num, transcript_len), dtype=int)

        relation_features = np.zeros((boxes_num, boxes_num, 6))

        height, width, _ = image.shape

        image = cv2.resize(image, self.resized_image_size, interpolation=cv2.INTER_LINEAR)
        x_scale = self.resized_image_size[0] / width
        y_scale = self.resized_image_size[1] / height

        # get min area box for each (original) boxes, for calculate initial relation features
        min_area_boxes = [cv2.minAreaRect(np.array(box, dtype=np.float32).reshape(4, 2)) for box in
                          boxes[:boxes_num]]

        # calculate resized image box coordinate, and initial relation features between boxes (nodes)
        resized_boxes = []
        for i in range(boxes_num):
            box_i = boxes[i]
            transcript_i = transcripts[i]

            # get resized images's boxes coordinate, used to ROIAlign in Encoder layer
            resized_box_i = [int(np.round(pos * x_scale)) if i % 2 == 0 else int(np.round(pos * y_scale))
                                 for i, pos in enumerate(box_i)]
            resized_box_i = np.array(resized_box_i).reshape((8,))
            resized_boxes.append(resized_box_i)

            # enumerate each box, calculate relation features between i and other nodes.
            # formula (9)
            self.relation_features_between_ij_nodes(boxes_num, i, min_area_boxes, relation_features, transcript_i,
                                                    transcripts)

        relation_features = normalize_relation_features(relation_features, width=width, height=height)

        # The length of texts of each segment.
        text_segments = [list(trans) for trans in transcripts[:boxes_num]]

        # texts shape is (num_texts, max_texts_len), texts_len shape is (num_texts,)
        texts, texts_len = self.text_segments_field.process(text_segments)
        texts = texts[:, :transcript_len].numpy()
        texts_len = np.clip(texts_len.numpy(), 0, transcript_len)
        text_segments = (texts, texts_len)

        for i in range(boxes_num):
            mask[i, :texts_len[i]] = 1

        self.whole_image = RawField().preprocess(image)
        self.text_segments = self.text_segments_field.preprocess(text_segments)  # (text, texts_len)
        self.boxes_coordinate = RawField().preprocess(resized_boxes)
        self.relation_features = RawField().preprocess(relation_features)
        self.mask = RawField().preprocess(mask)
        self.boxes_num = RawField().preprocess(boxes_num)
        self.transcript_len = RawField().preprocess(transcript_len)  # max transcript len of current document
        self.image_index = RawField().preprocess(image_index)


    def relation_features_between_ij_nodes(self, boxes_num, i, min_area_boxes, relation_features, transcript_i,
                                           transcripts):
        '''
        calculate node i and other nodes' initial relation features.
        :param boxes_num:
        :param i:
        :param min_area_boxes: the min rectangle of (original) points.
        :param relation_features: np.array, boxes_num x boxes_num x 6
        :param transcript_i:  transcripts[i]
        :param transcripts:
        :return:
        '''
        for j in range(boxes_num):
            transcript_j = transcripts[j]

            rect_output_i = min_area_boxes[i]
            rect_output_j = min_area_boxes[j]

            # Centers of rect_of_box_i and rect_of_box_j.
            center_i = rect_output_i[0]
            center_j = rect_output_j[0]

            width_i, height_i = rect_output_i[1]
            width_j, height_j = rect_output_j[1]

            # Center distances of boxes on x-axis.
            relation_features[i, j, 0] = np.abs(center_i[0] - center_j[0]) \
                if np.abs(center_i[0] - center_j[0]) is not None else -1  # x_ij

            # Center distances of boxes on y-axis.
            relation_features[i, j, 1] = np.abs(center_i[1] - center_j[1]) \
                if np.abs(center_i[1] - center_j[1]) is not None else -1  # y_ij

            relation_features[i, j, 2] = width_i / (height_i) \
                if height_i != 0 and width_i / (height_i) is not None else -1  # w_i/h_i

            relation_features[i, j, 3] = height_j / (height_i) \
                if height_i != 0 and height_j / (height_i) is not None else -1  # h_j/h_i

            relation_features[i, j, 4] = width_j / (height_i) \
                if height_i != 0 and width_j / (height_i) is not None else -1  # w_j/h_i

            relation_features[i, j, 5] = len(transcript_j) / (len(transcript_i)) \
                if len(transcript_j) / (len(transcript_i)) is not None else -1  # T_j/T_i
Exemplo n.º 8
0
from typing import List, Tuple
import re

from torchtext.data import Example, Field


def regex_tokenizer(text, pattern=r"(?u)(?:\b\w\w+\b|\S)") -> List[str]:
    return [m.group() for m in re.finditer(pattern, text)]


if __name__ == '__main__':
    texts = [
        '{"intents": [], "is_terminal_state": false, "last_sys_acts": null, "slots_filled": ["chairname"], "slot_queries": {}, "requested_slot": "", "user_acts": [{"name": "dialogue_act", "funcName": null, "params": [{"slot": "chairname", "op": {}, "value": "control of convergent access networks"}], "intent": "inform"}], "system_made_offer": false, "db_matches_ratio": 0, "turn": 1, "num_dontcare": 0}',
        '{"intents": [], "is_terminal_state": false, "last_sys_acts": null, "slots_filled": ["chairname"], "slot_queries": {}, "requested_slot": "", "user_acts": [{"name": "dialogue_act", "funcName": null, "params": [{"slot": "chairname", "op": {}, "value": "control of convergent access networks"}], "intent": "inform"}]'
    ]
    TEXT_FIELD = Field(include_lengths=True,
                       batch_first=True,
                       tokenize=regex_tokenizer)
    examples = [
        Example.fromlist([text], [('dialog_state', TEXT_FIELD)])
        for text in texts
    ]
    TEXT_FIELD.build_vocab([example.dialog_state for example in examples])

    batch, lenghts = TEXT_FIELD.process([e.dialog_state for e in examples])
    print()
Exemplo n.º 9
0
class Dataset:
    """Defines dataset composed of queries and responses.

    Provides train, test and validation splits. It can be used to create bucket
    iterators where queries with similar length are placed in same bucket.
    Also, processes raw string queries to create padded tensors suitable for
    training.

    Attributes:
        data (torchtext.TabularDataset): Dataset composed of query-response examples.
        vocab (torchtext.Vocab): Vocabulary created from dataset examples.
        train_iter (torchtext.BucketIterator): Iterator over training examples.
        val_iter (torchtext.BucketIterator): Iterator over validation examples.
        test_iter (torchtext.BucketIterator): Iterator over test examples.
        iterator (torchtext.BucketIterator): Iterator over all examples.
    """
    @verbose
    def __init__(self,
                 path=DATA_PATH,
                 device=torch.device('cpu'),
                 batch_size=DEFAULT_BATCH_SIZE,
                 train_test_val_ratio=TRAIN_TEST_VAL_RATIO):
        """Loads dataset examples and creates bucket iterators.

        Creates vocabulary from loaded examples. Train, test and validation
        splits and their iterators are created.

        Args:
            path (str, optional): Path to the dataset file. Default: constants.DATA_PATH.
            device (torch.device, optional): Torch device where tensors will be created.
                Default: torch.device('cpu').
            batch_size (int, optional): Size of batch. Default: 32.
            train_test_val_ratio (iterable, optional): Iterable of 3 elements denoting ratio of
                train, test and validation splits. Default: [0.90, 0.05, 0.05].
        """
        print(colorize('\nLoading dataset'))

        self._batch_size = batch_size
        self._device = device

        self._field = Field(tokenize='revtok', lower=True, batch_first=True)

        fields = [
            ('query', self._field),
            ('response', self._field),
        ]

        self.data = TabularDataset(path=path, format='csv', fields=fields)

        self._train, self._val, self._test = self.data.split(
            train_test_val_ratio)

        self.train_iter, self.validation_iter, self.test_iter = BucketIterator.splits(
            datasets=(self._train, self._val, self._test),
            batch_size=self._batch_size,
            repeat=False,
            sort_key=lambda ex: interleave_keys(len(ex.query), len(ex.response)
                                                ),
            device=self._device)

        self.iterator = BucketIterator(dataset=self.data,
                                       batch_size=self._batch_size,
                                       repeat=False,
                                       sort_key=lambda ex: interleave_keys(
                                           len(ex.query), len(ex.response)),
                                       device=self._device)

        print(colorize(' • Building vocabulary', color='yellow'))
        self._field.build_vocab(self.data)
        self.vocab = self._field.vocab

    def process(self, batch):
        """Pads and converts list of query tokens to torch.Tensor.

        Batch of query tokens is padded with <PAD> token to make sure that each
        query have same length. After that, padded query tokens are converted to
        torch.Tensor

        Args:
            batch (list): List of lists with query tokens.

        Returns:
            torch.Tensor defined on specified device.
        """
        return self._field.process(batch, device=self._device)
# build a vocab of our training set, ignoring word with frequency less than 2
SRC.build_vocab(train_data, min_freq=2)
TRG.build_vocab(train_data, min_freq=2)

# build train/valid/test iterators, which will batch the data for us
BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), batch_size=BATCH_SIZE, device=device)

x = vars(test_data.examples[0])['src']
y = vars(test_data.examples[0])['trg']
print("Source example:", " ".join(x))
print("Target example:", " ".join(y))
print("Padded target:", TRG.pad([y]))
print("Tensorized target:", TRG.process([y]))

INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
EMB_DIM = 256
HIDDEN_DIM = 512
LAYERS = 1
DROPOUT = 0.5
BIDIRECTIONAL = True

# padding token
SRC_PAD = SRC.vocab.stoi['<pad>']
TRG_PAD = TRG.vocab.stoi['<pad>']

# build model
enc = Encoder(INPUT_DIM,