예제 #1
0
파일: utils.py 프로젝트: ahiroto/ParlAI
def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
    # Tokens are strings here
    hypo_tokens = tokenizer.tokenize_line(hypo_str)
    # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully
    src_tokens = tokenizer.tokenize_line(src_str) + ['<eos>']
    for i, ht in enumerate(hypo_tokens):
        if ht == unk:
            src_token = src_tokens[alignment[i]]
            # Either take the corresponding value in the aligned dictionary or just copy the original value.
            hypo_tokens[i] = align_dict.get(src_token, src_token)
    return ' '.join(hypo_tokens)
예제 #2
0
def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
    # Tokens are strings here
    hypo_tokens = tokenizer.tokenize_line(hypo_str)
    # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully
    src_tokens = tokenizer.tokenize_line(src_str) + ['<eos>']
    for i, ht in enumerate(hypo_tokens):
        if ht == unk:
            src_token = src_tokens[alignment[i]]
            # Either take the corresponding value in the aligned dictionary or just copy the original value.
            hypo_tokens[i] = align_dict.get(src_token, src_token)
    return ' '.join(hypo_tokens)
예제 #3
0
def replace_unk(hypo_str, src_str, alignment, align_dict, unk):
    from fairseq import tokenizer
    # Tokens are strings here
    hypo_tokens = tokenizer.tokenize_line(hypo_str)
    src_tokens = tokenizer.tokenize_line(src_str) + ['<eos>']
    for i, ht in enumerate(hypo_tokens):
        if ht == unk:
            src_token = src_tokens[alignment[i]]
            # Either take the corresponding value in the aligned dictionary or just copy the original value.
            hypo_tokens[i] = align_dict.get(src_token, src_token)
    return ' '.join(hypo_tokens)
예제 #4
0
    def parse(self, path, word_dict, char_dict, reverse_order=False, append_eos=False):
        word_array_list = []
        word_offsets = [0]
        char_array_list = []
        char_offsets = [0]
        sizes = []
        with open(path, "r") as f:
            for line in f:
                words = tokenizer.tokenize_line(line)
                if reverse_order:
                    words.reverse()
                word_inds = [word_dict.index(w) for w in words]
                if append_eos:
                    word_inds.append(dict.eos_index)

                word_array_list.append(np.array(word_inds, dtype=np.int32))
                word_offsets.append(word_offsets[-1] + len(word_inds))
                sizes.append(len(word_inds))

                for word in words:
                    chars = [word] if word in TAGS else list(word)
                    char_inds = [char_dict.index(c) for c in chars]
                    char_array_list.append(np.array(char_inds, dtype=np.int32))
                    char_offsets.append(char_offsets[-1] + len(char_inds))

        self.word_buffer = np.concatenate(word_array_list)
        self.word_offsets = np.array(word_offsets, dtype=np.int32)
        self.char_buffer = np.concatenate(char_array_list)
        self.char_offsets = np.array(char_offsets, dtype=np.int32)
        self.sizes = np.array(sizes, dtype=np.int32)

        del word_array_list, word_offsets, char_array_list, char_offsets, sizes
예제 #5
0
def encode_line(line,
                vocab,
                add_if_not_exist=True,
                consumer=None,
                append_eos=True,
                reverse_order=False):
    """
    Copied from fairseq.data.Dictionary and changed ids tensor type to Long (==int64)
    :param line:
    :param vocab:
    :param add_if_not_exist:
    :param consumer:
    :param append_eos:
    :param reverse_order:
    :return:
    """
    words = tokenize_line(line)
    if reverse_order:
        words = list(reversed(words))
    nwords = len(words)
    ids = torch.LongTensor(nwords + 1 if append_eos else nwords)

    for i, word in enumerate(words):
        if add_if_not_exist:
            idx = vocab.add_symbol(word)
        else:
            idx = vocab.index(word)
        if consumer is not None:
            consumer(word, idx)
        ids[i] = idx
    if append_eos:
        ids[nwords] = vocab.eos_index
    return ids
예제 #6
0
    def read_text(self,
                  utt_ids: List[str],
                  token_text: List[str],
                  dictionary=None):
        assert len(utt_ids) == len(token_text)
        self.utt_ids = utt_ids
        self.tokens_list = token_text
        self.tensor_list = []
        self.size = len(self.utt_ids)  # number of utterances
        self.sizes = []
        if dictionary is not None:
            for tokens in self.tokens_list:
                tensor = dictionary.encode_line(
                    tokens,
                    add_if_not_exist=False,
                    append_eos=self.append_eos,
                ).long()
                self.tensor_list.append(tensor)
                self.sizes.append(len(self.tensor_list[-1]))
        else:
            self.sizes = [
                len(tokenize_line(tokens)) for tokens in self.tokens_list
            ]

        self.sizes = np.array(self.sizes, dtype=np.int32)

        assert len(self.utt_ids) == len(self.tokens_list) and \
            (dictionary is None or len(self.utt_ids) == len(self.tensor_list)) and \
            len(self.utt_ids) == len(self.sizes)
예제 #7
0
def parse_sentences(parser, in_tokenized_sentences, batch_size,
                    roberta_batch_size, out_amr):

    # read tokenized sentences
    sentences = read_sentences(in_tokenized_sentences)
    split_sentences = []
    for sentence in sentences:
        split_sentences.append(tokenize_line(sentence))
    print(len(split_sentences))

    # parse
    start = time.time()
    result = parser.parse_sentences(
        split_sentences,
        batch_size=batch_size,
        roberta_batch_size=roberta_batch_size,
    )
    end = time.time()
    print(len(result))
    time_secs = timedelta(seconds=float(end - start))
    print(f'Total time taken to parse sentences: {time_secs}')

    # write annotations
    if out_amr:
        with open(out_amr, 'w') as fid:
            for i in range(0, len(sentences)):
                fid.write(result[i])
예제 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("input", help="input file; use - for stdin")
    args = parser.parse_args()
    # tokenise based on space
    for line in fileinput.input([args.input], openhook=fileinput.hook_compressed):
        line = tokenizer.tokenize_line(line)
        line = " ".join(line)
        sys.stdout.write(line+"\n")
예제 #9
0
    def read_text(self, utt_ids: List[str], texts: List[str], dictionary=None):
        assert len(utt_ids) == len(texts)
        self.utt_ids = utt_ids
        self.texts = texts
        self.size = len(self.utt_ids)  # number of utterances
        from fairseq.tokenizer import tokenize_line

        if dictionary is not None:
            self.sizes = [
                len(tokenize_line(dictionary.wordpiece_encode(text))) +
                (1 if self.append_eos else 0) for text in texts
            ]
        else:
            self.sizes = [len(tokenize_line(text)) for text in texts]

        self.sizes = np.array(self.sizes, dtype=np.int32)

        assert len(self.utt_ids) == len(self.sizes)
예제 #10
0
 def tokenize(self, text: str, add_start: bool = True):
     masked_text = text.replace(MASK, ROBERTA_MASK)
     text_spans = masked_text.split(ROBERTA_MASK)
     text_spans_bpe = ' {0} '.format(ROBERTA_MASK).join([
         self.bpe.encode(text_span.rstrip()) for text_span in text_spans
     ]).strip()
     if add_start:
         text_spans_bpe = ROBERTA_START_SENTENCE + ' ' + text_spans_bpe
     return tokenize_line(text_spans_bpe)
예제 #11
0
def char_tokenize(line):
    words = tokenizer.tokenize_line(line)
    chars = []
    for word in words:
        if word in TAGS:
            chars.append(word)
        else:
            chars.extend(c for c in word)
    return chars
예제 #12
0
 def check_wordpiece_to_word_map(input_file, raise_error):
     num_sents = 0
     with open(input_file, 'r') as fid:
         for sentence in tqdm(fid):
             if not sentence:
                 break
             sentence = " ".join(tokenize_line(str(sentence.rstrip())))
             #print("input: ", sentence)
             word2piece = get_wordpiece_to_word_map(sentence, roberta.bpe,
                                                    raise_error)
예제 #13
0
 def _sent_to_word_ids(self, sent, word_dict, reverse_order, prepend_inds,
                       append_inds):
     """
     Extract the word ids for words associated with the input sentence.
     """
     words = tokenizer.tokenize_line(sent)
     if reverse_order:
         words.reverse()
     word_inds = [word_dict.index(w) for w in words]
     word_inds = prepend_inds + word_inds + append_inds
     return words, word_inds
예제 #14
0
def replace_unk(hypo_str, src_str, alignment, align_dict, unk, input_str):
    from fairseq import tokenizer

    # Tokens are strings here
    hypo_tokens = tokenizer.tokenize_line(hypo_str)
    # TODO: Very rare cases where the replacement is '<eos>' should be handled gracefully
    src_tokens = tokenizer.tokenize_line(src_str) + ["<eos>"]
    for i, ht in enumerate(hypo_tokens):
        if ht == unk:
            src_idx, tgt_index = alignment[i]
            src_token = src_tokens[src_idx]
            # Either take the corresponding value in the aligned dictionary or just copy the original value.
            hypo_tokens[i] = align_dict.get(
                src_token, src_token
            )  # first value is searchd for, second is returned if not found
            if hypo_tokens[i] == unk and input_str is not None:
                input_tokens = tokenizer.tokenize_line(input_str) + ["<eos>"]
                # replace unk token with corresponding word from raw input string
                hypo_tokens[i] = input_tokens[src_idx]
    return " ".join(hypo_tokens)
예제 #15
0
def noise(file, ofile_suffix):
    noise_injector = NoiseInjector(tgts)
    with open(ofile_suffix + '.src') as src_ofile, open(
            ofile_suffix +
            '.tgt') as tgt_ofile, open(ofile_suffix +
                                       '.forward') as align_ofile:
        for line in file:
            tgt = tokenize_line(line.strip())
            src, align = noise_injector.inject_noise(tgt)
            tgt_ofile.write(' '.join(tgt) + '\n')
            src_ofile.write(' '.join(src) + '\n')
            align_ofile.write(' '.join(align) + '\n')
예제 #16
0
 def replace_unk(hypo_str, align_str, src, unk):
     hypo_tokens = hypo_str.split()
     src_tokens = tokenizer.tokenize_line(src)
     align_idx = [int(i) for i in align_str.split()]
     for i, ht in enumerate(hypo_tokens):
         if ht == unk:
             src_token = src_tokens[align_idx[i]]
             if src_token in align_dict:
                 hypo_tokens[i] = align_dict[src_token]
             else:
                 hypo_tokens[i] = src_token
     return ' '.join(hypo_tokens)
예제 #17
0
def noise(filename, ofile_suffix):
    lines = open(filename).readlines()
    tgts = [tokenize_line(line.strip()) for line in lines]
    noise_injector = NoiseInjector(tgts)

    srcs = []
    aligns = []
    for tgt in tgts:
        src, align = noise_injector.inject_noise(tgt)
        srcs.append(src)
        aligns.append(align)

    save_file('{}.src'.format(ofile_suffix), srcs)
    save_file('{}.tgt'.format(ofile_suffix), tgts)
    save_file('{}.forward'.format(ofile_suffix), aligns)
예제 #18
0
 def _sent_to_word_ids(self,
                       sent,
                       word_dict,
                       reverse_order=False,
                       append_eos=False):
     """
     Extract the word ids for words associated with the input sentence.
     """
     words = tokenizer.tokenize_line(sent)
     if reverse_order:
         words.reverse()
     word_inds = [word_dict.index(w) for w in words]
     if append_eos:
         word_inds.append(word_dict.eos_index)
     return words, word_inds
예제 #19
0
def encode_labels_line(labels_line, append_eos=True, reverse_order=False):
    """Custom helper:
    Encode a string of space-separated binary labels into LongTensor.

    Mimicks fairseq.data.dictionary.Dictionary.encode_line().
    eos always gets a zero token (no change).

    Returns a torch.IntTensor, analogous to dictionary's encode_line() method.
    """
    labels = [int(label) for label in tokenize_line(labels_line)]
    assert all([label in [0, 1] for label in labels]), \
        f"encode_labels_line: token-level labels must be binary!"
    if reverse_order:
        labels = list(reversed(labels))
    if append_eos:
        labels.append(0)
    return torch.tensor(labels, dtype=torch.int)
예제 #20
0
def binarize_file(input_file,
                  out_file_pref,
                  impl,
                  dtype=np.int64,
                  tokenize=tokenize_line):
    out_file = out_file_pref + '.bin'
    index_file = out_file_pref + '.idx'
    ds = make_builder(out_file, impl=impl, dtype=dtype)
    with open(input_file, 'r') as f:
        for line in f:
            if line.strip():
                line = tokenize_line(line)
                line = list(map(int, line))
                line = torch.tensor(line)
                ds.add_item(line)
            else:
                raise Exception('empty line')

    ds.finalize(index_file)

    return
예제 #21
0
    def convert_sentences_to_data(self, sentences, batch_size,
                                  roberta_batch_size):

        # extract RoBERTa features
        roberta_features = \
            self.get_bert_features_batched(sentences, roberta_batch_size)

        # organize data into a fairseq batch
        data = []
        for index, sentence in enumerate(sentences):
            ids = self.get_token_ids(sentence)
            word_features, wordpieces_roberta, word2piece_scattered_indices =\
                roberta_features[index]
            data.append({
                'id': index,
                'source': ids,
                'source_fix_emb': word_features,
                'src_wordpieces': wordpieces_roberta,
                'src_wp2w': word2piece_scattered_indices,
                'src_tokens': tokenize_line(sentence)  # original source tokens
            })
        return data
예제 #22
0
    def read_data(self, path, dictionary, ex_dict):
        with open(path, 'r') as f:
            for i, line in enumerate(f):
                # use complete words for segmentation on the target side
                if self.is_tgt:
                    s = " ".join([
                        "▁" + word
                        for word in "".join(line.strip("\n").split()).replace(
                            "▁", " ").strip().split()
                    ])
                    self.lines.append(s)
                else:
                    self.lines.append(line.strip('\n'))
                tokens = tokenize_line(line)
                if self.is_tgt:
                    tokens = [c for c in "".join(tokens)]
                if self.append_eos:
                    tokens.append(self.dictionary.eos())
                #self.tokens_list.append(tokens)
                self.sizes.append(len(tokens))

        self.sizes = np.array(self.sizes)
예제 #23
0
def main():

    # argument handling
    args = argument_parsing()

    # read tokenized sentences
    sentences = read_sentences(args.in_tokenized_sentences)
    split_sentences = []
    for sentence in sentences:
        split_sentences.append(tokenize_line(sentence))
    print(len(split_sentences))

    # load parser
    start = time.time()
    parser = AMRParser.from_checkpoint(args.in_checkpoint)
    end = time.time()
    time_secs = timedelta(seconds=float(end - start))
    print(f'Total time taken to load parser: {time_secs}')

    # TODO: max batch sizes could be computed from max sentence length

    # parse
    start = time.time()
    result = parser.parse_sentences(
        split_sentences,
        batch_size=args.batch_size,
        roberta_batch_size=args.roberta_batch_size,
    )
    end = time.time()
    print(len(result))
    time_secs = timedelta(seconds=float(end - start))
    print(f'Total time taken to parse sentences: {time_secs}')

    # write annotations
    with open(args.out_amr, 'w') as fid:
        for i in range(0, len(sentences)):
            fid.write(result[i])
예제 #24
0
    def parse(self, path, dict, reverse_order=False, append_eos=False):
        array_list = []
        offsets = [0]
        sizes = []
        with open(path, "r") as f:
            for line in f:
                words = tokenizer.tokenize_line(line)
                if reverse_order:
                    words.reverse()
                inds = [dict.index(w) for w in words]
                if append_eos:
                    inds.append(dict.eos_index)

                array_list.append(np.array(inds, dtype=np.int32))
                offsets.append(offsets[-1] + len(inds))
                sizes.append(len(inds))

        # +1 for Lua compatibility
        self.buffer = np.concatenate(array_list) + 1
        self.offsets = np.array(offsets, dtype=np.int32)
        self.sizes = np.array(sizes, dtype=np.int32)
        del array_list
        del offsets
        del sizes
예제 #25
0
    def parse_multilingual(
        self,
        corpora,
        reverse_order=False,
        append_eos=False,
        prepend_language_id=True,
        already_numberized=False,
    ):
        """Add sentences from text files to the dataset.

        This method reads pairs of text files containing source and target
        sides of a bitext. Sentences are converted to integer sequences by
        tokenization and dictionary look-up. Note that this method removes all
        sentences which have been previously added to the data set.

        Example (single sentence):
            token_sequence = [123, 234, 345]
            dict.eos_idx = 2
            dialect_id = 10
            Result:
                reverse_order=False, append_eos=True, prepend_language_id=True:
                    [10, 123, 234, 345, 2]
                reverse_order=False, append_eos=True, prepend_language_id=False:
                    [123, 234, 345, 2, 10]
                reverse_order=True, append_eos=True, prepend_language_id=True:
                    [10, 345, 234, 123, 2]
                reverse_order=True, append_eos=True, prepend_language_id=False:
                    [345, 234, 123, 2, 10]

        Args:
            corpora: List of MultilingualCorpusConfig. If dialect_id is not
                None, it is added to the token sequence.
            reverse_order (bool): Whether to reverse the integer token sequence.
            append_eos (bool): Whether to add the end-of-sentence symbol to each
                sentence.
            prepend_language_id (bool): Only used if dialect_id is not None. If
                true, add ID at the begin of the token sequence. Otherwise, add
                it at the end of the token sequence.
            already_numberized (bool): If data_file contains lines of
                numberized tokens, then already_numberized should be set to True
                If data_file contains raw text sentences, then
                already_numberized should be False (default) -- in which case
                each line is tokenized with tokenizer then numberized with the
                dictionary before being added to the output buffer.

        """
        array_list = []
        offsets = [0]
        sizes = []
        print(corpora)
        for corpus_config in corpora:
            print(corpus_config)
            print(corpus_config.data_file)
            prepend_inds = []
            append_inds = []
            if append_eos:
                append_inds.append(corpus_config.dict.eos_index)
            if corpus_config.dialect_id is not None:
                if prepend_language_id:
                    prepend_inds.append(corpus_config.dialect_id)
                else:
                    append_inds.append(corpus_config.dialect_id)
            with open(corpus_config.data_file, "r") as f:
                for line in f:
                    if already_numberized:
                        inds = line.strip().split()
                        inds = [int(ind) for ind in inds]
                    else:
                        words = tokenizer.tokenize_line(line)
                        inds = [corpus_config.dict.index(w) for w in words]

                    if reverse_order:
                        inds.reverse()
                    inds = prepend_inds + inds + append_inds
                    for _ in range(corpus_config.oversampling):
                        array_list.append(np.array(inds, dtype=np.int32))
                        offsets.append(offsets[-1] + len(inds))
                        sizes.append(len(inds))

        self.buffer = np.concatenate(array_list)
        self.offsets = np.array(offsets, dtype=np.int64)
        self.sizes = np.array(sizes, dtype=np.int32)
        del array_list
        del offsets
        del sizes
예제 #26
0
    def parse_multilingual(self,
                           corpora,
                           reverse_order=False,
                           append_eos=False,
                           prepend_language_id=True):
        """Add sentences from text files to the dataset.

        This method reads pairs of text files containing source and target
        sides of a bitext. Sentences are converted to integer sequences by
        tokenization and dictionary look-up. Note that this method removes all
        sentences which have been previously added to the data set.

        Example (single sentence):
            token_sequence = [123, 234, 345]
            dict.eos_idx = 2
            dialect_id = 10
            Result:
                reverse_order=False, append_eos=True, prepend_language_id=True:
                    [10, 123, 234, 345, 2]
                reverse_order=False, append_eos=True, prepend_language_id=False:
                    [123, 234, 345, 2, 10]
                reverse_order=True, append_eos=True, prepend_language_id=True:
                    [10, 345, 234, 123, 2]
                reverse_order=True, append_eos=True, prepend_language_id=False:
                    [345, 234, 123, 2, 10]

        Args:
            corpora: List of MultilingualCorpusConfig. If dialect_id is not
                None, it is added to the token sequence.
            reverse_order (bool): Whether to reverse the integer token sequence.
            append_eos (bool): Whether to add the end-of-sentence symbol to each
                sentence.
            prepend_language_id (bool): Only used if dialect_id is not None. If
                true, add ID at the begin of the token sequence. Otherwise, add
                it at the end of the token sequence.

        """
        array_list = []
        offsets = [0]
        sizes = []
        for corpus_config in corpora:
            prepend_inds = []
            append_inds = []
            if append_eos:
                append_inds.append(corpus_config.dict.eos_index)
            if corpus_config.dialect_id is not None:
                if prepend_language_id:
                    prepend_inds.append(corpus_config.dialect_id)
                else:
                    append_inds.append(corpus_config.dialect_id)
            with open(corpus_config.data_file, "r") as f:
                for line in f:
                    words = tokenizer.tokenize_line(line)
                    if reverse_order:
                        words.reverse()
                    inds = (prepend_inds +
                            [corpus_config.dict.index(w)
                             for w in words] + append_inds)
                    for _ in range(corpus_config.oversampling):
                        array_list.append(np.array(inds, dtype=np.int32))
                        offsets.append(offsets[-1] + len(inds))
                        sizes.append(len(inds))

        # +1 for Lua compatibility
        self.buffer = np.concatenate(array_list) + 1
        self.offsets = np.array(offsets, dtype=np.int32)
        self.sizes = np.array(sizes, dtype=np.int32)
        del array_list
        del offsets
        del sizes
예제 #27
0
def main(args):
    import_user_module(args)

    print(args)

    os.makedirs(args.destdir, exist_ok=True)
    target = not args.only_source

    task = tasks.get_task(args.task)

    def train_path(lang):
        return "{}{}".format(args.trainpref, ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args.destdir, file_name(prefix, lang))

    def dict_path(lang):
        return dest_path("dict", lang) + ".txt"

    def build_dictionary(filenames, src=False, tgt=False):
        assert src ^ tgt
        return task.build_dictionary(
            filenames,
            workers=args.workers,
            threshold=args.thresholdsrc if src else args.thresholdtgt,
            nwords=args.nwordssrc if src else args.nwordstgt,
            padding_factor=args.padding_factor,
        )

    if not args.srcdict and os.path.exists(dict_path(args.source_lang)):
        raise FileExistsError(dict_path(args.source_lang))
    if target and not args.tgtdict and os.path.exists(dict_path(args.target_lang)):
        raise FileExistsError(dict_path(args.target_lang))

    if args.copy_ext_dict:
        assert args.joined_dictionary, \
            "--joined-dictionary must be set if --copy-extended-dictionary is specified"
        assert args.workers == 1, \
            "--workers must be set to 1 if --copy-extended-dictionary is specified"

    if args.joined_dictionary:
        assert not args.srcdict or not args.tgtdict, \
            "cannot use both --srcdict and --tgtdict with --joined-dictionary"

        if args.srcdict:
            src_dict = task.load_dictionary(args.srcdict)
        elif args.tgtdict:
            src_dict = task.load_dictionary(args.tgtdict)
        else:
            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary(
                {train_path(lang) for lang in [args.source_lang, args.target_lang]}, src=True
            )
        tgt_dict = src_dict
    else:
        if args.srcdict:
            src_dict = task.load_dictionary(args.srcdict)
        else:
            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary([train_path(args.source_lang)], src=True)

        if target:
            if args.tgtdict:
                tgt_dict = task.load_dictionary(args.tgtdict)
            else:
                assert args.trainpref, "--trainpref must be set if --tgtdict is not specified"
                tgt_dict = build_dictionary([train_path(args.target_lang)], tgt=True)
        else:
            tgt_dict = None

    src_dict.save(dict_path(args.source_lang))
    if target and tgt_dict is not None:
        tgt_dict.save(dict_path(args.target_lang))

    def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, copy_src_words=None):
        print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()
        copyied = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            copyied.update(worker_result["copied"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(
            input_prefix, ("." + lang) if lang is not None else ""
        )
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:  # todo: not support copy 
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        vocab,
                        prefix,
                        lang,
                        offsets[worker_id],
                        offsets[worker_id + 1]
                    ),
                    callback=merge_result  
                )
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin")
        )
        words_list = []

        def binarize_consumer(ids, words):
            ds.add_item(ids)
            words_list.append(words)

        merge_result(
            Binarizer.binarize(
                input_file, vocab, binarize_consumer,
                offset=0, end=offsets[1], copy_ext_dict=args.copy_ext_dict, copy_src_words=copy_src_words
            )
        )
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print(
            "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% <unk> copied from src".format(
                lang,
                input_file,
                n_seq_tok[0],
                n_seq_tok[1],
                100 * sum(replaced.values()) / n_seq_tok[1],
                vocab.unk_word,
                100 * sum(copyied.values()) / n_seq_tok[1]
            )
        )

        return words_list

    def make_dataset(vocab, input_prefix, output_prefix, lang, num_workers=1, copy_src_words=None):
        if args.output_format == "binary":
            return make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, copy_src_words)
        elif args.output_format == "raw":
            # Copy original text file to destination folder
            output_text_file = dest_path(
                output_prefix + ".{}-{}".format(args.source_lang, args.target_lang),
                lang,
            )
            shutil.copyfile(file_name(input_prefix, lang), output_text_file)

            return None

    def make_all(lang, vocab, source_words_list_dict=defaultdict(lambda: None)):
        words_list_dict = defaultdict(lambda: None)

        if args.trainpref:
            words_list_dict["train"] = \
                make_dataset(vocab, args.trainpref, "train", lang, num_workers=args.workers,
                             copy_src_words=source_words_list_dict['train'])
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                words_list_dict["valid"] = \
                    make_dataset(vocab, validpref, outprefix, lang, copy_src_words=source_words_list_dict['valid'])
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                words_list_dict["test"] = \
                    make_dataset(vocab, testpref, outprefix, lang, copy_src_words=source_words_list_dict['test'])

        return words_list_dict

    source_words_list_dict = make_all(args.source_lang, src_dict)
    if target:
        target_words_list_dict = make_all(args.target_lang, tgt_dict, source_words_list_dict)

    print("| Wrote preprocessed data to {}".format(args.destdir))

    if False: #args.alignfile:
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        freq_map = {}
        with open(args.alignfile, "r", encoding='utf-8') as align_file:
            with open(src_file_name, "r", encoding='utf-8') as src_file:
                with open(tgt_file_name, "r", encoding='utf-8') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = src_dict.encode_line(s, add_if_not_exist=False)
                        ti = tgt_dict.encode_line(t, add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split("-")), a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)

        with open(
                os.path.join(
                    args.destdir,
                    "alignment.{}-{}.txt".format(args.source_lang, args.target_lang),
                ),
                "w", encoding='utf-8'
        ) as f:
            for k, v in align_dict.items():
                print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)


    if args.alignfile:
        from fairseq.tokenizer import tokenize_line
        import numpy as np
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        src_labels_list = []
        tgt_labels_list = []
        with open(args.alignfile, "r", encoding='utf-8') as align_file:
            with open(src_file_name, "r", encoding='utf-8') as src_file:
                with open(tgt_file_name, "r", encoding='utf-8') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        src_words = tokenize_line(s)
                        tgt_words = tokenize_line(t)
                        ai = list(map(lambda x: tuple(x.split("-")), a.split()))
                        src_labels = np.ones(len(src_words), int)
                        tgt_labels = np.ones(len(tgt_words), int)
                        for sai, tai in ai:
                            if int(tai) >= len(tgt_words):
                                print('Bad case:')
                                print(tgt_words)
                                print(ai)
                                continue
                            src_word = src_words[int(sai)]
                            tgt_word = tgt_words[int(tai)]
                            if src_word == tgt_word:
                                src_labels[int(sai)] = 0
                                tgt_labels[int(tai)] = 0
                        src_labels_list.append(src_labels)
                        tgt_labels_list.append(tgt_labels)

        save_label_file(os.path.join(args.destdir, "train.label.{}.txt".format(args.source_lang)), src_labels_list)
        save_label_file(os.path.join(args.destdir, "train.label.{}.txt".format(args.target_lang)), tgt_labels_list)
예제 #28
0
    def process_srl_row_simple(self, srl_row, word_dct: Dictionary):
        qa_pair = srl_row["qa_pair"]

        def get_padded_toks_and_lens(inp, max_l):
            return pad_words(
                word_list=word_dct.encode_line(
                    inp, add_if_not_exist=False, append_eos=False
                ).tolist(),
                max_len=max_l,
                append_eos=True,
                eos_index=word_dct.eos_index,
                pad_index=word_dct.pad_index,
            )

        max_l = 20
        if self.split_type == "train":
            qarg_lemma_out_lst = qa_pair["qarg_lemma"]
            qarg_x = qa_pair["question_type"]
            mapping = {
                "<Q-V>": ["ARG0", "ARG1", "ARG2", "ARGM-LOC"],
                "<Q-ARG0>": ["V", "ARG1", "ARG2", "ARGM-LOC"],
                "<Q-ARG1>": ["V", "ARG0", "ARG2", "ARGM-LOC"],
                "<Q-ARG2>": ["V", "ARG0", "ARG1", "ARGM-LOC"],
                "<Q-ARGM-LOC>": ["V", "ARG0", "ARG1", "ARG2"],
            }
            qphrase_lst = []
            qphrase_ix_lst = []
            for qarg_lemm1_ix, qarg_lemm1 in enumerate(qarg_lemma_out_lst):
                if qarg_x == qarg_lemm1[0]:
                    qphrase_lst.append(qarg_lemm1[3])
                    qphrase_ix_lst.append(qarg_lemm1_ix)
                else:
                    if (
                        qarg_lemm1[0]
                        in mapping[qarg_x]
                        # and len(qphrase_lst) < 3
                    ):
                        qphrase_lst.append(qarg_lemm1[3])
                        qphrase_ix_lst.append(qarg_lemm1_ix)

            if len(qphrase_lst) < 3:
                question = qa_pair["question"]
                qphrase_ix_lst = [ix for ix in range(len(qarg_lemma_out_lst))]
            else:
                question = " ".join(qphrase_lst)
                qphrase_ix_lst = qphrase_ix_lst
        else:
            question = qa_pair["question"]
            qphrase_ix_lst = qa_pair["qphrase_ix_lst"]

        question_toks, question_tok_lens = get_padded_toks_and_lens(
            question, max_l=max_l
        )

        answer_toks, answer_tok_lens = get_padded_toks_and_lens(
            qa_pair["answer"], max_l=max_l
        )
        question_type = qa_pair["question_type"]

        out_dct = {
            "question_toks": torch.tensor(question_toks).long(),
            "question_tok_len": torch.tensor(question_tok_lens).long(),
            "question_type": torch.tensor(word_dct.indices[question_type]).long(),
            "answer_toks": torch.tensor(answer_toks).long(),
            "answer_tok_lens": torch.tensor(answer_tok_lens).long(),
        }

        if self.cfg.mdl.use_phr_clf:
            ans = qa_pair["answer"]
            if ans in self.comm.awvoc.indices:
                answer_tok1 = self.comm.awvoc.indices[ans]
            else:
                answer_tok1 = self.comm.awvoc.unk_index
            aeos_ind = self.comm.awvoc.eos_index
            out_dct["answer_clf"] = torch.tensor([answer_tok1, aeos_ind]).long()
            out_dct["answer_clf_lens"] = torch.tensor(2).long()

        if self.cfg.mdl.use_srl_bounds:
            # only for VOGNET-QAP
            question_srl_bounds = []
            cur_ix = 0
            num_srls_max = 5
            num_box_per_srl = 4
            if self.cfg.ds_name == "anet":
                vid_seg_gt_box = srl_row["gt_bboxes"]
                vid_seg_gt_frms = srl_row["gt_frms"]
            gt_bbox_lst = []

            req_cls_pats_mask = srl_row["req_cls_pats_mask"]
            to_break = False
            for qarg_le1_ix in qphrase_ix_lst:
                qarg_lemma1 = qa_pair["qarg_lemma"][qarg_le1_ix]
                tok_str = qarg_lemma1[3]
                assert isinstance(tok_str, str)
                tok_out = tokenize_line(tok_str)
                tok_len = len(tok_out)
                en_ix = cur_ix + tok_len - 1
                is_groundable = qarg_lemma1[2]
                if is_groundable and self.cfg.ds_name == "anet":
                    gt_info = req_cls_pats_mask[qarg_le1_ix]
                    assert gt_info[1] == 1
                    gbox_frm = []
                    for z in gt_info[2]:
                        gbx = copy.deepcopy(vid_seg_gt_box[z])
                        gfrm = copy.deepcopy(vid_seg_gt_frms[z])
                        gbx.append(gfrm)
                        gbox_frm += [gbx]
                else:
                    gbox_frm = [[0] * 5] * num_box_per_srl

                if len(gbox_frm) < num_box_per_srl:
                    gbox_frm += [[0] * 5] * (num_box_per_srl - len(gbox_frm))
                else:
                    gbox_frm = gbox_frm[:num_box_per_srl]
                gt_bbox_lst.append(gbox_frm)
                if en_ix < max_l - 1:
                    question_srl_bounds.append([cur_ix, en_ix])
                else:
                    question_srl_bounds.append([cur_ix, max_l - 1])
                    to_break = True
                if to_break:
                    break
                cur_ix += tok_len
            num_srls_used = min(len(question_srl_bounds), num_srls_max)

            if len(question_srl_bounds) > num_srls_max:
                question_srl_bounds = question_srl_bounds[:num_srls_max]
                gt_bbox_lst = gt_bbox_lst[:num_srls_max]
            else:
                to_add_srls = num_srls_max - len(question_srl_bounds)
                question_srl_bounds += [[0, 0]] * to_add_srls
                gt_bbox_lst += [[[0] * 5] * num_box_per_srl] * to_add_srls

            assert len(question_srl_bounds) == num_srls_max

            out_dct["question_srl_bounds_idxs"] = torch.tensor(
                question_srl_bounds
            ).long()
            out_dct["num_srls_used_msk"] = torch.tensor(
                [1] * num_srls_used + [0] * (num_srls_max - num_srls_used)
            ).long()
            out_dct["num_srls_used"] = torch.tensor(num_srls_used).long()
            out_dct["gt_bbox_for_srls"] = torch.tensor(gt_bbox_lst).float()
            out_dct["gt_bbox_for_srls_msk"] = (
                out_dct["gt_bbox_for_srls"].sum(dim=-1).ne(0)
            )
        return out_dct
예제 #29
0
def main():
    args = argument_parsing()

    sentences = read_sentences(args.in_tokenized_sentences)
    split_sentences = []
    for sentence in sentences:
        split_sentences.append(tokenize_line(sentence))
    print(len(split_sentences))

    bad_unicode = open(args.output_file, 'w')

    def load_roberta(name=None, roberta_cache_path=None):
        if not roberta_cache_path:
            roberta = torch.hub.load('pytorch/fairseq', name)
        else:
            roberta = RobertaModel.from_pretrained(roberta_cach_path,
                                                   checkpoint_file='model.pt')

        roberta.eval()
        if torch.cuda.is_available():
            roberta.cuda()
        return roberta

    def get_wordpiece_to_word_map(sentence, roberta_bpe, raise_error):
        # Get word and worpiece tokens according to RoBERTa
        # sentence = sentence.replace(u'\x91', u' ')
        # sentence = sentence.replace(u'\x96', u' ')
        word_tokens = sentence.split()
        wordpiece_tokens = [
            roberta_bpe.decode(wordpiece)
            for wordpiece in roberta_bpe.encode(sentence).split()
        ]
        #print("wp_tokens: ", wordpiece_tokens)

        assert len(word_tokens) <= len(wordpiece_tokens)
        assert isinstance(word_tokens, list)
        assert isinstance(wordpiece_tokens, list)
        w_index = 0
        word_to_wordpiece = []
        subword_sequence = []
        bad_unicode_flag = 0

        for wp_index in range(len(wordpiece_tokens)):
            if w_index in range(len(word_tokens)):
                word = word_tokens[w_index]
                if word == wordpiece_tokens[wp_index]:
                    word_to_wordpiece.append(wp_index)
                    w_index += 1
                else:
                    subword_sequence.append(wp_index)
                    word_from_pieces = "".join([
                        # NOTE: Facebooks BPE signals SOW with whitesplace
                        wordpiece_tokens[i].lstrip() for i in subword_sequence
                    ])
                    if word == word_from_pieces:
                        word_to_wordpiece.append(subword_sequence)
                        w_index += 1
                        subword_sequence = []
                    elif word_from_pieces not in word:
                        word_to_wordpiece.append(subword_sequence)
                        w_index += 1
                        subword_sequence = []
                        bad_unicode_flag = 1

        if bad_unicode_flag == 1:
            bad_unicode.write(sentence)
            wp = " ".join(wordpiece_tokens)
            print("\n\nsentence: ", sentence)
            print("wp: ", wp)
            print("\n")
            bad_unicode.write("\n")
            bad_unicode.write(wp)
            bad_unicode.write("\n\n")
            if raise_error:
                raise Exception('Unicode splitting failed')

        return word_to_wordpiece

    def check_wordpiece_to_word_map(input_file, raise_error):
        num_sents = 0
        with open(input_file, 'r') as fid:
            for sentence in tqdm(fid):
                if not sentence:
                    break
                sentence = " ".join(tokenize_line(str(sentence.rstrip())))
                #print("input: ", sentence)
                word2piece = get_wordpiece_to_word_map(sentence, roberta.bpe,
                                                       raise_error)

    roberta = load_roberta(name=args.pretrained_embed)
    check_wordpiece_to_word_map(args.in_tokenized_sentences, args.raise_error)