Exemplo n.º 1
0
class Environment(object):
    """initialize the enviroment"""
    def __init__(self, args):
        self.args = args
        # init log
        if args.log_path:
            utils.init_log(args.log_path, args.local_rank, args.log_level)
        # init seed
        fluid.default_main_program().random_seed = args.seed
        np.random.seed(args.seed)
        # init place
        if args.use_cuda:
            self.place = "gpu"

        else:
            self.place = "cpu"

        os.environ["FLAGS_paddle_num_threads"] = str(args.threads)
        if not os.path.exists(self.args.model_files):
            os.makedirs(self.args.model_files)
        if not os.path.exists(args.fields_path) or args.preprocess:
            logging.info("Preprocess the data")
            if args.encoding_model in [
                    "ernie-1.0", "ernie-tiny", "ernie-lstm"
            ]:
                tokenizer = ErnieTokenizer.from_pretrained(args.encoding_model)
                args["ernie_vocabs_size"] = len(tokenizer.vocab)
                self.WORD = ErnieField(
                    "word",
                    pad=tokenizer.pad_token,
                    unk=tokenizer.unk_token,
                    bos=tokenizer.cls_token,
                    eos=tokenizer.sep_token,
                    fix_len=args.fix_len,
                    tokenizer=tokenizer,
                )
                self.WORD.vocab = tokenizer.vocab
                args.feat = None
            else:
                self.WORD = Field(
                    "word",
                    pad=utils.pad,
                    unk=utils.unk,
                    bos=utils.bos,
                    eos=utils.eos,
                    lower=True,
                )
            if args.feat == "char":
                self.FEAT = SubwordField(
                    "chars",
                    pad=utils.pad,
                    unk=utils.unk,
                    bos=utils.bos,
                    eos=utils.eos,
                    fix_len=args.fix_len,
                    tokenize=list,
                )
            elif args.feat == "pos":
                self.FEAT = Field("postag", bos=utils.bos, eos=utils.eos)
            else:
                self.FEAT = None
            self.ARC = Field(
                "head",
                bos=utils.bos,
                eos=utils.eos,
                use_vocab=False,
                fn=utils.numericalize,
            )
            self.REL = Field("deprel", bos=utils.bos, eos=utils.eos)
            if args.feat == "char":
                self.fields = CoNLL(FORM=(self.WORD, self.FEAT),
                                    HEAD=self.ARC,
                                    DEPREL=self.REL)
            else:
                self.fields = CoNLL(FORM=self.WORD,
                                    CPOS=self.FEAT,
                                    HEAD=self.ARC,
                                    DEPREL=self.REL)

            train = Corpus.load(args.train_data_path, self.fields)

            if not args.encoding_model.startswith("ernie"):
                self.WORD.build(train, args.min_freq)
                self.FEAT.build(train)

            self.REL.build(train)
            if args.local_rank == 0:
                with open(args.fields_path, "wb") as f:
                    logging.info("dumping fileds to disk.")
                    pickle.dump(self.fields, f, protocol=2)
        else:
            logging.info("loading the fields.")
            with open(args.fields_path, "rb") as f:
                self.fields = pickle.load(f)

            if isinstance(self.fields.FORM, tuple):
                self.WORD, self.FEAT = self.fields.FORM
            else:
                self.WORD, self.FEAT = self.fields.FORM, self.fields.CPOS
            self.ARC, self.REL = self.fields.HEAD, self.fields.DEPREL

        if args.encoding_model.startswith("ernie"):
            vocab_items = self.WORD.vocab.items()
        else:
            vocab_items = self.WORD.vocab.stoi.items()

        self.puncts = np.array([i for s, i in vocab_items if utils.ispunct(s)],
                               dtype=np.int64)

        self.args.update({
            "n_words": len(self.WORD.vocab),
            "n_feats": self.FEAT and len(self.FEAT.vocab),
            "n_rels": len(self.REL.vocab),
            "pad_index": self.WORD.pad_index,
            "unk_index": self.WORD.unk_index,
            "bos_index": self.WORD.bos_index,
            "eos_index": self.WORD.eos_index,
            "feat_pad_index": self.FEAT and self.FEAT.pad_index,
        })
Exemplo n.º 2
0
class Environment(object):
    """initialize the enviroment"""
    def __init__(self, args):
        self.args = args
        # init log
        if self.args.log_path:
            utils.init_log(self.args.log_path, self.args.local_rank,
                           self.args.log_level)
        # init seed
        fluid.default_main_program().random_seed = self.args.seed
        np.random.seed(self.args.seed)
        # init place
        if self.args.use_cuda:
            if self.args.use_data_parallel:
                self.place = fluid.CUDAPlace(
                    fluid.dygraph.parallel.Env().dev_id)
            else:
                self.place = fluid.CUDAPlace(0)
        else:
            self.place = fluid.CPUPlace()

        os.environ['FLAGS_paddle_num_threads'] = str(self.args.threads)
        os.makedirs(self.args.model_files, exist_ok=True)

        if not os.path.exists(self.args.fields_path) or self.args.preprocess:
            logging.info("Preprocess the data")
            self.WORD = Field('word',
                              pad=utils.pad,
                              unk=utils.unk,
                              bos=utils.bos,
                              lower=True)
            if self.args.feat == 'char':
                self.FEAT = SubwordField('chars',
                                         pad=utils.pad,
                                         unk=utils.unk,
                                         bos=utils.bos,
                                         fix_len=self.args.fix_len,
                                         tokenize=list)
            else:
                self.FEAT = Field('postag', bos=utils.bos)
            self.ARC = Field('head',
                             bos=utils.bos,
                             use_vocab=False,
                             fn=utils.numericalize)
            self.REL = Field('deprel', bos=utils.bos)
            if self.args.feat == 'char':
                self.fields = CoNLL(FORM=(self.WORD, self.FEAT),
                                    HEAD=self.ARC,
                                    DEPREL=self.REL)
            else:
                self.fields = CoNLL(FORM=self.WORD,
                                    CPOS=self.FEAT,
                                    HEAD=self.ARC,
                                    DEPREL=self.REL)

            train = Corpus.load(self.args.train_data_path, self.fields)
            if self.args.pretrained_embedding_dir:
                logging.info("loading pretrained embedding from file.")
                embed = Embedding.load(self.args.pretrained_embedding_dir,
                                       self.args.unk)
            else:
                embed = None
            self.WORD.build(train, self.args.min_freq, embed)
            self.FEAT.build(train)
            self.REL.build(train)
            if self.args.local_rank == 0:
                with open(self.args.fields_path, "wb") as f:
                    logging.info("dumping fileds to disk.")
                    pickle.dump(self.fields, f, protocol=2)
        else:
            logging.info("loading the fields.")
            with open(self.args.fields_path, "rb") as f:
                self.fields = pickle.load(f)

            if isinstance(self.fields.FORM, tuple):
                self.WORD, self.FEAT = self.fields.FORM
            else:
                self.WORD, self.FEAT = self.fields.FORM, self.fields.CPOS
            self.ARC, self.REL = self.fields.HEAD, self.fields.DEPREL
        self.puncts = np.array(
            [i for s, i in self.WORD.vocab.stoi.items() if utils.ispunct(s)],
            dtype=np.int64)

        if self.WORD.embed is not None:
            self.args["pretrained_embed_shape"] = self.WORD.embed.shape
        else:
            self.args["pretrained_embed_shape"] = None

        self.args.update({
            'n_words': self.WORD.vocab.n_init,
            'n_feats': len(self.FEAT.vocab),
            'n_rels': len(self.REL.vocab),
            'pad_index': self.WORD.pad_index,
            'unk_index': self.WORD.unk_index,
            'bos_index': self.WORD.bos_index,
            'feat_pad_index': self.FEAT.pad_index
        })