def test_tokenization_bert(self):
        # Given
        self.base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                                            do_lower_case=True,
                                                            cache_dir=self.test_dir)
        self.rust_tokenizer = PyBertTokenizer(
            get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['bert-base-uncased']),
            do_lower_case=True)
        output_baseline = []
        for example in self.examples:
            output_baseline.append(self.base_tokenizer.encode_plus(example.text_a,
                                                                   text_pair=example.text_b,
                                                                   add_special_tokens=True,
                                                                   return_overflowing_tokens=True,
                                                                   return_special_tokens_mask=True,
                                                                   max_length=128))

        # When
        output_rust = self.rust_tokenizer.encode_pair_list(
            [(example.text_a, example.text_b) for example in self.examples],
            max_len=128,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)):
            assert rust.token_ids == baseline[
                'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \
                              f'Sentence a: {self.examples[idx].text_a} \n' \
                              f'Sentence b: {self.examples[idx].text_b} \n' \
                              f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \
                              f'Rust: {rust.token_ids} \n' \
                              f' Python {baseline["input_ids"]}'
            assert (rust.segment_ids == baseline['token_type_ids'])
            assert (rust.special_tokens_mask == baseline['special_tokens_mask'])
    def setup_class(self):
        self.use_gpu = torch.cuda.is_available()
        self.test_dir = Path(tempfile.mkdtemp())

        self.base_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True,
                                                            cache_dir=self.test_dir)
        self.rust_tokenizer = PyBertTokenizer(
            get_from_cache(self.base_tokenizer.pretrained_vocab_files_map['vocab_file']['bert-base-uncased']))
        self.model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=False).eval()
        if self.use_gpu:
            self.model.cuda()
        self.sentence_list = ['For instance, on the planet Earth, man had always assumed that he was more intelligent '
                              'than dolphins because he had achieved so much—the wheel, New York, wars and so on—whilst'
                              ' all the dolphins had ever done was muck about in the water having a good time. But '
                              'conversely, the dolphins had always believed that they were far more intelligent than '
                              'man—for precisely the same reasons.'] * 64

        # Pre-allocate GPU memory
        tokens_list = [self.base_tokenizer.tokenize(sentence) for sentence in self.sentence_list]
        features = [self.base_tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list]
        features = [self.base_tokenizer.prepare_for_model(input, None, add_special_tokens=True, max_length=128) for
                    input
                    in features]
        all_input_ids = torch.tensor([f['input_ids'] for f in features], dtype=torch.long)

        if self.use_gpu:
            all_input_ids = all_input_ids.cuda()

        with torch.no_grad():
            _ = self.model(all_input_ids)[0].cpu().numpy()
示例#3
0
    def __init__(
        self,
        pretrained_model: str,
        use_starting_offsets: bool = False,
        do_lowercase: bool = True,
        never_lowercase: List[str] = None,
        max_pieces: int = 512,
        truncate_long_sequences: bool = True,
    ) -> None:
        if pretrained_model.endswith("-cased") and do_lowercase:
            logger.warning(
                "Your BERT model appears to be cased, but your indexer is lowercasing tokens."
            )
        elif pretrained_model.endswith("-uncased") and not do_lowercase:
            logger.warning("Your BERT model appears to be uncased, "
                           "but your indexer is not lowercasing tokens.")

        bert_tokenizer = BertTokenizer.from_pretrained(
            pretrained_model, do_lower_case=do_lowercase)
        super().__init__(
            vocab=bert_tokenizer.vocab,
            wordpiece_tokenizer=bert_tokenizer.wordpiece_tokenizer.tokenize,
            namespace="bert",
            use_starting_offsets=use_starting_offsets,
            max_pieces=max_pieces,
            do_lowercase=do_lowercase,
            never_lowercase=never_lowercase,
            start_tokens=["[CLS]"],
            end_tokens=["[SEP]"],
            separator_token="[SEP]",
            truncate_long_sequences=truncate_long_sequences,
        )
示例#4
0
    def __init__(self, args, device='cpu'):
        print(args.bert_model)
        self.tokenizer = BertTokenizer.from_pretrained(args.bert_model)
        self.data_dir = args.data_dir
        file_list = get_json_file_list(args.data_dir)
        self.data = []
        self.shortt=0
        self.longg=0
        #max_article_len = 0
        for file_name in file_list:
            data = json.loads(open(file_name, 'r').read())
            data['high'] = 0
            if ('high' in file_name):
                data['high'] = 1
            self.data.append(data)
            #max_article_len = max(max_article_len, len(nltk.word_tokenize(data['article'])))
        self.data_objs = []
        high_cnt = 0
        middle_cnt = 0

        for sample in self.data:
            high_cnt += sample['high']
            middle_cnt += (1 - sample['high'])
            self.data_objs += self._create_sample(sample)
            #print(self.data_objs[-1].ph)
            #break
        print('high school sample:', high_cnt)
        print('middle school sample:', middle_cnt)
        print('<512:',self.shortt)
        print('>512:',self.longg)
        for i in range(len(self.data_objs)):
            self.data_objs[i].convert_tokens_to_ids(self.tokenizer)
            #break

        torch.save(self.data_objs, args.save_name)
 def __init__(self) -> None:
     os.environ['CORENLP_HOME'] = '{}/stanford-corenlp-full-2018-10-05'.format(os.environ['HOME'])
     self.client: CoreNLPClient = CoreNLPClient()
     self.client.ensure_alive()
     self.do_lower_case = '-cased' not in config.bert_model
     self.basic_tokenizer: BasicTokenizer \
         = BertTokenizer.from_pretrained(config.bert_model, do_lower_case=self.do_lower_case).basic_tokenizer
示例#6
0
    def __init__(
        self,
        mappings: dict = None,
        instance_conll_file: str = None,
        debug: bool = None,
        singleton_replacement_ratio: float = 0.0,
        bert_use: bool = False,
        bert_voc_dir: str = None,
        bert_lowercase: bool = False,
        pretrained_use: bool = False,
        char_use: bool = False,
        elmo_use: bool = False,
    ):

        self.mappings = mappings
        self.instance_conll_file = instance_conll_file
        self.debug = debug
        self.singleton_replacement_ratio = singleton_replacement_ratio

        self.bert_use = bert_use
        self.bert_voc_dir = bert_voc_dir
        self.bert_lowercase = bert_lowercase
        self.bert_tokenizer = None

        self.pretrained_use = pretrained_use
        self.char_use = char_use
        self.elmo_use = elmo_use

        if self.bert_use:
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                self.bert_voc_dir, do_lower_case=self.bert_lowercase)

        self.singletons = set(self.extract_singletons())
        self.instances = dict()
        self.load_instances()
示例#7
0
def main():
    # 引数を処理する
    args = parse_argument()

    # 乱数処理のシードをチェック
    if args.seed is not -1:
        # 各種の乱数処理のシードを固定化する
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

    # pytorchに利用するGPU/CPUを設定する
    device = torch.device("cuda" if torch.cuda.is_available()
                          and not args.no_cuda else "cpu")

    if device != "cpu":
        # GPUの乱数シードを設定する
        torch.cuda.manual_seed_all(args.seed)

    # 事前学習済みのBERTモデルのTokernizerを読み込む
    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False,
                                              tokenize_chinese_chars=False)

    if args.do_train:
        # 学習
        train(args, tokenizer, device)
    if args.do_generate:
        # 生成
        generate(tokenizer, device, max_iter=args.max_iter,
                 length=args.seq_length, model=args.bert_model,
                 fix_word=args.fix_word, samples=args.samples)
示例#8
0
    def __init__(self, reader):
        task_cfg = reader
        tokenizer = BertTokenizer.from_pretrained(
            task_cfg.bert_model, do_lower_case=task_cfg.do_lower_case)

        task_feature_reader1 = {}
        task_feature_reader2 = {}
        self.task = []
        self._limit_sample_nums = task_cfg.get('limit_nums', None)
        is_train = task_cfg.get('is_train', False)

        ids = task_cfg.tasks.split('-')
        for i, task_id in enumerate(ids):
            task = 'TASK' + task_id
            self.task.append(task)
            cfg = task_cfg.TASKS[task]
            if cfg.features_h5path1 not in task_feature_reader1:
                task_feature_reader1[cfg.features_h5path1] = None
            if cfg.features_h5path2 not in task_feature_reader2:
                task_feature_reader2[cfg.features_h5path2] = None

        # initilzie the feature reader
        for features_h5path in task_feature_reader1.keys():
            if features_h5path != '':
                task_feature_reader1[features_h5path] = ImageFeaturesH5Reader(
                    features_h5path, task_cfg.in_memory)
        for features_h5path in task_feature_reader2.keys():
            if features_h5path != '':
                task_feature_reader2[features_h5path] = ImageFeaturesH5Reader(
                    features_h5path, task_cfg.in_memory)

        self.task_datasets = {}
        # only one task now
        for i, task_id in enumerate(ids):
            task = 'TASK' + task_id
            cfg = task_cfg.TASKS[task]
            task_name = cfg.name

            if is_train:
                split = cfg.train_split
                annotations_jsonpath = cfg.train_annotations_jsonpath
            else:
                split = cfg.val_split
                annotations_jsonpath = cfg.val_annotations_jsonpath

            self.task_datasets[task] = DatasetMapTrain[task_name](
                task=cfg.name,
                dataroot=cfg.dataroot,
                annotations_jsonpath=annotations_jsonpath,
                split=split,
                image_features_reader=task_feature_reader1[
                    cfg.features_h5path1],
                gt_image_features_reader=task_feature_reader2[
                    cfg.features_h5path2],
                tokenizer=tokenizer,
                bert_model=task_cfg.bert_model,
                clean_datasets=task_cfg.clean_datasets,
                padding_index=0,
                max_seq_length=cfg.max_seq_length,
                max_region_num=cfg.max_region_num)
    def test_tokenization_bert(self):
        # Given
        self.base_tokenizer = BertTokenizer.from_pretrained(
            'bert-base-uncased', do_lower_case=True, cache_dir=self.test_dir)
        self.rust_tokenizer = PyBertTokenizer(
            get_from_cache(
                self.base_tokenizer.pretrained_vocab_files_map['vocab_file']
                ['bert-base-uncased']))
        output_baseline = []
        for example in self.examples:
            output_baseline.append(
                self.base_tokenizer.encode_plus(
                    example.text_a,
                    add_special_tokens=True,
                    return_overflowing_tokens=True,
                    return_special_tokens_mask=True,
                    max_length=128))

        # When
        output_rust = self.rust_tokenizer.encode_list(
            [example.text_a for example in self.examples],
            max_len=128,
            truncation_strategy='longest_first',
            stride=0)

        # Then
        for rust, baseline in zip(output_rust, output_baseline):
            assert (rust.token_ids == baseline['input_ids'])
            assert (rust.segment_ids == baseline['token_type_ids'])
            assert (
                rust.special_tokens_mask == baseline['special_tokens_mask'])
示例#10
0
def main():
    # truncate csv
    with open('./output/tachikoma_out.csv', 'w') as fd:
        fd.truncate()
        fd.write('tweet\n')

    args = parse_argument()

    if args.seed is not -1:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available()
                          and not args.no_cuda else "cpu")

    if device != "cpu":
        torch.cuda.manual_seed_all(args.seed)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=False,
                                              tokenize_chinese_chars=False)

    if args.do_train:
        train(args, tokenizer, device)
    if args.do_generate:
        generate(tokenizer, device, max_iter=args.max_iter,
                 length=args.seq_length, model=args.bert_model,
                 fix_word=args.fix_word, samples=args.samples)
示例#11
0
def load_model(model_name, data_dir):
    processors = {
        "rte": RteProcessor
    }

    output_modes = {
        "rte": "classification"
    }
    # task_name = args.task_name.lower()
    task_name = 'rte'
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    label_list = processor.get_labels()  # [0,1]
    num_labels = len(label_list)
    pretrain_model_dir = '{}/FineTuneOn{}'.format(data_dir, model_name)
    # pretrain_model_dir = 'please enter your pretrain models path here/FineTuneOn{}'.format(model_name)
    # Prepare model
    # cache_dir = os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), '{} model distributed_{}'.format(model_name, -1))
    # # cache_dir = os.path.join(str(PYTORCH_TRANSFORMERS_CACHE), '{} model distributed_{}'.format(model_name, -1))


    model = BertForSequenceClassification.from_pretrained(pretrain_model_dir, num_labels=num_labels)
    tokenizer = BertTokenizer.from_pretrained(pretrain_model_dir)
    # model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
    #           cache_dir=cache_dir,
    #           num_labels=num_labels)
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    # print(tokenizer)
    return model, tokenizer
示例#12
0
    def load(cls,
             pretrained_model_name_or_path,
             tokenizer_class=None,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        `pretrained_model_name_or_path` or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param kwargs:
        :return: Tokenizer
        """

        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        # guess tokenizer type from name
        if tokenizer_class is None:
            if "albert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "AlbertTokenizer"
            elif "xlm-roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLMRobertaTokenizer"
            elif "roberta" in pretrained_model_name_or_path.lower():
                tokenizer_class = "RobertaTokenizer"
            elif "distilbert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "DistilBertTokenizer"
            elif "bert" in pretrained_model_name_or_path.lower():
                tokenizer_class = "BertTokenizer"
            elif "xlnet" in pretrained_model_name_or_path.lower():
                tokenizer_class = "XLNetTokenizer"
            else:
                raise ValueError(
                    f"Could not infer tokenizer_type from name '{pretrained_model_name_or_path}'. Set arg `tokenizer_type` in Tokenizer.load() to one of: 'bert', 'roberta', 'xlnet' "
                )
            logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        if tokenizer_class == "AlbertTokenizer":
            ret = AlbertTokenizer.from_pretrained(
                pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif tokenizer_class == "XLMRobertaTokenizer":
            ret = XLMRobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "RobertaTokenizer":
            ret = RobertaTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "DistilBertTokenizer":
            ret = DistilBertTokenizer.from_pretrained(
                pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "BertTokenizer":
            ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                **kwargs)
        elif tokenizer_class == "XLNetTokenizer":
            ret = XLNetTokenizer.from_pretrained(pretrained_model_name_or_path,
                                                 keep_accents=True,
                                                 **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
def main():
    args = parse_argument()

    if args.seed is not -1:
        random.seed(args.seed)
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)

    device = torch.device(
        "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")

    if device != "cpu":
        torch.cuda.manual_seed_all(args.seed)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=False,
                                              tokenize_chinese_chars=False)

    if args.do_train:
        train(args, tokenizer, device)
    if args.do_generate:
        generate(tokenizer,
                 device,
                 max_iter=args.max_iter,
                 length=args.seq_length,
                 model=args.bert_model,
                 fix_word=args.fix_word,
                 samples=args.samples)
def main():
    parser = ArgumentParser()
    parser.add_argument('--train_corpus', type=Path, required=False)
    parser.add_argument("--output_dir", type=Path, required=False)
    parser.add_argument("--bert_model", type=str, required=False, default=BERT_PRETRAINED_MODEL,
                        choices=["bert-base-uncased", "bert-large-uncased", "bert-base-cased",
                                 "bert-base-multilingual-uncased", "bert-base-chinese", "bert-base-multilingual-cased"])
    parser.add_argument("--do_lower_case", action="store_true")
    parser.add_argument("--do_whole_word_mask", action="store_true",
                        help="Whether to use whole word masking rather than per-WordPiece masking.")
    parser.add_argument("--reduce_memory", action="store_true",
                        help="Reduce memory usage for large datasets by keeping data on disc rather than in memory")

    parser.add_argument("--num_workers", type=int, default=NUM_CPU,
                        help="The number of workers to use to write the files")
    parser.add_argument("--epochs_to_generate", type=int, default=EPOCHS,
                        help="Number of epochs of data to pregenerate")
    parser.add_argument("--max_seq_len", type=int, default=MAX_SENTIMENT_SEQ_LENGTH)
    parser.add_argument("--short_seq_prob", type=float, default=0.1,
                        help="Probability of making a short sentence as a training example")
    parser.add_argument("--masked_lm_prob", type=float, default=MLM_PROB,
                        help="Probability of masking each token for the LM task")
    parser.add_argument("--max_predictions_per_seq", type=int, default=MAX_PRED_PER_SEQ,
                        help="Maximum number of tokens to mask in each sequence")
    parser.add_argument("--masking_method", type=str, default="double_num_adj", choices=("mlm_prob", "double_num_adj"),
                        help="Method of determining num masked tokens in sentence")
    args = parser.parse_args()

    if args.num_workers > 1 and args.reduce_memory:
        raise ValueError("Cannot use multiple workers while reducing memory")

    tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAINED_MODEL, do_lower_case=bool(BERT_PRETRAINED_MODEL.endswith("uncased")))

    generate_data_for_treatment(tokenizer, args)
示例#15
0
    def __init__(
        self,
        token_indexers: Dict[str, TokenIndexer] = None,
        domain_identifier: str = None,
        bert_model_name: str = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        if token_indexers is not None:
            self._token_indexers = token_indexers
        elif bert_model_name is not None:
            from allennlp.data.token_indexers import PretrainedTransformerIndexer

            self._token_indexers = {
                "tokens": PretrainedTransformerIndexer(bert_model_name)
            }
        else:
            self._token_indexers = {"tokens": SingleIdTokenIndexer()}
        self._domain_identifier = domain_identifier

        if bert_model_name is not None:
            self.bert_tokenizer = BertTokenizer.from_pretrained(
                bert_model_name)
            self.lowercase_input = "uncased" in bert_model_name
        else:
            self.bert_tokenizer = None
            self.lowercase_input = False
示例#16
0
def generate_data_for_domain(args, domain):
    tokenizer = BertTokenizer.from_pretrained(
        BERT_PRETRAINED_MODEL,
        do_lower_case=bool(BERT_PRETRAINED_MODEL.endswith("uncased")))
    vocab_list = list(tokenizer.vocab.keys())

    with open(SENTIMENT_TOPICS_DOMAIN_TREAT_CONTROL_MAP_FILE, "r") as jsonfile:
        domain_topic_treat_dict = json.load(jsonfile)

    treatment_topic = domain_topic_treat_dict[domain]["treated_topic"]
    control_topic = domain_topic_treat_dict[domain]["control_topics"][-1]

    treatment_column = f"{treatment_topic}_bin"
    control_column = f"{control_topic}_bin"

    with DocumentDatabase(reduce_memory=args.reduce_memory) as docs:
        print(f"\nGenerating data for domain: {domain}")
        output_dir = Path(SENTIMENT_TOPICS_PRETRAIN_DATA_DIR) / domain
        output_dir.mkdir(exist_ok=True, parents=True)
        unique_ids, reviews, treatment_labels, control_labels = list(), list(
        ), list(), list()
        for dataset in ("train", "dev"):
            DATASET_FILE = f"{SENTIMENT_TOPICS_DATASETS_DIR}/topics_{dataset}.csv"
            df = pd.read_csv(
                DATASET_FILE,
                header=0,
                encoding='utf-8',
                usecols=["id", "review", treatment_column,
                         control_column]).set_index(keys="id",
                                                    drop=False).sort_index()
            df = df[df[treatment_column].notnull()]
            unique_ids += df["id"].astype(int).tolist()
            reviews += df["review"].apply(tokenizer.tokenize).tolist()
            treatment_labels += df[treatment_column].astype(int).tolist()
            control_labels += df[control_column].astype(int).tolist()

        for unique_id, doc, treatment_label, control_label in tqdm(
                zip(unique_ids, reviews, treatment_labels, control_labels)):
            if doc:
                docs.add_document(
                    unique_id, doc, treatment_label, control_label
                )  # If the last doc didn't end on a newline, make sure it still gets added
        if len(docs) <= 1:
            exit(
                "ERROR: No document breaks were found in the input file! These are necessary to allow the script to "
                "ensure that random NextSentences are not sampled from the same document. Please add blank lines to "
                "indicate breaks between documents in your input file. If your dataset does not contain multiple "
                "documents, blank lines can be inserted at any natural boundary, such as the ends of chapters, "
                "sections or paragraphs.")

        if args.num_workers > 1:
            writer_workers = Pool(
                min(args.num_workers, args.epochs_to_generate))
            arguments = [(docs, vocab_list, args, idx, output_dir)
                         for idx in range(args.epochs_to_generate)]
            writer_workers.starmap(create_training_file, arguments)
        else:
            for epoch in trange(args.epochs_to_generate, desc="Epoch"):
                create_training_file(docs, vocab_list, args, epoch, output_dir)
示例#17
0
 def __init__(self,
              num_classes,
              model_name='bert-base-uncased'
              ):
     self.num_classes = num_classes
     self.tokenizer = BertTokenizer.from_pretrained(model_name)
     self.model = TFBertForSequenceClassification.from_pretrained(model_name,
                                                                  num_labels=self.num_classes)
示例#18
0
 def __init__(self):
     # Googleの公開している事前学習済みのトークナイザとモデルをロード
     self.tokenizer = BertTokenizer.from_pretrained(
         "bert-base-multilingual-cased", do_lower_case=False)
     self.model = BertForSequenceClassification.from_pretrained(
         "bert-base-multilingual-cased", num_labels=2)
     # Google Colabでファインチューニングしたモデルをロード
     self.model.load_state_dict(
         torch.load("bert_evaluator.bin", map_location='cpu'))
示例#19
0
 def init_tokens(self):
     # self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
     self.tokenizer = BertTokenizer.from_pretrained(
         pretrained_model_name_or_path=self.tokenizer_path)
     tokens = ['[CLS]', '[MASK]', '[SEP]']
     indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokens)
     self.CLS = indexed_tokens[0]
     self.MASK = indexed_tokens[1]
     self.SEP = indexed_tokens[2]
示例#20
0
    def __init__(self, n_kws=15):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.n_kws = n_kws

        self.bert_w2i = {w: i for i, w in enumerate(self.tokenizer.vocab)}
        self.bert_vocab = self.tokenizer.vocab
        # self.dataset = h5py.File("/home/phillab/data/headliner_6M.hdf5")
        # self.dset = self.dataset['name']
        self.keyworder = None
        self.i2w = None
示例#21
0
    def __init__(self, device, model_file=None):
        self.model = BertForNextSentencePrediction.from_pretrained(
            'bert-base-uncased')
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.tokenizer.max_len = 10000
        self.model.to(device)
        self.device = device

        if model_file is not None:
            self.reload_model(model_file)
示例#22
0
    def __init__(self, bert_model: str) -> None:

        self.bert_tokenizer: BertTokenizer = BertTokenizer.from_pretrained(
            bert_model, do_lower_case='-cased' not in bert_model)

        self.subword_alphabet: Optional[Alphabet] = None
        self.label_alphabet: Optional[Alphabet] = None

        self.train: Optional[List[SentInst]] = None
        self.dev: Optional[List[SentInst]] = None
        self.test: Optional[List[SentInst]] = None
示例#23
0
文件: pipe.py 项目: yhcc/BertForRD
    def process(self, data_bundle):
        """
        输入为
            word                definition
            测试                  这是 一个 测试

        :param data_bundle:
        :return:
        """
        tokenizer = BertTokenizer.from_pretrained(self.bert_name)
        tokenizer.do_basic_tokenize = True
        return _prepare_data_bundle(tokenizer, data_bundle, self.max_word_len)
示例#24
0
    def __init__(self,
                 file_path,
                 tag2idx,
                 tokenizer_path='',
                 do_lower_case=True):
        self.tag2idx = tag2idx
        self.tokenizer = BertTokenizer.from_pretrained(
            tokenizer_path, do_lower_case=do_lower_case)
        self._file_path = file_path

        with open(file_path, 'r', encoding='utf-8') as fp:
            self._lines_count = len(fp.readlines())
示例#25
0
 def load_model(self):
     self.tokenizer = BertTokenizer.from_pretrained(self.args.pretrained_path,do_lower_case=self.args.do_lower_case)
     self.config = BertConfig.from_pretrained(self.args.pretrained_path,num_labels=self.args.num_labels)
     if self.args.resume_model:
         self.model = BertForMultiLable.from_pretrained(self.args.resume_model_path,config=self.config)
         with open(self.threshold_path, 'r') as f:
             self.threshold = float(f.read())   # read the best model's threshold
     else:
         self.model = BertForMultiLable.from_pretrained(self.args.pretrained_path,config=self.config)
     if self.args.cuda:
         self.model.cuda()
         if self.args.n_gpus>1:
             self.model = DataParallel(self.model)
示例#26
0
    def __init__(self, method):
        if "gpt" in method:
            from transformers.tokenization_gpt2 import GPT2Tokenizer
            self.tokenizer = GPT2Tokenizer.from_pretrained(method)
        elif "bert" in method:
            from transformers.tokenization_bert import BertTokenizer
            self.tokenizer = BertTokenizer.from_pretrained(method)
        else:
            raise ValueError(
                '`method` is invalid value {}, should be "gpt"/"bpe" or "bert"'
                .format(method))

        self._tokenizer_class_name = self.tokenizer.__class__.__name__
示例#27
0
 def __init__(self, task, word2index,config):
     # self.transform = transform # Torch operations on the input image
     # self.target_transform = target_transform
     self.task = task
     self.config = config
     # self.split = split
     self.word2index = word2index
     self.max_len = int(config["data"]["window"])
     # self.image_roots = self.task.train_roots if self.split == 'train' else self.task.test_roots
     # self.labels = self.task.train_labels if self.split == 'train' else self.task.test_labels
     self.image_roots = self.task.train_roots+self.task.test_roots
     self.labels = self.task.train_labels+self.task.test_labels
     self.tokenizer = BertTokenizer.from_pretrained(config['data']['pretrain_path'])
 def __init__(self, data_path: str, treatment: str, subset: str, text_column: str, label_column: str,
              bert_pretrained_model: str = BERT_PRETRAINED_MODEL, max_seq_length: int = MAX_SENTIMENT_SEQ_LENGTH):
     super().__init__()
     if subset not in ("train", "dev", "test", "train_debug", "dev_debug", "test_debug"):
         raise ValueError("subset argument must be {train, dev,test}")
     self.dataset_file = f"{data_path}/{treatment}_{subset}.csv"
     self.subset = subset
     self.text_column = text_column
     self.label_column = label_column
     self.max_seq_length = max_seq_length
     self.tokenizer = BertTokenizer.from_pretrained(bert_pretrained_model,
                                                    do_lower_case=bool(BERT_PRETRAINED_MODEL.endswith("uncased")))
     self.dataset = self.preprocessing_pipeline()
示例#29
0
def load_embedding_tokenizer(pretrained_model_name_or_path, **kwargs):
    # if the pretrained model points to a file on deepset s3, we need to adjust transformers dictionaries
    if pretrained_model_name_or_path in PRETRAINED_INIT_CONFIGURATION:
        BertTokenizer.pretrained_vocab_files_map["vocab_file"]. \
            update({pretrained_model_name_or_path: EMBEDDING_VOCAB_FILES_MAP["vocab_file"].get(
            pretrained_model_name_or_path, None)})
        BertTokenizer.max_model_input_sizes. \
            update({pretrained_model_name_or_path: MAX_MODEL_INPU_SIZES.get(pretrained_model_name_or_path, None)})
        BertTokenizer.pretrained_init_configuration. \
            update(
            {pretrained_model_name_or_path: PRETRAINED_INIT_CONFIGURATION.get(pretrained_model_name_or_path, None)})
    ret = BertTokenizer.from_pretrained(pretrained_model_name_or_path,
                                        **kwargs)
    return ret
示例#30
0
def main(args):
    # For Chinese (Ro)Bert, the best result is from : RoBERTa-wwm-ext (https://github.com/ymcui/Chinese-BERT-wwm)
    # If we want to fine-tune these model, we have to use same tokenizer : LTP (https://github.com/HIT-SCIR/ltp)
    with open(args.file_name, "r", encoding="utf-8") as f:
        data = f.readlines()

    ltp_tokenizer = LTP(args.ltp)  # faster in GPU device
    bert_tokenizer = BertTokenizer.from_pretrained(args.bert)

    ref_ids = prepare_ref(data, ltp_tokenizer, bert_tokenizer)

    with open(args.save_path, "w", encoding="utf-8") as f:
        data = [json.dumps(ref) + "\n" for ref in ref_ids]
        f.writelines(data)