def infer(args): paddle.set_device(args.device) # create dataset. infer_dataset = LacDataset(args.data_dir, mode='infer') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0, dtype='int64'), # word_ids Stack(dtype='int64'), # length ): fn(samples) # Create sampler for dataloader infer_sampler = paddle.io.BatchSampler( dataset=infer_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False) infer_loader = paddle.io.DataLoader( dataset=infer_dataset, batch_sampler=infer_sampler, return_list=True, collate_fn=batchify_fn) # Define the model network network = BiGruCrf(args.emb_dim, args.hidden_size, infer_dataset.vocab_size, infer_dataset.num_labels) inputs = InputSpec(shape=(-1, ), dtype="int64", name='inputs') lengths = InputSpec(shape=(-1, ), dtype="int64", name='lengths') model = paddle.Model(network, inputs=[inputs, lengths]) model.prepare() # Load the model and start predicting model.load(args.init_checkpoint) emissions, lengths, crf_decodes = model.predict( test_data=infer_loader, batch_size=args.batch_size) # Post-processing the lexical analysis results lengths = np.array([l for lens in lengths for l in lens]).reshape([-1]) preds = np.array( [pred for batch_pred in crf_decodes for pred in batch_pred]) results = parse_lac_result(infer_dataset.word_ids, preds, lengths, infer_dataset.word_vocab, infer_dataset.label_vocab) sent_tags = [] for sent, tags in results: sent_tag = ['(%s, %s)' % (ch, tag) for ch, tag in zip(sent, tags)] sent_tags.append(''.join(sent_tag)) file_path = "results.txt" with open(file_path, "w", encoding="utf8") as fout: fout.write("\n".join(sent_tags)) # Print some examples print( "The results have been saved in the file: %s, some examples are shown below: " % file_path) print("\n".join(sent_tags[:10]))
def evaluate(args): place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() paddle.set_device("gpu" if args.use_gpu else "cpu") # create dataset. test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, places=place, return_list=True, collate_fn=batchify_fn) # Define the model network and metric evaluator network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size, test_dataset.num_labels) inputs = InputSpec(shape=(-1, ), dtype="int16", name='inputs') lengths = InputSpec(shape=(-1, ), dtype="int16", name='lengths') model = paddle.Model(network, inputs=[inputs, lengths]) chunk_evaluator = ChunkEvaluator( label_list=test_dataset.label_vocab.keys(), suffix=True) model.prepare(None, None, chunk_evaluator) # Load the model and start predicting model.load(args.init_checkpoint) model.evaluate( eval_data=test_loader, batch_size=args.batch_size, log_freq=100, verbose=2, )
def evaluate(args): place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() paddle.set_device("gpu" if args.use_gpu else "cpu") # create dataset. test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, places=place, return_list=True, collate_fn=batchify_fn) # Define the model network and metric evaluator network = BiGruCrf(args.emb_dim, args.hidden_size, test_dataset.vocab_size, test_dataset.num_labels) model = paddle.Model(network) chunk_evaluator = ChunkEvaluator( int(math.ceil((test_dataset.num_labels + 1) / 2.0)), "IOB") # + 1 for SOS and EOS model.prepare(None, None, chunk_evaluator) # Load the model and start predicting model.load(args.init_checkpoint) model.evaluate( eval_data=test_loader, batch_size=args.batch_size, log_freq=100, verbose=2, )
def train(args): paddle.set_device("gpu" if args.n_gpu else "cpu") # create dataset. train_dataset = LacDataset(args.data_dir, mode='train') test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader(dataset=train_dataset, batch_sampler=train_sampler, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler(dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=False) test_loader = paddle.io.DataLoader(dataset=test_dataset, batch_sampler=test_sampler, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss network = BiGruCrf(args.emb_dim, args.hidden_size, train_dataset.vocab_size, train_dataset.num_labels) model = paddle.Model(network) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam(learning_rate=args.base_lr, parameters=model.parameters()) crf_loss = LinearChainCrfLoss(network.crf) chunk_evaluator = ChunkEvaluator( label_list=train_dataset.label_vocab.keys(), suffix=True) model.prepare(optimizer, crf_loss, chunk_evaluator) if args.init_checkpoint: model.load(args.init_checkpoint) # Start training callbacks = paddle.callbacks.ProgBarLogger( log_freq=10, verbose=3) if args.verbose else None model.fit(train_data=train_loader, eval_data=test_loader, batch_size=args.batch_size, epochs=args.epochs, eval_freq=1, log_freq=10, save_dir=args.model_save_dir, save_freq=1, shuffle=True, callbacks=callbacks)
def train(args): if args.use_gpu: place = paddle.CUDAPlace(paddle.distributed.ParallelEnv().dev_id) paddle.set_device("gpu") else: place = paddle.CPUPlace() paddle.set_device("cpu") # create dataset. train_dataset = LacDataset(args.data_dir, mode='train') test_dataset = LacDataset(args.data_dir, mode='test') batchify_fn = lambda samples, fn=Tuple( Pad(axis=0, pad_val=0), # word_ids Stack(), # length Pad(axis=0, pad_val=0), # label_ids ): fn(samples) # Create sampler for dataloader train_sampler = paddle.io.DistributedBatchSampler( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, drop_last=True) train_loader = paddle.io.DataLoader( dataset=train_dataset, batch_sampler=train_sampler, places=place, return_list=True, collate_fn=batchify_fn) test_sampler = paddle.io.BatchSampler( dataset=test_dataset, batch_size=args.batch_size, shuffle=False, drop_last=True) test_loader = paddle.io.DataLoader( dataset=test_dataset, batch_sampler=test_sampler, places=place, return_list=True, collate_fn=batchify_fn) # Define the model netword and its loss network = BiGruCrf(args.emb_dim, args.hidden_size, train_dataset.vocab_size, train_dataset.num_labels) model = paddle.Model(network) # Prepare optimizer, loss and metric evaluator optimizer = paddle.optimizer.Adam( learning_rate=args.base_lr, parameters=model.parameters()) crf_loss = LinearChainCrfLoss(network.crf.transitions) chunk_evaluator = ChunkEvaluator( int(math.ceil((train_dataset.num_labels + 1) / 2.0)), "IOB") # + 1 for START and STOP model.prepare(optimizer, crf_loss, chunk_evaluator) if args.init_checkpoint: model.load(args.init_checkpoint) # Start training callback = paddle.callbacks.ProgBarLogger(log_freq=10, verbose=3) model.fit(train_data=train_loader, eval_data=test_loader, batch_size=args.batch_size, epochs=args.epochs, eval_freq=1, log_freq=10, save_dir=args.model_save_dir, save_freq=1, drop_last=True, shuffle=True, callbacks=callback)