Пример #1
0
    logger.info('Building dictionary')
    # 构建字典
    if os.path.exists(args.dictionary):
        dictionary = joblib.load(args.dictionary)
    else:
        logger.info("Loading data...")
        data = build_dict_dataset()  # 构建字典数据集
        # 词粒度或者字符粒度
        if args.word:
            data = data['raw_words'].values.tolist()
        else:
            data = data['raw_words'].apply(
                lambda x: " ".join("".join(x.split())))
        dictionary = Dictionary()
        dictionary.build_dictionary(data)
        del data
        joblib.dump(dictionary, config.dict_path)

    logger.info('Loading dataset')
    # 数据集的定义
    train_dataset = NewsDataset(config.train_path,
                                dictionary=dictionary,
                                word=args.word)
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=config.batch_size,
                                  collate_fn=collate_fn,
                                  shuffle=True)
    dev_dataset = NewsDataset(config.valid_path,
                              dictionary=dictionary,
                              word=args.word)