current_name = 'log/%s.txt' % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) logging.basicConfig(filename=current_name, filemode='w', format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO) file_namne = 'data/raw_data/train.json' train_part, valid_part = data_manager.parse_v3(file_name=file_namne, valid_num=10000) print(len(train_part), len(valid_part)) seed_torch(2019) t = Tokenizer(max_feature=10000, segment=False, lowercase=True) train_dataset = entity_linking_v3(train_part, t) valid_dataset = entity_linking_v3(valid_part, t) batch_size = 1 # 准备embedding数据 embedding_file = 'embedding/miniembedding_baike_link.npy' #embedding_file = 'embedding/miniembedding_engineer_qq_att.npy' if os.path.exists(embedding_file): embedding_matrix = np.load(embedding_file) else: #embedding = '/home/zhukaihua/Desktop/nlp/embedding/baike' embedding = '/home/zhu/Desktop/word_embedding/sgns.baidubaike.bigram-char'
train_data = read_data('data/sentiment_XS_30k.txt') valid_data = read_data('data/sentiment_XS_test.txt') print(train_data.sample(100)) current_name = 'log/%s.txt' % time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) logging.basicConfig(filename=current_name, filemode='w', format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO) seed_torch(2019) t = Tokenizer(max_feature=500000, segment=False) t.fit(list(train_data['text'].values) + list(valid_data['text'].values)) ## save t import pickle pickle.dump(t, open('tokenizer.pkl', 'wb')) print('一共有%d 个词' % t.num_words) train_data = train_data.append(valid_data).reset_index(drop=True) train_dataset = SPO(train_data['text'].values, t, label=train_data['label']) batch_size = 40 # 准备embedding数据 #embedding_file = 'embedding/miniembedding_engineer_baike_word.npy' embedding_file = 'embedding/miniembedding_engineer_qq_att.npy'
filemode='w', format='%(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S', level=logging.INFO) seed_torch(2019) train_X, train_pos, train_label, train_ner, dev_X, dev_pos, dev_label, dev_ner = read_data( ) assert len(train_X) == len(train_label) # train_X = [''.join(word) for word in train_X] # train_pos = [' '.join(word) for word in train_pos] # dev_X = [' '.join(word) for word in dev_X]self.raw_X[index] # dev_pos = [' '.join(word) for word in dev_pos] t = Tokenizer(max_feature=500000, segment=False) t.fit(list(train_X) + list(dev_X)) pos_t = Tokenizer(max_feature=500, segment=False) pos_t.fit(list(train_pos) + list(dev_pos)) print('一共有%d 个词' % t.num_words) print('一共有%d 个词性' % pos_t.num_words) train_dataset = SPO(train_X, train_pos, train_label, t, pos_t, ner=train_ner) valid_dataset = SPO(dev_X, dev_pos, dev_label, t, pos_t, ner=dev_ner) batch_size = 512 # 准备embedding数据 embedding_file = 'embedding/miniembedding_engineer_baike_zi_new.npy' #embedding_file = 'embedding/miniembedding_engineer_qq.npy'
from spo_model import SPOModel, EntityLink from torch.utils.data import DataLoader, RandomSampler, TensorDataset from tokenize_pkg.tokenize import Tokenizer from tqdm import tqdm as tqdm import torch.nn as nn from utils import seed_torch, read_data, load_glove, calc_f1, get_threshold from pytorch_pretrained_bert import BertTokenizer, BertAdam import logging import time file_namne = 'data/raw_data/train.json' train_X, train_pos, train_type, dev_X, dev_pos, dev_type = data_manager.parse_mention( file_name=file_namne, valid_num=10000) seed_torch(2019) t = Tokenizer(max_feature=10000, segment=False, lowercase=True) t.fit(train_X + dev_X) print('一共有%d 个字' % t.num_words) train_dataset = SPO_LINK(train_X, t, pos=train_pos, type=train_type) valid_dataset = SPO_LINK(dev_X, t, pos=dev_pos, type=dev_type) batch_size = 1 # 准备embedding数据 embedding_file = 'embedding/miniembedding_baike_link.npy' #embedding_file = 'embedding/miniembedding_engineer_qq_att.npy' if os.path.exists(embedding_file): embedding_matrix = np.load(embedding_file)