current_name = 'log/%s.txt' % time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime())
logging.basicConfig(filename=current_name,
                    filemode='w',
                    format='%(asctime)s - %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S',
                    level=logging.INFO)

file_namne = 'data/raw_data/train.json'
train_part, valid_part = data_manager.parse_v3(file_name=file_namne,
                                               valid_num=10000)
print(len(train_part), len(valid_part))
seed_torch(2019)

t = Tokenizer(max_feature=10000, segment=False, lowercase=True)

train_dataset = entity_linking_v3(train_part, t)
valid_dataset = entity_linking_v3(valid_part, t)

batch_size = 1

# 准备embedding数据
embedding_file = 'embedding/miniembedding_baike_link.npy'
#embedding_file = 'embedding/miniembedding_engineer_qq_att.npy'

if os.path.exists(embedding_file):
    embedding_matrix = np.load(embedding_file)
else:
    #embedding = '/home/zhukaihua/Desktop/nlp/embedding/baike'
    embedding = '/home/zhu/Desktop/word_embedding/sgns.baidubaike.bigram-char'
Пример #2
0
train_data = read_data('data/sentiment_XS_30k.txt')
valid_data = read_data('data/sentiment_XS_test.txt')
print(train_data.sample(100))

current_name = 'log/%s.txt' % time.strftime("%Y-%m-%d %H:%M:%S",
                                            time.localtime())
logging.basicConfig(filename=current_name,
                    filemode='w',
                    format='%(asctime)s - %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S',
                    level=logging.INFO)

seed_torch(2019)

t = Tokenizer(max_feature=500000, segment=False)
t.fit(list(train_data['text'].values) + list(valid_data['text'].values))
##  save t
import pickle
pickle.dump(t, open('tokenizer.pkl', 'wb'))

print('一共有%d 个词' % t.num_words)
train_data = train_data.append(valid_data).reset_index(drop=True)
train_dataset = SPO(train_data['text'].values, t, label=train_data['label'])

batch_size = 40

# 准备embedding数据
#embedding_file = 'embedding/miniembedding_engineer_baike_word.npy'
embedding_file = 'embedding/miniembedding_engineer_qq_att.npy'
                    filemode='w',
                    format='%(asctime)s - %(message)s',
                    datefmt='%d-%b-%y %H:%M:%S',
                    level=logging.INFO)

seed_torch(2019)
train_X, train_pos, train_label, train_ner, dev_X, dev_pos, dev_label, dev_ner = read_data(
)
assert len(train_X) == len(train_label)
# train_X = [''.join(word) for word in train_X]
# train_pos = [' '.join(word) for word in train_pos]

# dev_X = [' '.join(word) for word in dev_X]self.raw_X[index]
# dev_pos = [' '.join(word) for word in dev_pos]

t = Tokenizer(max_feature=500000, segment=False)
t.fit(list(train_X) + list(dev_X))

pos_t = Tokenizer(max_feature=500, segment=False)
pos_t.fit(list(train_pos) + list(dev_pos))
print('一共有%d 个词' % t.num_words)
print('一共有%d 个词性' % pos_t.num_words)

train_dataset = SPO(train_X, train_pos, train_label, t, pos_t, ner=train_ner)
valid_dataset = SPO(dev_X, dev_pos, dev_label, t, pos_t, ner=dev_ner)
batch_size = 512

# 准备embedding数据
embedding_file = 'embedding/miniembedding_engineer_baike_zi_new.npy'
#embedding_file = 'embedding/miniembedding_engineer_qq.npy'
from spo_model import SPOModel, EntityLink
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from tokenize_pkg.tokenize import Tokenizer
from tqdm import tqdm as tqdm
import torch.nn as nn
from utils import seed_torch, read_data, load_glove, calc_f1, get_threshold
from pytorch_pretrained_bert import BertTokenizer, BertAdam
import logging
import time

file_namne = 'data/raw_data/train.json'
train_X, train_pos, train_type, dev_X, dev_pos, dev_type = data_manager.parse_mention(
    file_name=file_namne, valid_num=10000)
seed_torch(2019)

t = Tokenizer(max_feature=10000, segment=False, lowercase=True)
t.fit(train_X + dev_X)

print('一共有%d 个字' % t.num_words)

train_dataset = SPO_LINK(train_X, t, pos=train_pos, type=train_type)
valid_dataset = SPO_LINK(dev_X, t, pos=dev_pos, type=dev_type)

batch_size = 1

# 准备embedding数据
embedding_file = 'embedding/miniembedding_baike_link.npy'
#embedding_file = 'embedding/miniembedding_engineer_qq_att.npy'

if os.path.exists(embedding_file):
    embedding_matrix = np.load(embedding_file)