#!/usr/bin/env python # -*- coding: utf-8 -*- # Created by lljzhiwang on 2018/11/23 import os, json, time, sys import util_path as path import util_common as util import logging from Logginger import init_logger logger = init_logger('Preprocess', logging_path=path.logpath) datapath = './data/data_raw' goodpath = './data/good' highqpath = './data/highq' alllog_b09 = datapath + r'/log_b_09.txt' alllog_b18 = datapath + r'/log_b_18.txt' alllog_d09 = datapath + r'/log_d_09.json' alllog_d18 = datapath + r'/log_d_18.json' user_typeinter_18 = datapath + r'/userdb_intersec_18.txt' user_typeinter_09 = datapath + r'/userdb_intersec_09.txt' ulog_typeinter09_d = datapath + r'/ulog_typeinter09_d.json' ulog_typeinter09_b = datapath + r'/ulog_typeinter09_b.json' ulog_typeinter09_dbdiff = datapath + r'/ulog_typeinter09_dbdiff.json' ulog_typeinter18_d = datapath + r'/ulog_typeinter18_d.json' ulog_typeinter18_b = datapath + r'/ulog_typeinter18_b.json' ulog_typeinter18_dbdiff = datapath + r'/ulog_typeinter18_dbdiff.json' user_timeinter_b = datapath + r'/userb_intersec_0918.txt' user_timeinter_d = datapath + r'/userd_intersec_0918.txt' ulog_sample_18_highq_posi = highqpath + '/log18_highq_posi.txt'
# -*- coding: utf-8 -*- # @project:wholee_keyword # @author:caojinlei # @file: data_load.py # @time: 2021/05/07 from transformers import BertTokenizer, BertModel, BertConfig from Logginger import init_logger import json from utils import sim_matrix import numpy as np logger = init_logger('wholee_keyword', logging_path='output') def load_bert_embedding(model_name): """ 载入Bert模型 :param model_name:bert模型名 :return: """ model_name = model_name tokenizer = BertTokenizer.from_pretrained(model_name) model_config = BertConfig.from_pretrained(model_name) model_config.output_hidden_states = True model_config.output_attentions = True bert_model = BertModel.from_pretrained(model_name, config=model_config) logger.info('载入模型成功') return tokenizer, bert_model def load_word_dict(label):
import argparse from model import NER_NET from data_loader import create_batch_iter import torch import common import time from util import time_since from transformers import AdamW, get_linear_schedule_with_warmup from Logginger import init_logger from score import eval_result, eval_rel_by_condition import sys logger = init_logger("torch", logging_path=common.log_path) def train_model(model, optimizer, scheduler, train_iter, test_iter, opt, len_dataset): print('====================== Start Training =========================') best_f1 = 0 global_step = 0 patience = opt.patience for e in range(opt.num_epoch): if patience <= 0: break total_loss = 0 epoch_start = time.time() temp_start = epoch_start model.train() for step, batch in enumerate(train_iter): words, pieces, batch = batch batch = tuple(t.to(opt.device) for t in batch)
import lightgbm as lgb import pandas as pd import numpy as np import util_common as uc from sklearn.metrics import mean_squared_error from sklearn import datasets from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.linear_model import Perceptron, LogisticRegression, SGDClassifier from sklearn import metrics from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC import util_path as path from Logginger import init_logger logger = init_logger('RMODEL', logging_path=path.logpath) params_gbdt = { 'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': {'binary_logloss', 'l2', 'auc'}, 'num_leaves': 30, 'max_depth': 5, 'min_data_in_leaf': 450, 'num_trees': 100, 'learning_rate': 0.05, 'feature_fraction': 0.9, 'bagging_fraction': 0.8, 'bagging_freq': 5, 'verbose': 0
import numpy as np from tqdm import tqdm import shutil from net import Net from utils import f1_score, get_tags, format_result, convert_tf_checkpoint_to_pytorch import args from model_util import save_model from data_loader import create_batch_iter from torch.optim.adamw import AdamW from flyai.utils import remote_helper from flyai.dataset import Dataset from Logginger import init_logger logger = init_logger("bert_ner", logging_path=args.log_path) DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') class Instructor(object): """ 特点:使用flyai字典的get all data | 自己进行划分next batch """ def __init__(self, args): self.args = args self.tag_map = {label: i for i, label in enumerate(self.args.labels)} def train(self, train_source, train_target, dev_source, dev_target): if os.path.exists(self.args.output_dir) is True: shutil.rmtree(self.args.output_dir)
#!/usr/bin/env python # -*- coding: utf-8 -*- # Created by lljzhiwang on 2018/12/6 import psutil import time, sys, os, codecs, logging import util_path as path import util_common as util from gensim.models.doc2vec import Doc2Vec, TaggedDocument BIANMA = 'utf8' datapath = r'./data' # bianma='gb18030' from Logginger import init_logger logger = init_logger('EmbD2V', logging_path=path.logpath) testwl = [ 'zgjq200914010', '1011143537.nh', 'DYPJ200924005', '2010261634.nh', '1014310786.nh', '1012347178.nh', '1012258129.nh', 'ddyi201218094', 'fxsy201508029' ] class MyDocuments(object): ''' 根据分好词的文件生成句子序列,用于word2vec训练 dirname:分好词的文件路径,可以是单个文件路径也可以是文件夹地址,文件以txt结尾 start:从一行的第几个元素开始算词。因为有的文件每行第一个元素是用户id,则start=1用于略过id, ''' def __init__(self, dirname, start=0, subfix='.txt'): self.dirname = dirname self.start = start
import logging import sys # import IOTools from tc_conversion.langconv import * from tc_conversion.full_half_conversion import * import time import util_path as path import util_common as util bianma='utf8' basepath=r'./data' # bianma='gb18030' ss = util_segment.SentenceSegmentation() ws = util_segment.WordSegmentation() from Logginger import init_logger logger=init_logger('EmbSeg',logging_path=path.logpath) def segword4oneline(line, minwc=3, minwlen=0, sseg=False, convert=False): ''' 对一行输入分词,分词结果是一行list :param line: 一行,一句 :type line: str :param minwc: 分词结果最小词数,如果分词后结果小于minwc,则输出空 :type minwc: int :param minwlen: 分词后单个词的最小长度如,'我' 长度=1 '我们' = 2 小于改长度则不计入分词结果 :type minwlen: int :param convert: 是否做全角转化,简繁转换 :type convert: bool :return: :rtype: '''
#!/usr/bin/env python # -*- coding: utf-8 -*- # Created by liliangjie on 2018/11/10 # Email llj : [email protected] import codecs, pickle import os, json, re import logging import numpy as np import util_path as path from Logginger import init_logger logger = init_logger('UtilCom', logging_path=path.logpath) bianma = 'utf8' ''' 常用函数 ''' def get_code_field(code, dic_codefield): ''' 获取专题子栏目代码对应的中文解释 :param code: 专题子栏目代码,可能是由分号隔开的多个 :type code: str :param dic_codefield: 专题子栏目代码解释字典 :type dic_codefield:dict :return: :rtype: ''' codes = code.strip(';').split(';') l0, l1 = [], [] for c in codes:
#!/usr/bin/env python # -*- coding: utf-8 -*- # Created by lljzhiwang on 2018/12/7 from gensim.models.doc2vec import Doc2Vec from gensim.models import keyedvectors import util_common as util import numpy as np import os, sys, pickle import util_path as path from Logginger import init_logger logger = init_logger('MLPrepare', logging_path=path.logpath) def get_samplevec_gensimmodel(vecpath1, vecpath2, samplefile, prefix, respath='./', stopcnt=100, progress_per=10000): #通过样本文件获取对应的向量表示 uid+fn==> [uvec+fnvec] data, labels, realexamp = [], [], [] logger.info('loading vecfile : %s' % vecpath1) # muser=Doc2Vec.load(usermodel) v_user = load_vec(vecpath1) logger.info('loading vecfile : %s' % vecpath2) v_file = load_vec(vecpath2) samples = util.load2list(samplefile) for cnt, exam in enumerate(samples): if cnt % progress_per == 0:
import numpy as np import pandas as pd from collections import Counter import matplotlib.pyplot as plt import re, os from imblearn.over_sampling import RandomOverSampler from imblearn.under_sampling import RandomUnderSampler from sklearn.preprocessing import OneHotEncoder, LabelBinarizer, MultiLabelBinarizer from sklearn import preprocessing from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV from sklearn.neighbors import NearestNeighbors import util_path as path from Logginger import init_logger logger = init_logger('FeaProcess', logging_path=path.logpath) def data2csv(): fnfeatpath = './data/highq_5w/fn18_5w_features.txt' fnfeas = uc.load2list(fnfeatpath) fns, cites, cites_w, authcodes, fundcodes, jigoucodes, productcodes, dates, pages, downs, citeds, ifs = [], [], [], [], [], [], [], [], [], [], [], [] for i in fnfeas: if type(i) is str: iss = i.split() if len(iss) == 14: fns.append(iss[0]) cites.append(iss[1]) cites_w.append(iss[2]) authcodes.append(iss[3]) fundcodes.append(iss[4])
import os import argparse import warnings import time import torch from flyai.dataset import Dataset from flyai.utils import remote_helper from Logginger import init_logger from data_loader import create_batch_iter from optimization import BertAdam import args as arguments from net import Net from model_util import save_model logger = init_logger("torch", logging_path=arguments.log_path) torch.manual_seed(arguments.seed) torch.cuda.manual_seed(arguments.seed) torch.cuda.manual_seed_all(arguments.seed) warnings.filterwarnings('ignore') remote_helper.get_remote_date("https://www.flyai.com/m/chinese_base.zip") os.environ['CUDA_LAUNCH_BLOCKING'] = "1" def main(): """ 项目的超参 """
def convert_examples_to_features(examples, max_seq_length, tokenizer): logger = init_logger("bert_ner", logging_path=args.log_path) # 标签转换为数字 label_map = {label: i for i, label in enumerate(args.labels)} # load sub_vocab sub_vocab = {} with open(args.VOCAB_FILE, 'r') as fr: for line in fr: _line = line.strip('\n') if "##" in _line and sub_vocab.get(_line) is None: sub_vocab[_line] = 1 features = [] labels = None for ex_index, example in enumerate(examples): tokens_a = tokenizer.tokenize(example.text_a) if example.label is not None: labels = example.label.split() if len(tokens_a) == 0: continue if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[:(max_seq_length - 2)] if labels is not None: labels = labels[:(max_seq_length - 2)] # ----------------处理source-------------- # 句子首尾加入标示符 tokens = ["[CLS]"] + tokens_a + ["[SEP]"] # 词转换成数字 input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1] * len(input_ids) padding = [0] * (max_seq_length - len(input_ids)) input_ids += padding input_mask += padding assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length # ---------------处理target---------------- if labels is not None: # Notes: label_id中不包括[CLS]和[SEP] label_id = [label_map[l] for l in labels] label_padding = [-1] * (max_seq_length - len(label_id)) label_id += label_padding else: label_id = [-1] * max_seq_length # output_mask用来过滤bert输出中sub_word的输出,只保留单词的第一个输出(As recommended by jocob in his paper) # 此外,也是为了适应crf output_mask = [ 0 if sub_vocab.get(t) is not None else 1 for t in tokens_a ] output_mask = [0] + output_mask + [0] output_mask += padding # ----------------处理后结果------------------------- # for example, in the case of max_seq_length=10: # raw_data: 春 秋 忽 代 谢le # token: [CLS] 春 秋 忽 代 谢 #le [SEP] # input_ids: 101 2 12 13 16 14 15 102 0 0 0 # input_mask: 1 1 1 1 1 1 1 1 0 0 0 # label_id: T T O O O # output_mask: 0 1 1 1 1 1 0 0 0 0 0 # --------------看结果是否合理------------------------ # if ex_index < 1: # logger.info("-----------------Example-----------------") # logger.info("guid: %s" % (example.guid)) # logger.info("text_a: %s" % example.text_a) # logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) # logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) # logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) # logger.info("label: %s " % " ".join([str(x) for x in label_id])) # logger.info("output_mask: %s " % " ".join([str(x) for x in output_mask])) # ---------------------------------------------------- feature = InputFeature(input_ids=input_ids, input_mask=input_mask, label_id=label_id, output_mask=output_mask) features.append(feature) return features