def generate_ner(args) -> None: """ 总共分成2个步骤: Step1 : 用模型进行实体识别 Step2 : 对每篇文章按照中文句点进行分割 Args: args: --file_root : root path of data """ file_names = scan_files(args.file_root) # type:List[str] for file in file_names: data = load_file(args.file_root, file, "txt") # Part1 : 计算得到当前文章的实体识别结果 prepare_data = prepare(data) # type:np.ndarray result = predict(prepare_data) # type:np.ndarray _, ner_result = decode_result(result=result, sent_pre=prepare_data, sent=data) pickle.dump(ner_result, open(args.file_root + file + "_ner.pkl", 'wb')) # Part2 : 将当前文章按照(句号/问号/感叹号)作为划分,并记录到dict中 start, end = 0, 0 sentence_split_result = [] stop_tokens = ["。", "!", "?"] for idx, c in enumerate(data): if c in stop_tokens: end = idx sentence_split_result.append((start, end)) start = end + 1 pickle.dump(sentence_split_result, open(args.file_root + file + "_sentence_split.pkl", 'wb'))
def make_predictions(pairs, encoder, decoder, char2i, outputfn,\ batch_size=100, use_cuda=True): i2char = {c: i for i, c in char2i.items()} batches = get_batches(pairs, batch_size, char2i,\ PAD_symbol, use_cuda, test_mode=True) output_strings = [] out = open(outputfn, "w") for batch in batches: preds = predict(encoder, decoder, batch,\ list(char2i.keys()), use_cuda) for j in range(batch.size): eos = (preds[:, j] == EOS_index).\ nonzero().data[1][0] # Write the prediction up to the # second eos #print(''.join([i2char[int(c)] for c in\ # preds[1:eos, j]])) input_text = batch.inputs[j] input_enc = [i2char[int(c)] for c in \ batch.input_variable.t()[j]] pred = ''.join([i2char[int(c)] for c in\ preds[1:eos, j]]) pred = replace_UNK(input_text, input_enc, pred) out.write(pred) out.write("\n")
def test(d, tgt_domain): # print('Trn Size', d.trn_X.shape, d.trn_y.shape, 'Test Size', d.test_X.shape, d.test_y.shape) alphas = [0.001, 0.1, 1, 10, 100, 1000] # alphas = [1] models = [Ridge(alpha=a) for a in alphas] trn_mses, dev_mses, test_mses = [], [], [] for i, model in enumerate(models): trn_mse, dev_mse, test_mse = evaluate.predict(model, tgt_domain, d.trn_X, d.trn_y, d.dev_X, d.dev_y, d.test_X, d.test_y) trn_mses.append(trn_mse) dev_mses.append(dev_mse) test_mses.append(test_mse) return trn_mses, dev_mses, test_mses, models
def scrapeBets(): matchDF_filename = './csv/static/matchDF.csv' test = False if len(sys.argv) > 1: if sys.argv[1] == 'test': test = True else: url = sys.argv[1] search_regex = '.offering-games__table-row' time_regex = '.offering-games__link' def processGamesPlaying(driver): url = 'https://www.flashscore.com/table-tennis/' def doSomething(driver): button = driver.find_element_by_xpath( '//*[@id="live-table"]/div[1]/div/div[2]/div[1]') button.click() page_source, driver = doSomethingFetchPageSource( url, driver=driver, doSomething=[doSomething]) text_file = open("./temp/gamesPlaying.txt", "w") text_file.write(page_source) text_file.close() search_regex = '.event__participant' s = BeautifulSoup(str(page_source), 'html.parser') players = s.select(search_regex) k = [] for (index, player) in enumerate(players): p = player.text p = p[:p.find('(')].strip() k.append(p) return k cols = ['time', 'lTeam', 'rTeam', 'lLine', 'rLine', 'link'] def processBetOnline(driver): url = 'https://beta.betonline.ag/sportsbook/table-tennis/todaygames' base = 'https://beta.betonline.ag' page_source = fetchPS(url, test, driver) text_file = open("./temp/betonline.txt", "w") text_file.write(page_source) text_file.close() def formatTeamNames(teams): def formatSide(side): end = side.find(',') return (side[:end] + ' ' + side[end + 2:end + 3] + '.').strip() return [formatSide(teams[0].text), formatSide(teams[1].text)] s = BeautifulSoup(str(page_source), 'html.parser') df = pd.DataFrame([], columns=cols) search_regex = '.offering-today-games__table-row' time_regex = '.offering-today-games__link' matches = s.select(search_regex) for (index, match) in enumerate(matches): teams = match.select('.lines-row__team-name') lines = match.select('.lines-row__money') time = match.select(time_regex) teams = formatTeamNames(teams) link = base + time[0].get('href') k = pd.DataFrame([[ time[0].text, teams[0], teams[1], lines[0].text.strip('()'), lines[1].text.strip('()'), link ]], columns=cols) df = df.append(k) df['platform'] = url return df.reset_index(0, drop=True) def processBovada(driver): url = 'https://www.bovada.lv/sports/table-tennis' base = 'https://www.bovada.lv' page_source = fetchPS(url, test, driver, waitFor=['class', 'grouped-events']) text_file = open("./temp/bovada_ps.txt", "w") text_file.write(page_source) text_file.close() def formatLines(lines): def formatLine(line): l = line.strip() if l == 'EVEN': return '+100' return l if len(lines) > 2: return [formatLine(lines[2].text), formatLine(lines[3].text)] return [formatLine(lines[0].text), formatLine(lines[1].text)] def formatTeamNames(teams): def formatSide(side): end = side.find(',') if end == -1: end = side.find(' ') return (side[end:] + ' ' + side[0:1] + '.').strip() return (side[:end] + ' ' + side[end + 2:end + 3] + '.').strip() return [formatSide(teams[0].text), formatSide(teams[1].text)] def formatTime(time): text = time[0].text e = text.find(' ', 2) return text[e + 1:] s = BeautifulSoup(str(page_source), 'html.parser') df = pd.DataFrame([], columns=cols) search_regex = '.coupon-content.more-info' time_regex = '.period' s = s.select('.next-events-bucket') if len(s) == 0: return df s = s[0] matches = s.select(search_regex) for (index, match) in enumerate(matches): teams = match.select('.competitor-name') link = match.select('.game-view-cta') link = link[0].find_all('a', href=True)[0].get('href') link = base + link lines = match.select('.bet-price') time = match.select(time_regex) lines = formatLines(lines) teams = formatTeamNames(teams) time = formatTime(time) k = pd.DataFrame([[ time, teams[0], teams[1], lines[0].strip('()'), lines[1].strip('()'), link ]], columns=cols) df = df.append(k) df['platform'] = url return df.reset_index(0, drop=True) def findCorresponding(df, l, r): # print(l, r) today = date.today() d = today.strftime("%d.%m") k = df.loc[((df['lPlayer'] == r.strip()) & (df['rPlayer'] == l.strip())) | ((df['lPlayer'] == l.strip()) & (df['rPlayer'] == r.strip()))] return k def getCorrespondingGames(df): gameDF = pd.read_csv(matchDF_filename) gameDF = gameDF.loc[gameDF['lScore'] == '-'] d = pd.DataFrame() o = pd.DataFrame() for index, i in df.iterrows(): k = findCorresponding(gameDF, i['lTeam'], i['rTeam']) if k.shape[0] != 0: d = d.append(k.iloc[0]) i['merge_index'] = k.id.values[0] o = o.append(i) return d, o def fetchPS(url, test, driver, **kwargs): ps = '' if test == False: if 'waitFor' in kwargs: ps, driver = fetchPageSource(url, waitFor=kwargs['waitFor'], driver=driver) else: ps, driver = fetchPageSource(url, driver=driver) text_file = open("./Debug/ps.txt", "w") text_file.write(ps) text_file.close() else: file = open('./Debug/ps.txt', 'r') ps = file.read() file.close() return ps driver = createDriver() gamesPlaying = processGamesPlaying(driver) k = processBetOnline(driver) l = processBovada(driver) k.to_csv('./temp/betOnline.csv') l.to_csv('./temp/bovada.csv') bettingSitesDF = [k, l] df = pd.concat(bettingSitesDF) def formatLines(df): df.loc[df['rTeam'].str.strip() > df['lTeam'].str.strip(), ['lTeam', 'rTeam', 'lLine', 'rLine']] = df.loc[ df['rTeam'].str.strip() > df['lTeam'].str.strip()][[ 'rTeam', 'lTeam', 'rLine', 'lLine' ]].values df = df.sort_values('time') df.to_csv('./temp/combined.csv') return df df.to_csv('./temp/bettingSitesDF.csv') df = formatLines(df).reset_index() cdf, cbdf = getCorrespondingGames(df) print(cdf.to_csv('./temp/cdf.csv')) print(cbdf.to_csv('./temp/cbdf.csv')) ul = set(list(cdf['lPlayer'].unique()) + list(cdf['rPlayer'].unique())) mdf = pd.read_csv(matchDF_filename) udf = mdf[(mdf['lPlayer'].isin(ul)) | (mdf['rPlayer'].isin(ul))] udf.to_csv('./test_before.csv') formatted = formatter(udf, True, ignore_ids=cdf['id']) formatted = formatted[formatted['id'].isin(cdf['id'])] formatted = formatted.merge(mdf, on='id') def formatSequencer(df, seq): df.loc[df[seq].str.contains(' ') == False, seq] = df[seq] + '0000' df.loc[df[seq].str.contains(' '), seq] = df[seq].str[0:6] + '2020' + df[seq].str[-5:] df[seq] = df[seq].str.replace('.', '') df[seq] = df[seq].str.replace(':', '') df[seq] = df[seq].str[4:8] + df[seq].str[2:4] + df[seq].str[0:2] + df[ seq].str[8:] k = df[df[seq].str.contains(' ')][[seq]] df[seq] = df[seq].astype(int) return df formatted = formatSequencer(formatted, 'datetime') formatted.to_csv('./temp/merged.csv') predictions = predict(formatted) formatted['predictions'] = predictions.tolist() formatted['rWinPred'] = formatted['predictions'].apply(lambda x: x[0]) formatted['lWinPred'] = formatted['predictions'].apply(lambda x: x[1]) formatted = formatted.merge(cbdf, left_on='id', right_on='merge_index') formatted.to_csv('./temp/formatted.csv') formatted = formatted[formatted['lLine'] != ''] formatted = formatted[formatted['rLine'] != ''] formatted = swap(formatted, ['lTeam', 'Player_left'], [['lTeam', 'rTeam'], ['lLine', 'rLine']]) formatted['lOdds'] = formatted['lLine'].astype(int).apply( americanToImplied) formatted['rOdds'] = formatted['rLine'].astype(int).apply( americanToImplied) formatted['ledge'] = round(formatted['lWinPred'] - formatted['lOdds'], 4) formatted['redge'] = round(formatted['rWinPred'] - formatted['rOdds'], 4) formatted['lOdds'] = round(formatted['lOdds'], 4) formatted['rOdds'] = round(formatted['rOdds'], 4) formatted['lWinPred'] = round(formatted['lWinPred'], 4) formatted['rWinPred'] = round(formatted['rWinPred'], 4) formatted = formatted.sort_values('datetime') formatted = getLargestInGroup(formatted, ['id'], 'ledge', 'redge') formatted = formatted.sort_values('datetime') formatted = filterOnlyNew(formatted, gamesPlaying, 'datetime') formatted = formatted.sort_values('datetime') cols = [ 'datetime', 'id', 'lTeam', 'rTeam', 'Player_left', 'Player_right', 'lWinPred', 'rWinPred', 'lOdds', 'rOdds', 'lLine', 'rLine', 'ledge', 'redge', 'platform', 'link' ] formatted[cols].to_csv('./predictions.csv') print("done")
log_dir=args.logdir, verbose=args.verbose) ############################################### ## Predict ## ############################################### if not args.test: load_model(args.modeldir, cnn) else: logger.info('Testing on val set:') val_acc = test(cnn, val_iter, text_field, label_field, cuda=args.cuda, verbose=args.verbose) predict(cnn, val_iter, text_field, label_field, os.path.join(args.predout, 'predict_val.txt'), cuda=args.cuda, verbose=args.verbose) predict(cnn, test_iter, text_field, label_field, os.path.join(args.predout, 'predict_test.txt'), cuda=args.cuda, verbose=args.verbose)
print(f"Shape of X_train: {X_train.shape}") print(f"Shape of X_test : {X_test.shape}\n") print(f"Shape of y_train: {y_train.shape}") print(f"Shape of y_test : {y_test.shape}\n") print(f"Ratio: {len(X_train)/len(X_test)}") # Training print("Start training...") model = train(X_train, y_train, model_path) print("Training done!!") # Evaluate ## Acc print("Train Acc") get_acc(model, X_train, y_train) print("\nTest Acc") get_acc(model, X_test, y_test) ## Print result print_report(model, X_test, y_test, le.classes_, report_path) ## Caculate Confidence caculate_confidence(model, X_test, y_test) ## Predict text = 'who are the actresses in the movies' predict(model, vectorizer, le, text)
import visualize import evaluate if __name__ == '__main__': train_dataset, test_dataset, encoder = utilities.load_data() model = tf.keras.Sequential([ encoder, tf.keras.layers.Embedding( input_dim=len(encoder.get_vocabulary()), output_dim=64, mask_zero=True), tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)), tf.keras.layers.Dense(64, activation='relu'), tf.keras.layers.Dense(1) ]) model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.Adam(1e-4), metrics=['accuracy']) history = model.fit(train_dataset, epochs=1, validation_data=test_dataset, validation_steps=30) test_loss, test_acc = model.evaluate(test_dataset) visualize.display_results(test_loss, test_acc, history) evaluate.predict(model)
def main(): # 数据预处理 # dataList = [] # for i in range(3,11): # dataList.append(PrepareData()) # args.train_file = '/Users/wangyihao/Pycharm/transformer-simple-master_new/data/data' + str(i) + '.p' data = PrepareData() args.src_vocab = len(data.en_word_dict) args.tgt_vocab = len(data.cn_word_dict) print("src_vocab %d" % args.src_vocab) print("tgt_vocab %d" % args.tgt_vocab) # 初始化模型 model = make_model(args.src_vocab, args.tgt_vocab, args.layers, args.d_model, args.d_ff, args.h_num, args.dropout) if args.type == 'train': # 训练 print(">>>>>>> start train") criterion = LabelSmoothing(args.tgt_vocab, padding_idx=0, smoothing=0.0) optimizer = NoamOpt( args.d_model, 1, 2000, torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) train(data, model, criterion, optimizer) print("<<<<<<< finished train") elif args.type == "evaluate": # 预测 # 先判断模型有没有训练好(前提) if os.path.exists(args.save_file): # 加载模型 model.load_state_dict(torch.load(args.save_file)) # 开始预测 print(">>>>>>> start evaluate") precision = evaluate(data, model) TP_total = precision.sum(axis=0)[0] FP_total = precision.sum(axis=0)[1] TN_total = precision.sum(axis=0)[2] FN_total = precision.sum(axis=0)[3] TPR = TP_total / (TP_total + FN_total) #计算真正率 TNR = TN_total / (TN_total + FP_total) #计算真负率 print( 'total true positive amount: %.3f, total false negative amount: %.3f' % (TP_total, FN_total)) print( 'total true negative amount: %.3f, total false positive amount: %.3f' % (TN_total, FP_total)) print('symbol within feature TPR: %.3f, delimiter TNR: %.3f' % (TPR, TNR)) print("<<<<<<< finished evaluate") else: print("Error: pleas train before evaluate") elif args.type == "predict": #输入特征并预测 if os.path.exists(args.save_file): # 加载模型 model.load_state_dict(torch.load(args.save_file)) # 开始预测 print(">>>>>>> start predict") translation = predict(data, model) print("<<<<<<< finished predict") else: print("Error: please select type within [train / evaluate / predict]")
def predict(self, image_arr, x_test, t1_image, out_file): y_pred = evaluate.predict(self.model, image_arr, x_test) save_mask(t1_image, x_test, y_pred, out_file)