def load_url(time): reslut = dict() num = '100' purl = NewsHomeURL() channels = ['news_china_suda', 'news_world_suda', 'news_society_suda'] out_path = "F:/scrapy/sina_data1.1.0/news_detail_url/" + time + "/" for c in channels: url = purl.news_3url( c, time, top_show_num=100 ) # finance_url(time)# entertainment_url(time)# military_url(time) # technology_url(time) #sports_url(time) news_detail_url = get_news_detail_url(url, out_path) reslut.update(news_detail_url) in_url = purl.integrated_channel(time, num) sports = purl.sports_url(time, num) cj = purl.finance_url(time, num) yl = purl.entertainment_url(time, num) kj = purl.technology_url(time, num) jc = purl.military_url(time, num) in_url = get_news_detail_url(in_url, out_path) sports = get_news_detail_url(sports, out_path) cj = get_news_detail_url(cj, out_path) yl = get_news_detail_url(yl, out_path) kj = get_news_detail_url(kj, out_path) jc = get_news_detail_url(jc, out_path) # reslut.update(in_url) reslut.update(sports) reslut.update(cj) reslut.update(yl) reslut.update(kj) reslut.update(jc) path = "F:/scrapy/sina_data1.1.0/news_detail_url/" + time + "/" name = "all_parsed.csv" # 数据保存 pd.Series(reslut).to_csv(file, index=False) pd.Series(in_url).to_csv(int_file, index=False) tmp = [] for url, channel in reslut.values(): tmp.append([url, channel]) to_csv(path, name, tmp) tmp = [] for url, channel in in_url.values(): tmp.append([url, channel]) to_csv(path, "integrated_parsed.csv", tmp) # 新闻综合保存 return reslut
def make_submission(): submission_dataset = pd.read_csv('test.csv') X_submission = submission_dataset.iloc[:, [i-1 for i in selected_features]].values ids = submission_dataset.iloc[:, 0].values print("replacing missing values") print("number of examples in test: "+str(len(X_submission[:, 0]))) for i in range(len(X[0, :])): if i <= categorical_features_count: # si c'est une variable de catégories, on prend comme stratégie de remplacer par la # valeur la plus fréquente (values, counts) = np.unique(X[:, i], return_counts=True) counts = [counts[i] if values[i] >= 0 else 0 for i in range(len(values))] ind = np.argmax(counts) column_ranges.append(max(values)) replacement_value = values[ind] else: # sinon on prend simplement la moyenne replacement_value = np.mean(X[:, i]) for j in range(len(X_submission[:, i])): if X_submission[j, i] < -0.5: X_submission[j, i] = replacement_value y_submission = gbm.predict(X_submission, num_iteration=gbm.best_iteration) from tools import to_csv minimum = 1 maximum = 0 epsilon = 0.01 for y_i in y_submission: if y_i < minimum: minimum = y_i if y_i > maximum: maximum = y_i y_submission = y_submission - minimum + epsilon y_submission = y_submission/(maximum - minimum) y_submission = y_submission/2 to_csv(y_submission, ids)
def report(d, hypothesis, ppln_name, grid): row_range = list(range(len(list(grid.cv_results_.values())[0]))) rows = [[] for _ in row_range] keys = [] csv_name = '.'.join([hypothesis, d.__class__.__name__, ppln_name]) for key, col_array in (list(grid.cv_results_.items())): if len(keys) < len(grid.cv_results_.keys()): keys.append(key) for i, cell in enumerate(col_array): rows[i].append(cell) rows = sorted(rows, reverse=True, key=lambda x: x[keys.index('mean_test_score')]) to_pickle(grid, 'models/%s.p' % csv_name) to_csv([keys] + rows, 'reports/%s.csv' % csv_name)
def get_news_detail_url(index_url, out_path=None): ''' :param out_home_page: :param out_home_page_name: :return: ''' channel = get_channel(url=index_url) # 获取新闻频道 home_html = get_html(index_url).strip() # 访问的新闻类型url news_data = parse_home_data(home_html) # 解析新闻数据 if out_path != None: mkdir(out_path) save_data_txt(out_path, channel + "_resource.txt", home_html) # 保存数据 to_csv(out_path, channel + "_parsed.csv", news_data) result = dict() for news in news_data: news_id = news[0] url = news[2] # tmp = {"news_id":news_id, "url":url, "channel":channel} result[str(news_id)] = (url, channel) return result
def report(k, hypothesis, classifier, rows, keys, arg_val, grid): cls_name = classifier.__class__.__name__ _rows = [[] for _ in range(len(list(grid.cv_results_.values())[0]))] cls_row = [cls_name for _ in range(len(list(grid.cv_results_.values())[0]))] k_row = [k for _ in range(len(list(grid.cv_results_.values())[0]))] for key, col_array in ( [('k', k_row)] + [('classifier', cls_row)] + list(grid.cv_results_.items()) ): if len(keys) < len(grid.cv_results_.keys()) + 2: keys.append(key) for i, cell in enumerate(col_array): _rows[i].append(cell) rows += _rows rows = sorted( rows, reverse=True, key=lambda x: x[keys.index('mean_test_score')] ) best_f = rows[0][keys.index('mean_test_score')] if best_f > arg_val: print('>>>> NEW BEST:', rows[0]) to_pickle(grid, 'models/%s.p' % hypothesis) arg_val = best_f to_csv( [keys] + rows, 'reports/%s.csv' % hypothesis ) to_csv( summarize_csv([keys] + rows, columns), 'reports/%s.summary.csv' % hypothesis ) return arg_val
_Xr = reducer.transform(_X) # Train stacked classifier stacked = stack() stacked.fit(_Xr, Y) # Predict with stacked classifier __X, __Y = list(zip(*d.test())) Y_ = stacked.predict(reducer.transform(vec.transform(__X))) p = precision(__Y, Y_) r = recall(__Y, Y_) f = f1(__Y, Y_) a = accuracy(__Y, Y_) row = (d.__class__.__name__, ppln_name, vec.__class__.__name__, cls.__class__.__name__, stacked.__class__.__name__, p, r, f, a) rows.append(row) to_csv([keys] + sorted(rows, reverse=True, key=lambda x: x[-2]), 'reports/%s.csv' % hypothesis) if f > arg_val: arg_val = f model = { 'grid': grid, 'reducer': reducer, 'stacked': stacked } to_pickle(model, 'models/%s.p' % hypothesis)
reg_lambda=reg_lambda #eval_metric=eval_metric ) classifier.fit(X_train, y_train) t2 = time.time() print(t2 - t1) # Predicting the Test set results y_pred = classifier.predict_proba(X_test)[:, 1] y_pred_train = classifier.predict_proba(X_train)[:, 1] print("gini normalized score (train): ") log_score = log_loss(y_train, y_pred_train) print(log_score) print("gini normalized score (test): ") log_score = log_loss(y_test, y_pred) print(log_score) print("mean de y pred") print(np.mean(y_pred)) evaluation_dataset = pd.read_csv('testing.csv') X_eval = evaluation_dataset.iloc[:, 2:].values # à changer !! y_pred_eval = classifier.predict_proba(X_eval)[:, 1] msno = evaluation_dataset.iloc[:, 0].values to_csv(y_pred_eval, msno)
y_train_pred = (y_train_pred_1 + y_train_pred_2 + y_train_pred_3) / 3 print("Gini train number 1: ") print(gini_normalized(y_train, y_train_pred_1)) print("Gini train number 2: ") print(gini_normalized(y_train, y_train_pred_2)) print("Gini train number 3: ") print(gini_normalized(y_train, y_train_pred_3)) print("Gini train mean on all trees: ") print(gini_normalized(y_train, y_train_pred)) y_test_pred = (clf_1.predict(X_test, raw_score=True) + clf_2.predict(X_test, raw_score=True) + clf_3.predict(X_test, raw_score=True) ) / 3 clf_1.save_model('clf_1.txt') clf_1.save_model('clf_2.txt') clf_1.save_model('clf_3.txt') print(y_test_pred) np.savetxt("y_test_pred", y_test_pred) ids = test.iloc[:, 0].values from tools import to_csv to_csv(y_test_pred, ids)
param_grid = param_grids[ppln_name] # 'GridSearchCV' defaults to stratified k-fold grid = GridSearchCV(ppln, cv=CV, n_jobs=N_JOBS, verbose=VERBOSITY, param_grid=param_grid, refit=True) grid.fit(X, Y) #print(grid.best_estimator_.named_steps.items()) report(d, hypothesis, ppln_name, grid) # Predict on the held-out split vec = grid.best_estimator_['vec'] cls = grid.best_estimator_['cls'] X, Y = list(zip(*d.test())) Y_ = cls.predict(vec.transform(X)) p = precision(Y, Y_) r = recall(Y, Y_) f = f1(Y, Y_) a = accuracy(Y, Y_) row = (d.__class__.__name__, ppln_name, vec.__class__.__name__, cls.__class__.__name__, p, r, f, a) rows.append(row) to_csv(rows, 'reports/%s.csv' % hypothesis)