def _fetch_test_data(self): """ 获取测试数据 """ session = get_db_session() objs = session.query(Pinkunhu2015).filter( Pinkunhu2015.county == '彝良县').all() X, Y = [], [] for item in objs: col_list = [] for col in [ 'tv', 'washing_machine', 'fridge', 'reason', 'is_danger_house', 'is_back_poor', 'is_debt', 'standard', 'arable_land', 'debt_total', 'living_space', 'member_count', 'person_year_total_income', 'year_total_income', 'subsidy_total', 'wood_land', 'xin_nong_he_total', 'xin_yang_lao_total', 'call_number', 'bank_name', 'bank_number', 'help_plan' ]: normalized_value = normalize(col, getattr(item, col)) col_list.append(normalized_value) X.append(col_list) normalized_value = normalize('poor_status', getattr(item, 'poor_status')) Y.append(normalized_value) return X, Y
def _fetch_data(self): """ 获取建模数据 """ session = get_db_session() objs = session.query(Pinkunhu2015).filter( Pinkunhu2015.county == 'A县', Pinkunhu2015.ny_person_income != -1, Pinkunhu2015.person_year_total_income > 0, Pinkunhu2015.person_year_total_income < 7000, ).all() X, Y = [], [] for item in objs: col_list = [] for col in self.features: normalized_value = normalize(col, getattr(item, col)) col_list.append(normalized_value) X.append(col_list) normalized_value = normalize(self.target, getattr(item, self.target)) Y.append(normalized_value) # # 筛掉可能有错误的数据 # 人均年收入除以100后,查看分布,少于5次的不纳入模型, 效果不佳,废弃 # df = pd.DataFrame(X, columns=self.features) # print '#df.shape:', df.shape # df['person_year_total_income'] = df['person_year_total_income'] / 100 # df['person_year_total_income'] = df['person_year_total_income'].astype(int) # df['person_year_total_income'] = df['person_year_total_income'] * 100 # df = df.groupby('person_year_total_income').filter(lambda x: len(x) > 5) # print '#df.shape:', df.shape # X, Y = df.loc[:, self.features[:-1]], df.loc[:, self.target] return X, Y
def _fetch_test_data(self): """ 获取测试数据 """ session = get_db_session() objs = session.query(Pinkunhu2015).filter( Pinkunhu2015.county == 'B县', Pinkunhu2015.ny_person_income != -1, Pinkunhu2015.person_year_total_income > 0, Pinkunhu2015.person_year_total_income < 7000, Pinkunhu2015.ny_person_income > 0, Pinkunhu2015.ny_person_income < 7000, ).all() X, Y = [], [] for item in objs: col_list = [] for col in self.features: normalized_value = normalize(col, getattr(item, col)) col_list.append(normalized_value) X.append(col_list) normalized_value = normalize(self.target, getattr(item, self.target)) Y.append(normalized_value) # 设置虚拟变量 df = pd.DataFrame(X, columns=self.features) for item in self.dummy_features: dummies = pd.get_dummies(df[item], prefix=item) df = df.join(dummies) # 删除已设置虚拟变量的原变量 df = df.drop(self.dummy_features, axis=1) X = df.loc[:] return X, Y
def _fetch_data(self): """ 获取建模数据 """ session = get_db_session() objs = session.query(Pinkunhu2015).filter(Pinkunhu2015.county == 'A县').all() X, Y = [], [] for item in objs: col_list = [] for col in self.features: normalized_value = normalize(col, getattr(item, col)) col_list.append(normalized_value) X.append(col_list) normalized_value = normalize(self.target, getattr(item, self.target)) Y.append(normalized_value) return X, Y
def stat_col_percent(self, col): top_num = 10 session = get_db_session() objs = session.query(getattr( Pinkunhu2015, col)).filter(Pinkunhu2015.poor_status != '已脱贫') res0 = {} for item in objs: k = (getattr(item, col) or '')[:2] cnt = res0.get(k, 0) res0[k] = cnt + 1 self.plot_col_name(self.count_to_percent(res0)[:20], title='%s-未脱贫' % col) objs = session.query(getattr(Pinkunhu2015, col)) res1 = {} for item in objs: k = (getattr(item, col) or '')[:2] cnt = res1.get(k, 0) res1[k] = cnt + 1 self.plot_col_name(self.count_to_percent(res1)[:20], title='%s-全部' % col) objs2 = session.query(getattr( Pinkunhu2015, col)).filter(Pinkunhu2015.poor_status == '已脱贫') res2 = {} for item in objs2: k = (getattr(item, col) or '')[:2] cnt = res2.get(k, 0) res2[k] = cnt + 1 self.plot_col_name(self.count_to_percent(res2)[:20], title='%s-已脱贫' % col) res = [] for k, v in res1.iteritems(): print k res.append((k, res2.get(k, 0) * 100.0 / v)) res = sorted(res, key=lambda x: -x[1]) result = [] for item in res: if res2.get(item[0]) > 200: result.append( ('%s (%s户)' % (item[0], res2.get(item[0])), item[1])) self.plot_col_name(result[:top_num], title='%s-百分比' % col)
def _fetch_test_data(self): """ 获取测试数据 """ session = get_db_session() objs = session.query(Pinkunhu2015).filter( Pinkunhu2015.county == 'B县', Pinkunhu2015.ny_person_income != -1, Pinkunhu2015.person_year_total_income > 0, Pinkunhu2015.person_year_total_income < 7000, ).all() X, Y = [], [] for item in objs: col_list = [] for col in self.features: normalized_value = normalize(col, getattr(item, col)) col_list.append(normalized_value) X.append(col_list) normalized_value = normalize(self.target, getattr(item, self.target)) Y.append(normalized_value) return X, Y
def get_normalize(): """ 根据数据库列值自动生成标量的map 文件头部的 MAPPINGS 就是用该方法生成的 """ session = get_db_session() # 字符串标量 转化成数字标量 res = OrderedDict() for col in [ 'reason', 'is_danger_house', 'is_back_poor', 'is_danger_house', 'is_debt', 'poor_status', 'standard' ]: data = OrderedDict() idx = 1 for item, in session.query(getattr(Pinkunhu2015, col)).distinct(): data[item] = idx idx += 1 res[col] = data print json.dumps(res, ensure_ascii=False, indent=2)
def chart_person_year_total_income(): # 获取数据 session = get_db_session() objs = session.query(Pinkunhu2015.person_year_total_income, Pinkunhu2015.ny_person_income).filter( Pinkunhu2015.county == 'A县', Pinkunhu2015.ny_person_income >= -1, Pinkunhu2015.ny_person_income < 7000, Pinkunhu2015.person_year_total_income > 0, Pinkunhu2015.person_year_total_income < 7000, ).all() X, Y = [], [] for item in objs: X.append(item.person_year_total_income) Y.append( item.ny_person_income if item.ny_person_income != -1 else -1000) # 绘图 fig = plt.figure(1, figsize=(10, 10)) ax1 = fig.add_subplot(221) ax2 = fig.add_subplot(222) ax3 = fig.add_subplot(223) ax4 = fig.add_subplot(224) # 画点图 ax1.plot(X, Y, 'k.', markersize=2) # 画热力图 heatmap_16, extent_16 = myplot(X, Y, nb=16) heatmap_32, extent_32 = myplot(X, Y, nb=32) heatmap_64, extent_64 = myplot(X, Y, nb=64) ax2.imshow(heatmap_16, extent=extent_16, origin='lower', aspect='auto') ax2.set_title("Smoothing over 16 neighbors") ax3.imshow(heatmap_32, extent=extent_32, origin='lower', aspect='auto') ax3.set_title("Smoothing over 32 neighbors") #Make the heatmap using a smoothing over 64 neighbors ax4.imshow(heatmap_64, extent=extent_64, origin='lower', aspect='auto') ax4.set_title("Smoothing over 64 neighbors") plt.show()
def update_next_year(): """ 更新下一年是否脱贫以及下一年收入数据 """ step = 100 # 一次取100条数据 session = get_db_session() # 2016年数据 mappings2016 = {} min_id, max_id = session.query(func.min(Pinkunhu2016.id), func.max(Pinkunhu2016.id)).one() while min_id <= max_id: objs = session.query(Pinkunhu2016).filter( Pinkunhu2016.id >= min_id, Pinkunhu2016.id <= min_id + step).all() for item in objs: mappings2016[item.card_number] = True mappings2016[item.card_number + 'year_total_income'] = item.year_total_income mappings2016[ item.card_number + 'person_year_total_income'] = item.person_year_total_income min_id += step print 'min_id:%s' % min_id # 2015年数据 mappings2015 = {} min_id, max_id = session.query(func.min(Pinkunhu2015.id), func.max(Pinkunhu2015.id)).one() while min_id <= max_id: objs = session.query(Pinkunhu2015).filter( Pinkunhu2015.id >= min_id, Pinkunhu2015.id <= min_id + step).all() for item in objs: # 存取2015年数据 mappings2015[item.card_number] = True mappings2015[item.card_number + 'year_total_income'] = item.year_total_income mappings2015[ item.card_number + 'person_year_total_income'] = item.person_year_total_income # 设置下一年数据 if mappings2016.get(item.card_number): item.ny_is_poor = 1 item.ny_total_income = mappings2016[item.card_number + 'year_total_income'] item.ny_person_income = mappings2016[ item.card_number + 'person_year_total_income'] else: print item.id, '不存在' min_id += step print 'min_id:%s' % min_id # 2014年数据 min_id, max_id = session.query(func.min(Pinkunhu2014.id), func.max(Pinkunhu2014.id)).one() while min_id <= max_id: objs = session.query(Pinkunhu2014).filter( Pinkunhu2014.id >= min_id, Pinkunhu2014.id <= min_id + step).all() for item in objs: # 设置下一年数据 if mappings2015.get(item.card_number): item.ny_is_poor = 1 item.ny_total_income = mappings2015[item.card_number + 'year_total_income'] item.ny_person_income = mappings2015[ item.card_number + 'person_year_total_income'] else: print item.id, '不存在' min_id += step print 'min_id:%s' % min_id session.flush() session.commit()