def DealData( filename, confDir1, confDir2, Cur_day, cid, bid, analysisDir, resultDir1 ): #resultDir1:sys.argv[1]+'/data/data_analysis/'+tcb+'/corrxx.csv' if os.path.exists(filename) == False: sM.run(filename + " doesn't exist at step 5_correlation") else: data = ReadData(filename) obData = pd.DataFrame(data) feature, conf2 = GetConf(confDir1, confDir2, Cur_day, cid, bid) obData.columns = conf2 obData = obData[feature] obData = np.array(obData).astype(float) rowvar, bias, ddof, corts = GetAnalysisConf(analysisDir) corr = np.corrcoef(obData, y=None, rowvar=rowvar, bias=bias, ddof=ddof) corrdf = pd.DataFrame(corr) #得到相关矩阵 corrdf.columns = feature corrdf.index = feature corrdf.to_csv(resultDir1, sep='\t') nrow = corrdf.shape[0] corrlist = [] for i in range(1, nrow): for j in range(i): corrlist.append( [corrdf.index[i], corrdf.columns[j], corrdf.iloc[i, j]]) corr2sql = pd.DataFrame(corrlist) corr2sql.columns = ['feature_name1', 'feature_name2', 'corr_value'] corr2sql['cid'] = cid corr2sql['bid'] = bid corr2sql['l_date'] = Cur_day datanum = len(corr2sql) Deletesql(Cur_day, cid, bid) for i in range(datanum): InsertSql(corr2sql.iloc[i, :]) print "corr2sql finished and the number is {0}".format(datanum)
def DealData(filename,confDir1,confDir2,Cur_day,cid,bid,analysisDir,resultDir1): #resultDir1:sys.argv[1]+'/data/data_analysis/'+tcb+'/corrxx.csv' if os.path.exists(filename)==False: sM.run(filename + " doesn't exist at step 5_correlation") else: data=ReadData(filename) obData=pd.DataFrame(data) feature, conf2 = GetConf(confDir1,confDir2,Cur_day,cid,bid) obData.columns=conf2 obData=obData[feature] obData=np.array(obData).astype(float) rowvar,bias,ddof,corts=GetAnalysisConf(analysisDir) corr=np.corrcoef(obData, y=None, rowvar=rowvar, bias=bias, ddof=ddof) corrdf=pd.DataFrame(corr) #得到相关矩阵 corrdf.columns=feature corrdf.index=feature corrdf.to_csv(resultDir1,sep='\t') nrow=corrdf.shape[0] corrlist=[] for i in range(1,nrow): for j in range(i): corrlist.append([corrdf.index[i],corrdf.columns[j],corrdf.iloc[i,j]]) corr2sql=pd.DataFrame(corrlist) corr2sql.columns=['feature_name1','feature_name2','corr_value'] corr2sql['cid']=cid corr2sql['bid']=bid corr2sql['l_date']=Cur_day datanum=len(corr2sql) Deletesql(Cur_day, cid, bid) for i in range(datanum): InsertSql(corr2sql.iloc[i,:]) print "corr2sql finished and the number is {0}".format(datanum)
def GetAnalysisConf( analysisDir ): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf' if os.path.exists(analysisDir) == False: sM.run(analysisDir + " doesn't exist at step 5_correlation") else: confp = ConfigParser.ConfigParser() confp.read( analysisDir ) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件 rowvar = confp.getint( 'correlation', 'rowvar' ) #If rowvar is non-zero (default), then each row represents a variable, with observations in the columns. bias = confp.getint( 'correlation', 'bias' ) #Default normalization is by (N - 1), where N is the number of observations (unbiased estimate). If bias is 1, then normalization is by N. ddof = confp.get( 'correlation', 'ddof' ) #If not None normalization is by (N - ddof), where N is the number of observations; corts = confp.getfloat('correlation', 'corts') #correlation standard if ddof == 'None': ddof = None else: ddof = int(ddof) return rowvar, bias, ddof, corts
def DealData(filename,confDir1,confDir2,Cur_day,cid,bid): if os.path.exists(filename)==False: sM.run(filename + " doesn't exist at step 0_datareplace") else: data=ReadData(filename) obData=pd.DataFrame(data) feature, conf2, rpv, zero_effect=GetConf(confDir1,confDir2,Cur_day,cid,bid) #print feature, conf2 obData.columns=conf2 nrow,ncol=obData.shape #print obData[:4] data_basic_summary=pd.DataFrame(0,index=np.arange(16),columns=feature) for i in range(len(feature)): my_series=obData[feature[i]] datasummary = BasicSummary(my_series,zero_effect[i]) my_series=my_series.replace('NULL',rpv[i]) my_series=my_series.replace('NaN',rpv[i]) obData[feature[i]]=my_series my_series=my_series.astype(float) datasummary1 =my_series.describe(percentiles=[.25, .5, .75,.98]).append(BasicSummary1(my_series)) data_basic_summary.iloc[:,i]=np.array(datasummary.append(datasummary1)) data_basic_summary.index=('validcount','covrate','count','mean','std','min','25%','50%','75%','98%','max','skew','skew','kurtosis','kurtosistest','coefficient of variation') data_basic_summary.loc['range']=data_basic_summary.loc['max']-data_basic_summary.loc['min'] data_basic_summary.loc['cid']=cid data_basic_summary.loc['bid']=bid return obData,data_basic_summary.T
def GetFeature(confDir): #sys.argv[5]+'/conf/default/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.feature':为读取数据列为feature的名称 参数5:$Base_Dir/conf/default/$time/$cid/$bid.feature if os.path.exists(confDir)==False: sM.run("error happend when GetFeature at step 0_datareplace") else: feature=open(confDir).readlines() feature=feature[0].strip().split(',') return feature
def GetAnalysisConf(analysisDir): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf' if os.path.exists(analysisDir)==False: sM.run(analysisDir + " doesn't exist at step 6_anova") else: confp = ConfigParser.ConfigParser() confp.read(analysisDir) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件 y=confp.get('anova','y') return y
def GetAnalysisConf(analysisDir): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf:sys.argv[5]+'/conf/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.conf' if os.path.exists(filename)==False: sM.run(analysisDir + " doesn't exist at step 1_threshold") else: confp = ConfigParser.ConfigParser() confp.read(analysisDir) d = confp.getfloat('threshold','d') return d
def GetFeature( confDir ): #sys.argv[5]+'/conf/default/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.feature':为读取数据列为feature的名称 参数5:$Base_Dir/conf/default/$time/$cid/$bid.feature if os.path.exists(confDir) == False: sM.run("error happend when GetFeature at step 0_datareplace") else: feature = open(confDir).readlines() feature = feature[0].strip().split(',') return feature
def GetFeature(confDir): #sys.argv[5]+'/conf/default/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.feature':为读取数据列为feature的名称 参数5:$Base_Dir/conf/default/$time/$cid/$bid.feature if os.path.exists(confDir)==False: sM.run(confDir +" doesn't exist when GetFeature at step 1_threshold") else: file2=open(confDir) feature=file2.readlines() feature=feature[0].strip().split(',') file2.close() return feature
def GetFeature(confDir): #sys.argv[5]+'/conf/default/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.feature':为读取数据列为feature的名称 参数5:$Base_Dir/conf/default/$time/$cid/$bid.feature if os.path.exists(confDir)==False: sM.run(confDir +" doesn't exist when GetFeature at step 5_correlation") else: file2=open(confDir) feature=file2.readlines() feature=feature[0].strip().split(',') file2.close() return feature
def GetAnalysisConf(analysisDir): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf' if os.path.exists(analysisDir)==False: sM.run(analysisDir + " doesn't exist at step 4_extrenum") else: confp = ConfigParser.ConfigParser() confp.read(analysisDir) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件 method=confp.get('extrenum','method') nstd=confp.getint('extrenum','nstd') topn=confp.getint('extrenum','topn') return method,nstd,topn
def GetAnalysisConf(analysisDir): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf' if os.path.exists(analysisDir)==False: sM.run(analysisDir + " doesn't exist at step 3_histogram") else: confp = ConfigParser.ConfigParser() confp.read(analysisDir) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件 logx=confp.getboolean('hist','logx') logy=confp.getboolean('hist','logy') ctrbins=confp.getint('hist','ctrbins') bins=confp.get('hist','bins') bins=json.loads('['+bins+']') return logx,logy,bins,ctrbins
def GetAnalysisConf( analysisDir ): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf' if os.path.exists(analysisDir) == False: sM.run(analysisDir + " doesn't exist at step 6_anova") else: confp = ConfigParser.ConfigParser() confp.read( analysisDir ) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件 y = confp.get('anova', 'y') return y
def GetAnalysisConf( analysisDir ): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf' if os.path.exists(analysisDir) == False: sM.run(analysisDir + " doesn't exist at step 4_extrenum") else: confp = ConfigParser.ConfigParser() confp.read( analysisDir ) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件 method = confp.get('extrenum', 'method') nstd = confp.getint('extrenum', 'nstd') topn = confp.getint('extrenum', 'topn') return method, nstd, topn
def OutputResult(filename,confDir1,confDir2,Cur_day,cid,bid,analysisDir,resultDir): if os.path.exists(filename)==False: sM.run(filename + " doesn't exist at step 3_histogram") else: data=ReadData(filename) obData=pd.DataFrame(data) feature, conf2 = GetConf(confDir1,confDir2,Cur_day,cid,bid) obData.columns=conf2 logx,logy,bins,ctrbins=GetAnalysisConf(analysisDir) for i in range(len(feature)): feature_name=feature[i] my_series=pd.DataFrame(obData[[feature_name,'label']]) bins[i],logx,logy, ctrbin = ConfTol(bins[i],logx,logy,ctrbins) HistPlot(resultDir,my_series,bins[i],feature_name,logx,logy) CtrPlot(resultDir,my_series,ctrbin,feature_name)
def GetAnalysisConf(analysisDir): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf' if os.path.exists(analysisDir)==False: sM.run(analysisDir + " doesn't exist at step 5_correlation") else: confp = ConfigParser.ConfigParser() confp.read(analysisDir) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件 rowvar=confp.getint('correlation','rowvar') #If rowvar is non-zero (default), then each row represents a variable, with observations in the columns. bias=confp.getint('correlation','bias') #Default normalization is by (N - 1), where N is the number of observations (unbiased estimate). If bias is 1, then normalization is by N. ddof=confp.get('correlation','ddof') #If not None normalization is by (N - ddof), where N is the number of observations; corts=confp.getfloat('correlation','corts') #correlation standard if ddof == 'None': ddof = None else: ddof = int(ddof) return rowvar,bias,ddof,corts
def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/feature_integration/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.integration' try: file = open(filename) data=[] for line in file: line=line.strip() cols=line.split('\t') if len(cols) >1: cols=np.array(cols) data.append(cols) file.close() return data except: sM.run("error happend when ReadData at step 0_datareplace") sys.stderr.write('error happend when read the data at step 0_datareplace\t%s\n' % line) traceback.print_exc(file=sys.stderr)
def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/feature_integration/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.integration' try: file = open(filename) data=[] for line in file: line=line.strip() cols=line.split('\t') if len(cols) >1: cols=np.array(cols) data.append(cols) file.close() return data except: sM.run("error happend when ReadData at step 1_threshold") sys.stderr.write('error happend when read the data as step 1_threshold\t%s\n' % line) traceback.print_exc(file=sys.stderr)
def OutputResult(filename, confDir1, confDir2, Cur_day, cid, bid, analysisDir, resultDir): if os.path.exists(filename) == False: sM.run(filename + " doesn't exist at step 3_histogram") else: data = ReadData(filename) obData = pd.DataFrame(data) feature, conf2 = GetConf(confDir1, confDir2, Cur_day, cid, bid) obData.columns = conf2 logx, logy, bins, ctrbins = GetAnalysisConf(analysisDir) for i in range(len(feature)): feature_name = feature[i] my_series = pd.DataFrame(obData[[feature_name, 'label']]) bins[i], logx, logy, ctrbin = ConfTol(bins[i], logx, logy, ctrbins) HistPlot(resultDir, my_series, bins[i], feature_name, logx, logy) CtrPlot(resultDir, my_series, ctrbin, feature_name)
def DealData(filename, confDir1, confDir2, Cur_day, cid, bid, analysisDir): if os.path.exists(filename) == False: sM.run(filename + " doesn't exist at step 6_anova") else: data = np.array(ReadData(filename)) dataF = pd.DataFrame(data) feature, conf2 = GetConf(confDir1, confDir2, Cur_day, cid, bid) dataF.columns = conf2 y = GetAnalysisConf(analysisDir) y = conf2.index(y) y = data[:, y].astype(float) x = np.array(dataF[feature]).astype(float) #formula='label'+'~'+'+'.join(feature) fit1 = smi.OLS(y, x).fit() #anova_result=anova_lm(fit1) return fit1.summary()
def GetAnalysisConf( analysisDir ): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf' if os.path.exists(analysisDir) == False: sM.run(analysisDir + " doesn't exist at step 3_histogram") else: confp = ConfigParser.ConfigParser() confp.read( analysisDir ) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件 logx = confp.getboolean('hist', 'logx') logy = confp.getboolean('hist', 'logy') ctrbins = confp.getint('hist', 'ctrbins') bins = confp.get('hist', 'bins') bins = json.loads('[' + bins + ']') return logx, logy, bins, ctrbins
def DealData(filename,confDir1,confDir2,Cur_day,cid,bid,analysisDir): if os.path.exists(filename)==False: sM.run(filename + " doesn't exist at step 6_anova") else: data=np.array(ReadData(filename)) dataF=pd.DataFrame(data) feature, conf2 = GetConf(confDir1,confDir2,Cur_day,cid,bid) dataF.columns=conf2 y=GetAnalysisConf(analysisDir) y=conf2.index(y) y=data[:,y].astype(float) x=np.array(dataF[feature]).astype(float) #formula='label'+'~'+'+'.join(feature) fit1=smi.OLS(y,x).fit() #anova_result=anova_lm(fit1) return fit1.summary()
def ReadData( filename ): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace' try: file = open(filename) data = [] for line in file: line = line.strip() cols = line.split('\t') if len(cols) > 1: cols = np.array(cols) data.append(cols) file.close() return data except: sM.run("error happend when ReadData at step 4_extrenum") sys.stderr.write( 'error happend when read the data as step 4_extrenum\t%s\n' % line) traceback.print_exc(file=sys.stderr)
def DealData(filename,confDir1,confDir2,Cur_day,cid,bid,analysisDir): if os.path.exists(filename)==False: sM.run(filename + " doesn't exist at step 4_extrenum") else: data=ReadData(filename) obData=pd.DataFrame(data) feature, conf2 = GetConf(confDir1,confDir2,Cur_day,cid,bid) obData.columns=conf2 nrow,ncol=obData.shape coln=len(feature) method,nstd,topn=GetAnalysisConf(analysisDir) if method == 'First': for i in feature:#极值处理,西格玛原则 ix=obData[i].astype(float) imean=ix.mean() istd=ix.std() imax=imean+nstd*istd imin=imean-nstd*istd ix[ix>imax]=imax ix[ix<imin]=imin obData[i]=ix elif method == 'Second': for i in feature: #极值处理,均值方法 ix=obData[i].astype(float) imean=ix.mean() istd=ix.std() imax=imean+nstd*istd imin=imean-nstd*istd ix[ix>imax]=imean ix[ix<imin]=imean #ix=ix.replace('NaN',imean) obData[i]=ix else: #method配置不写成first,second,默认为常规处理 for i in feature: #常规处理 ix=pd.DataFrame(obData[i].astype(float)) topN=int(round((1-topn/100.0)*nrow)) imax=heapq.nlargest(topN,ix.iloc[:,0])[-1] ix[ix>imax]=imax obData[i]=ix return obData
def DealData(filename, confDir1, confDir2, Cur_day, cid, bid, analysisDir): if os.path.exists(filename) == False: sM.run(filename + " doesn't exist at step 4_extrenum") else: data = ReadData(filename) obData = pd.DataFrame(data) feature, conf2 = GetConf(confDir1, confDir2, Cur_day, cid, bid) obData.columns = conf2 nrow, ncol = obData.shape coln = len(feature) method, nstd, topn = GetAnalysisConf(analysisDir) if method == 'First': for i in feature: #极值处理,西格玛原则 ix = obData[i].astype(float) imean = ix.mean() istd = ix.std() imax = imean + nstd * istd imin = imean - nstd * istd ix[ix > imax] = imax ix[ix < imin] = imin obData[i] = ix elif method == 'Second': for i in feature: #极值处理,均值方法 ix = obData[i].astype(float) imean = ix.mean() istd = ix.std() imax = imean + nstd * istd imin = imean - nstd * istd ix[ix > imax] = imean ix[ix < imin] = imean #ix=ix.replace('NaN',imean) obData[i] = ix else: #method配置不写成first,second,默认为常规处理 for i in feature: #常规处理 ix = pd.DataFrame(obData[i].astype(float)) topN = int(round((1 - topn / 100.0) * nrow)) imax = heapq.nlargest(topN, ix.iloc[:, 0])[-1] ix[ix > imax] = imax obData[i] = ix return obData
def OutputResult(filename,confDir1,confDir2,Cur_day,cid,bid,analysisDir): if os.path.exists(filename)==False: sM.run(filename + " doesn't exist at step 1_threshold") else: data=ReadData(filename) obData = pd.DataFrame(data) feature, conf2 = GetConf(confDir1,confDir2,Cur_day,cid,bid) obData.columns = conf2 nrow,ncol = obData.shape d=GetAnalysisConf(analysisDir) valindex = nrow*d freq_table = pd.DataFrame(-100,index = feature,columns = ['value']) for i in range(len(feature)): my_series = pd.Series(obData[feature[i]]) counts = my_series.value_counts() for j in range(len(counts)): if counts[j] >= valindex: freq_table.iloc[i] = counts.index[j] break D_value=freq_table[freq_table['value'] != -100] if len(D_value)>0: print 'Feature and value of ' +str(d*100)+'%'+'\n', D_value else: print 'the feature does not exist which somevalue takes up more than ', str(d*100)+'%'
def DealData(filename, confDir1, confDir2, Cur_day, cid, bid): if os.path.exists(filename) == False: sM.run(filename + " doesn't exist at step 0_datareplace") else: data = ReadData(filename) obData = pd.DataFrame(data) feature, conf2, rpv, zero_effect = GetConf(confDir1, confDir2, Cur_day, cid, bid) #print feature, conf2 obData.columns = conf2 nrow, ncol = obData.shape #print obData[:4] data_basic_summary = pd.DataFrame(0, index=np.arange(16), columns=feature) for i in range(len(feature)): my_series = obData[feature[i]] datasummary = BasicSummary(my_series, zero_effect[i]) my_series = my_series.replace('NULL', rpv[i]) my_series = my_series.replace('NaN', rpv[i]) obData[feature[i]] = my_series my_series = my_series.astype(float) datasummary1 = my_series.describe( percentiles=[.25, .5, .75, .98]).append( BasicSummary1(my_series)) data_basic_summary.iloc[:, i] = np.array( datasummary.append(datasummary1)) data_basic_summary.index = ('validcount', 'covrate', 'count', 'mean', 'std', 'min', '25%', '50%', '75%', '98%', 'max', 'skew', 'skew', 'kurtosis', 'kurtosistest', 'coefficient of variation') data_basic_summary.loc['range'] = data_basic_summary.loc[ 'max'] - data_basic_summary.loc['min'] data_basic_summary.loc['cid'] = cid data_basic_summary.loc['bid'] = bid return obData, data_basic_summary.T
#!/usr/bin/env python # -*- coding: utf-8 -*- import pandas as pd import numpy as np import traceback import sys, os import ConfigParser import json import subprocess import ast import sendMessage as sM import MySQLdb if os.path.exists(sys.argv[1] + '/bin/run') == False: sM.run("get_conf doesn't exist at step 5_correlation") else: sys.path.append(sys.argv[1] + '/bin/run') import get_conf as gc np.set_printoptions(threshold='nan') pd.options.display.max_rows = None pd.options.display.max_columns = None def ReadData( filename ): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace' try: file = open(filename) data = [] for line in file:
import pandas as pd import numpy as np import traceback import sys import ConfigParser import json import heapq import os import subprocess import ast from scipy import stats import sendMessage as sM import MySQLdb if os.path.exists(sys.argv[5] + '/bin/run/get_conf.py') == False: sM.run("get_conf doesn't exist at step 0_datareplace") else: sys.path.append(sys.argv[5] + '/bin/run') import get_conf as gc np.set_printoptions(threshold='nan') pd.options.display.max_rows = None pd.options.display.max_columns = None def InsertSql(s): ins = "INSERT INTO rec_feature_project_data_analysis (feature_name, validcount, covrate, count, mean, std, min, twenty_five, fifty, seventy_five, ninety_eight, max, skew, skewtest, kurtosis, kurtosistest, coef_variation, rang, cid, bid, l_date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" conn = MySQLdb.connect(host='192.168.61.73', port=9000, user='******', passwd='qianfendian',
import ConfigParser import string, os, sys import pandas as pd import numpy as np import math import traceback import time import json import statsmodels.api as smi from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm import subprocess import ast import sendMessage as sM if os.path.exists(sys.argv[1]+'/bin/run')==False: sM.run("get_conf doesn't exist at step 6_anova") else: sys.path.append(sys.argv[1]+'/bin/run') import get_conf as gc def isValid(s): return len(s.strip()) > 0 and s != 'null' and s != 'NULL' and s != '0' def eps(x): if x > 100: return 1 if x < -100: return 0 return 1 / (1 + math.exp(-x)) def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace'
#!/usr/bin/env python # -*- coding: utf-8 -*- import pandas as pd import numpy as np import traceback import sys, os import ConfigParser import json import heapq import subprocess import ast import sendMessage as sM if os.path.exists(sys.argv[1] + '/bin/run') == False: sM.run("get_conf doesn't exist at step 4_extrenum") else: sys.path.append(sys.argv[1] + '/bin/run') import get_conf as gc np.set_printoptions(threshold='nan') pd.options.display.max_rows = None pd.options.display.max_columns = None def ReadData( filename ): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace' try: file = open(filename) data = [] for line in file:
def sendM(filepath): if os.path.exists(filepath) == False: sM.run(filepath + 'does not exist') os.makedirs(filepath)
import pandas as pd import numpy as np from scipy import stats import glob from pandas.io import sql import MySQLdb import traceback import shutil import threading if os.path.exists(sys.argv[1] + '/bin/run/sendMessage.py'): import sendMessage as sM else: print "sendMessage.py doesn't exist" if os.path.exists(sys.argv[1] + '/bin/run/get_conf.py') == False: sM.run("get_conf doesn't exist at step feature analysis") else: sys.path.append(sys.argv[1] + '/bin/run') import get_conf as gc def sendM(filepath): if os.path.exists(filepath) == False: sM.run(filepath + 'does not exist') os.makedirs(filepath) def PreEnv(Base_Dir, Share_Dir): dataDir = '/'.join([Share_Dir, 'data', 'feature_integration']) dataAnalysisResult = '/'.join([Base_Dir, 'data', 'data_analysis']) confDir = '/'.join([Base_Dir, 'conf', 'data_analysis'])
import ConfigParser import string, os, sys import pandas as pd import numpy as np import math import traceback import time import json import statsmodels.api as smi from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm import subprocess import ast import sendMessage as sM if os.path.exists(sys.argv[1] + '/bin/run') == False: sM.run("get_conf doesn't exist at step 6_anova") else: sys.path.append(sys.argv[1] + '/bin/run') import get_conf as gc def isValid(s): return len(s.strip()) > 0 and s != 'null' and s != 'NULL' and s != '0' def eps(x): if x > 100: return 1 if x < -100: return 0 return 1 / (1 + math.exp(-x))
#!/usr/bin/env python # -*- coding: utf-8 -*- import pandas as pd import numpy as np import traceback import ConfigParser import json import heapq import subprocess import ast import os import sys import sendMessage as sM if os.path.exists(sys.argv[5]+'/bin/run')==False: sM.run("get_conf doesn't exist at step 1_threshold") else: sys.path.append(sys.argv[5]+'/bin/run') import get_conf as gc np.set_printoptions(threshold = 'nan') pd.set_option('display.max_rows',None) def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/feature_integration/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.integration' try: file = open(filename) data=[] for line in file: line=line.strip() cols=line.split('\t') if len(cols) >1:
import pandas as pd import numpy as np import traceback import os, sys, json, ast import ConfigParser import matplotlib.pyplot as plt from matplotlib import rcParams rcParams.update({'figure.autolayout': True}) from matplotlib.backends.backend_pdf import PdfPages import heapq import subprocess import sendMessage as sM import math if os.path.exists(sys.argv[1] + '/bin/run') == False: sM.run("get_conf doesn't exist at step 3_histogram") else: sys.path.append(sys.argv[1] + '/bin/run') import get_conf as gc np.set_printoptions(threshold='nan') pd.options.display.max_rows = None pd.options.display.max_columns = None def ReadData( filename ): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace' try: file = open(filename) data = []
import pandas as pd import numpy as np from scipy import stats import glob from pandas.io import sql import MySQLdb import traceback import shutil import threading if os.path.exists(sys.argv[1]+'/bin/run/sendMessage.py'): import sendMessage as sM else: print "sendMessage.py doesn't exist" if os.path.exists(sys.argv[1]+'/bin/run/get_conf.py')==False: sM.run("get_conf doesn't exist at step feature analysis") else: sys.path.append(sys.argv[1]+'/bin/run') import get_conf as gc def sendM(filepath): if os.path.exists(filepath)==False: sM.run(filepath+'does not exist') os.makedirs(filepath) def PreEnv(Base_Dir, Share_Dir): dataDir='/'.join([Share_Dir, 'data', 'feature_integration']) dataAnalysisResult='/'.join([Base_Dir, 'data', 'data_analysis']) confDir='/'.join([Base_Dir, 'conf', 'data_analysis'])
#!/usr/bin/env python # -*- coding: utf-8 -*- import pandas as pd import numpy as np import traceback import sys, os import ConfigParser import json import heapq import subprocess import ast import sendMessage as sM if os.path.exists(sys.argv[1]+'/bin/run')==False: sM.run("get_conf doesn't exist at step 4_extrenum") else: sys.path.append(sys.argv[1]+'/bin/run') import get_conf as gc np.set_printoptions(threshold='nan') pd.options.display.max_rows=None pd.options.display.max_columns=None def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace' try: file = open(filename) data=[] for line in file: line=line.strip() cols=line.split('\t') if len(cols) >1:
import pandas as pd import numpy as np import traceback import sys import ConfigParser import json import heapq import os import subprocess import ast from scipy import stats import sendMessage as sM import MySQLdb if os.path.exists(sys.argv[5]+'/bin/run/get_conf.py')==False: sM.run("get_conf doesn't exist at step 0_datareplace") else: sys.path.append(sys.argv[5]+'/bin/run') import get_conf as gc np.set_printoptions(threshold='nan') pd.options.display.max_rows=None pd.options.display.max_columns=None def InsertSql(s): ins="INSERT INTO rec_feature_project_data_analysis (feature_name, validcount, covrate, count, mean, std, min, twenty_five, fifty, seventy_five, ninety_eight, max, skew, skewtest, kurtosis, kurtosistest, coef_variation, rang, cid, bid, l_date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" conn= MySQLdb.connect(host='192.168.61.73',port = 9000, user='******',passwd='qianfendian',db ='rec_report') cursor = conn.cursor() cursor.execute(ins, tuple(s)) conn.commit()
#!/usr/bin/env python # -*- coding: utf-8 -*- import pandas as pd import numpy as np import traceback import sys, os import ConfigParser import json import subprocess import ast import sendMessage as sM import MySQLdb if os.path.exists(sys.argv[1]+'/bin/run')==False: sM.run("get_conf doesn't exist at step 5_correlation") else: sys.path.append(sys.argv[1]+'/bin/run') import get_conf as gc np.set_printoptions(threshold='nan') pd.options.display.max_rows=None pd.options.display.max_columns=None def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace' try: file = open(filename) data=[] for line in file: line=line.strip() cols=line.split('\t')
def sendM(filepath): if os.path.exists(filepath)==False: sM.run(filepath+'does not exist') os.makedirs(filepath)
import pandas as pd import numpy as np import traceback import os, sys, json, ast import ConfigParser import matplotlib.pyplot as plt from matplotlib import rcParams rcParams.update({'figure.autolayout': True}) from matplotlib.backends.backend_pdf import PdfPages import heapq import subprocess import sendMessage as sM import math if os.path.exists(sys.argv[1]+'/bin/run')==False: sM.run("get_conf doesn't exist at step 3_histogram") else: sys.path.append(sys.argv[1]+'/bin/run') import get_conf as gc np.set_printoptions(threshold='nan') pd.options.display.max_rows=None pd.options.display.max_columns=None def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace' try: file = open(filename) data=[] for line in file: line=line.strip()