Exemplo n.º 1
0
def DealData(
    filename, confDir1, confDir2, Cur_day, cid, bid, analysisDir, resultDir1
):  #resultDir1:sys.argv[1]+'/data/data_analysis/'+tcb+'/corrxx.csv'
    if os.path.exists(filename) == False:
        sM.run(filename + " doesn't exist at step 5_correlation")
    else:
        data = ReadData(filename)
        obData = pd.DataFrame(data)
        feature, conf2 = GetConf(confDir1, confDir2, Cur_day, cid, bid)
        obData.columns = conf2
        obData = obData[feature]
        obData = np.array(obData).astype(float)
        rowvar, bias, ddof, corts = GetAnalysisConf(analysisDir)
        corr = np.corrcoef(obData, y=None, rowvar=rowvar, bias=bias, ddof=ddof)
        corrdf = pd.DataFrame(corr)  #得到相关矩阵
        corrdf.columns = feature
        corrdf.index = feature
        corrdf.to_csv(resultDir1, sep='\t')
        nrow = corrdf.shape[0]
        corrlist = []
        for i in range(1, nrow):
            for j in range(i):
                corrlist.append(
                    [corrdf.index[i], corrdf.columns[j], corrdf.iloc[i, j]])
        corr2sql = pd.DataFrame(corrlist)
        corr2sql.columns = ['feature_name1', 'feature_name2', 'corr_value']
        corr2sql['cid'] = cid
        corr2sql['bid'] = bid
        corr2sql['l_date'] = Cur_day
        datanum = len(corr2sql)
        Deletesql(Cur_day, cid, bid)
        for i in range(datanum):
            InsertSql(corr2sql.iloc[i, :])
        print "corr2sql finished and the number is {0}".format(datanum)
Exemplo n.º 2
0
def DealData(filename,confDir1,confDir2,Cur_day,cid,bid,analysisDir,resultDir1): #resultDir1:sys.argv[1]+'/data/data_analysis/'+tcb+'/corrxx.csv'
	if os.path.exists(filename)==False:
		sM.run(filename + " doesn't exist at step 5_correlation")
	else:
		data=ReadData(filename)
		obData=pd.DataFrame(data)
		feature, conf2 = GetConf(confDir1,confDir2,Cur_day,cid,bid)
		obData.columns=conf2
		obData=obData[feature]
		obData=np.array(obData).astype(float)
		rowvar,bias,ddof,corts=GetAnalysisConf(analysisDir)
		corr=np.corrcoef(obData, y=None, rowvar=rowvar, bias=bias, ddof=ddof)
		corrdf=pd.DataFrame(corr) #得到相关矩阵
		corrdf.columns=feature
		corrdf.index=feature
		corrdf.to_csv(resultDir1,sep='\t')
		nrow=corrdf.shape[0]
		corrlist=[]
		for i in range(1,nrow):
			for j in range(i):
				corrlist.append([corrdf.index[i],corrdf.columns[j],corrdf.iloc[i,j]])
		corr2sql=pd.DataFrame(corrlist)
		corr2sql.columns=['feature_name1','feature_name2','corr_value']
		corr2sql['cid']=cid
		corr2sql['bid']=bid
		corr2sql['l_date']=Cur_day
		datanum=len(corr2sql)
		Deletesql(Cur_day, cid, bid)
		for i in range(datanum):
			InsertSql(corr2sql.iloc[i,:])
		print "corr2sql finished and the number is {0}".format(datanum)
Exemplo n.º 3
0
def GetAnalysisConf(
    analysisDir
):  #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf'
    if os.path.exists(analysisDir) == False:
        sM.run(analysisDir + " doesn't exist at step 5_correlation")
    else:
        confp = ConfigParser.ConfigParser()
        confp.read(
            analysisDir
        )  #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件
        rowvar = confp.getint(
            'correlation', 'rowvar'
        )  #If rowvar is non-zero (default), then each row represents a variable, with observations in the columns.
        bias = confp.getint(
            'correlation', 'bias'
        )  #Default normalization is by (N - 1), where N is the number of observations (unbiased estimate). If bias is 1, then normalization is by N.
        ddof = confp.get(
            'correlation', 'ddof'
        )  #If not None normalization is by (N - ddof), where N is the number of observations;
        corts = confp.getfloat('correlation', 'corts')  #correlation standard
        if ddof == 'None':
            ddof = None
        else:
            ddof = int(ddof)
        return rowvar, bias, ddof, corts
Exemplo n.º 4
0
def DealData(filename,confDir1,confDir2,Cur_day,cid,bid):
	if os.path.exists(filename)==False:
		sM.run(filename + " doesn't exist at step 0_datareplace")
	else:
		data=ReadData(filename)
		obData=pd.DataFrame(data)
		feature, conf2,  rpv, zero_effect=GetConf(confDir1,confDir2,Cur_day,cid,bid)
		#print feature, conf2
		obData.columns=conf2
		nrow,ncol=obData.shape
		#print obData[:4]
		data_basic_summary=pd.DataFrame(0,index=np.arange(16),columns=feature)
		for i in range(len(feature)):
			my_series=obData[feature[i]]
			datasummary = BasicSummary(my_series,zero_effect[i])
			my_series=my_series.replace('NULL',rpv[i])
			my_series=my_series.replace('NaN',rpv[i])
			obData[feature[i]]=my_series
			my_series=my_series.astype(float)
			datasummary1 =my_series.describe(percentiles=[.25, .5, .75,.98]).append(BasicSummary1(my_series))
			data_basic_summary.iloc[:,i]=np.array(datasummary.append(datasummary1))
		data_basic_summary.index=('validcount','covrate','count','mean','std','min','25%','50%','75%','98%','max','skew','skew','kurtosis','kurtosistest','coefficient of variation')
		data_basic_summary.loc['range']=data_basic_summary.loc['max']-data_basic_summary.loc['min']
		data_basic_summary.loc['cid']=cid
		data_basic_summary.loc['bid']=bid
		return obData,data_basic_summary.T
Exemplo n.º 5
0
def GetFeature(confDir): #sys.argv[5]+'/conf/default/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.feature':为读取数据列为feature的名称 参数5:$Base_Dir/conf/default/$time/$cid/$bid.feature
	if os.path.exists(confDir)==False:
		sM.run("error happend when GetFeature at step 0_datareplace")
	else:
		feature=open(confDir).readlines()
		feature=feature[0].strip().split(',')
		return feature
Exemplo n.º 6
0
def GetAnalysisConf(analysisDir): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf'
	if os.path.exists(analysisDir)==False:
		sM.run(analysisDir + " doesn't exist at step 6_anova")
	else:
		confp = ConfigParser.ConfigParser() 
		confp.read(analysisDir) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件
		y=confp.get('anova','y')
		return y
Exemplo n.º 7
0
def GetAnalysisConf(analysisDir): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf:sys.argv[5]+'/conf/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.conf'
	if os.path.exists(filename)==False:
		sM.run(analysisDir + " doesn't exist at step 1_threshold")
	else:
		confp = ConfigParser.ConfigParser() 
		confp.read(analysisDir) 
		d = confp.getfloat('threshold','d')
		return d
Exemplo n.º 8
0
def GetFeature(
    confDir
):  #sys.argv[5]+'/conf/default/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.feature':为读取数据列为feature的名称 参数5:$Base_Dir/conf/default/$time/$cid/$bid.feature
    if os.path.exists(confDir) == False:
        sM.run("error happend when GetFeature at step 0_datareplace")
    else:
        feature = open(confDir).readlines()
        feature = feature[0].strip().split(',')
        return feature
Exemplo n.º 9
0
def GetFeature(confDir): #sys.argv[5]+'/conf/default/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.feature':为读取数据列为feature的名称 参数5:$Base_Dir/conf/default/$time/$cid/$bid.feature
	if os.path.exists(confDir)==False:
		sM.run(confDir +" doesn't exist when GetFeature at step 1_threshold")
	else:
		file2=open(confDir)
		feature=file2.readlines()
		feature=feature[0].strip().split(',')
		file2.close()
		return feature
Exemplo n.º 10
0
def GetFeature(confDir): #sys.argv[5]+'/conf/default/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.feature':为读取数据列为feature的名称 参数5:$Base_Dir/conf/default/$time/$cid/$bid.feature
	if os.path.exists(confDir)==False:
		sM.run(confDir +" doesn't exist when GetFeature at step 5_correlation")
	else:
		file2=open(confDir)
		feature=file2.readlines()
		feature=feature[0].strip().split(',')
		file2.close()
		return feature
Exemplo n.º 11
0
def GetAnalysisConf(analysisDir): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf'
	if os.path.exists(analysisDir)==False:
		sM.run(analysisDir + " doesn't exist at step 4_extrenum")
	else:
		confp = ConfigParser.ConfigParser() 
		confp.read(analysisDir) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件
		method=confp.get('extrenum','method')
		nstd=confp.getint('extrenum','nstd')
		topn=confp.getint('extrenum','topn')
		return method,nstd,topn
Exemplo n.º 12
0
def GetAnalysisConf(analysisDir): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf'
	if os.path.exists(analysisDir)==False:
		sM.run(analysisDir + " doesn't exist at step 3_histogram")
	else:
		confp = ConfigParser.ConfigParser() 
		confp.read(analysisDir) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件
		logx=confp.getboolean('hist','logx')
		logy=confp.getboolean('hist','logy')
		ctrbins=confp.getint('hist','ctrbins')
		bins=confp.get('hist','bins')
		bins=json.loads('['+bins+']')
		return logx,logy,bins,ctrbins
Exemplo n.º 13
0
def GetAnalysisConf(
    analysisDir
):  #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf'
    if os.path.exists(analysisDir) == False:
        sM.run(analysisDir + " doesn't exist at step 6_anova")
    else:
        confp = ConfigParser.ConfigParser()
        confp.read(
            analysisDir
        )  #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件
        y = confp.get('anova', 'y')
        return y
Exemplo n.º 14
0
def GetAnalysisConf(
    analysisDir
):  #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf'
    if os.path.exists(analysisDir) == False:
        sM.run(analysisDir + " doesn't exist at step 4_extrenum")
    else:
        confp = ConfigParser.ConfigParser()
        confp.read(
            analysisDir
        )  #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件
        method = confp.get('extrenum', 'method')
        nstd = confp.getint('extrenum', 'nstd')
        topn = confp.getint('extrenum', 'topn')
        return method, nstd, topn
Exemplo n.º 15
0
def OutputResult(filename,confDir1,confDir2,Cur_day,cid,bid,analysisDir,resultDir):
	if os.path.exists(filename)==False:
		sM.run(filename + " doesn't exist at step 3_histogram")
	else:
		data=ReadData(filename)
		obData=pd.DataFrame(data)
		feature, conf2 = GetConf(confDir1,confDir2,Cur_day,cid,bid)
		obData.columns=conf2
		logx,logy,bins,ctrbins=GetAnalysisConf(analysisDir)
		for i in range(len(feature)):
			feature_name=feature[i]
			my_series=pd.DataFrame(obData[[feature_name,'label']])
			bins[i],logx,logy, ctrbin = ConfTol(bins[i],logx,logy,ctrbins)
			HistPlot(resultDir,my_series,bins[i],feature_name,logx,logy)
			CtrPlot(resultDir,my_series,ctrbin,feature_name)
Exemplo n.º 16
0
def GetAnalysisConf(analysisDir): #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf'
	if os.path.exists(analysisDir)==False:
		sM.run(analysisDir + " doesn't exist at step 5_correlation")
	else:
		confp = ConfigParser.ConfigParser() 
		confp.read(analysisDir) #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件
		rowvar=confp.getint('correlation','rowvar') #If rowvar is non-zero (default), then each row represents a variable, with observations in the columns. 
		bias=confp.getint('correlation','bias') #Default normalization is by (N - 1), where N is the number of observations (unbiased estimate). If bias is 1, then normalization is by N.
		ddof=confp.get('correlation','ddof') #If not None normalization is by (N - ddof), where N is the number of observations;
		corts=confp.getfloat('correlation','corts') #correlation standard
		if ddof == 'None':
			ddof = None
		else:
			ddof = int(ddof)
		return rowvar,bias,ddof,corts
Exemplo n.º 17
0
def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/feature_integration/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.integration'
	try:
		file = open(filename) 
		data=[]
		for line in file:
			line=line.strip()
			cols=line.split('\t')
			if len(cols) >1:
				cols=np.array(cols)
				data.append(cols)
		file.close()
		return data
	except:
		sM.run("error happend when ReadData at step 0_datareplace")
		sys.stderr.write('error happend when read the data at step 0_datareplace\t%s\n' % line)
		traceback.print_exc(file=sys.stderr)
Exemplo n.º 18
0
def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/feature_integration/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.integration'
	try:
		file = open(filename) 
		data=[]
		for line in file:
			line=line.strip()
			cols=line.split('\t')
			if len(cols) >1:
				cols=np.array(cols)
				data.append(cols)
		file.close()
		return data
	except:
		sM.run("error happend when ReadData at step 1_threshold")
		sys.stderr.write('error happend when read the data as step 1_threshold\t%s\n' % line)
		traceback.print_exc(file=sys.stderr)
Exemplo n.º 19
0
def OutputResult(filename, confDir1, confDir2, Cur_day, cid, bid, analysisDir,
                 resultDir):
    if os.path.exists(filename) == False:
        sM.run(filename + " doesn't exist at step 3_histogram")
    else:
        data = ReadData(filename)
        obData = pd.DataFrame(data)
        feature, conf2 = GetConf(confDir1, confDir2, Cur_day, cid, bid)
        obData.columns = conf2
        logx, logy, bins, ctrbins = GetAnalysisConf(analysisDir)
        for i in range(len(feature)):
            feature_name = feature[i]
            my_series = pd.DataFrame(obData[[feature_name, 'label']])
            bins[i], logx, logy, ctrbin = ConfTol(bins[i], logx, logy, ctrbins)
            HistPlot(resultDir, my_series, bins[i], feature_name, logx, logy)
            CtrPlot(resultDir, my_series, ctrbin, feature_name)
Exemplo n.º 20
0
def DealData(filename, confDir1, confDir2, Cur_day, cid, bid, analysisDir):
    if os.path.exists(filename) == False:
        sM.run(filename + " doesn't exist at step 6_anova")
    else:
        data = np.array(ReadData(filename))
        dataF = pd.DataFrame(data)
        feature, conf2 = GetConf(confDir1, confDir2, Cur_day, cid, bid)
        dataF.columns = conf2
        y = GetAnalysisConf(analysisDir)
        y = conf2.index(y)
        y = data[:, y].astype(float)
        x = np.array(dataF[feature]).astype(float)
        #formula='label'+'~'+'+'.join(feature)
        fit1 = smi.OLS(y, x).fit()
        #anova_result=anova_lm(fit1)
        return fit1.summary()
Exemplo n.º 21
0
def GetAnalysisConf(
    analysisDir
):  #数据分析配置文件#Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,即 sys.argv[1]+'/conf/data_analysis/'+tcb+'.conf'
    if os.path.exists(analysisDir) == False:
        sM.run(analysisDir + " doesn't exist at step 3_histogram")
    else:
        confp = ConfigParser.ConfigParser()
        confp.read(
            analysisDir
        )  #Base_Dir/../conf/data_analysis/$time/$cid/$bid.conf,数据分析之配置文件
        logx = confp.getboolean('hist', 'logx')
        logy = confp.getboolean('hist', 'logy')
        ctrbins = confp.getint('hist', 'ctrbins')
        bins = confp.get('hist', 'bins')
        bins = json.loads('[' + bins + ']')
        return logx, logy, bins, ctrbins
Exemplo n.º 22
0
def DealData(filename,confDir1,confDir2,Cur_day,cid,bid,analysisDir):
	if os.path.exists(filename)==False:
		sM.run(filename + " doesn't exist at step 6_anova")
	else:
		data=np.array(ReadData(filename))
		dataF=pd.DataFrame(data)
		feature, conf2 = GetConf(confDir1,confDir2,Cur_day,cid,bid)
		dataF.columns=conf2
		y=GetAnalysisConf(analysisDir)
		y=conf2.index(y)
		y=data[:,y].astype(float)
		x=np.array(dataF[feature]).astype(float)
		#formula='label'+'~'+'+'.join(feature)
		fit1=smi.OLS(y,x).fit()
		#anova_result=anova_lm(fit1)
		return fit1.summary()
Exemplo n.º 23
0
def ReadData(
    filename
):  #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace'
    try:
        file = open(filename)
        data = []
        for line in file:
            line = line.strip()
            cols = line.split('\t')
            if len(cols) > 1:
                cols = np.array(cols)
                data.append(cols)
        file.close()
        return data
    except:
        sM.run("error happend when ReadData at step 4_extrenum")
        sys.stderr.write(
            'error happend when read the data as step 4_extrenum\t%s\n' % line)
        traceback.print_exc(file=sys.stderr)
Exemplo n.º 24
0
def DealData(filename,confDir1,confDir2,Cur_day,cid,bid,analysisDir):
	if os.path.exists(filename)==False:
		sM.run(filename + " doesn't exist at step 4_extrenum")
	else:
		data=ReadData(filename)
		obData=pd.DataFrame(data)
		feature, conf2 = GetConf(confDir1,confDir2,Cur_day,cid,bid)
		obData.columns=conf2
		nrow,ncol=obData.shape
		coln=len(feature)
		method,nstd,topn=GetAnalysisConf(analysisDir)
		if method == 'First':
			for i in feature:#极值处理,西格玛原则
				ix=obData[i].astype(float)
				imean=ix.mean()
				istd=ix.std()
				imax=imean+nstd*istd
				imin=imean-nstd*istd
				ix[ix>imax]=imax
				ix[ix<imin]=imin
				obData[i]=ix
		elif method == 'Second':
			for i in feature:   #极值处理,均值方法
				ix=obData[i].astype(float)
				imean=ix.mean()
				istd=ix.std()
				imax=imean+nstd*istd
				imin=imean-nstd*istd
				ix[ix>imax]=imean
				ix[ix<imin]=imean
				#ix=ix.replace('NaN',imean)
				obData[i]=ix
		else: #method配置不写成first,second,默认为常规处理
			for i in feature: #常规处理
				ix=pd.DataFrame(obData[i].astype(float))
				topN=int(round((1-topn/100.0)*nrow))
				imax=heapq.nlargest(topN,ix.iloc[:,0])[-1]
				ix[ix>imax]=imax
				obData[i]=ix
		return obData
Exemplo n.º 25
0
def DealData(filename, confDir1, confDir2, Cur_day, cid, bid, analysisDir):
    if os.path.exists(filename) == False:
        sM.run(filename + " doesn't exist at step 4_extrenum")
    else:
        data = ReadData(filename)
        obData = pd.DataFrame(data)
        feature, conf2 = GetConf(confDir1, confDir2, Cur_day, cid, bid)
        obData.columns = conf2
        nrow, ncol = obData.shape
        coln = len(feature)
        method, nstd, topn = GetAnalysisConf(analysisDir)
        if method == 'First':
            for i in feature:  #极值处理,西格玛原则
                ix = obData[i].astype(float)
                imean = ix.mean()
                istd = ix.std()
                imax = imean + nstd * istd
                imin = imean - nstd * istd
                ix[ix > imax] = imax
                ix[ix < imin] = imin
                obData[i] = ix
        elif method == 'Second':
            for i in feature:  #极值处理,均值方法
                ix = obData[i].astype(float)
                imean = ix.mean()
                istd = ix.std()
                imax = imean + nstd * istd
                imin = imean - nstd * istd
                ix[ix > imax] = imean
                ix[ix < imin] = imean
                #ix=ix.replace('NaN',imean)
                obData[i] = ix
        else:  #method配置不写成first,second,默认为常规处理
            for i in feature:  #常规处理
                ix = pd.DataFrame(obData[i].astype(float))
                topN = int(round((1 - topn / 100.0) * nrow))
                imax = heapq.nlargest(topN, ix.iloc[:, 0])[-1]
                ix[ix > imax] = imax
                obData[i] = ix
        return obData
Exemplo n.º 26
0
def OutputResult(filename,confDir1,confDir2,Cur_day,cid,bid,analysisDir):
	if os.path.exists(filename)==False:
		sM.run(filename + " doesn't exist at step 1_threshold")
	else:
		data=ReadData(filename)
		obData = pd.DataFrame(data)
		feature, conf2 = GetConf(confDir1,confDir2,Cur_day,cid,bid)
		obData.columns = conf2
		nrow,ncol = obData.shape
		d=GetAnalysisConf(analysisDir)
		valindex = nrow*d
		freq_table = pd.DataFrame(-100,index = feature,columns = ['value'])
		for i in range(len(feature)):
			my_series = pd.Series(obData[feature[i]])
			counts = my_series.value_counts()
			for j in range(len(counts)):
				if counts[j] >= valindex:
					freq_table.iloc[i] = counts.index[j]
					break
		D_value=freq_table[freq_table['value'] != -100]
		if len(D_value)>0:
			print 'Feature and value of ' +str(d*100)+'%'+'\n', D_value
		else:
			print 'the feature does not exist which somevalue takes up more than ', str(d*100)+'%'
Exemplo n.º 27
0
def DealData(filename, confDir1, confDir2, Cur_day, cid, bid):
    if os.path.exists(filename) == False:
        sM.run(filename + " doesn't exist at step 0_datareplace")
    else:
        data = ReadData(filename)
        obData = pd.DataFrame(data)
        feature, conf2, rpv, zero_effect = GetConf(confDir1, confDir2, Cur_day,
                                                   cid, bid)
        #print feature, conf2
        obData.columns = conf2
        nrow, ncol = obData.shape
        #print obData[:4]
        data_basic_summary = pd.DataFrame(0,
                                          index=np.arange(16),
                                          columns=feature)
        for i in range(len(feature)):
            my_series = obData[feature[i]]
            datasummary = BasicSummary(my_series, zero_effect[i])
            my_series = my_series.replace('NULL', rpv[i])
            my_series = my_series.replace('NaN', rpv[i])
            obData[feature[i]] = my_series
            my_series = my_series.astype(float)
            datasummary1 = my_series.describe(
                percentiles=[.25, .5, .75, .98]).append(
                    BasicSummary1(my_series))
            data_basic_summary.iloc[:, i] = np.array(
                datasummary.append(datasummary1))
        data_basic_summary.index = ('validcount', 'covrate', 'count', 'mean',
                                    'std', 'min', '25%', '50%', '75%', '98%',
                                    'max', 'skew', 'skew', 'kurtosis',
                                    'kurtosistest', 'coefficient of variation')
        data_basic_summary.loc['range'] = data_basic_summary.loc[
            'max'] - data_basic_summary.loc['min']
        data_basic_summary.loc['cid'] = cid
        data_basic_summary.loc['bid'] = bid
        return obData, data_basic_summary.T
Exemplo n.º 28
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import traceback
import sys, os
import ConfigParser
import json
import subprocess
import ast
import sendMessage as sM
import MySQLdb
if os.path.exists(sys.argv[1] + '/bin/run') == False:
    sM.run("get_conf doesn't exist at step 5_correlation")
else:
    sys.path.append(sys.argv[1] + '/bin/run')
    import get_conf as gc

np.set_printoptions(threshold='nan')
pd.options.display.max_rows = None
pd.options.display.max_columns = None


def ReadData(
    filename
):  #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace'
    try:
        file = open(filename)
        data = []
        for line in file:
Exemplo n.º 29
0
import pandas as pd
import numpy as np
import traceback
import sys
import ConfigParser
import json
import heapq
import os
import subprocess
import ast
from scipy import stats
import sendMessage as sM
import MySQLdb
if os.path.exists(sys.argv[5] + '/bin/run/get_conf.py') == False:
    sM.run("get_conf doesn't exist at step 0_datareplace")
else:
    sys.path.append(sys.argv[5] + '/bin/run')
    import get_conf as gc

np.set_printoptions(threshold='nan')
pd.options.display.max_rows = None
pd.options.display.max_columns = None


def InsertSql(s):
    ins = "INSERT INTO rec_feature_project_data_analysis (feature_name, validcount, covrate, count, mean, std, min, twenty_five, fifty, seventy_five, ninety_eight, max, skew, skewtest, kurtosis, kurtosistest, coef_variation, rang, cid, bid, l_date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
    conn = MySQLdb.connect(host='192.168.61.73',
                           port=9000,
                           user='******',
                           passwd='qianfendian',
Exemplo n.º 30
0
import ConfigParser
import string, os, sys
import pandas as pd
import numpy as np
import math
import traceback
import time
import json
import statsmodels.api as smi
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import subprocess
import ast
import sendMessage as sM
if os.path.exists(sys.argv[1]+'/bin/run')==False:
	sM.run("get_conf doesn't exist at step 6_anova")
else:
	sys.path.append(sys.argv[1]+'/bin/run')
	import get_conf as gc

def isValid(s):
	return len(s.strip()) > 0 and s != 'null' and s != 'NULL' and s != '0'

def eps(x):
	if x > 100:
		return 1
	if x < -100:
		return 0
	return 1 / (1 + math.exp(-x))

def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace'
Exemplo n.º 31
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import traceback
import sys, os
import ConfigParser
import json
import heapq
import subprocess
import ast
import sendMessage as sM
if os.path.exists(sys.argv[1] + '/bin/run') == False:
    sM.run("get_conf doesn't exist at step 4_extrenum")
else:
    sys.path.append(sys.argv[1] + '/bin/run')
    import get_conf as gc

np.set_printoptions(threshold='nan')
pd.options.display.max_rows = None
pd.options.display.max_columns = None


def ReadData(
    filename
):  #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace'
    try:
        file = open(filename)
        data = []
        for line in file:
Exemplo n.º 32
0
def sendM(filepath):
    if os.path.exists(filepath) == False:
        sM.run(filepath + 'does not exist')
        os.makedirs(filepath)
Exemplo n.º 33
0
import pandas as pd
import numpy as np
from scipy import stats
import glob
from pandas.io import sql
import MySQLdb
import traceback
import shutil
import threading
if os.path.exists(sys.argv[1] + '/bin/run/sendMessage.py'):
    import sendMessage as sM
else:
    print "sendMessage.py doesn't exist"

if os.path.exists(sys.argv[1] + '/bin/run/get_conf.py') == False:
    sM.run("get_conf doesn't exist at step feature analysis")
else:
    sys.path.append(sys.argv[1] + '/bin/run')
    import get_conf as gc


def sendM(filepath):
    if os.path.exists(filepath) == False:
        sM.run(filepath + 'does not exist')
        os.makedirs(filepath)


def PreEnv(Base_Dir, Share_Dir):
    dataDir = '/'.join([Share_Dir, 'data', 'feature_integration'])
    dataAnalysisResult = '/'.join([Base_Dir, 'data', 'data_analysis'])
    confDir = '/'.join([Base_Dir, 'conf', 'data_analysis'])
Exemplo n.º 34
0
import ConfigParser
import string, os, sys
import pandas as pd
import numpy as np
import math
import traceback
import time
import json
import statsmodels.api as smi
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
import subprocess
import ast
import sendMessage as sM
if os.path.exists(sys.argv[1] + '/bin/run') == False:
    sM.run("get_conf doesn't exist at step 6_anova")
else:
    sys.path.append(sys.argv[1] + '/bin/run')
    import get_conf as gc


def isValid(s):
    return len(s.strip()) > 0 and s != 'null' and s != 'NULL' and s != '0'


def eps(x):
    if x > 100:
        return 1
    if x < -100:
        return 0
    return 1 / (1 + math.exp(-x))
Exemplo n.º 35
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import traceback
import ConfigParser
import json
import heapq
import subprocess
import ast
import os
import sys
import sendMessage as sM
if os.path.exists(sys.argv[5]+'/bin/run')==False:
	sM.run("get_conf doesn't exist at step 1_threshold")
else:
	sys.path.append(sys.argv[5]+'/bin/run')
	import get_conf as gc

np.set_printoptions(threshold = 'nan')
pd.set_option('display.max_rows',None)

def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/feature_integration/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'.integration'
	try:
		file = open(filename) 
		data=[]
		for line in file:
			line=line.strip()
			cols=line.split('\t')
			if len(cols) >1:
Exemplo n.º 36
0
import pandas as pd
import numpy as np
import traceback
import os, sys, json, ast
import ConfigParser
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})
from matplotlib.backends.backend_pdf import PdfPages
import heapq
import subprocess
import sendMessage as sM
import math
if os.path.exists(sys.argv[1] + '/bin/run') == False:
    sM.run("get_conf doesn't exist at step 3_histogram")
else:
    sys.path.append(sys.argv[1] + '/bin/run')
    import get_conf as gc

np.set_printoptions(threshold='nan')
pd.options.display.max_rows = None
pd.options.display.max_columns = None


def ReadData(
    filename
):  #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace'
    try:
        file = open(filename)
        data = []
Exemplo n.º 37
0
import pandas as pd
import numpy as np
from scipy import stats
import glob
from pandas.io import sql
import MySQLdb
import traceback
import shutil
import threading
if os.path.exists(sys.argv[1]+'/bin/run/sendMessage.py'):
    import sendMessage as sM
else:
    print "sendMessage.py doesn't exist"

if os.path.exists(sys.argv[1]+'/bin/run/get_conf.py')==False:
	sM.run("get_conf doesn't exist at step feature analysis")
else:
	sys.path.append(sys.argv[1]+'/bin/run')
	import get_conf as gc


def sendM(filepath):
	if os.path.exists(filepath)==False:
		sM.run(filepath+'does not exist')
		os.makedirs(filepath)


def PreEnv(Base_Dir, Share_Dir):
	dataDir='/'.join([Share_Dir, 'data', 'feature_integration']) 
	dataAnalysisResult='/'.join([Base_Dir, 'data', 'data_analysis'])
	confDir='/'.join([Base_Dir, 'conf', 'data_analysis'])
Exemplo n.º 38
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import traceback
import sys, os
import ConfigParser
import json
import heapq
import subprocess
import ast
import sendMessage as sM
if os.path.exists(sys.argv[1]+'/bin/run')==False:
	sM.run("get_conf doesn't exist at step 4_extrenum")
else:
	sys.path.append(sys.argv[1]+'/bin/run')
	import get_conf as gc

np.set_printoptions(threshold='nan')
pd.options.display.max_rows=None
pd.options.display.max_columns=None

def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace'
	try:
		file = open(filename) 
		data=[]
		for line in file:
			line=line.strip()
			cols=line.split('\t')
			if len(cols) >1:
Exemplo n.º 39
0
import pandas as pd
import numpy as np
import traceback
import sys
import ConfigParser
import json
import heapq
import os
import subprocess
import ast
from scipy import stats
import sendMessage as sM
import MySQLdb
if os.path.exists(sys.argv[5]+'/bin/run/get_conf.py')==False:
	sM.run("get_conf doesn't exist at step 0_datareplace")
else:
	sys.path.append(sys.argv[5]+'/bin/run')
	import get_conf as gc

np.set_printoptions(threshold='nan')
pd.options.display.max_rows=None
pd.options.display.max_columns=None

def InsertSql(s):
	ins="INSERT INTO rec_feature_project_data_analysis (feature_name, validcount, covrate, count, mean, std, min, twenty_five, fifty, seventy_five, ninety_eight, max, skew, skewtest, kurtosis, kurtosistest, coef_variation, rang, cid, bid, l_date) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
	conn= MySQLdb.connect(host='192.168.61.73',port = 9000, user='******',passwd='qianfendian',db ='rec_report')
	cursor = conn.cursor()
	cursor.execute(ins, tuple(s))
	conn.commit()
Exemplo n.º 40
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import traceback
import sys, os
import ConfigParser
import json
import subprocess
import ast
import sendMessage as sM
import MySQLdb
if os.path.exists(sys.argv[1]+'/bin/run')==False:
	sM.run("get_conf doesn't exist at step 5_correlation")
else:
	sys.path.append(sys.argv[1]+'/bin/run')
	import get_conf as gc

np.set_printoptions(threshold='nan')
pd.options.display.max_rows=None
pd.options.display.max_columns=None


def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace'
	try:
		file = open(filename) 
		data=[]
		for line in file:
			line=line.strip()
			cols=line.split('\t')
Exemplo n.º 41
0
def sendM(filepath):
	if os.path.exists(filepath)==False:
		sM.run(filepath+'does not exist')
		os.makedirs(filepath)
Exemplo n.º 42
0
import pandas as pd
import numpy as np
import traceback
import os, sys, json, ast
import ConfigParser
import matplotlib.pyplot as plt
from matplotlib import rcParams
rcParams.update({'figure.autolayout': True})
from matplotlib.backends.backend_pdf import PdfPages
import heapq
import subprocess
import sendMessage as sM
import math
if os.path.exists(sys.argv[1]+'/bin/run')==False:
	sM.run("get_conf doesn't exist at step 3_histogram")
else:
	sys.path.append(sys.argv[1]+'/bin/run')
	import get_conf as gc

np.set_printoptions(threshold='nan')
pd.options.display.max_rows=None
pd.options.display.max_columns=None


def ReadData(filename): #为读取原始数据的路径:sys.argv[1]+'/data/data_analysis/'+sys.argv[4]+'/'+sys.argv[2]+'/'+sys.argv[3]+'/datareplace'
	try:
		file = open(filename) 
		data=[]
		for line in file:
			line=line.strip()