def _frank_PKTE(X): # calculate empirical kendall's tau ktau = multivariate_stats.kendalls_tau(X) # inverse to find dependency parameter alpha_hat = invcopulastat('Frank', 'kendall', ktau) return alpha_hat
def cvolume(family, u1v1, u1v2, u2v1, u2v2, *args): """ Computes the C-Volume of a specified copula family with dependency parameter defined in the args. family - the copula type, must be: 'Gaussian' 'T' 'Clayton' 'Frank' 'Gumbel' u1v1 - a N x 2 matrix of values between [0,1] that represents the bottom left coordinate of the grid for which the C-Volume is desired u1v2 - a N x 2 matrix of values between [0,1] that represent the top left coordinate of the grid for which the C-Volume is desired u2v1 - a N x 2 matrix of values between [0,1] that represent the bottom right coordinate of the grid for which the C-volume is desired u2v2 - a N x 2 matrix of values between [0,1] that represents the top right coordinate of the grid for which the C-Volume is desired args - must be atleast of length 2, for which the first element in args is expected to be a string which describes the dependency value being provided, must be one of the following: 'kendall' - means kendall's Tau is being provided 'spearman' - means spearman's rho is being provided 'native' - means that the dependency parameter of the copula family itself is being provided directly the second argmuent must be the value of the dependency type provided. For kendall and spearman, a scalar value is expected. For native, if the family type is Frank, Gumbel, or Clayton, then a scalar value is expected, which represents the dependency parameter. If the family type is Gaussian, then a 2 x 2 numpy array is expected, which represents the correlation matrix defining the Gaussian copula. If the family is T, then the 2nd argument is the 2x2 numpy array representing the correlation matrix, and the 3rd argument is the degrees of freedom """ family_lc = family.lower() if(family_lc=='gaussian'): if(len(args)<2): raise ValueError("Gaussian Family expects 2 variable arguments, the dependency type and value") if(args[0]=='kendall' or args[0]=='spearman'): # get the correlation parameter r = invcopulastat(family, args[0], args[1]) else: r = args[1] cvol = _gaussian(u1v1, u1v2, u2v1, u2v2, r) elif(family_lc=='t'): if(len(args)<2): raise ValueError("T Family expects atleast 2 variable arguments, the dependency type and value") if(args[0]=='kendall' or args[0]=='spearman'): raise ValueError('T Family does not accept Kendalls Tau or Spearmans Rho, only native parameters') else: r = args[1] nu = args[2] cvol = _gaussian(u1v1, u1v2, u2v1, u2v2, r, nu) elif(family_lc=='clayton'): if(len(args)<2): raise ValueError("Clayton Family expects 2 variable arguments, the dependency type and value") if(args[0]=='kendall' or args[0]=='spearman'): # get the correlation parameter and degrees of freedom alpha = invcopulastat(family, args[0], args[1]) else: alpha = args[1] cvol = _clayton(u1v1, u1v2, u2v1, u2v2, alpha) elif(family_lc=='frank'): if(len(args)<2): raise ValueError("Frank Family expects 2 variable arguments, the dependency type and value") if(args[0]=='kendall' or args[0]=='spearman'): # get the correlation parameter and degrees of freedom alpha = invcopulastat(family, args[0], args[1]) else: alpha = args[1] cvol = _frank(u1v1, u1v2, u2v1, u2v2, alpha) elif(family_lc=='gumbel'): if(len(args)<2): raise ValueError("Gumbel Family expects 2 variable arguments, the dependency type and value") if(args[0]=='kendall' or args[0]=='spearman'): # get the correlation parameter and degrees of freedom alpha = invcopulastat(family, args[0], args[1]) else: alpha = args[1] cvol = _gumbel(u1v1, u1v2, u2v1, u2v2, alpha) return cvol
def visualizeMNSig(): # some tests on the copula multinomial signature K = 4 M = 1000 N = 3 tauVec = np.arange(-0.9,0.95,0.05) # the families to test against and pick optimal copula families = ['Gaussian', 'Clayton', 'Gumbel', 'Frank'] helmAccuracyResults = testHELM_parametric(K,M,N,tauVec,families) resultsAggregate = {} for family in families: famResults = {} for tau in tauVec: mnsig = copulamnsig(family,K,'kendall',tau) famResults[tau] = mnsig resultsAggregate[family] = famResults # visualize the results for tau in tauVec: # we would also like to visualize this copula on the side, to try to # understand what may be a better way todo model selection try: r = invcopulastat('Gaussian', 'kendall', tau) except ValueError: r = -1 Rho = np.empty((N,N)) for jj in range(0,N): for kk in range(0,N): if(jj==kk): Rho[jj][kk] = 1 else: Rho[jj][kk] = r try: alpha_clayton = invcopulastat('Clayton', 'kendall', tau) except ValueError: alpha_clayton = -1 try: alpha_gumbel = invcopulastat('Gumbel', 'kendall', tau) except ValueError: alpha_gumbel = -1 try: alpha_frank = invcopulastat('Frank', 'kendall', tau) except ValueError: alpha_frank = -1 if(r!=-1): try: U_gauss = copularnd('Gaussian', M, Rho) except ValueError: U_gauss = np.zeros((M,N)) if(alpha_clayton!=-1): try: U_clayton = copularnd('Clayton', M, N, alpha_clayton) except ValueError: U_clayton = np.zeros((M,N)) if(alpha_frank!=-1): try: U_frank = copularnd('Frank', M, N, alpha_frank) except ValueError: U_frank = np.zeros((M,N)) if(alpha_gumbel!=-1): try: U_gumbel = copularnd('Gumbel', M, N, alpha_gumbel) except ValueError: U_gumbel = np.zeros((M,N)) # get each family's MN signature and plot it plt.figure(figsize=(30,20)) plt.subplot(231) if(np.sum(resultsAggregate['Gaussian'][tau])>0): plt.plot(np.arange(1,K*K+1), resultsAggregate['Gaussian'][tau], 'b.-', label='Gaussian Copula') if(np.sum(resultsAggregate['Clayton'][tau])>0): plt.plot(np.arange(1,K*K+1), resultsAggregate['Clayton'][tau], 'g.-', label='Clayton Copula') if(np.sum(resultsAggregate['Gumbel'][tau])>0): plt.plot(np.arange(1,K*K+1), resultsAggregate['Gumbel'][tau], 'r.-', label='Gumbel Copula') if(np.sum(resultsAggregate['Frank'][tau])>0): plt.plot(np.arange(1,K*K+1), resultsAggregate['Frank'][tau], 'k.-', label='Frank Copula') plt.title(r'Copula Multinomial Signature $\tau$=' + "{0:.2f}".format(tau) + ' K=' + str(K)) plt.legend() plt.grid() plt.subplot(232) if(r!=-1): plt.scatter(U_gauss[:,0], U_gauss[:,1]) plt.grid() plt.title(r'Gaussian Copula, $\rho$=' + "{0:.2f}".format(r) + r' $\tau$=' + "{0:.2f}".format(tau)) plt.subplot(233) if(alpha_clayton!=-1): plt.scatter(U_clayton[:,0], U_clayton[:,1]) plt.grid() plt.title(r'Clayton Copula, $\alpha$=' + "{0:.2f}".format(alpha_clayton) + r' $\tau$=' + "{0:.2f}".format(tau)) plt.subplot(235) if(alpha_frank!=-1): plt.scatter(U_frank[:,0], U_frank[:,1]) plt.grid() plt.title(r'Frank Copula, $\alpha$=' + "{0:.2f}".format(alpha_frank) + r' $\tau$=' + "{0:.2f}".format(tau)) plt.subplot(236) if(alpha_gumbel!=-1): plt.scatter(U_gumbel[:,0], U_gumbel[:,1]) plt.grid() plt.title(r'Gumbel Copula, $\alpha$=' + "{0:.2f}".format(alpha_gumbel) + r' $\tau$=' + "{0:.2f}".format(tau)) plt.subplot(234) # index manually to ensure accuracy cla = np.array([helmAccuracyResults['Clayton'][tau]['clayton'], helmAccuracyResults['Gaussian'][tau]['clayton'], helmAccuracyResults['Gumbel'][tau]['clayton'], helmAccuracyResults['Frank'][tau]['clayton']]) gau = np.array([helmAccuracyResults['Clayton'][tau]['gaussian'], helmAccuracyResults['Gaussian'][tau]['gaussian'], helmAccuracyResults['Gumbel'][tau]['gaussian'], helmAccuracyResults['Frank'][tau]['gaussian']]) gum = np.array([helmAccuracyResults['Clayton'][tau]['gumbel'], helmAccuracyResults['Gaussian'][tau]['gumbel'], helmAccuracyResults['Gumbel'][tau]['gumbel'], helmAccuracyResults['Frank'][tau]['gumbel']]) fra = np.array([helmAccuracyResults['Clayton'][tau]['frank'], helmAccuracyResults['Gaussian'][tau]['frank'], helmAccuracyResults['Gumbel'][tau]['frank'], helmAccuracyResults['Frank'][tau]['frank']]) ind = np.arange(4) width = 0.2 p1 = plt.bar(ind,cla,width,color='b') p2 = plt.bar(ind,gau,width,color='g',bottom=cla) p3 = plt.bar(ind,gum,width,color='k',bottom=cla+gau) p4 = plt.bar(ind,fra,width,color='r',bottom=cla+gau+gum) plt.xticks(ind+width/2.,('Clayton', 'Gaussian', 'Gumbel', 'Frank')) plt.legend( (p1[0], p2[0], p3[0], p4[0]), ('Clayton', 'Gaussian', 'Gumbel', 'Frank') ) plt.grid() plt.savefig(os.path.join('figures/HELM_performance/', 'HELM_DIM_' + str(N) + '_tau_' + "{0:.2f}".format(tau) + ' _K_' + str(K) + '.png')) plt.close()
def testHELM(tau, M, N, familyToTest, numMCSims, copulaFamiliesToTest): results = {} for fam in copulaFamiliesToTest: results[fam.lower()] = 0 for ii in range(0,numMCSims): # generate samples of the requested copula with tau same as the # empirical signature we calculated above if(familyToTest.lower()=='gaussian'): r = invcopulastat(familyToTest, 'kendall', tau) Rho = np.empty((N,N)) for jj in range(0,N): for kk in range(0,N): if(jj==kk): Rho[jj][kk] = 1 else: Rho[jj][kk] = r try: U = copularnd(familyToTest, M, Rho) except ValueError: # copularnd will throw a ValueError if Rho is not a positive semidefinite matrix return results # return 0, which will then be ignored by tests else: # assume Clayton, Frank, or Gumbel try: alpha = invcopulastat(familyToTest, 'kendall', tau) U = copularnd(familyToTest, M, N, alpha) except ValueError: continue lst = [] for jj in range(0,N): U_conditioned = U[:,jj] # if there are any 1's, condition it U_conditioned[U_conditioned==1] = 0.99 if(jj%2==0): lst.append(norm.ppf(U_conditioned)) else: lst.append(expon.ppf(U_conditioned)) # combine X and Y into the joint distribution w/ the copula X = np.vstack(lst) X = X.T ret = optimalCopulaFamily(X, family_search=copulaFamiliesToTest) ret_family = ret[0].lower() # aggregate results results[ret_family] = results[ret_family] + 1.0 # display some progress sys.stdout.write("\rComputing " + str(familyToTest) + " Copula (DIM=%d) (tau=%f)-- %d%%" % (N,tau,ii+1)) sys.stdout.flush() sys.stdout.write("\r") # convert results to percentage for fam in copulaFamiliesToTest: results[fam.lower()] = results[fam.lower()]/float(numMCSims) * 100 return results
def optimalCopulaFamily(X, K=4, family_search=['Gaussian', 'Clayton', 'Gumbel', 'Frank']): """ This function, given a multivariate data set X, computes the best copula family which fits the data, using the procedure described in the paper "Highly Efficient Learning of Mixed Copula Networks," by Gal Elidan X - the multivariate dataset for which we desire the copula. Must be a numpy array of dimension [M x N], where M is the number of data points, and N is the dimensionality of the dataset K - the square root of the number of grid points (for now, we assume square gridding of the unit cube) family_search - a list of all the copula families to search. Currently, what is supported is Gaussian, Clayton, Gumbel, and Frank. As more copula's are added, the default list will be expanded. """ # compute the empirical Kendall's Tau tau_hat = multivariate_stats.kendalls_tau(X) # compute empirical multinomial signature empirical_mnsig = empirical_copulamnsig(X, K) empirical_mnsig = empirical_mnsig[0]['esig'] # replace any 0 values w/ smallest possible float value empirical_mnsig[empirical_mnsig==0] = np.spacing(1) # compute the multinomial signature for each of the copula families specified # and simultaneously compute the kullback leibler divergence between the empirical # and the computed, and store that info distances = {} for family in family_search: # because the Clayton and Gumbel Copula's have restrictions for the valid values of # Kendall's tau, we do checks here to ensure those restrictions are met, because there # will be a certain variance associated with the tau_hat measurement if(family.lower()=='clayton'): # here we add some additional optimizatons as follows. We know that the Clayton copula # captures only positive concordance. Like any estimator, tau_hat will have some variance # associated with it. Thus, the optimization we make is as follows, if tau_hat is within # a configurable amount less than 0, then we will set tau_hat to 0 and continue processing. # However, if tau_hat is greater than that, we theoretically wouldn't have to test against # the Clayton copula model, so we set the KL-divergence to be infinity to exclude # this family from being selected if(tau_hat<-0.05): distances[family] = np.inf continue elif(tau_hat>=-0.05 and tau_hat<0): tau_hat = 0 elif(tau_hat>=1): tau_hat = 1 - np.spacing(1) # as close to 1 as possible in our precision elif(family.lower()=='gumbel'): # here we add some additional optimizatons as follows. We know that the Gumbel copula # captures only positive concordance. Like any estimator, tau_hat will have some variance # associated with it. Thus, the optimization we make is as follows, if tau_hat is within # a configurable amount less than 0, then we will set tau_hat to 0 and continue processing. # However, if tau_hat is greater than that, we theoretically wouldn't have to test against # the Gumbel copula model, so we set the KL-divergence to be infinity to exclude # this family from being selected if(tau_hat<-0.05): distances[family] = np.inf continue elif(tau_hat>=-0.05 and tau_hat<0): tau_hat = 0 elif(tau_hat>=1): tau_hat = 1 - np.spacing(1) # as close to 1 as possible in our precision # any other copula families with restrictions can go here mnsig = copulamnsig(family,K,'kendall',tau_hat) # replace any 0 values w/ smallest possible float value mnsig[mnsig==0] = np.spacing(1) # compute KL divergence, see # http://docs.scipy.org/doc/scipy-dev/reference/generated/scipy.stats.entropy.html distances[family] = entropy(mnsig, empirical_mnsig) # search for the minimum distance, that is the optimal copula family to use minDistance = np.inf for family, distance in distances.iteritems(): if distance<minDistance: minDistance = distance optimalFamily = family depParams = invcopulastat(optimalFamily, 'kendall', tau_hat) return (optimalFamily, depParams, tau_hat)
def cvolume(family, u1v1, u1v2, u2v1, u2v2, *args): """ Computes the C-Volume of a specified copula family with dependency parameter defined in the args. family - the copula type, must be: 'Gaussian' 'T' 'Clayton' 'Frank' 'Gumbel' u1v1 - a N x 2 matrix of values between [0,1] that represents the bottom left coordinate of the grid for which the C-Volume is desired u1v2 - a N x 2 matrix of values between [0,1] that represent the top left coordinate of the grid for which the C-Volume is desired u2v1 - a N x 2 matrix of values between [0,1] that represent the bottom right coordinate of the grid for which the C-volume is desired u2v2 - a N x 2 matrix of values between [0,1] that represents the top right coordinate of the grid for which the C-Volume is desired args - must be atleast of length 2, for which the first element in args is expected to be a string which describes the dependency value being provided, must be one of the following: 'kendall' - means kendall's Tau is being provided 'spearman' - means spearman's rho is being provided 'native' - means that the dependency parameter of the copula family itself is being provided directly the second argmuent must be the value of the dependency type provided. For kendall and spearman, a scalar value is expected. For native, if the family type is Frank, Gumbel, or Clayton, then a scalar value is expected, which represents the dependency parameter. If the family type is Gaussian, then a 2 x 2 numpy array is expected, which represents the correlation matrix defining the Gaussian copula. If the family is T, then the 2nd argument is the 2x2 numpy array representing the correlation matrix, and the 3rd argument is the degrees of freedom """ family_lc = family.lower() if (family_lc == 'gaussian'): if (len(args) < 2): raise ValueError( "Gaussian Family expects 2 variable arguments, the dependency type and value" ) if (args[0] == 'kendall' or args[0] == 'spearman'): # get the correlation parameter r = invcopulastat(family, args[0], args[1]) else: r = args[1] cvol = _gaussian(u1v1, u1v2, u2v1, u2v2, r) elif (family_lc == 't'): if (len(args) < 2): raise ValueError( "T Family expects atleast 2 variable arguments, the dependency type and value" ) if (args[0] == 'kendall' or args[0] == 'spearman'): raise ValueError( 'T Family does not accept Kendalls Tau or Spearmans Rho, only native parameters' ) else: r = args[1] nu = args[2] cvol = _gaussian(u1v1, u1v2, u2v1, u2v2, r, nu) elif (family_lc == 'clayton'): if (len(args) < 2): raise ValueError( "Clayton Family expects 2 variable arguments, the dependency type and value" ) if (args[0] == 'kendall' or args[0] == 'spearman'): # get the correlation parameter and degrees of freedom alpha = invcopulastat(family, args[0], args[1]) else: alpha = args[1] cvol = _clayton(u1v1, u1v2, u2v1, u2v2, alpha) elif (family_lc == 'frank'): if (len(args) < 2): raise ValueError( "Frank Family expects 2 variable arguments, the dependency type and value" ) if (args[0] == 'kendall' or args[0] == 'spearman'): # get the correlation parameter and degrees of freedom alpha = invcopulastat(family, args[0], args[1]) else: alpha = args[1] cvol = _frank(u1v1, u1v2, u2v1, u2v2, alpha) elif (family_lc == 'gumbel'): if (len(args) < 2): raise ValueError( "Gumbel Family expects 2 variable arguments, the dependency type and value" ) if (args[0] == 'kendall' or args[0] == 'spearman'): # get the correlation parameter and degrees of freedom alpha = invcopulastat(family, args[0], args[1]) else: alpha = args[1] cvol = _gumbel(u1v1, u1v2, u2v1, u2v2, alpha) return cvol
def visualizeMNSig(): # some tests on the copula multinomial signature K = 4 M = 1000 N = 3 tauVec = np.arange(-0.9, 0.95, 0.05) # the families to test against and pick optimal copula families = ['Gaussian', 'Clayton', 'Gumbel', 'Frank'] helmAccuracyResults = testHELM_parametric(K, M, N, tauVec, families) resultsAggregate = {} for family in families: famResults = {} for tau in tauVec: mnsig = copulamnsig(family, K, 'kendall', tau) famResults[tau] = mnsig resultsAggregate[family] = famResults # visualize the results for tau in tauVec: # we would also like to visualize this copula on the side, to try to # understand what may be a better way todo model selection try: r = invcopulastat('Gaussian', 'kendall', tau) except ValueError: r = -1 Rho = np.empty((N, N)) for jj in range(0, N): for kk in range(0, N): if (jj == kk): Rho[jj][kk] = 1 else: Rho[jj][kk] = r try: alpha_clayton = invcopulastat('Clayton', 'kendall', tau) except ValueError: alpha_clayton = -1 try: alpha_gumbel = invcopulastat('Gumbel', 'kendall', tau) except ValueError: alpha_gumbel = -1 try: alpha_frank = invcopulastat('Frank', 'kendall', tau) except ValueError: alpha_frank = -1 if (r != -1): try: U_gauss = copularnd('Gaussian', M, Rho) except ValueError: U_gauss = np.zeros((M, N)) if (alpha_clayton != -1): try: U_clayton = copularnd('Clayton', M, N, alpha_clayton) except ValueError: U_clayton = np.zeros((M, N)) if (alpha_frank != -1): try: U_frank = copularnd('Frank', M, N, alpha_frank) except ValueError: U_frank = np.zeros((M, N)) if (alpha_gumbel != -1): try: U_gumbel = copularnd('Gumbel', M, N, alpha_gumbel) except ValueError: U_gumbel = np.zeros((M, N)) # get each family's MN signature and plot it plt.figure(figsize=(30, 20)) plt.subplot(231) if (np.sum(resultsAggregate['Gaussian'][tau]) > 0): plt.plot(np.arange(1, K * K + 1), resultsAggregate['Gaussian'][tau], 'b.-', label='Gaussian Copula') if (np.sum(resultsAggregate['Clayton'][tau]) > 0): plt.plot(np.arange(1, K * K + 1), resultsAggregate['Clayton'][tau], 'g.-', label='Clayton Copula') if (np.sum(resultsAggregate['Gumbel'][tau]) > 0): plt.plot(np.arange(1, K * K + 1), resultsAggregate['Gumbel'][tau], 'r.-', label='Gumbel Copula') if (np.sum(resultsAggregate['Frank'][tau]) > 0): plt.plot(np.arange(1, K * K + 1), resultsAggregate['Frank'][tau], 'k.-', label='Frank Copula') plt.title(r'Copula Multinomial Signature $\tau$=' + "{0:.2f}".format(tau) + ' K=' + str(K)) plt.legend() plt.grid() plt.subplot(232) if (r != -1): plt.scatter(U_gauss[:, 0], U_gauss[:, 1]) plt.grid() plt.title(r'Gaussian Copula, $\rho$=' + "{0:.2f}".format(r) + r' $\tau$=' + "{0:.2f}".format(tau)) plt.subplot(233) if (alpha_clayton != -1): plt.scatter(U_clayton[:, 0], U_clayton[:, 1]) plt.grid() plt.title(r'Clayton Copula, $\alpha$=' + "{0:.2f}".format(alpha_clayton) + r' $\tau$=' + "{0:.2f}".format(tau)) plt.subplot(235) if (alpha_frank != -1): plt.scatter(U_frank[:, 0], U_frank[:, 1]) plt.grid() plt.title(r'Frank Copula, $\alpha$=' + "{0:.2f}".format(alpha_frank) + r' $\tau$=' + "{0:.2f}".format(tau)) plt.subplot(236) if (alpha_gumbel != -1): plt.scatter(U_gumbel[:, 0], U_gumbel[:, 1]) plt.grid() plt.title(r'Gumbel Copula, $\alpha$=' + "{0:.2f}".format(alpha_gumbel) + r' $\tau$=' + "{0:.2f}".format(tau)) plt.subplot(234) # index manually to ensure accuracy cla = np.array([ helmAccuracyResults['Clayton'][tau]['clayton'], helmAccuracyResults['Gaussian'][tau]['clayton'], helmAccuracyResults['Gumbel'][tau]['clayton'], helmAccuracyResults['Frank'][tau]['clayton'] ]) gau = np.array([ helmAccuracyResults['Clayton'][tau]['gaussian'], helmAccuracyResults['Gaussian'][tau]['gaussian'], helmAccuracyResults['Gumbel'][tau]['gaussian'], helmAccuracyResults['Frank'][tau]['gaussian'] ]) gum = np.array([ helmAccuracyResults['Clayton'][tau]['gumbel'], helmAccuracyResults['Gaussian'][tau]['gumbel'], helmAccuracyResults['Gumbel'][tau]['gumbel'], helmAccuracyResults['Frank'][tau]['gumbel'] ]) fra = np.array([ helmAccuracyResults['Clayton'][tau]['frank'], helmAccuracyResults['Gaussian'][tau]['frank'], helmAccuracyResults['Gumbel'][tau]['frank'], helmAccuracyResults['Frank'][tau]['frank'] ]) ind = np.arange(4) width = 0.2 p1 = plt.bar(ind, cla, width, color='b') p2 = plt.bar(ind, gau, width, color='g', bottom=cla) p3 = plt.bar(ind, gum, width, color='k', bottom=cla + gau) p4 = plt.bar(ind, fra, width, color='r', bottom=cla + gau + gum) plt.xticks(ind + width / 2., ('Clayton', 'Gaussian', 'Gumbel', 'Frank')) plt.legend((p1[0], p2[0], p3[0], p4[0]), ('Clayton', 'Gaussian', 'Gumbel', 'Frank')) plt.grid() plt.savefig( os.path.join( 'figures/HELM_performance/', 'HELM_DIM_' + str(N) + '_tau_' + "{0:.2f}".format(tau) + ' _K_' + str(K) + '.png')) plt.close()
def testHELM(tau, M, N, familyToTest, numMCSims, copulaFamiliesToTest): results = {} for fam in copulaFamiliesToTest: results[fam.lower()] = 0 for ii in range(0, numMCSims): # generate samples of the requested copula with tau same as the # empirical signature we calculated above if (familyToTest.lower() == 'gaussian'): r = invcopulastat(familyToTest, 'kendall', tau) Rho = np.empty((N, N)) for jj in range(0, N): for kk in range(0, N): if (jj == kk): Rho[jj][kk] = 1 else: Rho[jj][kk] = r try: U = copularnd(familyToTest, M, Rho) except ValueError: # copularnd will throw a ValueError if Rho is not a positive semidefinite matrix return results # return 0, which will then be ignored by tests else: # assume Clayton, Frank, or Gumbel try: alpha = invcopulastat(familyToTest, 'kendall', tau) U = copularnd(familyToTest, M, N, alpha) except ValueError: continue lst = [] for jj in range(0, N): U_conditioned = U[:, jj] # if there are any 1's, condition it U_conditioned[U_conditioned == 1] = 0.99 if (jj % 2 == 0): lst.append(norm.ppf(U_conditioned)) else: lst.append(expon.ppf(U_conditioned)) # combine X and Y into the joint distribution w/ the copula X = np.vstack(lst) X = X.T ret = optimalCopulaFamily(X, family_search=copulaFamiliesToTest) ret_family = ret[0].lower() # aggregate results results[ret_family] = results[ret_family] + 1.0 # display some progress sys.stdout.write("\rComputing " + str(familyToTest) + " Copula (DIM=%d) (tau=%f)-- %d%%" % (N, tau, ii + 1)) sys.stdout.flush() sys.stdout.write("\r") # convert results to percentage for fam in copulaFamiliesToTest: results[fam.lower()] = results[fam.lower()] / float(numMCSims) * 100 return results
def optimalCopulaFamily(X, K=4, family_search=[ 'Gaussian', 'Clayton', 'Gumbel', 'Frank' ]): """ This function, given a multivariate data set X, computes the best copula family which fits the data, using the procedure described in the paper "Highly Efficient Learning of Mixed Copula Networks," by Gal Elidan X - the multivariate dataset for which we desire the copula. Must be a numpy array of dimension [M x N], where M is the number of data points, and N is the dimensionality of the dataset K - the square root of the number of grid points (for now, we assume square gridding of the unit cube) family_search - a list of all the copula families to search. Currently, what is supported is Gaussian, Clayton, Gumbel, and Frank. As more copula's are added, the default list will be expanded. """ # compute the empirical Kendall's Tau tau_hat = multivariate_stats.kendalls_tau(X) # compute empirical multinomial signature empirical_mnsig = empirical_copulamnsig(X, K) empirical_mnsig = empirical_mnsig[0]['esig'] # replace any 0 values w/ smallest possible float value empirical_mnsig[empirical_mnsig == 0] = np.spacing(1) # compute the multinomial signature for each of the copula families specified # and simultaneously compute the kullback leibler divergence between the empirical # and the computed, and store that info distances = {} for family in family_search: # because the Clayton and Gumbel Copula's have restrictions for the valid values of # Kendall's tau, we do checks here to ensure those restrictions are met, because there # will be a certain variance associated with the tau_hat measurement if (family.lower() == 'clayton'): # here we add some additional optimizatons as follows. We know that the Clayton copula # captures only positive concordance. Like any estimator, tau_hat will have some variance # associated with it. Thus, the optimization we make is as follows, if tau_hat is within # a configurable amount less than 0, then we will set tau_hat to 0 and continue processing. # However, if tau_hat is greater than that, we theoretically wouldn't have to test against # the Clayton copula model, so we set the KL-divergence to be infinity to exclude # this family from being selected if (tau_hat < -0.05): distances[family] = np.inf continue elif (tau_hat >= -0.05 and tau_hat < 0): tau_hat = 0 elif (tau_hat >= 1): tau_hat = 1 - np.spacing( 1) # as close to 1 as possible in our precision elif (family.lower() == 'gumbel'): # here we add some additional optimizatons as follows. We know that the Gumbel copula # captures only positive concordance. Like any estimator, tau_hat will have some variance # associated with it. Thus, the optimization we make is as follows, if tau_hat is within # a configurable amount less than 0, then we will set tau_hat to 0 and continue processing. # However, if tau_hat is greater than that, we theoretically wouldn't have to test against # the Gumbel copula model, so we set the KL-divergence to be infinity to exclude # this family from being selected if (tau_hat < -0.05): distances[family] = np.inf continue elif (tau_hat >= -0.05 and tau_hat < 0): tau_hat = 0 elif (tau_hat >= 1): tau_hat = 1 - np.spacing( 1) # as close to 1 as possible in our precision # any other copula families with restrictions can go here mnsig = copulamnsig(family, K, 'kendall', tau_hat) # replace any 0 values w/ smallest possible float value mnsig[mnsig == 0] = np.spacing(1) # compute KL divergence, see # http://docs.scipy.org/doc/scipy-dev/reference/generated/scipy.stats.entropy.html distances[family] = entropy(mnsig, empirical_mnsig) # search for the minimum distance, that is the optimal copula family to use minDistance = np.inf for family, distance in distances.iteritems(): if distance < minDistance: minDistance = distance optimalFamily = family depParams = invcopulastat(optimalFamily, 'kendall', tau_hat) return (optimalFamily, depParams, tau_hat)