def run(dataDefFile, var1, var2, datapoints): tau = TAU print('Generating data using: ', dataDefFile) fileExt = dataDefFile.split('.')[-1] if fileExt == 'py': # Data Definition File outFile = synthDataGen.run(dataDefFile, datapoints) elif fileExt == 'csv': # Data file outFile = dataDefFile else: print('*** Invalid file type = ', fileExt) return d = getData.DataReader(outFile, limit=datapoints) fig = plt.figure() ax = fig.add_axes([.1,.1,.8,.8],projection='3d') # If var1 and var2 are not specified, then build a master manifold out of three vars at a time. Otherwise, build shadow manifolds # from the two specified vars if var1 is None: vars = d.getSeriesNames() vars.sort() print('Vars = ', vars) colors = ['b', 'g', 'r','o'] for i in range(4): if len(vars) < i*3+3: break X = d.getSeries(vars[i*3]) Y = d.getSeries(vars[i*3+1]) Z = d.getSeries(vars[i*3+2]) if standard: X = standardize.standardize(X) Y = standardize.standardize(Y) Z = standardize.standardize(Z) color = colors[i] ax.plot(X, Y, Z) else: var1D = d.getSeries(var1) if standard: var1D = standardize.standardize(var1D) X1 = var1D[:-2*tau] Y1 = var1D[tau:-tau] Z1 = var1D[2*tau:] var2D = d.getSeries(var2) if standard: var2D = standardize.standardize(var2D) X2 = var2D[:-2*tau] Y2 = var2D[tau:-tau] Z2 = var2D[2*tau:] #plotData(d, datapoints) ax.plot(X1,Y1,Z1) ax.plot(X2,Y2,Z2, 'r') #ax.plot(X1, Y1, Z2, 'g') # `ax` is a 3D-aware axis instance because of the projection='3d' keyword argument to add_subplot #ax = fig.add_subplot(1, 2, 1, projection='3d') plt.show()
def similarity_chk(inputFile1,inputFile2): std_inpt1=standardize.standardize(inputFile1) file1=codecs.open(std_inpt1,encoding='utf-8', errors='ignore').read() std_inpt2=standardize.standardize(inputFile2) file2=codecs.open(std_inpt2,encoding='utf-8', errors='ignore').read() ngram1=ngrams(file1, 2) # [['a', 'b'], ['b', 'c'], ['c', 'd']] ngram2=ngrams(file2, 2) l1 = len(ngram1) l2 = len(ngram2) print "l1" print l1 print "l2" print l2 interList = [i for i in ngram1 if i in ngram2] unionList=ngram1+ngram2 unionList=uniq(unionList) interList=uniq(interList) similar = len(interList) total = len(unionList) print "similar" print similar print "total" print total threshold = float (similar) / total print "resemblence score " print threshold return threshold
def __init__(self, s1, s2, tau=2, dim=4): self.s1 = standardize.standardize(s1) self.s2 = standardize.standardize(s2) self.tau = tau self.dim = dim self.pointsToTest = [] self.sLen = len(s1) - (self.dim - 1) * self.tau self.pointsToTest = [] self._generateShadowManifolds() return
def translate(self): order_of_function = [ "\\frac", "\\int", "\\partial", ] # The order is very important!, which still needs more consideration directory_of_function = {"\\frac": frac, "\\int": integrate, "\\partial": partial} print(self.origin) standardize(self) print(self.origin) for function in order_of_function: if function in self.set_of_tex: directory_of_function[function](self) mul_integer_symbol(self) convert_integer(self) return self.origin
def get_weight_vector(feature_matrix, output, lambda_reg, p): w = [] check = True """ feature_matrix: an n x m 2-D numpy array where n is the number of samples and m the feature size. output: an n x 1 numpy array reprsenting the outputs for the n samples lambda_reg: regularization parameter p: p-norm for the regularized regression Return: an m x 1 numpy array weight vector obtained through stochastic gradient descent using the provided function parameters such that the matrix product of the feature_matrix matrix with this vector will give you the n x 1 regression outputs """ if (p < 1 or p > 2): check = False return w, check x_train = copy.deepcopy(feature_matrix) x_train = standardize(x_train) # y_train=getY() if p == 2: w = L2(x_train, y_train, lambda_reg) elif p == 1: w = L1(x_train, y_train, lambda_reg) else: w = pNorm(x_train, y_train, lambda_reg, p) return w, check
def translate(self): order_of_function = [ '\\frac', '\\int', '\\partial' ] #The order is very important!, which still needs more consideration directory_of_function = { '\\frac': frac, '\\int': integrate, '\\partial': partial } print(self.origin) standardize(self) print(self.origin) for function in order_of_function: if function in self.set_of_tex: directory_of_function[function](self) mul_integer_symbol(self) convert_integer(self) return self.origin
def direction(rvA, rvB): """ Test the causal direction between variables A and B using one of the LiNGAM or GeNGAM pairwise algorithms. Returns a number R. A positive R indicates that the causal path runs from A toward B. A negative value indicates a causal path from B towards A. Values close to zero (e.g. +/- 10**-5) means that causal direction could not be determined. """ s1 = standardize(rvA) s2 = standardize(rvB) # Pairwise Lingam Algorithm (Hyperbolic Tangent (HT) variant) cum = 0 for i in range(len(s1)): v1 = s1[i] v2 = s2[i] cumulant = v1 * math.tanh(v2) - v2 * math.tanh(v1) cum += cumulant avg = cum / float(len(s1)) cc = np.corrcoef([s1, s2]) rho = cc[1, 0] R = rho * avg return R
def solve_LP(LP_matrix, var_constraints, func_constraints, verbose=False): """ :param LP_matrix: the matrix of the LP, size m+1 x n+1 :param var_constraints: variable constraints, 1: >=0, -1: <= 0, 0: no constraint, size: n :param func_constraints: functional constraint, 1: >=0, -1: <= 0, 0: no constraint, size: m :param verbose: Bool, if true, additional message is printed :return: (solution, solution_value). solution is a n-dimensional array, solution_value is the Z value of the solution """ _, columns = LP_matrix.shape var_number = columns - 1 if verbose: print("the input of the problem is: LP_matrix = \n{}\nvar_constraints = {}, func_constraints = {}" .format(LP_matrix, var_constraints, func_constraints)) LP_matrix, neg_constraint_vars, no_constraint_vars, nolimit_extra, phase1_slack_vars, bases \ = standardize(LP_matrix, var_constraints, func_constraints, verbose) if phase1_slack_vars.size: rows, columns = LP_matrix.shape c = LP_matrix[0, :columns-1].copy() LP_matrix[0] = 0 LP_matrix[0, phase1_slack_vars] = 1 if verbose: print("enter phase 1, input for phase 1 is \n{}".format(LP_matrix)) transform_canonical(LP_matrix, bases, verbose) solution, solution_value = solve_canonical_LP(LP_matrix, bases, verbose) if solution_value > epsilon: raise InfeasibleError("phase 1 failed. The LP is not feasible") LP_matrix[0, :columns-1] = c LP_matrix[:, phase1_slack_vars] = 0 if verbose: print("phase 1 end, input for phase 2 is \n{}".format(LP_matrix)) transform_canonical(LP_matrix, bases, verbose) if verbose: print("enter phase 2") solution, solution_value = solve_canonical_LP(LP_matrix, bases, verbose) solution[neg_constraint_vars] = -solution[neg_constraint_vars] if no_constraint_vars.size: solution[no_constraint_vars] -= solution[nolimit_extra] solution = solution[:var_number] if verbose: print("original solution is {}, value is {}".format(solution, solution_value)) return solution, solution_value
def __init__(self, rvList, data={}): self.g = networkx.DiGraph() self.rvDict = {} for rv in rvList: if rv.name in self.rvDict.keys(): raise 'Duplicate variable name = ' + rv.name self.rvDict[rv.name] = rv self.rvList = list(self.rvDict.keys()) self.rvList.sort() self.g.add_nodes_from(self.rvDict.keys()) edges = [] for rv in rvList: for pa in rv.parentNames: edges.append((pa, rv.name)) self.g.add_edges_from(edges) self.data = data edges = self.g.edges() self.edgeDict = {} for edge in edges: s, d = edge if s in self.edgeDict.keys(): self.edgeDict[s].append(edge) else: self.edgeDict[s] = [edge] if d in self.edgeDict.keys(): self.edgeDict[d].append(edge) else: self.edgeDict[d] = [edge] # Create a probability space object for later use self.prob = Prob.ProbSpace(self.data, power=1) # Create a separate standardized probability space for independence testing. iData = {} for var in self.data.keys(): iData[var] = standardize(self.data[var]) self.iProb = Prob.ProbSpace(iData) self.bdCache = {} self.fdCache = {}
from synth import synthDataGen from standardize import standardize import time METHOD = 'prob' #METHOD = 'fcit' POWER = 1 print('power = ', POWER) args = sys.argv test = 'Probability/Test/models/indCalibrationDat.csv' r = getData.DataReader(test) dat = r.read() vars = dat.keys() for var in vars: dat[var] = standardize(dat[var]) #print('dat = ', dat) ps = Prob.ProbSpace(dat, power=POWER) # List a variety of independent relationships indeps = [ ('L1', 'L2'), ('L2', 'L3'), ('L1', 'L3'), ('E1', 'E2'), ('N1', 'N2'), ('L4', 'L5'), ('L5', 'L6'), ('L4', 'N3'), ('B', 'D', ['A']), ('A', 'C', ['B', 'D']),
def TestModel(self, data=None, order=3): # Standardize the data before doing independence testing warningPenalty = .0025 iData = {} for var in self.data.keys(): iData[var] = standardize(self.data[var]) ps = Prob.ProbSpace(iData) numTestTypes = 4 errors = [] warnings = [] if data is None: data = self.data numTestsPerType = [0] * numTestTypes numErrsPerType = [0] * numTestTypes deps = self.computeDependencies(order) if VERBOSE: print('Testing Model for', len(deps), 'Independencies') for dep in deps: x, y, z, isDep = dep if z is None: z = [] pval = independence.test(ps, [x], [y], z, power=1) print(x, y, z) errStr = None warnStr = None testType = -1 if not z and self.isExogenous(x) and self.isExogenous(y): testType = 0 elif not isDep: testType = 1 else: testType = 2 numTestsPerType[testType] += 1 if testType == 0 and pval < .5: errStr = 'Error (Type 0 -- Exogenous variables not independent) -- Expected: ' + self.formatDependency( dep) + ' but dependence was detected. P-val = ' + str(pval) elif testType == 2 and pval > .5: if pval > .75: errStr = 'Error (Type 2 -- Unexpected independence) -- Expected: ' + self.formatDependency( dep) + ' but no dependence detected. P-val = ' + str( pval) else: warnStr = 'Warning (Type 2 -- Unexpected independence) -- Expected: ' + self.formatDependency( dep ) + ' but minimal dependence detected. P-val = ' + str( pval) elif testType == 1 and pval < .5: if pval < .25: errStr = 'Error (Type 1 -- Unexpected dependence) -- Expected: ' + self.formatDependency( dep) + ' but dependence was detected. P-val = ' + str( pval) else: warnStr = 'Warning (Type 1 -- Unexpected dependence) -- Expected: ' + self.formatDependency( dep ) + ' but some dependence was detected. P-val = ' + str( pval) if errStr: if VERBOSE: print('***', errStr) #5/0 errors.append( (testType, [x], [y], list(z), isDep, pval, errStr)) numErrsPerType[testType] += 1 if warnStr: if VERBOSE: print('*', warnStr) #5/0 warnings.append( (testType, [x], [y], list(z), isDep, pval, warnStr)) elif VERBOSE: print('.', ) # Now test directionalities testType = 3 dresults = self.testAllDirections() derrs = 0 for dresult in dresults: isError, cause, effect, rho = dresult if isError: derrs += 1 if abs(rho) < .0001: resStr = 'True direction could not be verified.' warnStr = 'Warning (Type 3 -- Incorrect Causal Direction) between ' + cause + ' and ' + effect + '. ' + resStr + '. Rho = ' + str( rho) warnings.append( (testType, [cause], [effect], [], False, rho, warnStr)) if VERBOSE: print('*', warnStr) else: resStr = 'Direction appears to be reversed.' errStr = 'Error (Type 3 -- Incorrect Causal Direction) between ' + cause + ' and ' + effect + '. ' + resStr + '. Rho = ' + str( rho) errors.append( (testType, [cause], [effect], [], False, rho, errStr)) if VERBOSE: print('***', errStr) numErrsPerType[testType] += 1 numTestsPerType[testType] += 1 confidence = 1.0 failurePenaltyPerType = [1, 1, 1, 1] errorRatios = [0.0] * numTestTypes for i in range(numTestTypes): nTests = numTestsPerType[i] nErrs = numErrsPerType[i] if nTests > 0: ratio = nErrs / nTests errorRatios[i] = ratio confidence -= ratio * failurePenaltyPerType[i] / numTestTypes warnPenalty = len(warnings) * warningPenalty confidence -= warnPenalty confidence = max([confidence, 0.0]) numTotalTests = len(deps) if VERBOSE: print('Model Testing Completed with', len(errors), 'error(s) and', len(warnings), ' warning(s). Confidence = ', round(confidence * 100, 1), '%') return (confidence, numTotalTests, numTestsPerType, numErrsPerType, errors, warnings)
def _etl(raw, extracted, parsed, standardized): extract(raw, extracted) parse(extracted, parsed) standardize(parsed, standardized)
Return: your best m x 1 numpy array weight vector used to predict the output for the kaggle competition. The matrix product of the feature_matrix, obtained from get_feature_matrix() call with file as test_features.csv, with this weight vector should result in you best prediction for the test dataset. """ with open('my_w_best', 'rb') as f: w_best = pickle.load(f) return w_best check = True #Imposing check condition in case invalid p is given feature_matrix = get_feature_matrix('train.csv') y_train = get_output('train.csv') lambda_reg = 3 #Change value of regularization parameter here p = 1.5 #Change value of p here #Uncomment line 94 to try for specific p and regularization parameter(lambda_reg) and comment out line 96 #w,check=get_weight_vector(feature_matrix, y_train, lambda_reg, p) if (check): w = get_my_best_weight_vector() x_test = give_features('test_features.csv') x_test = standardize(x_test) output_values = np.dot(x_test, w) y_test = (output_values).astype('int') msg = save_as_csv(y_test) print(msg) else: print('Invalid p given. 1<=p<=2')