def covariance(x, y): """Return the sample covariance of x and y.""" n = len(x) if len(y) != n: raise statistics.StatisticsError('covariance requires that both inputs ' 'have same number of data points') if n < 2: raise statistics.StatisticsError('covariance requires at least two data points') sectype = type(x[0]) # all elts of x assumed of same type if not issubclass(sectype, SecureObject): if sys.version_info.minor >= 10: return statistics.covariance(x, y) # inline code of statistics.covariance() copied from Python 3.10.0: xbar = fsum(x) / n ybar = fsum(y) / n sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) return sxy / (n - 1) if issubclass(sectype, SecureFixedPoint): xbar = runtime.sum(x) / n ybar = runtime.sum(y) / n sxy = runtime.in_prod([xi - xbar for xi in x], [yi - ybar for yi in y]) return sxy / (n - 1) if issubclass(sectype, SecureInteger): sx = runtime.sum(x) sy = runtime.sum(y) sxy = runtime.in_prod([xi * n - sx for xi in x], [yi * n - sy for yi in y]) d = n**2 * (n - 1) return (sxy + d//2) // d raise TypeError('secure fixed-point or integer type required')
def _correlation(x, y) -> float: """Pearson's correlation coefficient Return the Pearson's correlation coefficient for two inputs. Pearson's correlation coefficient *r* takes values between -1 and +1. It measures the strength and direction of the linear relationship, where +1 means very strong, positive linear relationship, -1 very strong, negative linear relationship, and 0 no linear relationship """ n = len(x) if len(y) != n: raise statistics.StatisticsError( "correlation requires that both inputs have same number of data points" ) if n < 2: raise statistics.StatisticsError( "correlation requires at least two data points") xbar = fsum(x) / n ybar = fsum(y) / n sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) sxx = fsum((xi - xbar)**2.0 for xi in x) syy = fsum((yi - ybar)**2.0 for yi in y) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: raise statistics.StatisticsError( "at least one of the inputs is constant")
def linear_regression(x, y): """Return a (simple) linear regression model for x and y. The parameters of the model are returned as a named LinearRegression tuple, with two fields called "slope" and "intercept", respectively. A linear regression model describes the relationship between independent variable x and dependent variable y in terms of a linear function: y = slope * x + intercept + noise Here, slope and intercept are the regression parameters estimated using ordinary least squares, and noise represents the variability of the data not explained by the linear regression (it is equal to the difference between predicted and actual values of the dependent variable). """ n = len(x) if len(y) != n: raise statistics.StatisticsError( 'covariance requires that both inputs ' 'have same number of data points') if n < 2: raise statistics.StatisticsError( 'covariance requires at least two data points') sectype = type(x[0]) # all elts of x assumed of same type if not issubclass(sectype, SecureObject): if sys.version_info.minor >= 10: return statistics.linear_regression(x, y) # inline code of statistics.linear_regression() adapted from Python 3.10.0: xbar = fsum(x) / n ybar = fsum(y) / n sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) sxx = fsum((xi - xbar)**2.0 for xi in x) try: slope = sxy / sxx # equivalent to: covariance(x, y) / variance(x) except ZeroDivisionError: raise statistics.StatisticsError('x is constant') from None intercept = ybar - slope * xbar return LinearRegression(slope=slope, intercept=intercept) if issubclass(sectype, SecureFixedPoint): xbar = runtime.sum(x) / n ybar = runtime.sum(y) / n xxbar = [xi - xbar for xi in x] yybar = [yi - ybar for yi in y] sxy = runtime.in_prod(xxbar, yybar) sxx = runtime.in_prod(xxbar, xxbar) slope = sxy / sxx intercept = ybar - slope * xbar return LinearRegression(slope=slope, intercept=intercept) # TODO: implement for secure integers as well raise TypeError('secure fixed-point type required')
def mode(data): """Return the mode, the most common data point from discrete or nominal data. If there are multiple modes with the same frequency, the first one encountered in data is returned. If data is empty, StatisticsError is raised. To speed up the computation, the bit length of the sample range max(data) - min(data) is revealed, provided this range is not too small. """ if iter(data) is data: x = list(data) else: x = data[:] n = len(x) if not n: raise statistics.StatisticsError( 'mode requires at least one data point') if isinstance(x[0], sectypes.SecureObject): return _mode(x, PRIV=runtime.options.sec_param // 6) return statistics.mode( x) # NB: raises StatisticsError in Python < 3.8 if x is multimodal
def mean(data): """Return the sample mean (average) of data which can be a sequence or an iterable. If the data points are secure integers or secure fixed-point numbers, the mean value returned is of the same secure type, rounded to the nearest number. If data is empty, StatisticsError will be raised. """ if iter(data) is data: x = list(data) else: x = data n = len(x) if not n: raise statistics.StatisticsError( 'mean requires at least one data point') stype = type(x[0]) # all elts of x assumed of same type if issubclass(stype, sectypes.SecureFiniteField): raise TypeError('secure fixed-point or integer type required') if issubclass(stype, sectypes.SecureInteger): s = runtime.sum(x) return (s + n // 2) // n # round to nearest integer if issubclass(stype, sectypes.SecureFixedPoint): s = runtime.sum(x) e = n.bit_length() - 1 # 1/2 < 2**e / n <= 1 return s * (2**e / n) * 2**-e return statistics.mean(x)
def _med(data, med=None): if iter(data) is data: x = list(data) else: x = data[:] n = len(x) if not n: raise statistics.StatisticsError( 'median requires at least one data point') stype = type(x[0]) # all elts of x assumed of same type if issubclass(stype, sectypes.SecureFiniteField): raise TypeError('secure fixed-point or integer type required') if not issubclass(stype, sectypes.SecureObject): return statistics.median(x) if n % 2: return _quickselect(x, (n - 1) / 2) if med == 'low': return _quickselect(x, (n - 2) / 2) if med == 'high': return _quickselect(x, n / 2) # average two middle values s = _quickselect(x, (n - 2) / 2) + _quickselect(x, n / 2) if issubclass(stype, sectypes.SecureInteger): return s // 2 return s / 2
def _std(data, m, correction): if iter(data) is data: x = list(data) else: x = data n = len(x) if n < 1 + correction: if correction: e = 'stdev requires at least two data points' else: e = 'pstdev requires at least one data point' raise statistics.StatisticsError(e) stype = type(x[0]) # all elts of x assumed of same type if issubclass(stype, sectypes.SecureFiniteField): raise TypeError('secure fixed-point or integer type required') if issubclass(stype, sectypes.SecureInteger): return _isqrt(_var(x, m, correction)) if issubclass(stype, sectypes.SecureFixedPoint): return _fsqrt(_var(x, m, correction)) if correction: return statistics.stdev(x, m) return statistics.pstdev(x, m)
def correlation(x, y): """Return Pearson's correlation coefficient for x and y. Pearson's correlation coefficient takes values between -1 and +1. It measures the strength and direction of the linear relationship between x and y, where +1 means very strong, positive linear relationship, -1 very strong, negative linear relationship, and 0 no linear relationship. """ n = len(x) if len(y) != n: raise statistics.StatisticsError( 'covariance requires that both inputs ' 'have same number of data points') if n < 2: raise statistics.StatisticsError( 'covariance requires at least two data points') sectype = type(x[0]) # all elts of x assumed of same type if not issubclass(sectype, SecureObject): if sys.version_info.minor >= 10: return statistics.correlation(x, y) # inline code of statistics.correlation() copied from Python 3.10.0: xbar = fsum(x) / n ybar = fsum(y) / n sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) sxx = fsum((xi - xbar)**2.0 for xi in x) syy = fsum((yi - ybar)**2.0 for yi in y) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: raise statistics.StatisticsError( 'at least one of the inputs is constant') from None if issubclass(sectype, SecureFixedPoint): xbar = runtime.sum(x) / n ybar = runtime.sum(y) / n xxbar = [xi - xbar for xi in x] yybar = [yi - ybar for yi in y] sxy = runtime.in_prod(xxbar, yybar) sxx = runtime.in_prod(xxbar, xxbar) syy = runtime.in_prod(yybar, yybar) return sxy / (_fsqrt(sxx) * _fsqrt(syy)) raise TypeError('secure fixed-point type required')
def _presorted_median(data): n = len(data) if n == 0: raise statistics.StatisticsError('no median for empty data') if n % 2 == 1: return data[n // 2] else: i = n // 2 return (data[i - 1] + data[i]) / 2
def singlemode(data): try: # New in Python 3.8 modes = statistics.multimode(data) except AttributeError: return statistics.mode(data) else: if len(modes) > 1: raise statistics.StatisticsError('no unique mode') else: return modes[0]
def _quantiles(data, *, n=4, method="exclusive") -> List: """Divide *data* into *n* continuous intervals with equal probability. Returns a list of (n - 1) cut points separating the intervals. Set *n* to 4 for quartiles (the default). Set *n* to 10 for deciles. Set *n* to 100 for percentiles which gives the 99 cuts points that separate *data* in to 100 equal sized groups. The *data* can be any iterable containing sample. The cut points are linearly interpolated between data points. If *method* is set to *inclusive*, *data* is treated as population data. The minimum value is treated as the 0th percentile and the maximum value is treated as the 100th percentile. """ if n < 1: raise statistics.StatisticsError("n must be at least 1") data = sorted(data) ld = len(data) if ld < 2: raise statistics.StatisticsError("must have at least two data points") if method == "inclusive": m = ld - 1 result = [] for i in range(1, n): j, delta = divmod(i * m, n) interpolated = (data[j] * (n - delta) + data[j + 1] * delta) / n result.append(interpolated) return result if method == "exclusive": m = ld + 1 result = [] for i in range(1, n): j = i * m // n # rescale i to m/n j = 1 if j < 1 else ld - 1 if j > ld - 1 else j # clamp to 1 .. ld-1 delta = i * m - j * n # exact integer math interpolated = (data[j - 1] * (n - delta) + data[j] * delta) / n result.append(interpolated) return result raise ValueError(f"Unknown method: {method!r}")
def get_median(self, full_tree: bool = True, sub_tree=None) -> float: """ Calculates median of elements in a tree. Each value in a tree is used for calculation only once. Args: full_tree (bool): if True self.root tree is used for calculation, else tree specified by sub_tree sub_tree (Node|None): subtree to use for calculation if full_tree is False Returns: Median of elements in a tree Raises: statistics.StatisticsError: when tree has no elements(i.e. is None) or has a single element that has value None """ if full_tree: sub_tree = self.root try: return statistics.median(self._get_subtree_values(sub_tree)) except statistics.StatisticsError: raise statistics.StatisticsError( 'Cannot calculate median from an empty Tree')
def _var(data, m, correction): if iter(data) is data: x = list(data) else: x = data n = len(x) if n < 1 + correction: if correction: e = 'variance requires at least two data points' else: e = 'pvariance requires at least one data point' raise statistics.StatisticsError(e) stype = type(x[0]) # all elts of x assumed of same type if issubclass(stype, sectypes.SecureFiniteField): raise TypeError('secure fixed-point or integer type required') if issubclass(stype, sectypes.SecureInteger): if m is None: s = runtime.sum(x) y = [a * n - s for a in x ] # TODO: runtime.scalar_mul(n,x) for public (int) n d = n**2 * (n - correction) else: y = runtime.vector_sub( x, [m] * n) # TODO: runtime.vector_sub(x,y) for scalar y d = n - correction return (runtime.in_prod(y, y) + d // 2) // d if issubclass(stype, sectypes.SecureFixedPoint): if m is None: m = mean(x) y = runtime.vector_sub(x, [m] * n) d = n - correction return runtime.in_prod(y, y) / d if correction: return statistics.variance(x, m) return statistics.pvariance(x, m)
def doCalc(G, dyna, do, counter, results): pr = nx.pagerank(G, alpha=0.85) avg = float(1) / float(number_of_nodes(G)) isavg = 0 underavg = 0 aboveavg = 0 minv = 1 maxv = 0 work = 0 for key, value in pr.items(): #print(key, 'corresponds to', value) g = str(value) a = str(key) + ": " + g #print(a) work = work + value if (value < avg): underavg = underavg + 1 if (minv > value): minv = value elif (value == avg): isavg = isavg + 1 else: aboveavg = aboveavg + 1 if (maxv < value): maxv = value onemin = 0 onemax = 0 items = [] for k, l in pr.items(): items.append(l) if (l == minv): onemin = onemin + 1 elif (l == maxv): onemax = onemax + 1 m = str(onemin) n = str(onemax) b = str(work) c = str(isavg) d = str(underavg) e = str(aboveavg) h = str(minv) i = str(maxv) f = str(avg) o = str(statistics.median(items)) p = str(statistics.median_low(items)) q = str(statistics.median_high(items)) s = str(statistics.median_grouped(items)) t = str(statistics.mode(items)) u = str(statistics.pstdev(items)) v = str(statistics.pvariance(items)) w = str(statistics.stdev(items)) x = str(statistics.variance(items)) y = str(statistics.StatisticsError(items)) if (os.stat(os.getcwd() + "/results/testResults" + str(counter) + ".txt").st_size >= 500000): results.close() counter = counter + 1 results = open( os.getcwd() + "/results/testResults" + str(counter) + ".txt", 'w') print(counter) results.write('\n') results.write('testdata without ' + dyna + ' and ' + do + '\n') results.write(json.dumps(pr) + '\n') results.write("average: " + f + '\n') results.write("total: " + b + '\n') results.write("isavg: " + c + '\n') results.write("underavg: " + d + '\n') results.write("aboveavg: " + e + '\n') results.write("minimum: " + h + '\n') results.write("maximum: " + i + '\n') results.write("oneminimum: " + m + '\n') results.write("onemaximum: " + n + '\n') results.write("median: " + o + '\n') results.write("low median: " + p + '\n') results.write("high median: " + q + '\n') results.write("grouped median: " + s + '\n') results.write("mode: " + t + '\n') results.write("pstdev: " + u + '\n') results.write("pvariance: " + v + '\n') results.write("stdev: " + w + '\n') results.write("variance: " + x + '\n') results.write("error: " + y + '\n') # too much data, only if actually test print("succes writing to test " + str(counter) + " result") return counter, results
def quantiles(data, *, n=4, method='exclusive'): """Divide data into n continuous intervals with equal probability. Returns a list of n-1 cut points separating the intervals. Set n to 4 for quartiles (the default). Set n to 10 for deciles. Set n to 100 for percentiles which gives the 99 cuts points that separate data into 100 equal sized groups. The data can be any iterable containing samples. The cut points are linearly interpolated between data points. If method is set to 'inclusive', data is treated as population data. The minimum value is treated as the 0th percentile (lowest quantile) and the maximum value is treated as the 100th percentile (highest quantile). """ if n < 1: raise statistics.StatisticsError('n must be at least 1') if iter(data) is data: x = list(data) else: x = data ld = len(x) if ld < 2: raise statistics.StatisticsError('must have at least two data points') sectype = type(x[0]) # all elts of x assumed of same type if not issubclass(sectype, SecureObject): return statistics.quantiles(x, n=n, method=method) if issubclass(sectype, SecureFixedPoint): div_n = lambda a: a / n elif issubclass(sectype, SecureInteger): div_n = lambda a: (a + n//2) // n else: raise TypeError('secure fixed-point or integer type required') if method == 'inclusive': m = ld - 1 # Determine which kth order statistics will actually be used. data = {} for i in range(1, n): j, delta = divmod(i * m, n) data[j] = None if delta: data[j+1] = None points = _quickselect(x, list(data)) data = dict(zip(data, points)) # Compute the n-1 cut points for the n quantiles. result = [] for i in range(1, n): j, delta = divmod(i * m, n) interpolated = data[j] if delta: interpolated += div_n((data[j+1] - data[j]) * delta) result.append(interpolated) return result if method == 'exclusive': m = ld + 1 # Determine which kth order statistics will actually be used. data = {} for i in range(1, n): j = i * m // n j = 1 if j < 1 else ld-1 if j > ld-1 else j # clamp to 1 .. ld-1 delta = i*m - j*n if n - delta: data[j-1] = None if delta: data[j] = None points = _quickselect(x, list(data)) data = dict(zip(data, points)) # Compute the n-1 cut points for the n quantiles. result = [] for i in range(1, n): j = i * m // n j = 1 if j < 1 else ld-1 if j > ld-1 else j delta = i*m - j*n if delta == 0: interpolated = data[j-1] elif delta == n: interpolated = data[j] else: # NB: possibly delta<0 or delta>n interpolated = data[j-1] + div_n((data[j] - data[j-1]) * delta) result.append(interpolated) return result raise ValueError(f'Unknown method: {method!r}')