def probYearGivenNames(names, r=(2012 - 50, 2012 - 11)): """ Given a set of names (first names), we return p(year|names), i.e. the probability of a random user from the set might be born in that year. Since the year is a variable, we return an array in which each element holds a probability for corresponding year. The range specified by 'r' is inclusive. The output array is from r[0] through r[1]. """ assert len(r) == 2 # r must be a tuple or a list of length 2 assert r[0] <= r[1] prob_year_sum = None for name in names: prob_year = probYearGivenName(name, r[0], r[1]) if prob_year_sum is None: prob_year_sum = prob_year else: prob_year_sum = vector_sum(prob_year_sum, prob_year) # Finally, we need to normalize the prob array to ensure that the sum is 1. sum_prob = sum(prob_year_sum) if sum_prob != 0: factor = 1 / float(sum_prob) prob_year_sum = vector_scalar_product(prob_year_sum, factor) return prob_year_sum
def probYearGivenWeightedNames(wnames, r): """ Now the first parameter, the set of names, is provided with the associated weights with names. Our task is to calculate p(year|names), where year is between r[0] and r[1], inclusive. The data structure of the wnames is as follows: [(name, weight), (name, weight), ...] """ assert len(r) == 2 assert r[0] <= r[1] names, weights = zip(*wnames) # normalize the weights sum_weights = float(sum(weights)) weights = [x / sum_weights for x in weights] wnames = zip(names, weights) # calculation part sum_prob_year = None for name, weight in wnames: prob_year = probYearGivenName(name, r[0], r[1]) prob_year = vector_scalar_product(prob_year, weight) if sum_prob_year is None: sum_prob_year = prob_year else: sum_prob_year = vector_sum(sum_prob_year, prob_year) # Finally, we need to normalize the prob array to ensure that the sum is 1. sum_prob = sum(sum_prob_year) if sum_prob != 0: factor = 1 / float(sum_prob) sum_prob_year = vector_scalar_product(sum_prob_year, factor) return sum_prob_year
def probYearGivenDomainInYear(start_year, end_year): """ We compute in the function y(year) = avg_i p(year|name_i) where name_i is the i-th element in the set of users who were born between start_year (inclusive) and end_year (exclusive). The function avg is not a simple average since we consider the weights between different names, thus it's rather a weighted average. Weights are determined by considering the sum of the counts of the names between the periods specified in the arguments. """ assert start_year < end_year return_range = (2012 - 50, 2012 - 11) # this is inclusive q = """SELECT b.name AS name, sum(b.num) AS count FROM babyname b INNER JOIN popularname p ON b.name = p.name WHERE year BETWEEN %s AND %s GROUP BY name ORDER BY count """ % ( start_year, end_year - 1, ) # sql between is inclusive name_weights = {} # key: name, value: weight con = db.con() with con: cur = con.cursor() cur.execute(q) numrows = int(cur.rowcount) for i in range(numrows): row = cur.fetchone() name = row[0] count = row[1] name_weights[name] = count # noramlize the weights for the sake of the calucation safety sum_name_weights = sum(name_weights.values()) for n, w in name_weights.iteritems(): name_weights[n] /= sum_name_weights # Get prob distribution for every name name_prob_year = {} for n in name_weights.keys(): prob_year = probYearGivenName(n, return_range[0], return_range[1]) name_prob_year[n] = prob_year # Get weighted average of them assert len(name_prob_year) > 0 # sanity check d = name_prob_year sum_prob_year = [0] * len(d[random.choice(d.keys())]) for n in name_prob_year.keys(): weighted = vector_scalar_product(name_prob_year[n], name_weights[n]) sum_prob_year = vector_sum(sum_prob_year, weighted) # We don't need to average since the summation of the weights is one. # However, we need to normalize the prob array to ensure that the sum is 1. factor = 1 / float(sum(sum_prob_year)) sum_prob_year = vector_scalar_product(sum_prob_year, factor) return sum_prob_year