Пример #1
0
def probYearGivenWeightedNames(wnames, r):
    '''
    Now the first parameter, the set of names, is provided with the associated
    weights with names. Our task is to calculate p(year|names), where year is
    between r[0] and r[1], inclusive.

    The data structure of the wnames is as follows:
        [(name, weight), (name, weight), ...]
    '''
    assert(len(r) == 2)
    assert(r[0] <= r[1])

    names, weights = zip(*wnames)

    # normalize the weights
    sum_weights = float(sum(weights))
    weights = [x / sum_weights for x in weights]
    wnames = zip(names, weights)

    # calculation part
    sum_prob_year = None

    for name, weight in wnames:
        prob_year = probYearGivenName(name, r[0], r[1])
        prob_year = vector_scalar_product(prob_year, weight)
        if sum_prob_year is None:
            sum_prob_year = prob_year
        else:
            sum_prob_year = vector_sum(sum_prob_year, prob_year)

    # Finally, we need to normalize the prob array to ensure that the sum is 1.
    factor = 1 / float(sum(sum_prob_year))
    sum_prob_year = vector_scalar_product(sum_prob_year, factor)

    return sum_prob_year
Пример #2
0
def probYearGivenNames(names, r):
    '''
    Given a set of names (first names), we return p(year|names), i.e. the
    probability of a random user from the set might be born in that year. Since
    the year is a variable, we return an array in which each element holds a
    probability for corresponding year.

    The range specified by 'r' is inclusive. The output array is from r[0]
    through r[1].
    '''
    assert(len(r) == 2)     # r must be a tuple or a list of length 2
    assert(r[0] <= r[1])

    prob_year_sum = None

    for name in names:
        prob_year = probYearGivenName(name, r[0], r[1])
        if prob_year_sum is None:
            prob_year_sum = prob_year
        else:
            prob_year_sum = vector_sum(prob_year_sum, prob_year)

    # Finally, we need to normalize the prob array to ensure that the sum is 1.
    factor = 1 / float(sum(sum_prob_year))
    sum_prob_year = vector_scalar_product(sum_prob_year, factor)

    return prob_year_sum
Пример #3
0
def run_allName():
    con = mdb.connect('localhost', 'yongjoo', 'Fgla4Zp0', 'yongjoo')

    with con:
        cur = con.cursor()

        sql = '''SELECT name, SUM(num) AS total
                FROM babyname
                GROUP BY name
                ORDER BY total DESC
                LIMIT 1000
                '''
        cur.execute(sql)

        bins = [0 for x in range(1950, 2001)]
        debugout = open('debug.txt', 'w')

        numrows = int(cur.rowcount)
        for i in range(numrows):
            row = cur.fetchone()

            name = row[0]
            firstname = name.split(' ')[0]
            prob_year = babyname.probYearGivenName(firstname, 1950, 2000)

            try:
                assert(len(bins) == len(prob_year))
            except AssertionError:
                print len(bins), len(prob_year)
                raise RuntimeError

            if sum(prob_year) != 0:
                bins = [bins[i] + prob_year[i] for i in range(len(bins))]

        debugout.close()

        print bins
Пример #4
0
def probYearGivenDomainInYear(start_year, end_year):
    '''
    We compute in the function y(year) = avg_i p(year|name_i) where name_i is
    the i-th element in the set of users who were born between start_year
    (inclusive) and end_year (exclusive).

    The function avg is not a simple average since we consider the weights
    between different names, thus it's rather a weighted average. Weights are
    determined by considering the sum of the counts of the names between the
    periods specified in the arguments.
    '''
    assert(start_year < end_year)

    return_range = (2012-50, 2012-11)     # this is inclusive

    q = '''SELECT b.name AS name, sum(b.num) AS count
           FROM babyname b
           INNER JOIN popularname p
           ON b.name = p.name
           WHERE year BETWEEN %s AND %s
           GROUP BY name
           ORDER BY count
           ''' % (start_year, end_year - 1) # sql between is inclusive

    name_weights = {}   # key: name, value: weight

    con = mdb.connect('localhost', 'yongjoo', 'Fgla4Zp0', 'yongjoo');
    with con:
        cur = con.cursor()
        cur.execute(q)
        numrows = int(cur.rowcount)
        for i in range(numrows):
            row = cur.fetchone()
            name = row[0]
            count = row[1]
            name_weights[name] = count

    # noramlize the weights for the sake of the calucation safety
    sum_name_weights = sum(name_weights.values())
    for n, w in name_weights.iteritems():
        name_weights[n] /= sum_name_weights

    # Get prob distribution for every name
    name_prob_year = {}
    for n in name_weights.keys():
        prob_year = probYearGivenName(n, return_range[0], return_range[1])
        name_prob_year[n] = prob_year

    # Get weighted average of them

    assert(len(name_prob_year) > 0)     # sanity check

    d = name_prob_year
    sum_prob_year = [0] * len(d[random.choice(d.keys())])
    for n in name_prob_year.keys():
        weighted = vector_scalar_product(name_prob_year[n], name_weights[n])
        sum_prob_year = vector_sum(sum_prob_year, weighted)

    # We don't need to average since the summation of the weights is one.

    # However, we need to normalize the prob array to ensure that the sum is 1.
    factor = 1 / float(sum(sum_prob_year))
    sum_prob_year = vector_scalar_product(sum_prob_year, factor)

    return sum_prob_year
Пример #5
0
    prob_year_sum[age] = []
    skip_count[age] = 0

# Start to collect prob for every age.
for line in datain:
    tweet = json.loads(line)
    name = tweet['user']['name'].encode('ascii', 'ignore')
    firstname = name.split(' ')[0]
    age = int(tweet['user']['age'])

    if not age in given_ages:
        continue;

    start_year = 2012 - 50
    end_year = 2012 - 11
    prob_year = probYearGivenName(firstname, start_year, end_year)

    if sum(prob_year) == 0:
        skip_count[age] += 1
        continue

    prob_year_sum[age].append(prob_year)

for age in sorted(prob_year_sum.keys()):
    print "For %d, collected %d number of users. Skipped %d" % (age,
            len(prob_year_sum[age]), skip_count[age])


# Average the prob array
def elementWiseSum(x, y):
    assert(len(x) == len(y))