예제 #1
0
def getZscore(word, user, ngramTable, schema, distTable = ''):
	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)
	word = word.replace('\'', '\'\'')

	try:
		query = 'SELECT group_norm FROM {}.{} where group_id = \'{}\' and feat = \'{}\''.format(schema, ngramTable, user, word)
		#print query
	except UnicodeEncodeError:
		return 0


	group_norm = mm.executeGetList(schema, dbCursor, query)

	if not group_norm:
		#print group_norm
		#print 'group_norm is None'
		return 0 
	if isinstance(group_norm, tuple):
		#print group_norm
		group_norm = group_norm[0]

	if isinstance(group_norm, tuple):
		group_norm = group_norm[0]

	(mean, std) = getMeanAndStd(word, ngramTable = ngramTable, schema = schema, distTable = distTable)
	#print type(group_norm)
	if (std == 0):
		return 0
	else:
		return (group_norm - mean)/(std + 0.0)
예제 #2
0
def getFeatWithLimit(schema,
                     table,
                     group='',
                     amount=50,
                     orderBy='group_norm',
                     desc=True):
    #get the first n amount of words, using the orderBy (asc or desc) column to sort.
    #if group is specified, get from that specific group
    #returns list of (feat, group_norm)

    (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)

    if group != '':
        select_group = 'where group_id = \'{}\''.format(group)
    else:
        select_group = ''

    if amount <= 0:
        limit = ''
    else:
        limit = ' LIMIT {}'.format(int(amount))

    query = 'SELECT feat, group_norm FROM {}.{} {} ORDER BY {} DESC{}'.format(
        schema, table, select_group, orderBy, limit)
    return mm.executeGetList(schema, dbCursor, query)
예제 #3
0
def updateZscore(schema, ngramTable, user = '', use_feat_table = False, 
					distTable = ''):
	# update ngramTable with z-values

	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)

	counter = 0
	if user != '':
		users = [user]

	else:
		users = getUsers(schema, ngramTable)

	for user in users:
		for ngram in map(lambda x: x[0], getNgrams(ngramTable, schema)):
			if use_feat_table:
				z = getZscore(ngram, user, ngramTable, schema)
			else:
				z = getZscore(ngram, user, ngramTable, schema, distTable = distTable)

			ngram = ngram.replace('\'', '\'\'')

			try :
				query = "UPDATE {}.{} SET z = {} where group_id = \'{}\' and feat=\'{}\'".format(schema, ngramTable, z, user, ngram)

			except UnicodeEncodeError:
				query = "UPDATE {}.{} SET z = 0 where group_id = \'{}\' and feat=\'{}\'".format(schema, ngramTable, user, ngram.encode('utf-8'))		
				

			if counter % 1000 == 0: print query
			mm.executeGetList(schema, dbCursor, query)
			counter += 1
예제 #4
0
def getFeatValueAndZ(user, schema, ngramTable, min_value = 5, ordered = True, z_threshold = 0):
	#returns list of (feat, value, z) for a given user
	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)

	if ordered:
		order_by = " ORDER BY z DESC"
	else:
		order_by = ""

	pos_z = " AND z > {}".format(z_threshold)


	query = 'SELECT feat, value, z FROM {}.{} WHERE group_id = \'{}\' and value >= {}{}{};'.format(schema, ngramTable, user, min_value, pos_z, order_by)
	print query
	list = mm.executeGetList(schema, dbCursor, query)
	#return map(lambda x: x[0], list)
	return list
예제 #5
0
def getUniqueNgrams(schema, ngramTable, user = '', max = -1):
	# get n ngrams from ngramTable where z-score = 0, sorted by group_norm
	# if user is specified, only grab unique ngrams from that user

	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)

	if user != '':
		select_user = '******'{}\''.format(user)
	else:
		select_user = ''

	if max != -1:
		limit = ' LIMIT {}'.format(max)
	else:
		limit = ''

	query = 'SELECT feat, group_norm FROM {}.{} WHERE z = 0{} ORDER BY group_norm DESC{}'.format(schema, ngramTable, select_user, limit)
	return mm.executeGetList(schema, dbCursor, query)
예제 #6
0
def getOneGram(schema, ngramTable):
	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)
	query = "SELECT feat, sum(value) as count FROM {}.{} group by feat".format(schema, ngramTable)
	print query
	return mm.executeGetList(schema, dbCursor, query)
예제 #7
0
def createZColumn(schema, ngramTable):
	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)
	query = "ALTER TABLE {}.{} ADD COLUMN z DOUBLE;".format(schema, ngramTable)
	mm.executeGetList(schema, dbCursor, query)
예제 #8
0
def getUsers(schema, ngramTable):
	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)
	query = "SELECT distinct(group_id) FROM {}.{};".format(schema, ngramTable)
	return map(lambda user: user[0], mm.executeGetList(schema, dbCursor, query))
예제 #9
0
def getNgrams(ngramTable, schema):
	#returns list of ngrams

	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)
	query = "SELECT feat FROM {}.{} GROUP BY feat".format(schema, ngramTable)
	return mm.executeGetList(schema, dbCursor, query)
예제 #10
0
def getMeanAndStd(word, ngramTable, schema, num_groups = -1, distTable = '', distTableSource = None):
	# get mean and std for a word using the ngramTable

	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)

	if num_groups == -1:
		query = 'SELECT count(distinct(group_id)) FROM {}.{}'.format(schema, ngramTable)
		result = mm.executeGetList(schema, dbCursor, query)
		num_groups = int(result[0][0])
		#print int(num_groups[0][0])

	elif distTableSource is not None:
		#TODO: let user specify distTableSource		
		query = 'SELECT count(distinct(group_id)) FROM {}.{}'.format(schema, distTableSource)
		result = mm.executeGetList(schema, dbCursor, query)
		num_groups = int(result[0][0])
	else:
		pass

	if distTable == '':

		########### two pass algorithm
		n = 0 #count
		sum = 0.0
		diff_squared_sum = 0.0
		#for group_norm in session.query(Feature.group_norm).filter(Feature.feat == word):
		query = u'SELECT group_norm FROM {}.{} WHERE feat = \'{}\''.format(schema, ngramTable, word)
		group_norms = mm.executeGetList(schema, dbCursor, query)
		#print 'SELECT group_norm FROM {}.{} WHERE feat = \'{}\''.format(schema, ngramTable, word)
		num_groups = len(group_norms)

		if len(group_norms) == 1:
			return (0, 0)

		for group_norm in group_norms:
			n += 1
			sum += group_norm[0]


		mean = float(sum)/num_groups
		#print "Mean: %.12f" % mean

		for group_norm in group_norms:
			diff_squared_sum += (group_norm[0] - mean) ** 2

		if (num_groups == 1):
			variance = 1
		else:
			variance = diff_squared_sum / (num_groups - 1) #sample variance
		std = sqrt(variance)
		#print "Standard Deviation: %.12f" % std

		########### algorithm end

	else:
		query = "SELECT mean, std FROM {}.{} where feat = \'{}\'".format(schema, distTable, word)
		result = mm.executeGetList(schema, dbCursor, query)
		if not result:
			mean = 0
			std = 0
		else:
			mean = result[0][0]
			std = result[0][1]

	#print (mean, std)
	return (mean, std)
예제 #11
0
        labels = dataHolder.keys()
        counts = dataHolder.values()
        ro_labels = ro.StrVector(labels)
        ro_counts = ro.IntVector(counts)
        if filename:
            self.grdevices.png(file="%s_hist_cats.png"%(filename), width=self.widths, height=self.heights)
            self.graphics.par(las=2, mar=[5.1, 7.1, 4.1, 2.1])
            ro.r.barplot(ro_counts, main = "Category Histogram, N=%d"%(total_count), beside=True, horiz=True, col='royalblue4', **{"names.arg":ro_labels})
            self.grdevices.dev_off()

if __name__=="__main__":
    sp = StatsPlotter()
    #floatCols = [2, 3, 4] + range(6,14) + [23] + [28]
    #prefix = '600_'
    #sp.getFloatColStats("userstats_en", floatCols, "plots/%sdesc"%prefix)
    #sp.getCategoricalColStats("userstats_en", range(14, 23), "plots/%srelnbins"%prefix)
    (conn, cur, dcur) = mm.dbConnect('fb20')
    #sp.getCategoricalColStats('fb20', cur, "userstats_en",  range(24, 28), '/data/ml/plots/fb20/age_category')
    #sp.getCategoricalColStats('fb20', cur, "userstats_en",  range(14, 23), '/data/ml/plots/fb20/reln_category')
    # N=100000
    # d1 = list(rand.normal(0,2,N))
    # d2 = list(rand.normal(0,1,N))
    # d3 = list(rand.normal(0,17,N))
    # d_all = {"d1":d1, "d2":d2, "d3":d3}
    # e1 = list(rand.exponential(2,N))
    # e2 = list(rand.exponential(14,N))
    # e_all = {'e1':e1, 'e2':e2}
    # sp.plot2dHist('d1', d1, 'd2', d2)
    # sp.plot2dHistGeneralized(d_all, e_all, 'plots/samba')
    mm.warn("descStats.py exits with success :)")