예제 #1
0
def updateZscore(schema, ngramTable, user = '', use_feat_table = False, 
					distTable = ''):
	# update ngramTable with z-values

	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)

	counter = 0
	if user != '':
		users = [user]

	else:
		users = getUsers(schema, ngramTable)

	for user in users:
		for ngram in map(lambda x: x[0], getNgrams(ngramTable, schema)):
			if use_feat_table:
				z = getZscore(ngram, user, ngramTable, schema)
			else:
				z = getZscore(ngram, user, ngramTable, schema, distTable = distTable)

			ngram = ngram.replace('\'', '\'\'')

			try :
				query = "UPDATE {}.{} SET z = {} where group_id = \'{}\' and feat=\'{}\'".format(schema, ngramTable, z, user, ngram)

			except UnicodeEncodeError:
				query = "UPDATE {}.{} SET z = 0 where group_id = \'{}\' and feat=\'{}\'".format(schema, ngramTable, user, ngram.encode('utf-8'))		
				

			if counter % 1000 == 0: print query
			mm.executeGetList(schema, dbCursor, query)
			counter += 1
예제 #2
0
    def getCategoricalColStats(self, db, dbCursor, tableName, colsOfNote, filename=None):
        #1. Get column names
        sql = "SELECT column_name from information_schema.columns where table_name='%s'"%tableName
        row = mm.executeGetList(db, dbCursor, sql)
        colNames = []
        for col in colsOfNote:
            colNames.append(row[col][0])

        ncols = len(colsOfNote)

        #. Get users who have at least one message
        sql = "SELECT DISTINCT group_id FROM feat$1gram$messages$user_id$16to16"
        rows = mm.executeGetList(db, dbCursor, sql)
        user_ids = []
        for row in rows:
            user_ids.append(row[0])
        sql_user_ids = ",".join(map(str,user_ids))

        #. Get total row count
        total_count = len(user_ids)

        #. Assemble storage data structure
        dataHolder = dict()
        for col in colNames:
            dataHolder[col] = 0
        dataHolder["none specified"] = 0

        #. Pull data of interest
        sql = "SELECT * FROM %s WHERE user_id IN (%s)"%(tableName, sql_user_ids)
        rows = mm.executeGetList(db, dbCursor, sql)
        ii = 0
        for row in rows:
            jj = 0
            has_specified_value = False
            for col in colsOfNote:
                if row[col]:
                    dataHolder[colNames[jj]] += 1
                    if has_specified_value:
                        raise Exception("incorrect assumption; more than one category allowed")
                    has_specified_value = True
                jj += 1
            if not has_specified_value:
                dataHolder["none specified"] += 1
            ii += 1

        #. Calculate descriptive statistics and create plots
        labels = dataHolder.keys()
        counts = dataHolder.values()
        ro_labels = ro.StrVector(labels)
        ro_counts = ro.IntVector(counts)
        if filename:
            self.grdevices.png(file="%s_hist_cats.png"%(filename), width=self.widths, height=self.heights)
            self.graphics.par(las=2, mar=[5.1, 7.1, 4.1, 2.1])
            ro.r.barplot(ro_counts, main = "Category Histogram, N=%d"%(total_count), beside=True, horiz=True, col='royalblue4', **{"names.arg":ro_labels})
            self.grdevices.dev_off()
예제 #3
0
def getZscore(word, user, ngramTable, schema, distTable = ''):
	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)
	word = word.replace('\'', '\'\'')

	try:
		query = 'SELECT group_norm FROM {}.{} where group_id = \'{}\' and feat = \'{}\''.format(schema, ngramTable, user, word)
		#print query
	except UnicodeEncodeError:
		return 0


	group_norm = mm.executeGetList(schema, dbCursor, query)

	if not group_norm:
		#print group_norm
		#print 'group_norm is None'
		return 0 
	if isinstance(group_norm, tuple):
		#print group_norm
		group_norm = group_norm[0]

	if isinstance(group_norm, tuple):
		group_norm = group_norm[0]

	(mean, std) = getMeanAndStd(word, ngramTable = ngramTable, schema = schema, distTable = distTable)
	#print type(group_norm)
	if (std == 0):
		return 0
	else:
		return (group_norm - mean)/(std + 0.0)
예제 #4
0
def getFeatWithLimit(schema,
                     table,
                     group='',
                     amount=50,
                     orderBy='group_norm',
                     desc=True):
    #get the first n amount of words, using the orderBy (asc or desc) column to sort.
    #if group is specified, get from that specific group
    #returns list of (feat, group_norm)

    (dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)

    if group != '':
        select_group = 'where group_id = \'{}\''.format(group)
    else:
        select_group = ''

    if amount <= 0:
        limit = ''
    else:
        limit = ' LIMIT {}'.format(int(amount))

    query = 'SELECT feat, group_norm FROM {}.{} {} ORDER BY {} DESC{}'.format(
        schema, table, select_group, orderBy, limit)
    return mm.executeGetList(schema, dbCursor, query)
예제 #5
0
    def getFloatColStats(self, db, dbCursor, tableName, colsOfNote, filename=None):
        #1. Get column names
        sql = "SELECT column_name from information_schema.columns where table_name='%s'"%tableName
        row = mm.executeGetList(db, dbCursor, sql)
        colNames = []
        for col in colsOfNote:
            colNames.append(row[col][0])

        ncols = len(colsOfNote)

        #. Get users who have at least one message
        sql = "SELECT DISTINCT group_id FROM feat$1gram$messages$user_id$16to16"
        rows = mm.executeGetList(db, dbCursor, sql)
        user_ids = []
        for row in rows:
            user_ids.append(row[0])
        sql_user_ids = ",".join(map(str,user_ids))

        #. Get total row count
        total_count = len(user_ids)

        #. Assemble storage data structure
        dataHolder = []
        for col in colsOfNote:
            dataHolder.append([None]*total_count)

        #. Pull data of interest; use offset if needed
        sql = "SELECT * FROM %s WHERE user_id IN (%s)"%(tableName, sql_user_ids)
        rows = mm.executeGetList(db, dbCursor, sql)
        ii = 0
        for row in rows:
            jj = 0
            for col in colsOfNote:
                dataHolder[jj][ii] = row[col]
                jj += 1
            ii += 1

        #. Link the data to their names
        dataDict = dict()
        for cc in range(ncols):
            dataDict[colNames[cc]] = dataHolder[cc]

        return self.plotDescStats(dataDict, total_count, filename)
예제 #6
0
def getFeatValueAndZ(user, schema, ngramTable, min_value = 5, ordered = True, z_threshold = 0):
	#returns list of (feat, value, z) for a given user
	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)

	if ordered:
		order_by = " ORDER BY z DESC"
	else:
		order_by = ""

	pos_z = " AND z > {}".format(z_threshold)


	query = 'SELECT feat, value, z FROM {}.{} WHERE group_id = \'{}\' and value >= {}{}{};'.format(schema, ngramTable, user, min_value, pos_z, order_by)
	print query
	list = mm.executeGetList(schema, dbCursor, query)
	#return map(lambda x: x[0], list)
	return list
예제 #7
0
def getUniqueNgrams(schema, ngramTable, user = '', max = -1):
	# get n ngrams from ngramTable where z-score = 0, sorted by group_norm
	# if user is specified, only grab unique ngrams from that user

	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)

	if user != '':
		select_user = '******'{}\''.format(user)
	else:
		select_user = ''

	if max != -1:
		limit = ' LIMIT {}'.format(max)
	else:
		limit = ''

	query = 'SELECT feat, group_norm FROM {}.{} WHERE z = 0{} ORDER BY group_norm DESC{}'.format(schema, ngramTable, select_user, limit)
	return mm.executeGetList(schema, dbCursor, query)
예제 #8
0
def getOneGram(schema, ngramTable):
	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)
	query = "SELECT feat, sum(value) as count FROM {}.{} group by feat".format(schema, ngramTable)
	print query
	return mm.executeGetList(schema, dbCursor, query)
예제 #9
0
def createZColumn(schema, ngramTable):
	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)
	query = "ALTER TABLE {}.{} ADD COLUMN z DOUBLE;".format(schema, ngramTable)
	mm.executeGetList(schema, dbCursor, query)
예제 #10
0
def getUsers(schema, ngramTable):
	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)
	query = "SELECT distinct(group_id) FROM {}.{};".format(schema, ngramTable)
	return map(lambda user: user[0], mm.executeGetList(schema, dbCursor, query))
예제 #11
0
def getNgrams(ngramTable, schema):
	#returns list of ngrams

	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)
	query = "SELECT feat FROM {}.{} GROUP BY feat".format(schema, ngramTable)
	return mm.executeGetList(schema, dbCursor, query)
예제 #12
0
def getMeanAndStd(word, ngramTable, schema, num_groups = -1, distTable = '', distTableSource = None):
	# get mean and std for a word using the ngramTable

	(dbConn, dbCursor, dictCursor) = mm.dbConnect(schema)

	if num_groups == -1:
		query = 'SELECT count(distinct(group_id)) FROM {}.{}'.format(schema, ngramTable)
		result = mm.executeGetList(schema, dbCursor, query)
		num_groups = int(result[0][0])
		#print int(num_groups[0][0])

	elif distTableSource is not None:
		#TODO: let user specify distTableSource		
		query = 'SELECT count(distinct(group_id)) FROM {}.{}'.format(schema, distTableSource)
		result = mm.executeGetList(schema, dbCursor, query)
		num_groups = int(result[0][0])
	else:
		pass

	if distTable == '':

		########### two pass algorithm
		n = 0 #count
		sum = 0.0
		diff_squared_sum = 0.0
		#for group_norm in session.query(Feature.group_norm).filter(Feature.feat == word):
		query = u'SELECT group_norm FROM {}.{} WHERE feat = \'{}\''.format(schema, ngramTable, word)
		group_norms = mm.executeGetList(schema, dbCursor, query)
		#print 'SELECT group_norm FROM {}.{} WHERE feat = \'{}\''.format(schema, ngramTable, word)
		num_groups = len(group_norms)

		if len(group_norms) == 1:
			return (0, 0)

		for group_norm in group_norms:
			n += 1
			sum += group_norm[0]


		mean = float(sum)/num_groups
		#print "Mean: %.12f" % mean

		for group_norm in group_norms:
			diff_squared_sum += (group_norm[0] - mean) ** 2

		if (num_groups == 1):
			variance = 1
		else:
			variance = diff_squared_sum / (num_groups - 1) #sample variance
		std = sqrt(variance)
		#print "Standard Deviation: %.12f" % std

		########### algorithm end

	else:
		query = "SELECT mean, std FROM {}.{} where feat = \'{}\'".format(schema, distTable, word)
		result = mm.executeGetList(schema, dbCursor, query)
		if not result:
			mean = 0
			std = 0
		else:
			mean = result[0][0]
			std = result[0][1]

	#print (mean, std)
	return (mean, std)