Пример #1
0
 def yieldValuesWithZerosByGroup(self, groups = [], where = '', allFeats = None):
     """returns a dict of (group_id, feature_values)"""
     valuelist = []
     if groups: 
         gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups)
         if where: valuelist = self.getValues(where+" AND "+gCond)
         else: valuelist = self.getValues(gCond)
     else: 
         valuelist = self.getValues()
     values = dict()
     for tup in valuelist:
         (gid, feat, value) = tup
         if not gid in values: values[gid] = dict()
         values[gid][feat] = value
     if not groups: groups = self.getDistinctGroups(where)
     if not allFeats:
         allFeats = self.getDistinctFeatures(where)
     #fill in zeros (this can get quite big!)
     fwc.warn("Yielding values with zeros for %d groups * %d feats." %(len(groups), len(allFeats)))
     for gid in groups:
         thisValues = dict()
         if gid in values: thisValues.update(values[gid])
         for feat in allFeats:
             if not feat in thisValues: thisValues[feat] = 0
         yield (gid, thisValues)
Пример #2
0
    def getFeatNormsWithZeros(self, groups=[], where=''):
        """returns a dict of (group_id => feature => feat_norm) """
        fnlist = []
        if groups:
            gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups)
            if where: gnlist = self.getFeatNorms(where + " AND " + gCond)
            else: fnlist = self.getFeatNorms(gCond)
        else:
            fnlist = self.getFeatNorms()
        fns = dict()
        for tup in fnlist:
            (gid, feat, fn) = tup
            if not gid in fns: fns[gid] = dict()
            fns[gid][feat] = float(fn)
        if not groups: groups = self.getDistinctGroups(where)

        #fill in zeros (this can get quite big!)
        fwc.warn("Adding zeros to feat norms (%d groups * %d feats)." %
                 (len(groups), len(meanData.keys())))
        meanData = self.getFeatMeanData()  # feat : (mean, std, zero_mean)
        for gid in groups:
            if not gid in fns: fns[gid] = dict()
            for feat in meanData.iterkeys():
                if not feat in fns[gid]: fns[gid][feat] = meanData[feat][2]
        return fns, meanData.keys()
Пример #3
0
 def getGroupNormsWithZeros(self, groups=[], where=''):
     """returns a dict of (group_id => feature => group_norm)"""
     #This functino gets killed on large feature sets
     gnlist = []
     if groups:
         gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups)
         if where: gnlist = self.getGroupNorms(where + " AND " + gCond)
         else: gnlist = self.getGroupNorms(gCond)
     else:
         gnlist = self.getGroupNorms()
     gns = dict()
     for tup in gnlist:
         (gid, feat, gn) = tup
         if not gid in gns: gns[gid] = dict()
         gns[gid][feat] = gn
     if not groups: groups = self.getDistinctGroups(where)
     allFeats = self.getDistinctFeatures(where)
     #fill in zeros (this can get quite big!)
     fwc.warn("Adding zeros to group norms (%d groups * %d feats)." %
              (len(groups), len(allFeats)))
     for gid in groups:
         if not gid in gns: gns[gid] = dict()
         for feat in allFeats:
             if not feat in gns[gid]: gns[gid][feat] = 0
     return gns, allFeats
Пример #4
0
    def getCollocsWithPMI(self):
        '''
        :inputs: self.featureTable
        calculates PMI for each ngram that is >1
        :returns: a dict of colloc => [pmi, num_tokens, pmi_threshold_val]
            **pmi_threshold_val is pmi/(num_tokens-1), thats what --feat_colloc_filter is based on
        '''
        featureTable = self.featureTable
        fwc.warn(featureTable)
        wordGetter = self.getWordGetter()
        tokenizer = Tokenizer(use_unicode=self.use_unicode)

        jointFreqs = self.getSumValuesByFeat()
        wordFreqs = dict(wordGetter.getSumValuesByFeat())
        allFreqs = wordGetter.getSumValue()

        keepers = set()
        collocPMIs = {}
        count = 0
        print "len(jointFreqs): " + str(len(jointFreqs))
        for (colloc, freq) in jointFreqs:
            count +=1
            if count % 50000 == 0:
                print "calculating pmi for {}th feature".format(count)
            words = [word[:fwc.VARCHAR_WORD_LENGTH] for word in tokenizer.tokenize(colloc)]
            if (len(words) > 1):
                indFreqs = [wordFreqs[w] for w in words if w in wordFreqs]
                pmi = FeatureRefiner.pmi(freq, indFreqs, allFreqs, words = words)
                collocPMIs[colloc] =[colloc, freq, pmi, len(words), pmi/(len(words)-1)]
        return collocPMIs
Пример #5
0
 def getFeatValuesAndGNs(feat):
     if gns:
         try:
             if values:
                 return (vals[feat].copy(), gns[feat].copy())
             return (None, gns[feat].copy())
         except KeyError:
             fwc.warn(
                 "Couldn't find gns for feat: %s (group_freq_thresh may be too high)"
                 % feat)
             return (None, dict())
     else:  #must query for feat
         gnDict = None
         valDict = None
         gnlist = []
         if gCond:
             if where:
                 gnlist = getGroupNormsForFeat(feat,
                                               where + " AND " + gCond)
             else:
                 gnlist = getGroupNormsForFeat(feat, gCond)
         else:
             gnlist = self.getGroupNormsForFeat(feat)
         if values:
             gnDict = dict([(g, float(gn)) for g, _, gn in gnlist])
             valDict = dict([(g, float(v)) for g, v, _ in gnlist])
         else:
             gnDict = dict([(g, float(gn)) for g, gn in gnlist])
         return (valDict, gnDict)
Пример #6
0
    def createCollocRefinedFeatTable(self, threshold = 3.0, featNormTable=False):
        #n = the number of words in the ngrams
        #uses pmi to remove uncommon collocations:
        featureTable = self.featureTable
        fwc.warn(featureTable)
        wordGetter = self.getWordGetter()
        tokenizer = Tokenizer(use_unicode=self.use_unicode)

        jointFreqs = self.getSumValuesByFeat()
        wordFreqs = dict(wordGetter.getSumValuesByFeat())
        allFreqs = wordGetter.getSumValue()

        keepers = set()
        for (colloc, freq) in jointFreqs:
            # words = tokenizer.tokenize(colloc)
            # If words got truncated in the creation of 1grams, we need to account for that
            words = [word[:fwc.VARCHAR_WORD_LENGTH] for word in tokenizer.tokenize(colloc)]
            if (len(words) > 1):
                indFreqs = [wordFreqs[w] for w in words if w in wordFreqs]
                pmi = FeatureRefiner.pmi(freq, indFreqs, allFreqs, words = words)
                # print "%s: %.4f" % (colloc, pmi)#debug
                if pmi > (len(words)-1)*threshold: 
                    keepers.add(colloc)
            else:
                keepers.add(colloc)
        return self.createNewTableWithGivenFeats(keepers, "pmi%s"%str(threshold).replace('.', '_'), featNormTable)
Пример #7
0
    def addFeatNorms(self, ReCompute = False):
        """Adds the mean normalization by feature (z-score) for each feature"""
        where = None
        if not ReCompute: where = 'feat_norm is null'
        groupNorms = self.getGroupNorms(where = where) #contains group_id, feat, group_norm
        
        fMeans = self.addFeatTableMeans(groupNorms = groupNorms) #mean, std, zero

        wsql = """UPDATE """+self.featureTable+""" SET feat_norm = %s where group_id = %s AND feat = %s"""
        featNorms = []
        num_at_time = 2000
        numWritten = 0
        for (group_id, feat, group_norm) in groupNorms:
            if fwc.LOWERCASE_ONLY: feat = feat.lower()
            if (feat):
                fn = ( ((group_norm - fMeans[feat][0]) / float(fMeans[feat][1]), group_id, feat) )
                featNorms.append(fn)
                if len(featNorms) >= num_at_time:
                    mm.executeWriteMany(self.corpdb, self.dbCursor, wsql, featNorms, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode)
                    featNorms = []
                    numWritten += num_at_time
                    if numWritten % 100000 == 0: fwc.warn("%.1fm feature instances updated out of %dm" % 
                                                        ((numWritten/float(1000000)), len(groupNorms)/1000000))
                                    
        
        #write values back in 
        if featNorms: 
            mm.executeWriteMany(self.corpdb, self.dbCursor, wsql, featNorms, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode)

        return True
Пример #8
0
 def yieldValuesWithZerosByGroup(self, groups=[], where='', allFeats=None):
     """returns a dict of (group_id, feature_values)"""
     valuelist = []
     if groups:
         gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups)
         if where: valuelist = self.getValues(where + " AND " + gCond)
         else: valuelist = self.getValues(gCond)
     else:
         valuelist = self.getValues()
     values = dict()
     for tup in valuelist:
         (gid, feat, value) = tup
         if not gid in values: values[gid] = dict()
         values[gid][feat] = value
     if not groups: groups = self.getDistinctGroups(where)
     if not allFeats:
         allFeats = self.getDistinctFeatures(where)
     #fill in zeros (this can get quite big!)
     fwc.warn("Yielding values with zeros for %d groups * %d feats." %
              (len(groups), len(allFeats)))
     for gid in groups:
         thisValues = dict()
         if gid in values: thisValues.update(values[gid])
         for feat in allFeats:
             if not feat in thisValues: thisValues[feat] = 0
         yield (gid, thisValues)
Пример #9
0
 def getGroupNormsWithZerosFeatsFirst(self, groups = [], where = '', blacklist = None):
     """returns a dict of (feature => group_id => group_norm)"""
     #This functino gets killed on large feature sets
     gnlist = []
     if groups: 
         gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups)
         if where: gnlist = self.getGroupNorms(where+" AND "+gCond)
         else: gnlist = self.getGroupNorms(gCond)
     else: 
         gnlist = self.getGroupNorms()
     gns = dict()
     print "USING BLACKLIST (from getgroupnorms): %s" %str(blacklist)
     for tup in gnlist:
         (gid, feat, gn) = tup
         if blacklist:
             if not any(r.match(feat) for r in blacklist):
                 if not feat in gns: gns[feat] = dict()
                 gns[feat][gid] = gn
         else:
             if not feat in gns: gns[feat] = dict()
             gns[feat][gid] = gn
     if not groups: groups = self.getDistinctGroups(where)
     allFeats = self.getDistinctFeatures(where)
     if blacklist:
         allFeats = list(set(allFeats) - set(blacklist))
     #fill in zeros (this can get quite big!)
     fwc.warn("Adding zeros to group norms (%d groups * %d feats)." %(len(groups), len(allFeats)))
     for feat in allFeats:
         if not feat in gns: gns[feat] = dict()
         thisGn = gns[feat]
         for gid in groups:
             if not gid in thisGn: thisGn[gid] = 0
     return gns, allFeats
Пример #10
0
    def createTfIdfTable(self, ngram_table):
        '''
        Creates new feature table where group_norm = tf-idf (term frequency-inverse document frequency)
        :param ngram_table: table containing words/ngrams, collocs, etc...

        Written by Phil
        '''

        # tf-idf = tf*idf

        # tf (term frequency) is simply how frequently a term occurs in a document (group_norm for a given group_id)

        # each feat's idf = log(N/dt)
        # N = number of documents in total (i.e. count(distinct(group_id))
        # df (document frequency) = number of documents where feat was used in (i.e. count(distinct(group_id)) where feat = 'feat')

        # create new feature table
        feat_name_grabber = re.compile(r'^feat\$([^\$]+)\$') 
        feat_name = feat_name_grabber.match(ngram_table).group(1) # grabs feat_name (i.e. 1gram, 1to3gram)

        short_name = 'tf_idf_{}'.format(feat_name)
        idf_table = self.createFeatureTable(short_name, valueType = 'DOUBLE')

        #getting N
        sql = "SELECT COUNT(DISTINCT group_id) FROM %s" % ngram_table
        N = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)[0][0]

        feat_counts = self.getFeatureCounts() #tuples of: feat, count (number of groups feature appears with)

        fwc.warn('Inserting idf values into new table')
        counter = 0
        for (feat, dt) in feat_counts:
            idf = log(N/float(dt))

            # get (group_id, group_norm) where feat = feat
            # clean_feat = mm.MySQLdb.escape_string(feat.encode('utf-8')) 

            sql = u"""SELECT group_id, value, group_norm from %s WHERE feat = \'%s\'"""%(ngram_table, mm.MySQLdb.escape_string(feat.encode('utf-8')).decode('utf-8'))

            group_id_freq = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)

            for (group_id, value, tf) in group_id_freq:
                tf_idf = tf * idf

                insert_sql = u"INSERT INTO {} (group_id, feat, value, group_norm) VALUES (\'{}\', \'{}\', {}, {});".format(
                                                idf_table, 
                                                group_id, 
                                                mm.MySQLdb.escape_string(feat.encode('utf-8')).decode('utf-8'), 
                                                value, 
                                                tf_idf)
                mm.execute(self.corpdb, self.dbCursor, insert_sql)

                if (counter % 50000 == 0):
                    print '%d tf_idf values inserted!' % (counter)
                counter += 1

        fwc.warn('Finished inserting.')

        return idf_table
Пример #11
0
    def getWordGetter(self, lexicon_count_table=None):
        from featureGetter import FeatureGetter
        if lexicon_count_table: fwc.warn(lexicon_count_table)
        wordTable = self.getWordTable() if not lexicon_count_table else lexicon_count_table

        assert mm.tableExists(self.corpdb, self.dbCursor, wordTable), "Need to create word table to use current functionality: %s" % wordTable
        return FeatureGetter(self.corpdb, self.corptable, self.correl_field, self.mysql_host,
                             self.message_field, self.messageid_field, self.encoding, self.use_unicode, 
                             self.lexicondb, featureTable=wordTable, wordTable = wordTable)
Пример #12
0
    def printJoinedFeatureLines(self, filename, delimeter = ' '):
        """prints feature table like a message table in format mallet can use"""

        f = open(filename, 'w')
        for (gid, featValues) in self.yieldValuesSparseByGroup():
            message = delimeter.join([delimeter.join([feat.replace(' ', '_')]*value) for feat, value in featValues.iteritems()])
            f.write("""%s en %s\n""" %(gid, message.encode('utf-8')))            
       
        f.close()
        fwc.warn("Wrote joined features file to: %s"%filename)
Пример #13
0
    def getGroupsAndFeats(self, where=''):
        fwc.warn("Loading Features and Getting Groups.")
        groups = set()
        features = dict()
        featNames = set(self.featNames)

        for featName in featNames:
            features[featName] = dict(self.getGroupAndFeatureValues(featName, where))
            groups.update(features[featName].keys())

        return (groups, features)
Пример #14
0
    def getGroupsAndFeats(self, where=''):
        fwc.warn("Loading Features and Getting Groups.")
        groups = set()
        features = dict()
        featNames = set(self.featNames)

        for featName in featNames:
            features[featName] = dict(
                self.getGroupAndFeatureValues(featName, where))
            groups.update(features[featName].keys())

        return (groups, features)
Пример #15
0
    def printJoinedFeatureLines(self, filename, delimeter=' '):
        """prints feature table like a message table in format mallet can use"""

        f = open(filename, 'w')
        for (gid, featValues) in self.yieldValuesSparseByGroup():
            message = delimeter.join([
                delimeter.join([feat.replace(' ', '_')] * value)
                for feat, value in featValues.iteritems()
            ])
            f.write("""%s en %s\n""" % (gid, message.encode('utf-8')))

        f.close()
        fwc.warn("Wrote joined features file to: %s" % filename)
Пример #16
0
 def getAnnotationTableAsDF(self, fields=['unit_id', 'worker_id', 'score'], where='', index=['unit_id', 'worker_id'], pivot=True, fillNA=False):
     """return a dataframe of unit_it, worker_id, score"""
     if fillNA and not pivot:
         fwc.warn("fillNA set to TRUE but pivot set to FALSE. No missing values will be filled.") 
     db_eng = get_db_engine(self.corpdb)
     sql = """SELECT %s, %s, %s from %s""" % tuple(fields + [self.outcome_table])
     if (where): sql += ' WHERE ' + where
     if pivot:
         if fillNA:
             return pd.read_sql(sql=sql, con=db_eng, index_col=index).unstack().fillna(value=0)
         else:
             return pd.read_sql(sql=sql, con=db_eng, index_col=index).unstack()
     else:
         return pd.read_sql(sql=sql, con=db_eng, index_col=index) 
Пример #17
0
 def getAnnotationTableAsDF(self, fields=['unit_id', 'worker_id', 'score'], where='', index=['unit_id', 'worker_id'], pivot=True, fillNA=False):
     """return a dataframe of unit_it, worker_id, score"""
     if fillNA and not pivot:
         fwc.warn("fillNA set to TRUE but pivot set to FALSE. No missing values will be filled.") 
     db_eng = get_db_engine(self.corpdb)
     sql = """SELECT %s, %s, %s from %s""" % tuple(fields + [self.outcome_table])
     if (where): sql += ' WHERE ' + where
     if pivot:
         if fillNA:
             return pd.read_sql(sql=sql, con=db_eng, index_col=index).unstack().fillna(value=0)
         else:
             return pd.read_sql(sql=sql, con=db_eng, index_col=index).unstack()
     else:
         return pd.read_sql(sql=sql, con=db_eng, index_col=index) 
Пример #18
0
    def getWordGetter(self, lexicon_count_table=None):
        from featureGetter import FeatureGetter
        if lexicon_count_table: fwc.warn(lexicon_count_table)
        wordTable = self.getWordTable(
        ) if not lexicon_count_table else lexicon_count_table

        assert mm.tableExists(
            self.corpdb, self.dbCursor, wordTable
        ), "Need to create word table to use current functionality: %s" % wordTable
        return FeatureGetter(self.corpdb,
                             self.corptable,
                             self.correl_field,
                             self.mysql_host,
                             self.message_field,
                             self.messageid_field,
                             self.encoding,
                             self.use_unicode,
                             self.lexicondb,
                             featureTable=wordTable,
                             wordTable=wordTable)
Пример #19
0
    def _getKeepSet(self, p, minimumFeatSum = 0, groupFreqThresh = 0):
        """creates a set of features occuring in less than p*|correl_field| rows"""
        #acquire the number of groups (need to base on corp table):
        featureTable = self.featureTable
        totalGroups = self.countGroups(groupFreqThresh)
        assert totalGroups > 0, 'NO GROUPS TO FILTER BASED ON (LIKELY group_freq_thresh IS TOO HIGH)'
        assert p <= 1, 'p_occ > 1 not implemented yet'
        threshold = int(round(p*totalGroups))
        fwc.warn (" %s [threshold: %d]" %(featureTable, threshold))

        #acquire counts per feature (each row will come from a different correl_field)

        featCounts = self.getFeatureCounts(groupFreqThresh) #tuples of: feat, count (number of groups feature appears with)
        
        #apply filter:
        toKeep = set()
        i = 0
        for (feat, count) in featCounts:
            if count >= threshold:
                if self.use_unicode:
                    toKeep.add(unicode(feat).lower())
                else:
                    toKeep.add(feat.lower())
            i += 1
            
            if (i % 1000000) == 0: print "    checked %d features" % i
        
        #apply secondary filter
        if minimumFeatSum > 1:
            featSums = self.getFeatureValueSums()
            for (feat, fsum) in featSums:
                if self.use_unicode:
                    feat = unicode(feat).lower()
                else:
                    feat = feat.lower()
                if feat in toKeep:
                    if fsum < minimumFeatSum:
                        toKeep.remove(feat)
            
        return toKeep
Пример #20
0
 def getGroupNormsWithZerosFeatsFirst(self,
                                      groups=[],
                                      where='',
                                      blacklist=None):
     """returns a dict of (feature => group_id => group_norm)"""
     #This functino gets killed on large feature sets
     gnlist = []
     if groups:
         gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups)
         if where: gnlist = self.getGroupNorms(where + " AND " + gCond)
         else: gnlist = self.getGroupNorms(gCond)
     else:
         gnlist = self.getGroupNorms()
     gns = dict()
     print "USING BLACKLIST (from getgroupnorms): %s" % str(blacklist)
     for tup in gnlist:
         (gid, feat, gn) = tup
         if blacklist:
             if not any(r.match(feat) for r in blacklist):
                 if not feat in gns: gns[feat] = dict()
                 gns[feat][gid] = gn
         else:
             if not feat in gns: gns[feat] = dict()
             gns[feat][gid] = gn
     if not groups: groups = self.getDistinctGroups(where)
     allFeats = self.getDistinctFeatures(where)
     if blacklist:
         allFeats = list(set(allFeats) - set(blacklist))
     #fill in zeros (this can get quite big!)
     fwc.warn("Adding zeros to group norms (%d groups * %d feats)." %
              (len(groups), len(allFeats)))
     for feat in allFeats:
         if not feat in gns: gns[feat] = dict()
         thisGn = gns[feat]
         for gid in groups:
             if not gid in thisGn: thisGn[gid] = 0
     return gns, allFeats
Пример #21
0
    def getFeatNormsWithZeros(self, groups = [], where = ''):
        """returns a dict of (group_id => feature => feat_norm) """
        fnlist = []
        if groups: 
            gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups)
            if where: gnlist = self.getFeatNorms(where+" AND "+gCond)
            else: fnlist = self.getFeatNorms(gCond)
        else: 
            fnlist = self.getFeatNorms()
        fns = dict()
        for tup in fnlist:
            (gid, feat, fn) = tup
            if not gid in fns: fns[gid] = dict()
            fns[gid][feat] = float(fn)
        if not groups: groups = self.getDistinctGroups(where)

        #fill in zeros (this can get quite big!)
        fwc.warn("Adding zeros to feat norms (%d groups * %d feats)." %(len(groups), len(meanData.keys())))
        meanData = self.getFeatMeanData() # feat : (mean, std, zero_mean)
        for gid in groups:
            if not gid in fns: fns[gid] = dict()
            for feat in meanData.iterkeys():
                if not feat in fns[gid]: fns[gid][feat] = meanData[feat][2] 
        return fns, meanData.keys()
Пример #22
0
 def getFeatValuesAndGNs(feat):
     if gns:
         try:
             if values: 
                 return (vals[feat].copy(), gns[feat].copy())
             return (None, gns[feat].copy())
         except KeyError:
             fwc.warn("Couldn't find gns for feat: %s (group_freq_thresh may be too high)" % feat)
             return (None, dict())
     else:#must query for feat
         gnDict = None 
         valDict = None
         gnlist = []
         if gCond: 
             if where: gnlist = getGroupNormsForFeat(feat, where+" AND "+gCond)
             else: gnlist = getGroupNormsForFeat(feat, gCond)
         else:
             gnlist = self.getGroupNormsForFeat(feat)
         if values:
             gnDict = dict([(g, float(gn)) for g, _, gn in gnlist])
             valDict = dict([(g, float(v)) for g, v, _ in gnlist])
         else:
             gnDict = dict([(g, float(gn)) for g, gn in gnlist])
         return (valDict, gnDict)
Пример #23
0
 def getGroupNormsWithZeros(self, groups = [], where = ''):
     """returns a dict of (group_id => feature => group_norm)"""
     #This functino gets killed on large feature sets
     gnlist = []
     if groups: 
         gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups)
         if where: gnlist = self.getGroupNorms(where+" AND "+gCond)
         else: gnlist = self.getGroupNorms(gCond)
     else: 
         gnlist = self.getGroupNorms()
     gns = dict()
     for tup in gnlist:
         (gid, feat, gn) = tup
         if not gid in gns: gns[gid] = dict()
         gns[gid][feat] = gn
     if not groups: groups = self.getDistinctGroups(where)
     allFeats = self.getDistinctFeatures(where)
     #fill in zeros (this can get quite big!)
     fwc.warn("Adding zeros to group norms (%d groups * %d feats)." %(len(groups), len(allFeats)))
     for gid in groups:
         if not gid in gns: gns[gid] = dict()
         for feat in allFeats:
             if not feat in gns[gid]: gns[gid][feat] = 0
     return gns, allFeats
Пример #24
0
    def createNewTableWithGivenFeats(self, toKeep, label, featNorm=False):
        """Creates a new table only containing the given features"""

        featureTable = self.featureTable
        numToKeep = len(toKeep)
        newTable = featureTable+'$'+label
        mm.execute(self.corpdb, self.dbCursor, "DROP TABLE IF EXISTS %s" % newTable, charset=self.encoding, use_unicode=self.use_unicode)
        fwc.warn(" %s <new table %s will have %d distinct features.>" %(featureTable, newTable, numToKeep))
        sql = """CREATE TABLE %s like %s""" % (newTable, featureTable)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        mm.disableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode)
  
        num_at_time = 2000
        total = 0
        toWrite = []
        
        wsql = """INSERT INTO """+newTable+""" (group_id, feat, value, group_norm, feat_norm) values (%s, %s, %s, %s, %s)""" if featNorm else """INSERT INTO """+newTable+""" (group_id, feat, value, group_norm) values (%s, %s, %s, %s)"""

        #iterate through each row, deciding whetehr to keep or not
        for featRow in self.getFeatAllSS(featNorm=featNorm):
            #print "%d %d" % (len(featRow), len(toWrite))
            if self.use_unicode and unicode(featRow[1]).lower() in toKeep:
                toWrite.append(featRow)
            elif not self.use_unicode and featRow[1].lower() in toKeep:
                toWrite.append(featRow)
            if len(toWrite) > num_at_time:
            #write those past the filter to the table
                mm.executeWriteMany(self.corpdb, self.dbCursor, wsql, toWrite, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode)
                total+= num_at_time
                if total % 100000 == 0: fwc.warn("%.1fm feature instances written" % (total/float(1000000)))
                toWrite = []

        #catch rest:
        if len(toWrite) > 0:
            #write those past the filter to the table
            mm.executeWriteMany(self.corpdb, self.dbCursor, wsql, toWrite, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode)

        fwc.warn("Done inserting.\nEnabling keys.")
        mm.enableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode)
        fwc.warn("done.")

        self.featureTable = newTable
        return newTable
Пример #25
0
    def createAggregateFeatTableByGroup(self, valueFunc = lambda d: d):
        """combines feature tables, and groups by the given group field"""
        
        featureTable = self.featureTable

        (_, name, oldCorpTable, oldGroupField) = featureTable.split('$')[:4]
        theRest = featureTable.split('$')[4:]

        

        newTable = 'feat$agg_'+name[:12]+'$'+oldCorpTable+'$'+self.correl_field # +'$'+'$'.join(theRest)
        drop = """DROP TABLE IF EXISTS %s""" % (newTable)
        mm.execute(self.corpdb, self.dbCursor, drop, charset=self.encoding, use_unicode=self.use_unicode)

        sql = """CREATE TABLE %s like %s""" % (newTable, featureTable)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        sql = """ALTER TABLE %s MODIFY group_id VARCHAR(255)""" % (newTable)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        

        mm.disableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode)
        
        fwc.warn("Inserting group_id, feat, and values")
        sql = "INSERT INTO %s SELECT m.%s, f.feat, sum(f.value), 0 FROM %s AS f, %s AS m where m.%s = f.group_id GROUP BY m.%s, f.feat" % (newTable,self.correl_field, featureTable, self.corptable, oldGroupField, self.correl_field)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)

        fwc.warn("Recalculating group_norms")
        sql = "UPDATE %s a INNER JOIN (SELECT group_id,sum(value) sum FROM %s GROUP BY group_id) b ON a.group_id=b.group_id SET a.group_norm=a.value/b.sum" % (newTable,newTable)
        
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
  
        # patrick changed this to be all SQL 7/21/15. Values and group norms were being calculated wrong before

        fwc.warn("Done inserting.\nEnabling keys.")
        mm.enableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode)
        fwc.warn("done.")

        self.featureTable = newTable
        return newTable
Пример #26
0
    def createFeatTableByDistinctOutcomes(self, outcomeGetter, controlValuesToAvg = [], outcomeRestriction = None, nameSuffix=None ):
        """Creates a new feature table, by combining values based on an outcome, then applies an averaging based on controls"""
        ##TODO: perform outcome restriction by using group freq thresh instead of uwt, for flexibility
        featureTable = self.featureTable
        outcomeTable = outcomeGetter.outcome_table
        assert len(outcomeGetter.outcome_value_fields) < 2, 'Currently, only allowed to specify one outcome.'
        outcomeField = outcomeGetter.outcome_value_fields[0]
        controlField = None
        if outcomeGetter.outcome_controls: 
            assert len(outcomeGetter.outcome_controls) < 2, 'Currently, only allowed to specify one control.'
            controlField = outcomeGetter.outcome_controls[0]
            if len(controlValuesToAvg) < 1:
                fwc.warn("getting distinct values for controls")
                controlValuesToAvg = outcomeGetter.getDistinctOutcomeValues(outcome = controlField, includeNull = False, where=outcomeRestriction)

        #create new table name:
        nameParts = featureTable.split('$')
        nameParts = map(lambda part: part.replace('16to', ''), nameParts)
        nameParts = map(lambda part: part.replace('messages', 'msgs'), nameParts)
        newTables = []
        nameSuffix = '' if not nameSuffix else '_%s'%(nameSuffix,)
        if controlField:
            for value in controlValuesToAvg:
                controlGroupName = outcomeField + '_' + controlField + '_' + str(value)
                newTables.append('feat_grpd'+ nameSuffix +'$' + '$'.join(nameParts[1:3]) + '$' + controlGroupName + '$' + '$'.join(nameParts[4:]))
        else: 
            newTables.append('feat_grpd'+ nameSuffix +'$' + '$'.join(nameParts[1:3]) + '$' + outcomeField + '$' + '$'.join(nameParts[4:]))

        #1. create table where outcome is group_id and insert values
        for newTable in newTables:
            drop = """DROP TABLE IF EXISTS %s""" % (newTable)
            sql = "create table %s like %s" % (newTable, featureTable)
            mm.execute(self.corpdb, self.dbCursor, drop, charset=self.encoding, use_unicode=self.use_unicode)
            mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
            sql = 'ALTER TABLE %s ADD COLUMN `N` int(16) not null default -1'%(newTable)
            mm.execute(self.corpdb, self.dbCursor, sql)
            sql = 'ALTER TABLE %s CHANGE feat_norm std_dev FLOAT' % newTable;
            mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)

            
        outres = outcomeRestriction
        outres = outres + ' AND ' if outres else '' #only need and if it exists
        if controlField:
            for outcomeValue, cntrlcounts in \
                    outcomeGetter.getDistinctOutcomeAndControlValueCounts(control = controlField, includeNull = False, where=outcomeRestriction).iteritems():
                for cvalue, count in cntrlcounts.iteritems():
                    if cvalue in controlValuesToAvg:
                        newTable = 'feat_grpd'+ nameSuffix + '$' + '$'.join(nameParts[1:3]) + '$' + outcomeField + '_' + controlField + '_' + str(cvalue) + '$' + '$'.join(nameParts[4:])
                        print "on %s %s and %s %s, count: %d" % (outcomeField, str(outcomeValue), controlField, str(cvalue), count)
                        sql = "INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N) SELECT age, feat, total_freq, mean_rel_freq, SQRT((N_no_zero*(POW((mean_no_zero - mean_rel_freq), 2) + std_no_zero*std_no_zero) + (N - N_no_zero)*(mean_rel_freq * mean_rel_freq)) / N) as std, N  from (SELECT b.%s, feat, SUM(value) as total_freq, SUM(group_norm)/%d as mean_rel_freq, AVG(group_norm) as mean_no_zero, std(group_norm) as std_no_zero, %d as N, count(*) as N_no_zero FROM %s AS a, %s AS b WHERE %s b.%s = '%s' AND b.%s = '%s' AND b.user_id = a.group_id group by b.%s, a.feat) as stats" % (newTable, outcomeField, count, count, featureTable, outcomeTable, outres, controlField, str(cvalue), outcomeField, str(outcomeValue), outcomeField)
#SELECT age, feat, total_freq, mean_rel_freq, SQRT((N_no_zero*(POW((mean_no_zero - mean_rel_freq), 2) + std_no_zero*std_no_zero) + (N - N_no_zero)*(mean_rel_freq * mean_rel_freq)) / N) as std, N  from (
#SELECT b.age, feat, SUM(value) as total_freq, SUM(group_norm)/390 as mean_rel_freq, AVG(group_norm) as mean_no_zero, std(group_norm) as std_no_zero, 390 as N, count(*) as N_no_zero FROM feat$1gram$messages_en$user_id$16to16$0_01 AS a, masterstats_andy AS b WHERE UWT >= 1000 AND b.age = '45' AND b.user_id = a.group_id group by b.age, a.feat) as a             
                        mm.execute(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)
                    else:
                        print "skipping %s %s and %s %s, count: %d because control value not in list" % (outcomeField, str(outcomeValue), controlField, str(cvalue), count)
        else: #no controls to avg
            # Maarten
            correspondences = outcomeGetter.getGroupAndOutcomeValues()
            correspondences_inv = {}
            for k,v in correspondences:
                correspondences_inv[v] = correspondences_inv.get(v,[])
                correspondences_inv[v].append(k)
            correspondences = correspondences_inv
            total_sum_values = {i[0]: long(i[1]) for i in self.getSumValuesByGroup()}

            i = 0
            j = 0
            for outcomeValue, groups in correspondences.iteritems():
                i += 1
                rows = []
                groups_nonZero = [g for g in groups if g in total_sum_values]
                for feat, values, gns, Nfeats in self.yieldGroupNormsWithZerosByFeat(groups = groups, values = True):
                    if not values: continue

                    sum_value = sum(values.values())
                    total_sum_value = sum(total_sum_values[g] for g in groups_nonZero)
                    group_norm = float(sum_value)/total_sum_value
                    std_dev = std(gns.values())
                    N = len(gns)
                    rows.append([outcomeValue, feat, sum_value, group_norm, std_dev, N])
                    if len(rows) >= 10000:
                        sql = "INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N) " % newTable
                        sql += "VALUES (%s)" % ', '.join('%s' for r in rows[0]) 
                        mm.executeWriteMany(self.corpdb, self.dbCursor, sql, rows, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode)
                        j += len(rows)
                        print "    wrote %d rows [finished %d outcome_values]" % (j, i)
                        rows = []
                    
                if rows:
                    sql = "INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N) " % newTable
                    sql += "VALUES (%s)" % ', '.join('%s' for r in rows[0])
                    mm.executeWriteMany(self.corpdb, self.dbCursor, sql, rows, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode)
                    j += len(rows)  
                    print "    wrote %d rows [finished %d outcome_values]" % (j, i)
                print "Inserted into %s" % newTable

                        
                """
                for outcomeValue, count in outcomeGetter.getDistinctOutcomeValueCounts(includeNull = False, where=outcomeRestriction).iteritems():
                
                newTable = 'feat_grpd'+ '$' + '$'.join(nameParts[1:3]) + '$' + outcomeField + '$' + '$'.join(nameParts[4:])
                print "on %s %s, count: %d (no control)" % (outcomeField, str(outcomeValue), count)
                sql = "INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N) SELECT %s, feat, total_freq, mean_rel_freq, SQRT((N_no_zero*(POW((mean_no_zero - mean_rel_freq), 2) + std_no_zero*std_no_zero) + (N - N_no_zero)*(mean_rel_freq * mean_rel_freq)) / N) as std, N  from (SELECT group_id, feat, SUM(value) as total_freq, SUM(group_norm)/%d as mean_rel_freq, AVG(group_norm) as mean_no_zero, std(group_norm) as std_no_zero, count(1) as N_no_zero, %d as N FROM %s) AS a, %s AS b WHERE %s b.%s = '%s' AND b.%s = a.group_id group by b.%s, a.feat" % (newTable, outcomeField,  count, count, featureTable, outcomeTable, outres, outcomeField, str(outcomeValue), self.correl_field ,outcomeField)
                # print "Maarten", self.correl_field, sql
                mm.execute(self.corpdb, self.dbCursor, sql, False)"""
        #2: Combine feature table to take average of controls:
        #controlGroupAvgName = outcomeField + '_' + controlField + 'avg_' + '_'.join(map(lambda v: str(v), controlValuesToAvg))
        if controlField and len(newTables) > 1:  
            controlGroupAvgName = outcomeField + '_' + controlField + 'avg'
            avgTable = 'feat_grpd'+ nameSuffix + '$' + '$'.join(nameParts[1:3]) + '$' + controlGroupAvgName + '$' + '$'.join(nameParts[4:])
            drop = """DROP TABLE IF EXISTS %s""" % (avgTable)
            sql = "create table %s like %s" % (avgTable, newTables[0])
            mm.execute(self.corpdb, self.dbCursor, drop, charset=self.encoding, use_unicode=self.use_unicode)
            mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)

            #create insert fields:
            shortNames = map(lambda i: chr(ord('a')+i), range(len(newTables)))
            tableNames = ', '.join(map(lambda i: "%s as %s" % (newTables[i], shortNames[i]), range(len(newTables))))
            values = "(%s)" % (' + '.join(map(lambda name: "%s.value" % (name), shortNames))) + ' / ' + str(len(shortNames))
            groupNorms = "(%s)" % (' + '.join(map(lambda name: "%s.group_norm" % (name), shortNames))) + ' / ' + str(len(shortNames))
            Ns = ' + '.join(map(lambda name: "%s.N" % (name), shortNames))
            stdDev = "(%s)" % (' + '.join(map(lambda name: "POW(%s.group_norm - (%s), 2) + POW(%s.std_dev, 2)" % (name, groupNorms, name), shortNames))) + ' / ' + str(len(shortNames))
            stdDev = "SQRT(%s)" % stdDev
            #stdDev = "SQRT(%s)" % (' + '.join(map(lambda name: "%s.N*(POW(%s.group_norm - %s, 2) + POW(%s.std_dev,2))" % (name, name, groupNorms, name), shortNames))) + ' / ' + Ns
            #stdDev = "(%s)" % (' + '.join(map(lambda name: "%s.std_dev_no_zero" % (name), shortNames))) + ' / ' + str(len(shortNames))


            #create joins
            groupIds = map(lambda name: "%s.group_id" % (name), shortNames)
            feats = map(lambda name: "%s.feat" % (name), shortNames)
            groupIdJoins = []
            featJoins = []
            for i in xrange(len(groupIds) - 1):
                groupIdJoins.append('%s = %s' % (groupIds[i], groupIds[i+1]))
                featJoins.append('%s = %s' % (feats[i], feats[i+1]))
            groupIdJoins = ' AND '.join(groupIdJoins)
            featJoins = ' AND '.join(featJoins)

            #call SQL
            sql = "INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N) SELECT a.group_id, a.feat, %s, %s, %s, %s FROM %s where %s AND %s" % \
                (avgTable, values, groupNorms, stdDev, Ns, tableNames, groupIdJoins, featJoins)
            print "Populating AVG table with command: %s" % sql
            mm.execute(self.corpdb, self.dbCursor, sql, False, charset=self.encoding, use_unicode=self.use_unicode)
Пример #27
0
    def yieldGroupNormsWithZerosByFeat(self,
                                       groups=[],
                                       where='',
                                       values=False,
                                       feats=[]):
        """yields (feat, groupnorms, number of features"""
        """ or if values = True, (feat, values, groupnorms, number of features)"""
        allFeats = feats
        if not feats:
            allFeats = self.getDistinctFeatures(where)
        else:
            fwc.warn("feats restricted to %s" % feats)

        numFeats = len(allFeats)
        gCond = None
        if groups:
            gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups)
        else:
            groups = self.getDistinctGroups(where)
        numGroups = len(groups)

        getGroupNorms = self.getGroupNorms
        getGroupNormsForFeat = self.getGroupNormsForFeat
        getGroupNormsForFeats = self.getGroupNormsForFeats
        if values:
            getGroupNorms = self.getValuesAndGroupNorms
            getGroupNormsForFeat = self.getValuesAndGroupNormsForFeat
            getGroupNormsForFeats = self.getValuesAndGroupNormsForFeats

        #figure out if too big for memory:
        fwc.warn("Yielding norms with zeros (%d groups * %d feats)." %
                 (len(groups), numFeats))
        gns = dict()
        vals = dict()  #only gets field if values is true
        if (numFeats * numGroups) < 12500000 * fwc.GIGS_OF_MEMORY:
            #statically acquire all gns
            gnlist = []
            if gCond:
                if where: gnlist = getGroupNorms(where + " AND " + gCond)
                else: gnlist = getGroupNorms(gCond)
            else:  #don't need to specify groups
                gnlist = getGroupNorms()
            if feats:
                if where:
                    where = " AND ".join(
                        [where, "feat IN ('" + "','".join(feats) + "')"])
                else:
                    where = " feat IN ('" + "','".join(feats) + "')"

            for tup in gnlist:
                (gid, feat) = tup[0:2]
                if not feat in gns:
                    gns[feat] = dict()
                    if values:
                        vals[feat] = dict()
                gns[feat][gid] = float(tup[-1])
                if values:
                    vals[feat][gid] = float(tup[2])
        else:
            fwc.warn(
                "Too big to keep gns in memory, querying for each feature (slower, but less memory intensive)"
            )

        def getFeatValuesAndGNs(feat):
            if gns:
                try:
                    if values:
                        return (vals[feat].copy(), gns[feat].copy())
                    return (None, gns[feat].copy())
                except KeyError:
                    fwc.warn(
                        "Couldn't find gns for feat: %s (group_freq_thresh may be too high)"
                        % feat)
                    return (None, dict())
            else:  #must query for feat
                gnDict = None
                valDict = None
                gnlist = []
                if gCond:
                    if where:
                        gnlist = getGroupNormsForFeat(feat,
                                                      where + " AND " + gCond)
                    else:
                        gnlist = getGroupNormsForFeat(feat, gCond)
                else:
                    gnlist = self.getGroupNormsForFeat(feat)
                if values:
                    gnDict = dict([(g, float(gn)) for g, _, gn in gnlist])
                    valDict = dict([(g, float(v)) for g, v, _ in gnlist])
                else:
                    gnDict = dict([(g, float(gn)) for g, gn in gnlist])
                return (valDict, gnDict)

        #fill in zeros (this can get quite big!)
        for feat in allFeats:
            (valDict, gnDict) = getFeatValuesAndGNs(feat)
            for gid in groups:
                if not gid in gnDict:  #add zeros!
                    gnDict[gid] = 0
                    if values and valDict: valDict[gid] = 0
            if values:
                yield (feat, valDict, gnDict, numFeats)
            else:
                yield (feat, gnDict, numFeats)
Пример #28
0
    def getGroupsAndOutcomes(self, lexicon_count_table=None, groupsWhere = '', includeFoldLabels=False):
        if self.group_freq_thresh and self.wordTable != self.get1gramTable():
            fwc.warn("""You specified a --word_table and --group_freq_thresh is
enabled, so the total word count for your groups might be off
(remove "--word_table WT" to solve this issue)""", attention=False)
            
        groups = set()
        outcomes = dict()
        outcomeFieldList = set(self.outcome_value_fields).union(set(self.outcome_controls)).union(set(self.outcome_interaction))
        ocs = dict()
        controls = dict()
        folds = dict()

        #get outcome values:
        fwc.warn("Loading Outcomes and Getting Groups for: %s" % str(outcomeFieldList)) #debug
        if outcomeFieldList:
            for outcomeField in outcomeFieldList:
                outcomes[outcomeField] = dict(self.getGroupAndOutcomeValues(outcomeField))
                if outcomeField in self.outcome_value_fields:
                    groups.update(outcomes[outcomeField].keys())
            

            if self.group_freq_thresh:
                where = """ group_id in ('%s')""" % ("','".join(str(g) for g in groups))
                groupCnts = self.getGroupWordCounts(where, lexicon_count_table = lexicon_count_table)
                groups = set()
                for outcomeField, outcomeValues in outcomes.iteritems():
                    newOutcomes = dict()
                    for gId in outcomeValues.iterkeys():
                        if (gId in groupCnts) and (groupCnts[gId] >= self.group_freq_thresh):
                            #keep
                            # newOutcomes[gId] = float(outcomeValues[gId])
                            newOutcomes[gId] = outcomeValues[gId]
                    outcomes[outcomeField] = newOutcomes
                    if outcomeField in self.outcome_value_fields:
                        groups.update(newOutcomes.keys())

            #set groups:
            for k in self.outcome_controls + self.outcome_interaction:
                groups = groups & set(outcomes[k].keys()) #always intersect with controls
            if groupsWhere:
                outcm = groupsWhere.split()[0].strip()
                # val = groupsWhere.split('=')[1].strip()
                # # print "Maarten getGroupsAndOutcomes", [groupsWhere, outcm, val]
                # whereusers = set([i[0] for i in self.getGroupAndOutcomeValues(outcm) if str(i[1]) == val])
                whereusers = set([i[0] for i in self.getGroupAndOutcomeValues(outcm, where=groupsWhere)])
                groups = groups & whereusers

            if self.oneGroupSetForAllOutcomes:
                for k in self.outcome_value_fields:
                    groups = groups & set(outcomes[k].keys()) # only intersect if wanting all the same groups
            
            #split into outcomes and controls:
            ocs = dict()
            controls = dict()
            for k in self.outcome_controls + self.outcome_interaction:
                outcomeDict = outcomes[k]
                outcomeDict = dict([(g, v) for g, v in outcomeDict.iteritems() if g in groups])
                controls[k] = outcomeDict
            for k in self.outcome_value_fields:
                outcomeDict = outcomes[k]
                outcomeDict = dict([(g, v) for g, v in outcomeDict.iteritems() if g in groups])
                ocs[k] = outcomeDict
        elif self.group_freq_thresh:
            groupCnts = self.getGroupWordCounts(where = None, lexicon_count_table = lexicon_count_table)
            groups = set()
            for gId, cnt in groupCnts.iteritems():
                if cnt >= self.group_freq_thresh:
                    groups.add(gId)
            if groupsWhere:
                outcm = groupsWhere.split('=')[0].strip()
                val = groupsWhere.split('=')[1].strip()
                # print "Maarten getGroupsAndOutcomes", [groupsWhere, outcm, val]
                whereusers = set([i[0] for i in self.getGroupAndOutcomeValues(outcm) if str(i[1]) == val])
                groups = groups & whereusers

        if self.fold_column:
            folds = dict(self.getGroupAndOutcomeValues(self.fold_column))

        if includeFoldLabels:
            return (groups, ocs, controls, folds)
        else:
            return (groups, ocs, controls)
Пример #29
0
    def getContingencyArrayFeatNorm(self, where = ''):
        """ returns a list of lists: each row is a group_id and each col is a feature"""
        """ the first row has a blank first entry and then a list of unique features"""
        """ the first column has a blank first entry and then a list of unique group_ids"""
        fwc.warn("running getContingencyArrayFeatNorm")

        fwc.warn("Getting distinct feature / groupId lists and (feat, featNormZero) list")
        distinctFeatureList = self.getDistinctFeatures( where )
        featureZeroList = self.getFeatureZeros( where )
        distinctGroupList = self.getDistinctGroups( where )

        fwc.warn("Converting feature / groupId lists to dictionaries (item: index) for quick insertion")
        distinctFeatureDict = {}
        counter = 0
        for feature in distinctFeatureList:
            distinctFeatureDict[feature] = counter
            counter += 1

        distinctGroupDict = {}
        counter = 0
        for group in distinctGroupList:
            distinctGroupDict[group] = counter
            counter += 1
        
        fwc.warn("Making a 2d array (matrix) with ncol = nDistinctFeatures and nrow = nDistinctGroupIds")
        fwc.warn("For each distinct feature, intializing that column with feat norm zeros' value")
        contingencyMatrix = zeros( ( len(distinctGroupList), len(distinctFeatureList) ) )
        for tup in featureZeroList:
            (feat, featNormZero) = tup
            columnIndexToZero = distinctFeatureDict[ feat ] 
            contingencyMatrix[ :, columnIndexToZero ] = featNormZero

        fwc.warn("calling getFeatNormsSS, iterating through (with SS cursor)")
        fwc.warn("for each iteration, using the index dictionaries to insert the entry into the matrix")
        ssCursor = self.getFeatNormsSS( where )
        for tup in ssCursor:
            (gid, feat, featNorm) = tup
            columnIndexForInsertion = distinctFeatureDict[ feat ]
            rowIndexForInsertion = distinctGroupDict[ gid ]
            contingencyMatrix[ rowIndexForInsertion, columnIndexForInsertion ] = featNorm

        fwc.warn("returning [contingency matrix, rownames (distinct groups), and colnames (distinct features)]")
        return [ contingencyMatrix, distinctGroupList, distinctFeatureList ]
Пример #30
0
    def getGroupsAndOutcomes(self, groupThresh = 0, lexicon_count_table=None, groupsWhere = ''):
        if groupThresh and self.wordTable != self.get1gramTable():
            fwc.warn("""###################################################################
WARNING: You specified a --word_table and --group_freq_thresh is
enabled, so the total word count for your groups might be off
(remove "--word_table WT" to solve this issue)
###################################################################""")
            
        groups = set()
        outcomes = dict()
        outcomeFieldList = set(self.outcome_value_fields).union(set(self.outcome_controls)).union(set(self.outcome_interaction))
        ocs = dict()
        controls = dict()

        #get outcome values:
        fwc.warn("Loading Outcomes and Getting Groups for: %s" % str(outcomeFieldList)) #debug
        if outcomeFieldList:
            for outcomeField in outcomeFieldList:
                outcomes[outcomeField] = dict(self.getGroupAndOutcomeValues(outcomeField))
                if outcomeField in self.outcome_value_fields:
                    groups.update(outcomes[outcomeField].keys())
            

            if groupThresh:
                where = """ group_id in ('%s')""" % ("','".join(str(g) for g in groups))
                groupCnts = self.getGroupWordCounts(where, lexicon_count_table = lexicon_count_table)
                groups = set()
                for outcomeField, outcomeValues in outcomes.iteritems():
                    newOutcomes = dict()
                    for gId in outcomeValues.iterkeys():
                        if (gId in groupCnts) and (groupCnts[gId] >= groupThresh):
                            #keep
                            # newOutcomes[gId] = float(outcomeValues[gId])
                            newOutcomes[gId] = outcomeValues[gId]
                    outcomes[outcomeField] = newOutcomes
                    if outcomeField in self.outcome_value_fields:
                        groups.update(newOutcomes.keys())

            #set groups:
            for k in self.outcome_controls + self.outcome_interaction:
                groups = groups & set(outcomes[k].keys()) #always intersect with controls
            if groupsWhere:
                outcm = groupsWhere.split('=')[0].strip()
                val = groupsWhere.split('=')[1].strip()
                # print "Maarten getGroupsAndOutcomes", [groupsWhere, outcm, val]
                whereusers = set([i[0] for i in self.getGroupAndOutcomeValues(outcm) if str(i[1]) == val])
                groups = groups & whereusers

            if self.oneGroupSetForAllOutcomes:
                for k in self.outcome_value_fields:
                    groups = groups & set(outcomes[k].keys()) #only intersect if wanting all the same groups
            
            #split into outcomes and controls:
            ocs = dict()
            controls = dict()
            for k in self.outcome_controls + self.outcome_interaction:
                outcomeDict = outcomes[k]
                outcomeDict = dict([(g, v) for g, v in outcomeDict.iteritems() if g in groups])
                controls[k] = outcomeDict
            for k in self.outcome_value_fields:
                outcomeDict = outcomes[k]
                outcomeDict = dict([(g, v) for g, v in outcomeDict.iteritems() if g in groups])
                ocs[k] = outcomeDict
        elif groupThresh:
            groupCnts = self.getGroupWordCounts(where = None, lexicon_count_table = lexicon_count_table)
            groups = set()
            for gId, cnt in groupCnts.iteritems():
                if cnt >= groupThresh:
                    groups.add(gId)
            if groupsWhere:
                outcm = groupsWhere.split('=')[0].strip()
                val = groupsWhere.split('=')[1].strip()
                # print "Maarten getGroupsAndOutcomes", [groupsWhere, outcm, val]
                whereusers = set([i[0] for i in self.getGroupAndOutcomeValues(outcm) if str(i[1]) == val])
                groups = groups & whereusers

        return (groups, ocs, controls)
Пример #31
0
    def yieldGroupNormsWithZerosByFeat(self, groups = [], where = '', values = False, feats = []):
        """yields (feat, groupnorms, number of features"""
        """ or if values = True, (feat, values, groupnorms, number of features)"""
        allFeats = feats
        if not feats: 
            allFeats = self.getDistinctFeatures(where)
        else:
            fwc.warn("feats restricted to %s" % feats)
        
        numFeats = len(allFeats)
        gCond = None
        if groups: 
            gCond = " group_id in ('%s')" % "','".join(str(g) for g in groups)
        else: 
            groups = self.getDistinctGroups(where)
        numGroups = len(groups)

        getGroupNorms = self.getGroupNorms
        getGroupNormsForFeat = self.getGroupNormsForFeat
        getGroupNormsForFeats = self.getGroupNormsForFeats
        if values:
            getGroupNorms = self.getValuesAndGroupNorms
            getGroupNormsForFeat = self.getValuesAndGroupNormsForFeat
            getGroupNormsForFeats = self.getValuesAndGroupNormsForFeats
            
        #figure out if too big for memory:
        fwc.warn("Yielding norms with zeros (%d groups * %d feats)." %(len(groups), numFeats))
        gns = dict()
        vals = dict() #only gets field if values is true
        if (numFeats * numGroups) < 12500000*fwc.GIGS_OF_MEMORY:
            #statically acquire all gns
            gnlist = []
            if gCond: 
                if where: gnlist = getGroupNorms(where+" AND "+gCond)
                else: gnlist = getGroupNorms(gCond)
            else: #don't need to specify groups
                gnlist = getGroupNorms()
            if feats:
                if where:
                    where = " AND ".join([where, "feat IN ('"+"','".join(feats)+"')"])
                else:
                    where = " feat IN ('"+"','".join(feats)+"')"
                
            for tup in gnlist:
                (gid, feat) = tup[0:2]
                if not feat in gns: 
                    gns[feat] = dict()
                    if values:
                        vals[feat] = dict()
                gns[feat][gid] = float(tup[-1])
                if values:
                    vals[feat][gid] = float(tup[2])
        else:
            fwc.warn("Too big to keep gns in memory, querying for each feature (slower, but less memory intensive)")

        def getFeatValuesAndGNs(feat):
            if gns:
                try:
                    if values: 
                        return (vals[feat].copy(), gns[feat].copy())
                    return (None, gns[feat].copy())
                except KeyError:
                    fwc.warn("Couldn't find gns for feat: %s (group_freq_thresh may be too high)" % feat)
                    return (None, dict())
            else:#must query for feat
                gnDict = None 
                valDict = None
                gnlist = []
                if gCond: 
                    if where: gnlist = getGroupNormsForFeat(feat, where+" AND "+gCond)
                    else: gnlist = getGroupNormsForFeat(feat, gCond)
                else:
                    gnlist = self.getGroupNormsForFeat(feat)
                if values:
                    gnDict = dict([(g, float(gn)) for g, _, gn in gnlist])
                    valDict = dict([(g, float(v)) for g, v, _ in gnlist])
                else:
                    gnDict = dict([(g, float(gn)) for g, gn in gnlist])
                return (valDict, gnDict)


        #fill in zeros (this can get quite big!)
        for feat in allFeats:
            (valDict, gnDict) = getFeatValuesAndGNs(feat)
            for gid in groups:
                if not gid in gnDict: #add zeros!
                    gnDict[gid] = 0
                    if values and valDict: valDict[gid] = 0
            if values:
                yield (feat, valDict, gnDict, numFeats)
            else:
                yield (feat, gnDict, numFeats)
Пример #32
0
# Interfaces with FeatureWorker and scikit-learn
# to perform prediction of outcomes for language features.
#
# example: predicting satisfaction with life score given language use
#
# example usage: ./featureWorker.py --outcome_fields SWL --train_regression

from fwConstants import warn
import cPickle as pickle

try:
    from rpy2.robjects.packages import importr
    import rpy2.robjects as ro
    from rpy2.rinterface import RNULLType
except ImportError:
    warn("rpy2 cannot be imported")
    pass

import pandas as pd
try:
    import pandas.rpy.common as com
except ImportError:
    warn("pandas.rpy.common cannot be imported")
    pass

from inspect import ismethod
import sys
import random
from itertools import combinations

# scikit-learn imports
Пример #33
0
    def getContingencyArrayFeatNorm(self, where=''):
        """ returns a list of lists: each row is a group_id and each col is a feature"""
        """ the first row has a blank first entry and then a list of unique features"""
        """ the first column has a blank first entry and then a list of unique group_ids"""
        fwc.warn("running getContingencyArrayFeatNorm")

        fwc.warn(
            "Getting distinct feature / groupId lists and (feat, featNormZero) list"
        )
        distinctFeatureList = self.getDistinctFeatures(where)
        featureZeroList = self.getFeatureZeros(where)
        distinctGroupList = self.getDistinctGroups(where)

        fwc.warn(
            "Converting feature / groupId lists to dictionaries (item: index) for quick insertion"
        )
        distinctFeatureDict = {}
        counter = 0
        for feature in distinctFeatureList:
            distinctFeatureDict[feature] = counter
            counter += 1

        distinctGroupDict = {}
        counter = 0
        for group in distinctGroupList:
            distinctGroupDict[group] = counter
            counter += 1

        fwc.warn(
            "Making a 2d array (matrix) with ncol = nDistinctFeatures and nrow = nDistinctGroupIds"
        )
        fwc.warn(
            "For each distinct feature, intializing that column with feat norm zeros' value"
        )
        contingencyMatrix = zeros(
            (len(distinctGroupList), len(distinctFeatureList)))
        for tup in featureZeroList:
            (feat, featNormZero) = tup
            columnIndexToZero = distinctFeatureDict[feat]
            contingencyMatrix[:, columnIndexToZero] = featNormZero

        fwc.warn("calling getFeatNormsSS, iterating through (with SS cursor)")
        fwc.warn(
            "for each iteration, using the index dictionaries to insert the entry into the matrix"
        )
        ssCursor = self.getFeatNormsSS(where)
        for tup in ssCursor:
            (gid, feat, featNorm) = tup
            columnIndexForInsertion = distinctFeatureDict[feat]
            rowIndexForInsertion = distinctGroupDict[gid]
            contingencyMatrix[rowIndexForInsertion,
                              columnIndexForInsertion] = featNorm

        fwc.warn(
            "returning [contingency matrix, rownames (distinct groups), and colnames (distinct features)]"
        )
        return [contingencyMatrix, distinctGroupList, distinctFeatureList]
Пример #34
0
    def createTableWithBinnedFeats(self, num_bins, group_id_range, groupfreqthresh, valueFunc = lambda x:x, 
                                   gender=None, genderattack=False, reporting_percent=0.04, outcomeTable = fwc.DEF_OUTCOME_TABLE, skip_binning=False):
        featureTable = self.featureTable
        group_id_range = map(int, group_id_range)
        newTable = featureTable+'$'+str(num_bins)+'b_'+'_'.join(map(str,group_id_range))
        if skip_binning: return newTable

        sql = 'DROP TABLE IF EXISTS %s'%newTable
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        sql = "CREATE TABLE %s like %s" % (newTable, featureTable)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)

        #groupValues = self.getSumValuesByGroup(where) # [(gid1, val1), ...]
        # OLD N calculation.... same as new one....
        # sql = 'SELECT age, COUNT(DISTINCT user_id) from userstats_en_ageadj where age >= %d AND age <= %d AND uwt >= %d'%(group_id_range[0], group_id_range[1], groupfreqthresh)
        # if gender:
        #     gender = gender.lower()
        #     if gender == 'm':
        #         gender = 0
        #     elif gender == 'f':
        #         gender = 1
        #     sql += ' AND gender = %d'%gender
        # else:
        #     sql += ' AND gender IS NOT NULL'
        # sql += ' group by age'
        # groupValues = mm.executeGetList(self.corpdb, self.dbCursor, sql) # [(gid1, N1), ...]
        # groupIdToN = dict(groupValues)
        groupNs = mm.executeGetList(self.corpdb, self.dbCursor, 'SELECT group_id, N FROM %s GROUP BY group_id'%self.featureTable, charset=self.encoding, use_unicode=self.use_unicode)
        groupIdToN = dict(groupNs)
        #pprint(groupIdToN)
        #pprint(groupIdToN)
        total_freq = sum(map(lambda x:x[1], groupNs))
        bin_size = float(total_freq) / float(num_bins+2)

        num_groups = len(groupNs)
        reporting_int = fwc._getReportingInt(reporting_percent, num_groups)

        # figure out the bins, i.e. if group_id's 1,2,3 total value is greater than "bin_size" our first bin is 1_3.
        fwc.warn('determining the number of bins...')
        current_sum = 0
        current_lower_group = groupNs[0][0]

        current_upper_group = None
        next_group_is_lower_group = False
        bin_groups = OrderedDict()
        gg = 0
        for group, value in groupNs:
            if next_group_is_lower_group:
                current_lower_group = group
                next_group_is_lower_group = False
            current_sum += value
            current_upper_group = group
            if current_sum >= bin_size:
                current_sum = 0
                bin_groups[(current_lower_group, current_upper_group)]  = '_'.join(map(str,[current_lower_group, current_upper_group]))
                next_group_is_lower_group = True
            gg += 1
            fwc._report('group_id\'s', gg, reporting_int, num_groups)
        if current_sum >= 0:
            bin_groups[(current_lower_group, current_upper_group)]  = '_'.join(map(str,[current_lower_group, current_upper_group]))

        max_label_length = max(map(len, bin_groups.values()))

        sql = 'ALTER TABLE %s MODIFY COLUMN group_id VARCHAR(%d)'%(newTable, max_label_length) #this action preserves the index
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        sql = 'ALTER TABLE %s ADD COLUMN `bin_center` float(6) not null default -1.0'%(newTable)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        sql = 'ALTER TABLE %s ADD COLUMN `bin_center_w` float(6) not null default -1.0'%(newTable)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        sql = 'ALTER TABLE %s ADD COLUMN `bin_width` int(10) not null default -1'%(newTable)
        mm.execute(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
        mm.disableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode)

        # for each newly denoted bin: e.g. 1_3, 4_5, 6_6, ... get the new feature value counts / group norms; insert them into the new table
        # e.g. 1 'hi' 5, 2 'hi' 10, 3 'hi' 30 ==> 1_3 'hi' 45  (of course include group_norm also)
        fwc.warn('aggreagating the newly binned feature values / group_norms into the new table...')
        isql = 'INSERT INTO %s (group_id, feat, value, group_norm, std_dev, N, bin_center, bin_center_w, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s, %s, %s')
        #isql = 'INSERT INTO %s (group_id, feat, value, group_norm, N, bin_center, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s')
        ii_bins = 0
        num_bins = len(bin_groups.keys())
        reporting_int = fwc._getReportingInt(reporting_percent, num_bins)
        #_warn('#############BIN NUMBER############### [[%d]] #############'%len(bin_groups))
        for (lower_group, upper_group), label in bin_groups.iteritems():
            bin_N_sum = 0
            bin_width = 0
            bin_center = sum((lower_group, upper_group)) / 2.0
            bin_center_w = 0
            for ii in range(lower_group, upper_group+1):
                #_warn('for bin %d_%d ii:%d'%(lower_group, upper_group, ii))
                bin_width += 1
                bin_N_sum += groupIdToN.get(ii, 0)
                bin_center_w += groupIdToN.get(ii, 0) * ii
            bin_center_w = float(bin_center_w) / float(bin_N_sum)

            #_warn('number of users in range [%d, %d] is %d'%(lower_group, upper_group, bin_N_sum))
            
            # sql = 'SELECT group_id, feat, value, group_norm, N FROM %s where group_id >= %d AND group_id <= %d'%(self.featureTable, lower_group, upper_group)
            sql = 'SELECT group_id, feat, value, group_norm, std_dev FROM %s where group_id >= %d AND group_id <= %d'%(self.featureTable, lower_group, upper_group)
            groupFeatValueNorm = mm.executeGetList(self.corpdb, self.dbCursor, sql, charset=self.encoding, use_unicode=self.use_unicode)
            #pprint(groupFeatValueNorm)

            totalFeatCountForThisBin = float(0)
            featToValue = {}
            featToSummedNorm = {}
            for group_id, feat, value, norm, sd in groupFeatValueNorm:
            # for group_id, feat, value, norm, N in groupFeatValueNorm:
                if fwc.LOWERCASE_ONLY: feat = str(feat).lower()
                totalFeatCountForThisBin += value
                currentN = groupIdToN[group_id]
                try:
                    featToValue[feat] += value
                    featToSummedNorm[feat] += norm * currentN
                except KeyError:
                    featToValue[feat] = value
                    featToSummedNorm[feat] = norm * currentN

            #calculate mean and std_dev, using above info
            featToMeanNorm = {}
            featToSummedVar = {}
            for group_id, feat, _, norm, sd in groupFeatValueNorm:
                currentN = groupIdToN[group_id]
                meanNorm = featToSummedNorm[feat]/bin_N_sum
                try: 
                    featToSummedVar[feat] += currentN*((meanNorm - norm)**2 + (sd*sd))
                except KeyError:
                    featToSummedVar[feat] = currentN*((meanNorm - norm)**2 + (sd*sd))
                featToMeanNorm[feat] = meanNorm

            current_batch = [ ('_'.join(map(str,(lower_group, upper_group))),  k,  v, featToMeanNorm[k], sqrt(featToSummedVar[k] / bin_N_sum),
                               bin_N_sum, bin_center, bin_center_w, bin_width) for k, v in featToValue.iteritems() ]
            mm.executeWriteMany(self.corpdb, self.dbCursor, isql, current_batch, writeCursor=self.dbConn.cursor(), charset=self.encoding, use_unicode=self.use_unicode)
            # print 'N bin sum:', bin_N_sum
            # isql = 'INSERT INTO %s (group_id, feat, value, group_norm, N, bin_center, bin_center_w, bin_width) VALUES (%s)'%(newTable, '%s, %s, %s, %s, %s, %s, %s, %s')
            ii_bins += 1
            fwc._report('group_id bins', ii_bins, reporting_int, num_bins)

        mm.enableTableKeys(self.corpdb, self.dbCursor, newTable, charset=self.encoding, use_unicode=self.use_unicode)
        fwc.warn('Done creating new group_id-binned feature table.')

        outputdata = mm.executeGetList(self.corpdb, self.dbCursor, 'select group_id, N from `%s` group by group_id'%(newTable,), charset=self.encoding, use_unicode=self.use_unicode)
        pprint(outputdata)

        # mm.execute(self.corpdb, self.dbCursor, 'drop table if exists `%s`'%(newTable,))
        return newTable