def computeAnonFromScreenNames(self, extIntNameFileName):
     with open(extIntNameFileName, 'r') as inFd:
         print('ext_id,anon_screen_name')
         firstLineDiscarded = False
         for line in inFd:
             (extId, intId, screenName) = line.split(',') #@UnusedVariable
             #********
             #print('ScreenName.strip(\'"\'): \'%s\'' % screenName.strip().strip('"'))
             #********
             if firstLineDiscarded:
                 screenName = screenName.strip().strip('"')
                 if screenName == '\\N':
                     print ('%s,%s' % (extId.strip('"'),'NULL'))
                 else:
                     print('%s,%s' % (extId.strip('"'),EdXTrackLogJSONParser.makeHash(screenName)))
             else:
                 firstLineDiscarded = True
Пример #2
0
    def computeAndAdd(self):
        '''
        The heavy lifting: reads all TSV rows from the certificates_generatedcertificate
        table into memory. the screenNamePos passed into the __init__() method
        is used to find the row's user screen name. That name is hashed
        and appended to the row as a new anon_screen_name column.
        '''
        with open(self.logFile, 'a') as logFd:
            with open(self.tsvFileName, 'r') as tsvFd:
                allRows = tsvFd.readlines()
            for (i, row) in enumerate(allRows[1:]):
                colVals = row.split('\t')
                # Each line's last TSV value element has
                # a \n glued to it. Get rid of that:
                colVals[-1] = colVals[-1].strip()
                # Pick the screen name out of the row:
                try:
                    screenName = colVals[self.screenNamePos]
                except IndexError:
                    logMsg =  "Ill formatted row number %d in user grade file %s: '%s' (dropping this row)\n" % \
                                (int(i), self.tsvFileName, str(row).strip())
                    logFd.write(logMsg)
                    logFd.flush()
                    continue
                # Add the new last element, including
                # the trailing \n:
                colVals.append(
                    EdXTrackLogJSONParser.makeHash(screenName) + '\n')
                # Write the array back into allRows. The '+1'
                # is b/c the enumeration above starts i at 0,
                # which the allRows[1:] starts with the 2nd row,
                # the one after the header:
                allRows[i + 1] = string.join(colVals, '\t')

            # The first (header column names) row needs to
            # have the new column appended to it after
            # again stripping the newline off the last
            # column name, and tagging it onto the
            # new last col name:
            colNames = allRows[0].split('\t')
            colNames[-1] = colNames[-1].strip()
            colNames.append('anon_screen_name\n')
            allRows[0] = string.join(colNames, '\t')
            # Write the new TSV back into the file:
            with open(self.tsvFileName, 'w') as tsvFd:
                tsvFd.writelines(allRows)
 def computeAndAdd(self):
     '''
     The heavy lifting: reads all TSV rows from the certificates_generatedcertificate
     table into memory. the screenNamePos passed into the __init__() method
     is used to find the row's user screen name. That name is hashed
     and appended to the row as a new anon_screen_name column.
     '''
     with open(self.logFile, 'a') as logFd:
         with open(self.tsvFileName, 'r') as tsvFd:
             allRows = tsvFd.readlines()
         for (i, row) in enumerate(allRows[1:]):
             colVals = row.split('\t')
             # Each line's last TSV value element has
             # a \n glued to it. Get rid of that:
             colVals[-1] = colVals[-1].strip()
             # Pick the screen name out of the row:
             try:
                 screenName = colVals[self.screenNamePos]
             except IndexError:
                 logMsg =  "Ill formatted row number %d in user grade file %s: '%s' (dropping this row)\n" % \
                             (int(i), self.tsvFileName, str(row).strip())
                 logFd.write(logMsg)
                 logFd.flush()
                 continue
             # Add the new last element, including
             # the trailing \n:
             colVals.append(EdXTrackLogJSONParser.makeHash(screenName) + '\n')
             # Write the array back into allRows. The '+1'
             # is b/c the enumeration above starts i at 0,
             # which the allRows[1:] starts with the 2nd row,
             # the one after the header:
             allRows[i+1] = string.join(colVals, '\t')
 
         # The first (header column names) row needs to 
         # have the new column appended to it after 
         # again stripping the newline off the last
         # column name, and tagging it onto the 
         # new last col name: 
         colNames = allRows[0].split('\t')
         colNames[-1] = colNames[-1].strip()
         colNames.append('anon_screen_name\n') 
         allRows[0] = string.join(colNames, '\t')
         # Write the new TSV back into the file:
         with open(self.tsvFileName, 'w') as tsvFd:
             tsvFd.writelines(allRows)
Пример #4
0
 def computeAnonFromScreenNames(self, extIntNameFileName):
     with open(extIntNameFileName, 'r') as inFd:
         print('ext_id,anon_screen_name')
         firstLineDiscarded = False
         for line in inFd:
             (extId, intId, screenName) = line.split(',')  #@UnusedVariable
             #********
             #print('ScreenName.strip(\'"\'): \'%s\'' % screenName.strip().strip('"'))
             #********
             if firstLineDiscarded:
                 screenName = screenName.strip().strip('"')
                 if screenName == '\\N':
                     print('%s,%s' % (extId.strip('"'), 'NULL'))
                 else:
                     print('%s,%s' %
                           (extId.strip('"'),
                            EdXTrackLogJSONParser.makeHash(screenName)))
             else:
                 firstLineDiscarded = True
'''
Created on Dec 18, 2013
Given any number of user screen names---simple strings--- to
stdin, emit corresponding hashes as used to generat column
anon_screen_name.

@author: paepcke
'''
# Add json_to_relation source dir to $PATH
# for duration of this execution:
import os
import sys

source_dir = [os.path.join(os.path.dirname(os.path.abspath(__file__)), "../json_to_relation/")]
source_dir.extend(sys.path)
sys.path = source_dir

from edxTrackLogJSONParser import EdXTrackLogJSONParser

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print("Usage: makeAnonScreenName {str1 str2 ... | -} # Use dash to read input strings from stdin, e.g. from a pipe")
        sys.exit(1)
    if sys.argv[1] == '-':
        for screenName in sys.stdin:
            print(EdXTrackLogJSONParser.makeHash(screenName))
    else:
        for screenName in sys.argv[1:]:
            print(EdXTrackLogJSONParser.makeHash(screenName))
    
Пример #6
0
anon_screen_name.

@author: paepcke
'''
# Add json_to_relation source dir to $PATH
# for duration of this execution:
import os
import sys

source_dir = [
    os.path.join(os.path.dirname(os.path.abspath(__file__)),
                 "../json_to_relation/")
]
source_dir.extend(sys.path)
sys.path = source_dir

from edxTrackLogJSONParser import EdXTrackLogJSONParser

if __name__ == '__main__':
    if len(sys.argv) < 2:
        print(
            "Usage: makeAnonScreenName {str1 str2 ... | -} # Use dash to read input strings from stdin, e.g. from a pipe"
        )
        sys.exit(1)
    if sys.argv[1] == '-':
        for screenName in sys.stdin:
            print(EdXTrackLogJSONParser.makeHash(screenName))
    else:
        for screenName in sys.argv[1:]:
            print(EdXTrackLogJSONParser.makeHash(screenName))