def computeAnonFromScreenNames(self, extIntNameFileName): with open(extIntNameFileName, 'r') as inFd: print('ext_id,anon_screen_name') firstLineDiscarded = False for line in inFd: (extId, intId, screenName) = line.split(',') #@UnusedVariable #******** #print('ScreenName.strip(\'"\'): \'%s\'' % screenName.strip().strip('"')) #******** if firstLineDiscarded: screenName = screenName.strip().strip('"') if screenName == '\\N': print ('%s,%s' % (extId.strip('"'),'NULL')) else: print('%s,%s' % (extId.strip('"'),EdXTrackLogJSONParser.makeHash(screenName))) else: firstLineDiscarded = True
def computeAndAdd(self): ''' The heavy lifting: reads all TSV rows from the certificates_generatedcertificate table into memory. the screenNamePos passed into the __init__() method is used to find the row's user screen name. That name is hashed and appended to the row as a new anon_screen_name column. ''' with open(self.logFile, 'a') as logFd: with open(self.tsvFileName, 'r') as tsvFd: allRows = tsvFd.readlines() for (i, row) in enumerate(allRows[1:]): colVals = row.split('\t') # Each line's last TSV value element has # a \n glued to it. Get rid of that: colVals[-1] = colVals[-1].strip() # Pick the screen name out of the row: try: screenName = colVals[self.screenNamePos] except IndexError: logMsg = "Ill formatted row number %d in user grade file %s: '%s' (dropping this row)\n" % \ (int(i), self.tsvFileName, str(row).strip()) logFd.write(logMsg) logFd.flush() continue # Add the new last element, including # the trailing \n: colVals.append( EdXTrackLogJSONParser.makeHash(screenName) + '\n') # Write the array back into allRows. The '+1' # is b/c the enumeration above starts i at 0, # which the allRows[1:] starts with the 2nd row, # the one after the header: allRows[i + 1] = string.join(colVals, '\t') # The first (header column names) row needs to # have the new column appended to it after # again stripping the newline off the last # column name, and tagging it onto the # new last col name: colNames = allRows[0].split('\t') colNames[-1] = colNames[-1].strip() colNames.append('anon_screen_name\n') allRows[0] = string.join(colNames, '\t') # Write the new TSV back into the file: with open(self.tsvFileName, 'w') as tsvFd: tsvFd.writelines(allRows)
def computeAndAdd(self): ''' The heavy lifting: reads all TSV rows from the certificates_generatedcertificate table into memory. the screenNamePos passed into the __init__() method is used to find the row's user screen name. That name is hashed and appended to the row as a new anon_screen_name column. ''' with open(self.logFile, 'a') as logFd: with open(self.tsvFileName, 'r') as tsvFd: allRows = tsvFd.readlines() for (i, row) in enumerate(allRows[1:]): colVals = row.split('\t') # Each line's last TSV value element has # a \n glued to it. Get rid of that: colVals[-1] = colVals[-1].strip() # Pick the screen name out of the row: try: screenName = colVals[self.screenNamePos] except IndexError: logMsg = "Ill formatted row number %d in user grade file %s: '%s' (dropping this row)\n" % \ (int(i), self.tsvFileName, str(row).strip()) logFd.write(logMsg) logFd.flush() continue # Add the new last element, including # the trailing \n: colVals.append(EdXTrackLogJSONParser.makeHash(screenName) + '\n') # Write the array back into allRows. The '+1' # is b/c the enumeration above starts i at 0, # which the allRows[1:] starts with the 2nd row, # the one after the header: allRows[i+1] = string.join(colVals, '\t') # The first (header column names) row needs to # have the new column appended to it after # again stripping the newline off the last # column name, and tagging it onto the # new last col name: colNames = allRows[0].split('\t') colNames[-1] = colNames[-1].strip() colNames.append('anon_screen_name\n') allRows[0] = string.join(colNames, '\t') # Write the new TSV back into the file: with open(self.tsvFileName, 'w') as tsvFd: tsvFd.writelines(allRows)
def computeAnonFromScreenNames(self, extIntNameFileName): with open(extIntNameFileName, 'r') as inFd: print('ext_id,anon_screen_name') firstLineDiscarded = False for line in inFd: (extId, intId, screenName) = line.split(',') #@UnusedVariable #******** #print('ScreenName.strip(\'"\'): \'%s\'' % screenName.strip().strip('"')) #******** if firstLineDiscarded: screenName = screenName.strip().strip('"') if screenName == '\\N': print('%s,%s' % (extId.strip('"'), 'NULL')) else: print('%s,%s' % (extId.strip('"'), EdXTrackLogJSONParser.makeHash(screenName))) else: firstLineDiscarded = True
''' Created on Dec 18, 2013 Given any number of user screen names---simple strings--- to stdin, emit corresponding hashes as used to generat column anon_screen_name. @author: paepcke ''' # Add json_to_relation source dir to $PATH # for duration of this execution: import os import sys source_dir = [os.path.join(os.path.dirname(os.path.abspath(__file__)), "../json_to_relation/")] source_dir.extend(sys.path) sys.path = source_dir from edxTrackLogJSONParser import EdXTrackLogJSONParser if __name__ == '__main__': if len(sys.argv) < 2: print("Usage: makeAnonScreenName {str1 str2 ... | -} # Use dash to read input strings from stdin, e.g. from a pipe") sys.exit(1) if sys.argv[1] == '-': for screenName in sys.stdin: print(EdXTrackLogJSONParser.makeHash(screenName)) else: for screenName in sys.argv[1:]: print(EdXTrackLogJSONParser.makeHash(screenName))
anon_screen_name. @author: paepcke ''' # Add json_to_relation source dir to $PATH # for duration of this execution: import os import sys source_dir = [ os.path.join(os.path.dirname(os.path.abspath(__file__)), "../json_to_relation/") ] source_dir.extend(sys.path) sys.path = source_dir from edxTrackLogJSONParser import EdXTrackLogJSONParser if __name__ == '__main__': if len(sys.argv) < 2: print( "Usage: makeAnonScreenName {str1 str2 ... | -} # Use dash to read input strings from stdin, e.g. from a pipe" ) sys.exit(1) if sys.argv[1] == '-': for screenName in sys.stdin: print(EdXTrackLogJSONParser.makeHash(screenName)) else: for screenName in sys.argv[1:]: print(EdXTrackLogJSONParser.makeHash(screenName))