def processDayAnonymous(handler, day, month, monthsFolder, startIdx=0, endIdx=sys.maxint, notifications=True): anonymousFileName = utility.addMissingSlash(monthsFolder) \ + utility.addMissingSlash(month) \ + anonymousDataFolder + anonymousFilePrefix + "%02d" % day + anonymousFileSuffix if notifications: print "Working on: " + anonymousFileName with gzip.open(anonymousFileName) as a: aReader = csv.DictReader(a, delimiter="\t") i = 0 for anonymous in aReader: if startIdx <= i <= endIdx: sparqlQuery = urllib.unquote_plus( anonymous['#anonymizedQuery']) anonymous['Valid'] = 'VALID' handler.handle(sparqlQuery, anonymous) elif i > endIdx: break i += 1
def processMonth(handler, month, monthsFolder, anonymous=False, notifications=True): folderToSearch = processedFolder prefixToSearch = processedPrefix suffixToSearch = processedSuffix if anonymous: folderToSearch = anonymousDataFolder prefixToSearch = anonymousFilePrefix suffixToSearch = anonymousFileSuffix for filename in glob.glob( utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + folderToSearch + prefixToSearch + "*" + suffixToSearch): day = os.path.basename( filename)[len(prefixToSearch):][:-len(suffixToSearch)] if anonymous: processDayAnonymous(handler, int(day), month, monthsFolder, notifications=notifications) else: processDay(handler, int(day), month, monthsFolder, notifications=notifications)
def processDay(handler, day, month, monthsFolder, startIdx=0, endIdx=sys.maxint, notifications=True): processedFileName = utility.addMissingSlash(monthsFolder) \ + utility.addMissingSlash(month) \ + processedFolder + processedPrefix + "%02d" % day \ + processedSuffix if notifications: print "Working on: " + processedFileName with gzip.open(processedFileName) as p, \ gzip.open(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "rawLogData/" + sourcePrefix + "%02d" % day + ".tsv.gz") as s: pReader = csv.DictReader(p, delimiter="\t") sReader = csv.DictReader(s, delimiter="\t") i = 0 for processed, source in izip(pReader, sReader): if startIdx <= i <= endIdx: requestParameters = dict( urlparse.parse_qsl( urlparse.urlsplit(source['uri_query']).query.replace( ';', "%3B"))) if 'query' in requestParameters.keys(): sparqlQuery = requestParameters['query'] else: sparqlQuery = None processed['hour'] = source['hour'] processed['day'] = day processed['user_agent'] = source['user_agent'] processed['http_status'] = source['http_status'] processed['timestamp'] = source['ts'] processed['ts'] = source['ts'] handler.handle(sparqlQuery, processed) elif i > endIdx: break i += 1
def joinMonth(month, monthsFolder=config.monthsFolder, ignoreLock=False, outputPath=None, outputFilename=None): if os.path.isfile( utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") and not ignoreLock: print "ERROR: The month " + month + " is being edited at the moment. Use -i or ignoreLock = True if you want to force the execution of this script." sys.exit() anonymizedFolder = "anonymousRawData/" anonymizedPrefix = anonymizedFolder + "AnonymousQueryCnt" pathBase = utility.addMissingSlash(monthsFolder) \ + utility.addMissingSlash(month) outputFile = month.strip("/").replace("/", "_") + "_Joined.tsv.gz" if outputFilename is not None: outputFile = outputFilename targetFile = pathBase + anonymizedFolder if outputPath is not None: targetFile = outputPath if not os.path.exists(targetFile): os.makedirs(targetFile) targetFile += outputFile with gzip.open(targetFile, "w") as target: headerSet = False for i in xrange(1, 32): print "Working on %02d" % i sourceFile = pathBase + anonymizedPrefix + "%02d" % i + ".tsv.gz" if not (os.path.exists(sourceFile)): continue with gzip.open(sourceFile) as source: if headerSet: next(source) else: headerSet = True for line in source: target.write(line)
def processRankedQueryType(handler, month, monthsFolder, startIdx=0, endIdx=sys.maxint, notifications=True): rankedQueryTypeFilename = utility.addMissingSlash( monthsFolder) + utility.addMissingSlash( month) + rankedQueryTypeFolder + rankedQueryTypeFile if notifications: print "Working on: " + rankedQueryTypeFilename with open(rankedQueryTypeFilename) as r: rReader = csv.DictReader(r, delimiter="\t") i = 0 for ranked in rReader: if startIdx <= i <= endIdx: handler.handle(ranked["ExampleQuery"], ranked) elif i > endIdx: break i += 1
def fieldRanking(month, metric, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplitting = False, writeOut = False, notifications = True, anonymous = False): if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") \ and not ignoreLock: print "ERROR: The month " + month + " is being edited at the moment." + " Use -i or ignoreLock = True if you want to force the execution of this script." sys.exit() metric = utility.argMetric(metric) pathBase = utility.addMissingSlash(monthsFolder) \ + utility.addMissingSlash(month) \ + utility.addMissingSlash(metric) if outputPath is not None: pathBase = utility.addMissingSlash(outputPath) addString = "" if anonymous: addString = "_anonymous_" outputFile = month.strip("/").replace("/", "_") + "_" + metric + addString + "_Ranking.tsv" if outputFilename is not None: outputFile = outputFilename header = metric + "\t" + metric + "_count\n" filter = utility.filter() filter.setup(filterParams) class FieldRankingHandler: totalMetricCounts = defaultdict(int) def handle(self, sparqlQuery, processed): if not filter.checkLine(processed): return for key in utility.fetchEntries(processed, metric, nosplitting = nosplitting): self.totalMetricCounts[key] += 1 def writeOut(self): with open(pathBase + outputFile, "w") as file: file.write(header) for k, v in sorted(self.totalMetricCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True): file.write(str(k) + "\t" + str(v) + "\n") handler = FieldRankingHandler() if anonymous: processdata.processMonth(handler, month, monthsFolder, anonymous = True, notifications = notifications) else: processdata.processMonth(handler, month, monthsFolder, notifications = notifications) if writeOut: if not os.path.exists(pathBase): os.makedirs(pathBase) handler.writeOut() return handler.totalMetricCounts
action="store_true") parser.add_argument("--monthsFolder", "-m", default=config.monthsFolder, type=str, help="The folder in which the months directory are " + "residing.") parser.add_argument("months", type=str, help="The months to be processed") if (len(sys.argv[1:]) == 0): parser.print_help() parser.exit() args = parser.parse_args() monthsFolder = utility.addMissingSlash(args.monthsFolder) statisticsSubfolder = monthsFolder + "statistics/" if not os.path.exists(statisticsSubfolder): os.makedirs(statisticsSubfolder) def fieldRankingOn(monthFolder, metric, filename): print "Working with fieldRanking " + metric + " on " + filename fieldRanking.fieldRanking(monthFolder, metric, monthsFolder=args.monthsFolder, outputPath=statisticsSubfolder + metric + "_Ranking", outputFilename=filename, writeOut=True, notifications=False)
"-o", action='store_true', help="If set " + "only valid lines are being looked at") parser.add_argument("month", type=str, help="The month from which lines " + "should be displayed.") if (len(sys.argv[1:]) == 0): parser.print_help() parser.exit() args = parser.parse_args() if os.path.isfile( utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") and not args.ignoreLock: print "ERROR: The month " + args.month + " is being edited at the moment. Use -i if you want to force the execution of this script." sys.exit() class CountRdfPropertiesHandler: queryCount = 0 propQueryCounts = {} def handle(self, sparqlQuery, processed): self.queryCount += 1 if args.onlyValid: if processed['#Valid'] is not 'VALID':
+ " NOTE: The day setting is ignored if query type ranking is enabled.") if (len(sys.argv[1:]) == 0): parser.print_help() parser.exit() args = parser.parse_args() startLine = args.startline endLine = args.endline if args.line != None: startLine = args.line endLine = args.line if os.path.isfile(utility.addMissingSlash(args.monthsFolder) + utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: print "ERROR: The month " + args.month + " is being edited at the " + "moment. Use -i if you want to force the execution of this script." sys.exit() metrics = list() metricsNotNull = list() if args.metricsToBeViewed is not "": for metric in args.metricsToBeViewed.split(","): metrics.append(utility.addMissingDoubleCross(metric)) if args.metricsNotNull is not "": for metric in args.metricsNotNull.split(","):
type=str, help="the month which we're interested in") parser.add_argument("lines", type=int, help="number of lines the testfiles should have") if (len(sys.argv[1:]) == 0): parser.print_help() parser.exit() args = parser.parse_args() monthsFolder = args.monthsFolder month = args.month if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") \ and not args.ignoreLock: print "ERROR: The month " + args.month + " is being edited at the moment." + " Use -i if you want to force the execution of this script." sys.exit() # create new folder for the test data os.makedirs("testData/processedLogData") os.makedirs("testData/rawLogData") for filename in glob.glob(monthsFolder + "/" + month + "/processedLogData/" + processdata.processedPrefix + "*" + processdata.processedSuffix): day = int( os.path.basename(filename)[len(processdata.processedPrefix):]
def xyMapping(month, metricOne, metricTwo, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplittingOne = False, nosplittingTwo = False, writeOut = False, notifications = True): if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") \ and not ignoreLock: print "ERROR: The month " + month + " is being edited at the " + "moment. Use -i if you want to force the execution of this script." sys.exit() metricOne = utility.argMetric(metricOne) metricTwo = utility.argMetric(metricTwo) folderName = metricOne + "_" + metricTwo pathBase = utility.addMissingSlash(monthsFolder) \ + utility.addMissingSlash(month) \ + utility.addMissingSlash(folderName) if outputPath is not None: pathBase = utility.addMissingSlash(outputPath) outputFile = month.strip("/").replace("/", "_") + "_" + folderName + ".tsv" if outputFilename is not None: outputFile = outputFilename filter = utility.filter() filter.setup(filterParams) class hourlyFieldValueHandler: monthlyFieldValues = set() monthlyData = dict() def handle(self, sparqlQuery, processed): if not filter.checkLine(processed): return entriesOne = utility.fetchEntries(processed, metricOne, nosplittingOne) for keyTwo in utility.fetchEntries(processed, metricTwo, nosplittingTwo): if keyTwo not in self.monthlyData: self.monthlyData[keyTwo] = defaultdict(int) for keyOne in entriesOne: self.monthlyFieldValues.add(keyOne) self.monthlyData[keyTwo][keyOne] += 1 def writeHourlyValues(self): writeOutMethod(pathBase + outputFile, self.monthlyFieldValues, self.monthlyData, metricTwo + "\\" + metricOne) handler = hourlyFieldValueHandler() processdata.processMonth(handler, month, monthsFolder, notifications = notifications) if writeOut: if not os.path.exists(pathBase): os.makedirs(pathBase) handler.writeHourlyValues() return (handler.monthlyFieldValues, handler.monthlyData)
args = parser.parse_args() uri_path = {"/sparql", "/bigdata/namespace/wdq/sparql"} user_agent = { "Mozilla/5.0 (Android 4.4; Mobile; rv:41.0) Gecko/41.0 Firefox/41.0", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:53.0) Gecko/20100101 Firefox/53.0" } timestamp = { "2017-07-01 01:31:14", "2017-07-01 01:32:54", "2017-07-01 01:34:10" } agent_type = {"spider", "user"} http_status = "200" with gzip.open( utility.addMissingSlash(args.outputDirectory) + "QueryCnt01.tsv.gz", "w") as target: print("uri_query\turi_path\tuser_agent\tts\tagent_type\thour\thttp_status", file=target) exampleQueryFolder = utility.addMissingSlash(args.exampleQueryFolder) for filename in glob.glob(exampleQueryFolder + "*.exampleQuery"): with open(filename) as exampleFile: line = "?query=" + urllib.quote_plus(exampleFile.read()) + "\t" line += random.sample(uri_path, 1)[0] + "\t" line += random.sample(user_agent, 1)[0] + "\t" line += random.sample(timestamp, 1)[0] + "\t" line += random.sample(agent_type, 1)[0] + "\t" line += str(random.randint(0, 23)) + "\t" line += http_status
+ " Default filter is Valid=^VALID$." + " Enter as <metric>=<regex>,<othermetric>/<regex> (e.g." + " QueryType=wikidataLastModified,ToolName=^USER$)" + " NOTE: If you use this option you should probably also" + " set the --outputPath to some value other than the " + "default.") parser.add_argument("month", type=str, help="The month for which the ranking should be generated.") parser.add_argument("--threshold", "-t", default = 2000, type = int, help = "The threshold above which the combinations should be listed. Default is 2000.") if (len(sys.argv[1:]) == 0): parser.print_help() parser.exit() args = parser.parse_args() monthsFolder = utility.addMissingSlash(args.monthsFolder) month = utility.addMissingSlash(args.month) if os.path.isfile(monthsFolder + month + "locked") \ and not ignoreLock: print ("ERROR: The month " + args.month + " is being edited at the moment." + " Use -i or ignoreLock = True if you want to force the execution of this script.") sys.exit() subfolder = "automatedBotClassification/" pathBase = monthsFolder + month + subfolder if not os.path.exists(pathBase): os.makedirs(pathBase)
def fieldEntriesDaysApart(months, metric, days, monthsFolder=config.monthsFolder, ignoreLock=False, outputPath=None, outputFilename=None, filterParams="", nosplitting=False, writeOut=False, notifications=True, anonymous=False): for month in months.split(","): if os.path.isfile( utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") and not ignoreLock: print "ERROR: The month " + month + " is being edited at the moment." + " Use -i or ignoreLock = True if you want to force the execution of this script." sys.exit() metric = utility.argMetric(metric) pathBase = utility.addMissingSlash(monthsFolder) \ + utility.addMissingSlash(months.replace("/", "_")) \ + utility.addMissingSlash(metric) if outputPath is not None: pathBase = utility.addMissingSlash(outputPath) addString = "" if anonymous: addString = "_anonymous_" outputFile = month.strip("/").replace( "/", "_") + "_" + metric + addString + "_" + str(days) + "_days_apart.tsv" if outputFilename is not None: outputFile = outputFilename header = metric + "\n" filter = utility.filter() filter.setup(filterParams) faultyTimestamps = defaultdict(int) class FieldEntriesDaysApartHandler: firstSeen = dict() lastSeen = dict() fieldEntries = set() def handle(self, sparqlQuery, processed): if not filter.checkLine(processed): return for key in utility.fetchEntries(processed, metric, nosplitting=nosplitting): timestamp = processed["timestamp"] try: parsedTime = dateparser.parse(timestamp) except ValueError: print "ERROR: Faulty timestamp " + str(timestamp) faultyTimestamps[timestamp] += 1 continue if not key in self.firstSeen: self.firstSeen[key] = parsedTime self.lastSeen[key] = parsedTime if parsedTime > self.lastSeen[key]: self.lastSeen[key] = parsedTime def compute(self): for key, firstTS in self.firstSeen.iteritems(): lastTS = self.lastSeen[key] if (lastTS - firstTS).days >= days: self.fieldEntries.add(key) def writeOut(self): with open(pathBase + outputFile, "w") as file: file.write(header) for key in self.fieldEntries: file.write(str(key) + "\n") handler = FieldEntriesDaysApartHandler() for month in months.split(","): if anonymous: processdata.processMonth(handler, month, monthsFolder, anonymous=True, notifications=notifications) else: processdata.processMonth(handler, month, monthsFolder, notifications=notifications) handler.compute() if len(faultyTimestamps) > 0: print "Faulty timestamp\tcount" for k, v in sorted(faultyTimestamps.iteritems(), key=lambda (k, v): (v, k), reverse=True): print str(k) + "\t" + str(v) if writeOut: if not os.path.exists(pathBase): os.makedirs(pathBase) handler.writeOut() return handler.fieldEntries