def fieldRanking(month, metric, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplitting = False, writeOut = False, notifications = True, anonymous = False): if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") \ and not ignoreLock: print "ERROR: The month " + month + " is being edited at the moment." + " Use -i or ignoreLock = True if you want to force the execution of this script." sys.exit() metric = utility.argMetric(metric) pathBase = utility.addMissingSlash(monthsFolder) \ + utility.addMissingSlash(month) \ + utility.addMissingSlash(metric) if outputPath is not None: pathBase = utility.addMissingSlash(outputPath) addString = "" if anonymous: addString = "_anonymous_" outputFile = month.strip("/").replace("/", "_") + "_" + metric + addString + "_Ranking.tsv" if outputFilename is not None: outputFile = outputFilename header = metric + "\t" + metric + "_count\n" filter = utility.filter() filter.setup(filterParams) class FieldRankingHandler: totalMetricCounts = defaultdict(int) def handle(self, sparqlQuery, processed): if not filter.checkLine(processed): return for key in utility.fetchEntries(processed, metric, nosplitting = nosplitting): self.totalMetricCounts[key] += 1 def writeOut(self): with open(pathBase + outputFile, "w") as file: file.write(header) for k, v in sorted(self.totalMetricCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True): file.write(str(k) + "\t" + str(v) + "\n") handler = FieldRankingHandler() if anonymous: processdata.processMonth(handler, month, monthsFolder, anonymous = True, notifications = notifications) else: processdata.processMonth(handler, month, monthsFolder, notifications = notifications) if writeOut: if not os.path.exists(pathBase): os.makedirs(pathBase) handler.writeOut() return handler.totalMetricCounts
def handle(self, sparqlQuery, processed): self.queryCount += 1 if args.onlyValid: if processed['#Valid'] is not 'VALID': return props = processed["#Predicates"].split(",") for prop in props: if prop in self.propQueryCounts: c = self.propQueryCounts[prop] + 1 else: c = 1 self.propQueryCounts[prop] = c def printResults(self): print "Queries: %d" % (self.queryCount) print "\n\nproperty\tcount" for p, c in sorted(self.propQueryCounts.iteritems(), key=operator.itemgetter(1), reverse=True): print "%s\t%d" % (p, c) handler = CountRdfPropertiesHandler() processdata.processMonth(handler, args.month, args.monthsFolder) handler.printResults()
def xyMapping(month, metricOne, metricTwo, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplittingOne = False, nosplittingTwo = False, writeOut = False, notifications = True): if os.path.isfile(utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") \ and not ignoreLock: print "ERROR: The month " + month + " is being edited at the " + "moment. Use -i if you want to force the execution of this script." sys.exit() metricOne = utility.argMetric(metricOne) metricTwo = utility.argMetric(metricTwo) folderName = metricOne + "_" + metricTwo pathBase = utility.addMissingSlash(monthsFolder) \ + utility.addMissingSlash(month) \ + utility.addMissingSlash(folderName) if outputPath is not None: pathBase = utility.addMissingSlash(outputPath) outputFile = month.strip("/").replace("/", "_") + "_" + folderName + ".tsv" if outputFilename is not None: outputFile = outputFilename filter = utility.filter() filter.setup(filterParams) class hourlyFieldValueHandler: monthlyFieldValues = set() monthlyData = dict() def handle(self, sparqlQuery, processed): if not filter.checkLine(processed): return entriesOne = utility.fetchEntries(processed, metricOne, nosplittingOne) for keyTwo in utility.fetchEntries(processed, metricTwo, nosplittingTwo): if keyTwo not in self.monthlyData: self.monthlyData[keyTwo] = defaultdict(int) for keyOne in entriesOne: self.monthlyFieldValues.add(keyOne) self.monthlyData[keyTwo][keyOne] += 1 def writeHourlyValues(self): writeOutMethod(pathBase + outputFile, self.monthlyFieldValues, self.monthlyData, metricTwo + "\\" + metricOne) handler = hourlyFieldValueHandler() processdata.processMonth(handler, month, monthsFolder, notifications = notifications) if writeOut: if not os.path.exists(pathBase): os.makedirs(pathBase) handler.writeHourlyValues() return (handler.monthlyFieldValues, handler.monthlyData)
+ utility.addMissingSlash(args.month) + "locked") \ and not args.ignoreLock: print "ERROR: The month " + args.month + " is being edited at the " + "moment. Use -i if you want to force the execution of this script." sys.exit() ranking = defaultdict(int) class rankDataTypesHandler: def handle(self, sparqlQuery, processed): for entry in re.findall(r'\^\^(.*?)( |\)|\\n)', str(sparqlQuery)): ranking[entry[0]] += 1 handler = rankDataTypesHandler() if args.anonymous: processdata.processMonth(handler, args.month, args.monthsFolder, anonymous=True) else: processdata.processMonth(handler, args.month, args.monthsFolder) print "count\tdataType" for k, v in sorted(ranking.iteritems(), key=lambda (k, v): (v, k), reverse=True): print str(v) + "\t" + k
if len(presentOperators) == 0: self.statistic["None"] += 1 elif other: self.statistic["Other"] += 1 else: self.statistic[", ".join(sorted(presentOperators))] += 1 def printSparqlTranslation(self): result = "" i = 1 for featureName, featureCount in sorted(self.statistic.iteritems()): #print(featureName + "\t" + str(featureCount)) print(featureCount) i += 1 print("") print(str(self.totalCount)) handler = OperatorStatisticHandler() processdata.processMonth(handler, args.month, args.monthsFolder, notifications=False) print args.position handler.printSparqlTranslation()
def fieldEntriesDaysApart(months, metric, days, monthsFolder=config.monthsFolder, ignoreLock=False, outputPath=None, outputFilename=None, filterParams="", nosplitting=False, writeOut=False, notifications=True, anonymous=False): for month in months.split(","): if os.path.isfile( utility.addMissingSlash(monthsFolder) + utility.addMissingSlash(month) + "locked") and not ignoreLock: print "ERROR: The month " + month + " is being edited at the moment." + " Use -i or ignoreLock = True if you want to force the execution of this script." sys.exit() metric = utility.argMetric(metric) pathBase = utility.addMissingSlash(monthsFolder) \ + utility.addMissingSlash(months.replace("/", "_")) \ + utility.addMissingSlash(metric) if outputPath is not None: pathBase = utility.addMissingSlash(outputPath) addString = "" if anonymous: addString = "_anonymous_" outputFile = month.strip("/").replace( "/", "_") + "_" + metric + addString + "_" + str(days) + "_days_apart.tsv" if outputFilename is not None: outputFile = outputFilename header = metric + "\n" filter = utility.filter() filter.setup(filterParams) faultyTimestamps = defaultdict(int) class FieldEntriesDaysApartHandler: firstSeen = dict() lastSeen = dict() fieldEntries = set() def handle(self, sparqlQuery, processed): if not filter.checkLine(processed): return for key in utility.fetchEntries(processed, metric, nosplitting=nosplitting): timestamp = processed["timestamp"] try: parsedTime = dateparser.parse(timestamp) except ValueError: print "ERROR: Faulty timestamp " + str(timestamp) faultyTimestamps[timestamp] += 1 continue if not key in self.firstSeen: self.firstSeen[key] = parsedTime self.lastSeen[key] = parsedTime if parsedTime > self.lastSeen[key]: self.lastSeen[key] = parsedTime def compute(self): for key, firstTS in self.firstSeen.iteritems(): lastTS = self.lastSeen[key] if (lastTS - firstTS).days >= days: self.fieldEntries.add(key) def writeOut(self): with open(pathBase + outputFile, "w") as file: file.write(header) for key in self.fieldEntries: file.write(str(key) + "\n") handler = FieldEntriesDaysApartHandler() for month in months.split(","): if anonymous: processdata.processMonth(handler, month, monthsFolder, anonymous=True, notifications=notifications) else: processdata.processMonth(handler, month, monthsFolder, notifications=notifications) handler.compute() if len(faultyTimestamps) > 0: print "Faulty timestamp\tcount" for k, v in sorted(faultyTimestamps.iteritems(), key=lambda (k, v): (v, k), reverse=True): print str(k) + "\t" + str(v) if writeOut: if not os.path.exists(pathBase): os.makedirs(pathBase) handler.writeOut() return handler.fieldEntries