Пример #1
0
def fieldRanking(month, metric, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplitting = False, writeOut = False, notifications = True, anonymous = False):
	if os.path.isfile(utility.addMissingSlash(monthsFolder)
		              + utility.addMissingSlash(month) + "locked") \
	   and not ignoreLock:
		print "ERROR: The month " + month + " is being edited at the moment."
		+ " Use -i or ignoreLock = True if you want to force the execution of this script."
		sys.exit()

	metric = utility.argMetric(metric)

	pathBase = utility.addMissingSlash(monthsFolder) \
		    + utility.addMissingSlash(month) \
		    + utility.addMissingSlash(metric)

	if outputPath is not None:
		pathBase = utility.addMissingSlash(outputPath)

	addString = ""
	if anonymous:
		addString = "_anonymous_"

	outputFile = month.strip("/").replace("/", "_") + "_" + metric + addString + "_Ranking.tsv"

	if outputFilename is not None:
		outputFile = outputFilename

	header = metric + "\t" + metric + "_count\n"

	filter = utility.filter()

	filter.setup(filterParams)


	class FieldRankingHandler:
		totalMetricCounts = defaultdict(int)

		def handle(self, sparqlQuery, processed):
			if not filter.checkLine(processed):
				return

			for key in utility.fetchEntries(processed, metric, nosplitting = nosplitting):
				self.totalMetricCounts[key] += 1

		def writeOut(self):
			with open(pathBase + outputFile, "w") as file:
				file.write(header)
				for k, v in sorted(self.totalMetricCounts.iteritems(), key=lambda (k, v): (v, k), reverse=True):
					file.write(str(k) + "\t" + str(v) + "\n")

	handler = FieldRankingHandler()

	if anonymous:
	    processdata.processMonth(handler, month, monthsFolder, anonymous = True, notifications = notifications)
	else:
	    processdata.processMonth(handler, month, monthsFolder, notifications = notifications)



	if writeOut:
		if not os.path.exists(pathBase):
			os.makedirs(pathBase)
		handler.writeOut()
	return handler.totalMetricCounts
Пример #2
0
    def handle(self, sparqlQuery, processed):
        self.queryCount += 1

        if args.onlyValid:
            if processed['#Valid'] is not 'VALID':
                return

        props = processed["#Predicates"].split(",")

        for prop in props:
            if prop in self.propQueryCounts:
                c = self.propQueryCounts[prop] + 1
            else:
                c = 1
            self.propQueryCounts[prop] = c

    def printResults(self):
        print "Queries: %d" % (self.queryCount)
        print "\n\nproperty\tcount"
        for p, c in sorted(self.propQueryCounts.iteritems(),
                           key=operator.itemgetter(1),
                           reverse=True):
            print "%s\t%d" % (p, c)


handler = CountRdfPropertiesHandler()
processdata.processMonth(handler, args.month, args.monthsFolder)

handler.printResults()
Пример #3
0
def xyMapping(month, metricOne, metricTwo, monthsFolder = config.monthsFolder, ignoreLock = False, outputPath = None, outputFilename = None, filterParams = "", nosplittingOne = False, nosplittingTwo = False, writeOut = False, notifications = True):
    if os.path.isfile(utility.addMissingSlash(monthsFolder)
                      + utility.addMissingSlash(month) + "locked") \
       and not ignoreLock:
        print "ERROR: The month " + month + " is being edited at the "
        + "moment. Use -i if you want to force the execution of this script."
        sys.exit()

    metricOne = utility.argMetric(metricOne)
    metricTwo = utility.argMetric(metricTwo)

    folderName = metricOne + "_" + metricTwo

    pathBase = utility.addMissingSlash(monthsFolder) \
            + utility.addMissingSlash(month) \
            + utility.addMissingSlash(folderName)

    if outputPath is not None:
        pathBase = utility.addMissingSlash(outputPath)

    outputFile = month.strip("/").replace("/", "_") + "_" + folderName + ".tsv"

    if outputFilename is not None:
    	outputFile = outputFilename

    filter = utility.filter()

    filter.setup(filterParams)


    class hourlyFieldValueHandler:
        monthlyFieldValues = set()

        monthlyData = dict()

        def handle(self, sparqlQuery, processed):
            if not filter.checkLine(processed):
                return

            entriesOne = utility.fetchEntries(processed, metricOne, nosplittingOne)

            for keyTwo in utility.fetchEntries(processed, metricTwo, nosplittingTwo):
                if keyTwo not in self.monthlyData:
                    self.monthlyData[keyTwo] = defaultdict(int)

                for keyOne in entriesOne:
                    self.monthlyFieldValues.add(keyOne)
                    self.monthlyData[keyTwo][keyOne] += 1

        def writeHourlyValues(self):
            writeOutMethod(pathBase + outputFile, self.monthlyFieldValues, self.monthlyData, metricTwo + "\\" + metricOne)

    handler = hourlyFieldValueHandler()

    processdata.processMonth(handler, month, monthsFolder, notifications = notifications)

    if writeOut:
        if not os.path.exists(pathBase):
            os.makedirs(pathBase)
        handler.writeHourlyValues()
    return (handler.monthlyFieldValues, handler.monthlyData)
Пример #4
0
                  + utility.addMissingSlash(args.month) + "locked") \
   and not args.ignoreLock:
    print "ERROR: The month " + args.month + " is being edited at the "
    + "moment. Use -i if you want to force the execution of this script."
    sys.exit()

ranking = defaultdict(int)


class rankDataTypesHandler:
    def handle(self, sparqlQuery, processed):
        for entry in re.findall(r'\^\^(.*?)( |\)|\\n)', str(sparqlQuery)):
            ranking[entry[0]] += 1


handler = rankDataTypesHandler()

if args.anonymous:
    processdata.processMonth(handler,
                             args.month,
                             args.monthsFolder,
                             anonymous=True)
else:
    processdata.processMonth(handler, args.month, args.monthsFolder)

print "count\tdataType"
for k, v in sorted(ranking.iteritems(),
                   key=lambda (k, v): (v, k),
                   reverse=True):
    print str(v) + "\t" + k
            if len(presentOperators) == 0:
                self.statistic["None"] += 1
            elif other:
                self.statistic["Other"] += 1
            else:
                self.statistic[", ".join(sorted(presentOperators))] += 1

    def printSparqlTranslation(self):
        result = ""
        i = 1
        for featureName, featureCount in sorted(self.statistic.iteritems()):
            #print(featureName + "\t" + str(featureCount))
            print(featureCount)
            i += 1

        print("")
        print(str(self.totalCount))


handler = OperatorStatisticHandler()

processdata.processMonth(handler,
                         args.month,
                         args.monthsFolder,
                         notifications=False)

print args.position

handler.printSparqlTranslation()
Пример #6
0
def fieldEntriesDaysApart(months,
                          metric,
                          days,
                          monthsFolder=config.monthsFolder,
                          ignoreLock=False,
                          outputPath=None,
                          outputFilename=None,
                          filterParams="",
                          nosplitting=False,
                          writeOut=False,
                          notifications=True,
                          anonymous=False):
    for month in months.split(","):
        if os.path.isfile(
                utility.addMissingSlash(monthsFolder) +
                utility.addMissingSlash(month) + "locked") and not ignoreLock:
            print "ERROR: The month " + month + " is being edited at the moment." + " Use -i or ignoreLock = True if you want to force the execution of this script."
            sys.exit()

    metric = utility.argMetric(metric)

    pathBase = utility.addMissingSlash(monthsFolder) \
      + utility.addMissingSlash(months.replace("/", "_")) \
      + utility.addMissingSlash(metric)

    if outputPath is not None:
        pathBase = utility.addMissingSlash(outputPath)

    addString = ""
    if anonymous:
        addString = "_anonymous_"

    outputFile = month.strip("/").replace(
        "/",
        "_") + "_" + metric + addString + "_" + str(days) + "_days_apart.tsv"

    if outputFilename is not None:
        outputFile = outputFilename

    header = metric + "\n"

    filter = utility.filter()

    filter.setup(filterParams)

    faultyTimestamps = defaultdict(int)

    class FieldEntriesDaysApartHandler:
        firstSeen = dict()
        lastSeen = dict()

        fieldEntries = set()

        def handle(self, sparqlQuery, processed):
            if not filter.checkLine(processed):
                return

            for key in utility.fetchEntries(processed,
                                            metric,
                                            nosplitting=nosplitting):
                timestamp = processed["timestamp"]
                try:
                    parsedTime = dateparser.parse(timestamp)
                except ValueError:
                    print "ERROR: Faulty timestamp " + str(timestamp)
                    faultyTimestamps[timestamp] += 1
                    continue
                if not key in self.firstSeen:
                    self.firstSeen[key] = parsedTime
                    self.lastSeen[key] = parsedTime
                if parsedTime > self.lastSeen[key]:
                    self.lastSeen[key] = parsedTime

        def compute(self):
            for key, firstTS in self.firstSeen.iteritems():
                lastTS = self.lastSeen[key]
                if (lastTS - firstTS).days >= days:
                    self.fieldEntries.add(key)

        def writeOut(self):
            with open(pathBase + outputFile, "w") as file:
                file.write(header)
                for key in self.fieldEntries:
                    file.write(str(key) + "\n")

    handler = FieldEntriesDaysApartHandler()

    for month in months.split(","):
        if anonymous:
            processdata.processMonth(handler,
                                     month,
                                     monthsFolder,
                                     anonymous=True,
                                     notifications=notifications)
        else:
            processdata.processMonth(handler,
                                     month,
                                     monthsFolder,
                                     notifications=notifications)

    handler.compute()

    if len(faultyTimestamps) > 0:
        print "Faulty timestamp\tcount"
        for k, v in sorted(faultyTimestamps.iteritems(),
                           key=lambda (k, v): (v, k),
                           reverse=True):
            print str(k) + "\t" + str(v)

    if writeOut:
        if not os.path.exists(pathBase):
            os.makedirs(pathBase)
        handler.writeOut()
    return handler.fieldEntries