def main(): for i, item in enumerate(sanitize_dicts(filter_dicts(stdin_reader()))): # 131 seconds for 10 minutes of data (system json) # print(dumper(item, sort_keys=True)) # 66 second for 10 minutes of data (system json) # 58 second for 10 minutes of data (simplejson) print(dumper(item)) records = i print("{0} records written".format(records), file=stderr)
def main(): date_fmt = "%Y-%m-%dT%H:%M:%S" client = MongoClient() db = client.data collection = db["urlhist"] converter = datetime.strptime for i, doc in enumerate(stdin_reader()): doc["timestamp"] = converter(doc["timestamp"], date_fmt) collection.insert(doc) records = i print("{0} records read".format(records), file=stderr)
def main(): """ Create a dict to store the mapping. *g : bitly global hash identifier *u : Long URL Need to write this out as JSON because there are no punctuation marks I can find that do not also appear in the long urls -- meaning that there is nothing obvious to use as a split character in a CSV. """ c = {d["g"]: d["u"] for d in stdin_reader()} print(simplejson.dumps(c))
def main(countries=None): try: germanwings = load_germanwings() dumper = simplejson.dumps print("Extracting urls", file=stderr) records = 0 stream = sanitize_dicts( filter_dicts(stdin_reader(), germanwings, countries)) for i, item in enumerate(stream): print(dumper(item)) records = i print("{0} records written".format(records), file=stderr) except Exception as exc: print(exc)
def main(fields): data = [] for item in stdin_reader(): dt = dateutil.parser.parse(item["timestamp"]) data.append( dict( [(k, v) for k, v in item.items() if k in fields] + [("day", dt.day), ("hour", dt.hour), ("minute", dt.minute)] ) ) keys = fields + ["day", "hour", "minute"] keyfn = itemgetter(*keys) data = sorted(data, key=keyfn) for k, g in itertools.groupby(data, key=keyfn): d = {"key": k[0], "timestamp": datetime(2015, 3, k[1], k[2], k[3]), "count": len(list(g))} print(simplejson.dumps(d))
def main(fields): data = [] for item in stdin_reader(): dt = dateutil.parser.parse(item["timestamp"]) data.append( dict([(k, v) for k, v in item.items() if k in fields] + [("day", dt.day), ("hour", dt.hour), ("minute", dt.minute)])) keys = fields + ["day", "hour", "minute"] keyfn = itemgetter(*keys) data = sorted(data, key=keyfn) for k, g in itertools.groupby(data, key=keyfn): d = { "key": k[0], "timestamp": datetime(2015, 3, k[1], k[2], k[3]), "count": len(list(g)), } print(simplejson.dumps(d))
def main(key): key_fn = itemgetter(1) c = Counter(d[key] for d in stdin_reader(stdin) if key in d) for k, v in sorted(c.items(), key=key_fn, reverse=True): print("{0}, {1}".format(k, v))