def get_data(self, f, **kwargs): location = kwargs.get('meetlocatie',None) if location and isinstance(location, MeetLocatie): location = location.name params = kwargs.get('parameter',self.parm) if params and isinstance(params, Parameter): params = [params.name] # find data for location and one of the parameters (always one parameter per json file) dfs = {} for ts in ijson.items(f,'results.item'): pname = ts['name'] if not pname in params: # only 1 parameter per file break lcode = ts['location']['organisation_code'] if not location or lcode == location: events = ts['events'] data = [] if events: for e in events: tmin = e['min'] tmax = e['min'] tgem = (tmin+tmax)/2 if (tmin and tmax) else None t=e['timestamp'] data.append((datetime.datetime.fromtimestamp(t/1000),tgem)) if data: df = pd.DataFrame.from_records(data, index=['datum'], columns=['datum',pname]) if location: # requested for a single location return df dfs[lcode] = df return dfs
def iter_locations(self, fil): ''' iterates over point locations and returns id, coords, description tuple''' for feature in ijson.items(fil,'results.item.location'): geom = feature['geometry'] if geom and geom['type'] == 'Point': x,y,z = geom['coordinates'] coords = [float(x),float(y)] mcode = feature['organisation_code'] moms = feature['name'] yield (mcode,coords,moms)
def parse_json(self): """ yields list of values, where each value corresponds to each dictionary key in keys, yields over all JSON objects in .json file """ with open(self.filepath, 'rb') as f: for json_obj in ijson.items(f, 'item'): # since the file is a list of json objects, each json_obj currently is a dictionary yield [ self.get_values(json_obj, keys) for keys in self.all_keys ]
def _get_structure_info(self, view): """ """ #get structure info command text = view.substr(Region(0, view.size())) cmd = self.get_structure_info_cmd(view, text) timeout = self.get_settings(view, "sourcekitten_timeout", 1.0) # run structure info command p = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) structure_info = list(ijson.items(p.stdout, ''))[0] return structure_info
def _get_structure_info(self, view): """ """ #get structure info command text = view.substr(Region(0, view.size())) cmd = self.get_structure_info_cmd(view, text) timeout = self.get_settings(view, "sourcekitten_timeout", 1.0) # run structure info command p = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT) structure_info = list(ijson.items(p.stdout,''))[0] return structure_info
def get_entries(fp, iterative=True): if fp is sys.stdin: iterative = True fp = fp.buffer if ijson is None or not iterative: data = fp.read() if isinstance(data, bytes): data = data.decode('utf-8') data = json.loads(data) return data['log']['entries'] else: return ijson.items(fp, 'log.entries.item')
def init_from_cosmogony(cls, cosmogony_path): zones_index = cls() print('Reading zones...') with open(cosmogony_path, 'rb') as f: zones = ijson.items(f, 'zones.item') for z in zones: z.pop('geometry', None) zones_index.insert(z) print('{} zones have been read'.format(len(zones_index))) zones_index.build_children() return zones_index
def _iter_locations_fo(fo, start, stop) -> Iterator[Location]: logger = get_logger() total = 0 errors = 0 try: from mycfg.locations import LOCATIONS as known_locations except ModuleNotFoundError as e: name = 'mycfg.locations' if e.name != name: raise e logger.warning( "'%s' isn't found. setting known_locations to empty list", name) known_locations = [] # TODO tagging should be takeout-agnostic def tagger(dt: datetime, point: geopy.Point) -> Tag: ''' Tag points with known locations (e.g. work/home/etc) ''' for lat, lon, dist, tag in known_locations: # TODO use something more efficient? if geopy.distance.distance((lat, lon), point).m < dist: return tag else: return None for j in islice(ijson.items(fo, 'locations.item'), start, stop): dt = datetime.utcfromtimestamp(int(j["timestampMs"]) / 1000) if total % 10000 == 0: logger.info('processing item %d %s', total, dt) total += 1 dt = pytz.utc.localize(dt) try: lat = float(j["latitudeE7"] / 10000000) lon = float(j["longitudeE7"] / 10000000) point = geopy.Point( lat, lon) # kinda sanity check that coordinates are ok except Exception as e: logger.exception(e) errors += 1 if float(errors) / total > 0.01: raise RuntimeError('too many errors! aborting') else: continue alt = j.get("altitude", None) tag = tagger(dt, point) # TODO take accuracy into account?? yield Location(dt=dt, lat=lat, lon=lon, alt=alt, tag=tag)
def load_batches(filename, model, batch_size=0): batch_index = 0 batches = [] item_count = 0 index = 0 group = '' progress = utils.ProgressText() batch_dims = load_batches_dims(filename, model, batch_size) with open(filename, 'r') as jsonfile: items = ijson.items(jsonfile, 'item') for index, item in enumerate(items): if batch_size > 0: batch_index = int(index / batch_size) # start of batch if batch_index >= len(batches): batches.append({'labels': []}) for group in model.io_names: batches[batch_index][group] = np.zeros(batch_dims[batch_index][group], dtype=np.float32) for group in model.io_names: to_array( model.io_names[group], item['data'][group], batches[batch_index][group][item_count], batch_dims[batch_index][group][1:] ) if 'label' in item: batches[batch_index]['labels'].append(item['label']) else: batches[batch_index]['labels'].append('') item_count += 1 # end of batch if batch_size == item_count: item_count = 0 if ((index + 1) % 100) == 0: progress.text('import items: ' + str(index + 1) + '/' + str(batch_dims[batch_index][group][0])) progress.text('import items: ' + str(index + 1) + '/' + str(batch_dims[batch_index][group][0])) sys.stdout.write('\n') return batches
def _iter_via_ijson(fo) -> Iterator[TsLatLon]: # ijson version takes 25 seconds for 1M items (without processing) try: # pip3 install ijson cffi import ijson.backends.yajl2_cffi as ijson # type: ignore except: import warnings warnings.warn("Falling back to default ijson because 'cffi' backend isn't found. It's up to 2x faster, you might want to check it out") import ijson # type: ignore for d in ijson.items(fo, 'locations.item'): yield ( int(d['timestampMs']), d['latitudeE7' ], d['longitudeE7'], )
def _import_cosmogony_to_pg(cosmogony_path): _pg_execute(""" CREATE SCHEMA IF NOT EXISTS import; DROP TABLE IF EXISTS import.zones; CREATE TABLE IF NOT EXISTS import.zones( id bigint NOT NULL, parent bigint, name varchar, admin_level int, zone_type varchar, osm_id varchar, wikidata varchar, geometry geometry, PRIMARY KEY (id) ) WITH (OIDS=FALSE); CREATE INDEX ON import.zones USING gist(geometry); CREATE INDEX ON import.zones (parent); """) print("Importing cosmogony to pg...") start = time.clock() nb_zones = 0 def print_timer(): print(f"{nb_zones} zones imported in " f"{timedelta(seconds=(time.clock()-start))}") with open(cosmogony_path, "rb") as f: zones = ijson.items(f, "zones.item") with _pg_connect() as conn: with conn.cursor() as cur: for z in zones: z["geometry"] = rapidjson.dumps(z.pop("geometry"), number_mode=NM_DECIMAL | NM_NATIVE) cur.execute(SINGLE_INSERT, z) nb_zones += 1 if nb_zones % 10000 == 0: print_timer() print("Import done.") print_timer()
def getResultsStats(file_name, dest): import ijson.backends.yajl2_cffi as ijson log.debug("getResultsStats()") file_path = "%s%s" % (defs.DIR_RESULTS, file_name) distributions = [] with open(file_path, 'rb') as results: i = "1" while True: results.seek(0, 0) tmp = [d for d in ijson.items(results, "reads-distribution-%s.item" % i)] if len(tmp) == 0: break else: distributions.append((i, tmp[0])) i += "0" dest['distribution'] = distributions return dest
def import_google(filename): logging.info('Importing from ' + filename) # Needs to be rb! with open(filename, 'rb') as f: data = ijson.items(f, 'locations.item') c = 0 for o in data: c += 1 p = (round(o['longitudeE7'] / 10000000, precision), round(o['latitudeE7'] / 10000000, precision)) d, t = tst_to_dt(int(o['timestampMs'][:-3])) make_history(p, d, t, False) f.close() logging.info(str(c) + ' items imported from ' + filename) logging.info('History size: ' + str(len(history)) + ' points') pickle.dump(history, open('history.pickle', 'wb')) write_js()
def entries() -> Iterable[Entry]: inps = list(inputs()) base: List[PathIsh] = ['arbtt-dump', '--format=json'] cmds: List[List[PathIsh]] if len(inps) == 0: cmds = [base] # rely on default else: # otherise, 'merge' them cmds = [base + ['--logfile', f] for f in inps] import ijson.backends.yajl2_cffi as ijson # type: ignore from subprocess import Popen, PIPE for cmd in cmds: with Popen(cmd, stdout=PIPE) as p: out = p.stdout; assert out is not None for json in ijson.items(out, 'item'): yield Entry(json=json)
def useAttributeAndScatter(f, att, max_nums=10000): psis = [] labels = [] item_num = 0 for program in ijson.items(f, 'programs.item'): api_call = get_api(get_calls_from_ast(program['ast']['_nodes'])) if api_call != 'N/A': labels.append(api_call) if att not in program: return psis.append(program[att]) item_num += 1 if item_num > max_nums: break psis = np.array(psis) name = "RE" if att == "b2" else att fitTSEandplot(psis, labels, name)
def Deserializer(stream, **options): """ Deserialize a stream of JSON data using iterative ijson so we may not load the whole string into memory. """ if isinstance(stream, (bytes, six.string_types)): raise TypeError( 'Use iloaddata/ijson with streams only. For strings use plain loaddata/json.loads' ) try: objects = ijson.items(stream, 'item') for obj in PythonDeserializer(objects, **options): yield obj except GeneratorExit: raise except Exception as e: # Map to deserializer error six.reraise(DeserializationError, DeserializationError(e), sys.exc_info()[2])
def get_account_stats(conf, silent=True): system_account_names = set(get_system_account_names(conf)) vests = list() total_steem = 0 account_names = set() if not silent and not YAJL2_CFFI_AVAILABLE: print( "Warning: could not load yajl, falling back to default backend for ijson." ) with open(conf["snapshot_file"], "rb") as f: for acc in ijson.items(f, "accounts.item"): if acc["name"] in system_account_names: continue account_names.add(acc["name"]) vests.append(satoshis(acc["vesting_shares"])) total_steem += satoshis(acc["balance"]) if not silent: n = len(account_names) if n % 100000 == 0: print("Accounts read:", n) initial_account_stats = { "account_names": account_names, "total_vests": sum(vests), "total_steem": total_steem } proportions = get_proportions(initial_account_stats, conf) max_vests_per_account = proportions["max_vests_per_account"] for (i, v) in enumerate(vests): vests[i] = min(max_vests_per_account, v) return { "account_names": account_names, "total_vests": sum(vests), "total_steem": total_steem }
def load_batches_dims(filename, model, batch_size=0): batch_index = 0 batch_dims = [] item_count = 0 index = 0 progress = utils.ProgressText() with open(filename, 'r') as jsonfile: items = ijson.items(jsonfile, 'item') for index, item in enumerate(items): if batch_size > 0: batch_index = int(index / batch_size) # start of batch if batch_index >= len(batch_dims): batch_dims.append({}) get_item_dims(item['data'], model, batch_dims[batch_index]) item_count += 1 # end of batch if batch_size == item_count: for group in model.io_names: batch_dims[batch_index][group] = \ [item_count] + batch_dims[batch_index][group] + [get_io_len(model, group)] item_count = 0 if ((index + 1) % 100) == 0: progress.text('count items: ' + str(index + 1)) progress.text('count items: ' + str(index + 1)) sys.stdout.write('\n') if item_count != 0: for group in model.io_names: batch_dims[batch_index][group] = \ [item_count] + batch_dims[batch_index][group] + [get_io_len(model, group)] return batch_dims
def get_data(self, f, **kwargs): location = kwargs.get('meetlocatie', None) if location and isinstance(location, MeetLocatie): location = location.name params = kwargs.get('parameter', self.parm) if params: if isinstance(params, Parameter): params = [params.name] elif isinstance(params, six.string_types): params = [params] # find data for location and one of the parameters (always one parameter per json file) dfs = {} for ts in ijson.items(f, 'results.item'): pname = ts['name'] if not pname in params: # only 1 parameter per file break lcode = ts['location']['organisation_code'] if not location or lcode == location: events = ts['events'] data = [] if events: for e in events: tmin = e['min'] tmax = e['min'] tgem = (tmin + tmax) / 2 if (tmin and tmax) else None t = e['timestamp'] data.append( (datetime.datetime.utcfromtimestamp(t / 1000), tgem)) if data: df = pd.DataFrame.from_records(data, index=['datum'], columns=['datum', pname]) dfs[lcode] = df if location: # requested for a single location break return dfs
def split(args): f = open(args.input_file[0], 'rb') assert(args.part>0 and args.part<100) start , end = (args.part-1)*args.step , args.part*args.step i = 0 split_programs = [] for program in ijson.items(f, 'programs.item'): print('Split part {} of size {} #Finished {} programs'.format(args.part, args.step, i), end='\r') if i == end: break if i < start: i += 1 continue else: split_programs.append(program) i += 1 print('') print("Writing to File") with open('{}-{:02d}.json'.format(args.input_file[0][:-5], args.part), 'w') as f: simplejson.dump({'programs': split_programs}, f, indent=2)
def load(cls, fp, override=None): """Load a generator. Parameters ---------- fp : `file` or `str` Input file or file path. override : `dict` or `None`, optional Changes to loaded data (default: `None`). Returns ------- `markovchain.base.MarkovBase` Loaded generator. """ if isinstance(fp, str): with open(fp, 'r') as fp2: return cls.load(fp2, override) x = fp.read(1) fp.seek(0) if isinstance(x, str): data = json.load(fp) elif ijson is not None: try: data = next(ijson.items(fp, '')) except StopIteration: data = {} else: data = json.loads(fp.read().decode('utf-8')) if override is not None: extend(data, override) return cls(**data)
import cv2 import decimal import json import ijson.backends.yajl2_cffi as ijson from sklearn_theano.feature_extraction import OverfeatTransformer tr = OverfeatTransformer(output_layers=[8]) class DecimalEncoder(json.JSONEncoder): def default(self, o): if isinstance(o, decimal.Decimal): return float(o) return super(DecimalEncoder, self).default(o) with open('../workspace/ds.json') as inh: with open('../workspace/ds_deep.json', 'w') as outh: ds = ijson.items(inh, 'item') outh.write('[') for i, item in enumerate(ds): print 'running', i+1 if i > 0: outh.write(',') img = cv2.imread('set1/' + item['file']) img = cv2.resize(img, (231, 231)) item['deep'] = tr.transform(img)[0].tolist() json.dump(item, outh, cls=DecimalEncoder) outh.write(']')
def calendars(self): calendars = requests.get( "http://api-tokyochallenge.odpt.org/api/v4/odpt:Calendar.json", params={"acl:consumerKey": self.apikey}, timeout=30, stream=True) calendars.raise_for_status() calendars = ijson.items(calendars.raw, "item") # Get info on specific calendars calendar_dates = {} for calendar in calendars: calendar_id = calendar["owl:sameAs"].split(":")[1] if "odpt:day" in calendar and calendar["odpt:day"] != []: dates = [ datetime.strptime(i, "%Y-%m-%d").date() for i in calendar["odpt:day"] ] dates = [ i for i in dates if self.startdate <= i <= self.enddate ] for date in dates: if date not in calendar_dates: calendar_dates[date] = set() calendar_dates[date].add(calendar_id) # Get info about holidays if self.startdate.year == self.enddate.year: holidays = _holidays(self.startdate.year) else: holidays = _holidays(self.startdate.year) | _holidays( self.enddate.year) # Open file buffer = open("gtfs/calendar_dates.txt", mode="w", encoding="utf8", newline="") writer = csv.DictWriter(buffer, GTFS_HEADERS["calendar_dates.txt"], extrasaction="ignore") writer.writeheader() # Dump data for route, services in self.used_calendars.items(): if self.verbose: print("\033[1A\033[KParsing calendars:", route) working_date = copy(self.startdate) while working_date <= self.enddate: active_services = [] if calendar_dates.get(working_date, set()).intersection(services): active_services = [ i for i in calendar_dates[working_date].intersection( services) ] elif working_date in holidays and "Holiday" in services: active_services = ["Holiday"] elif working_date.isoweekday( ) == 7 and working_date not in holidays: if "Sunday" in services: active_services = ["Sunday"] elif "Holiday" in services: active_services = ["Sunday"] elif working_date.isoweekday( ) == 6 and working_date not in holidays and "Saturday" in services: active_services = ["Saturday"] elif working_date.isoweekday( ) == 5 and working_date not in holidays and "Friday" in services: active_services = ["Friday"] elif working_date.isoweekday( ) == 4 and working_date not in holidays and "Thursday" in services: active_services = ["Thursday"] elif working_date.isoweekday( ) == 3 and working_date not in holidays and "Wednesday" in services: active_services = ["Wednesday"] elif working_date.isoweekday( ) == 2 and working_date not in holidays and "Tuesday" in services: active_services = ["Tuesday"] elif working_date.isoweekday( ) == 1 and working_date not in holidays and "Monday" in services: active_services = ["Monday"] elif (working_date.isoweekday() >= 6 or working_date in holidays) and "SaturdayHoliday" in services: active_services = ["SaturdayHoliday"] elif working_date.isoweekday( ) <= 5 and working_date not in holidays and "Weekday" in services: active_services = ["Weekday"] if active_services: for service in active_services: writer.writerow({ "service_id": route + "/" + service, "date": working_date.strftime("%Y%m%d"), "exception_type": 1 }) working_date += timedelta(days=1) calendars.close() buffer.close()
def trips(self): """Parse trips & stop_times""" # Some variables available_calendars = self._legal_calendars() # Get all trips trips = requests.get( "http://api-tokyochallenge.odpt.org/api/v4/odpt:BusTimetable.json", params={"acl:consumerKey": self.apikey}, timeout=90, stream=True) trips.raise_for_status() trips = ijson.items(trips.raw, "item") # Open GTFS trips buffer_trips = open("gtfs/trips.txt", mode="w", encoding="utf8", newline="") writer_trips = csv.DictWriter(buffer_trips, GTFS_HEADERS["trips.txt"], extrasaction="ignore") writer_trips.writeheader() buffer_times = open("gtfs/stop_times.txt", mode="w", encoding="utf8", newline="") writer_times = csv.DictWriter(buffer_times, GTFS_HEADERS["stop_times.txt"], extrasaction="ignore") writer_times.writeheader() # Iteratr over trips for trip in trips: operator = trip["odpt:operator"].split(":")[1] pattern_id = trip["odpt:busroutePattern"].split(":")[1] # Get route_id if pattern_id in self.pattern_map: route_id = self.pattern_map[pattern_id] else: if operator == "JRBusKanto": route_id = operator + "." + \ pattern_id.split(".")[1] + "." + \ pattern_id.split(".")[2] else: route_id = operator + "." + pattern_id.split(".")[1] trip_id = trip["owl:sameAs"].split(":")[1] calendar = trip["odpt:calendar"].split(":")[1] service_id = route_id + "/" + calendar if self.verbose: print("\033[1A\033[KParsing times:", trip_id) # Ignore non-parsed routes and non_active calendars if operator not in self.operators: continue if route_id not in self.parsed_routes: warn( "\033[1mno route for pattern {}\033[0m".format(pattern_id)) continue if calendar not in available_calendars: continue # Add calendar if route_id not in self.used_calendars: self.used_calendars[route_id] = set() self.used_calendars[route_id].add(calendar) # Ignore one-stop trips if len(trip["odpt:busTimetableObject"]) < 2: continue # Bus headsign headsigns = [ i["odpt:destinationSign"] for i in trip["odpt:busTimetableObject"] if i.get("odpt:destinationSign") != None ] if headsigns: trip_headsign = headsigns[0] else: last_stop_id = trip["odpt:busTimetableObject"][-1][ "odpt:busstopPole"].split(":")[1] if last_stop_id in self.stop_names: trip_headsign = self.stop_names[last_stop_id] else: trip_headsign = re.sub(r"(?!^)([A-Z][a-z]+)", r" \1", last_stop_id.split(".")[1]) warn("\033[1mno name for stop {}\033[0m".format( last_stop_id)) self.stop_names[last_stop_id] = trip_headsign trip_headsign_en = self.english_strings.get(trip_headsign, "") # Non-step bus (wheelchair accesibility) if any([ i.get("odpt:isNonStepBus") == False for i in trip["odpt:busTimetableObject"] ]): wheelchair = "2" elif any([ i.get("odpt:isNonStepBus") == True for i in trip["odpt:busTimetableObject"] ]): wheelchair = "1" else: wheelchair = "0" # Do we start after midnight? prev_departure = _Time(0) if trip["odpt:busTimetableObject"][0].get("odpt:isMidnight", False): first_time = trip["odpt:busTimetableObject"][0].get("odpt:departureTime") or \ trip["odpt:busTimetableObject"][0].get("odpt:arrivalTime") # If that's a night bus, and the trip starts before 6 AM # Add 24h to departure, as the trip starts "after-midnight" if int(first_time.split(":")[0]) < 6: prev_departure = _Time(86400) # Filter stops to include only active stops trip["odpt:busTimetableObject"] = sorted( [ i for i in trip["odpt:busTimetableObject"] if i["odpt:busstopPole"].split(":")[1] in self.valid_stops ], key=lambda i: i["odpt:index"]) # Ignore trips with less then 1 stop if len(trip["odpt:busTimetableObject"]) <= 1: #warn("\033[1mno correct stops in trip {}\033[0m".format(trip_id)) continue # Write to trips.txt writer_trips.writerow({ "route_id": route_id, "trip_id": trip_id, "service_id": service_id, "trip_headsign": trip_headsign, "trip_pattern_id": pattern_id, "wheelchair_accessible": wheelchair }) # Times for idx, stop_time in enumerate(trip["odpt:busTimetableObject"]): stop_id = stop_time["odpt:busstopPole"].split(":")[1] # Get time arrival = stop_time.get("odpt:arrivalTime") or stop_time.get( "odpt:departureTime") departure = stop_time.get( "odpt:departureTime") or stop_time.get("odpt:arrivalTime") if arrival: arrival = _Time.from_str(arrival) if departure: departure = _Time.from_str(departure) # Be sure arrival and departure exist if not (arrival and departure): continue # Fix for after-midnight trips. GTFS requires "24:23", while JSON data contains "00:23" if arrival < prev_departure: arrival += 86400 if departure < arrival: departure += 86400 prev_departure = copy(departure) # Can get on/off? # None → no info → fallbacks to True, but bool(None) == False, so we have to explicitly comapre the value to False pickup = "1" if stop_time.get( "odpt:CanGetOn") == False else "0" dropoff = "1" if stop_time.get( "odpt:CanGetOff") == False else "0" writer_times.writerow({ "trip_id": trip_id, "stop_sequence": idx, "stop_id": stop_id, "arrival_time": str(arrival), "departure_time": str(departure), "pickup_type": pickup, "drop_off_type": dropoff }) trips.close() buffer_trips.close() buffer_times.close()
def routes(self): patterns = requests.get( "http://api-tokyochallenge.odpt.org/api/v4/odpt:BusroutePattern.json", params={"acl:consumerKey": self.apikey}, timeout=30, stream=True) patterns.raise_for_status() patterns = ijson.items(patterns.raw, "item") buffer = open("gtfs/routes.txt", mode="w", encoding="utf8", newline="") writer = csv.DictWriter(buffer, GTFS_HEADERS["routes.txt"], extrasaction="ignore") writer.writeheader() self.parsed_routes = set() for pattern in patterns: pattern_id = pattern["owl:sameAs"].split(":")[1] if type(pattern["odpt:operator"]) is list: operator = pattern["odpt:operator"][0].split(":")[1] else: operator = pattern["odpt:operator"].split(":")[1] if operator not in self.operators: continue if self.verbose: print("\033[1A\033[KParsing route patterns:", pattern_id) # Get route_id if "odpt:busroute" in pattern: route_id = pattern["odpt:busroute"].split(":")[1] else: if operator == "JRBusKanto": route_id = operator + "." + \ pattern_id.split(".")[1] + "." + \ pattern_id.split(".")[2] else: route_id = operator + "." + pattern_id.split(".")[1] # Map pattern → route_id, as BusTimetable references patterns instead of routes self.pattern_map[pattern_id] = route_id # Get color from bus_colors.csv route_code = pattern["dc:title"].split(" ")[ 0] # Toei appends direction to BusroutePattern's dc:title route_color, route_text = self.operators[operator] # Output to GTFS if route_id not in self.parsed_routes: self.parsed_routes.add(route_id) writer.writerow({ "agency_id": operator, "route_id": route_id, "route_short_name": route_code, "route_type": 3, "route_color": route_color, "route_text_color": route_text }) patterns.close() buffer.close()
def load(secure,hostname,url,schema,table,postdata,condition,verbose,rowcount): show("begin "+hostname+" "+url+" "+schema+" "+table+" "+(postdata or "")+" "+(condition or "")) if secure: address = "https://"+hostname+url else: address = "http://"+hostname+url show("load from "+address) reqheaders = {'Content-Type': 'application/json'} # api credentials from env vars if os.getenv("API_USERNAME"): show("using authentication") apiuser = os.getenv("API_USERNAME") apipass = os.getenv("API_PASSWORD") reqheaders['Authorization'] = 'Basic %s' % base64.b64encode(apiuser+":"+apipass) # automatic POST with (post)data print("value used for , -r, --rowcount=", rowcount) request = urllib2.Request(address, data=postdata, headers=reqheaders) try: response = urllib2.urlopen(request) except httplib.IncompleteRead as e: show('IncompleteRead exception.') show('Received: %d'%(e.partial)) sys.exit(2) except urllib2.HTTPError as e: show('The server couldn\'t fulfill the request.') show('Error code: %d'%(e.code)) sys.exit(2) except urllib2.URLError as e: show('We failed to reach a server.') show('Reason: %s'%(e.reason)) sys.exit(2) else: # everything is fine show("api call OK") # remove data conditionally, otherwise empty # merge operation could be considered here... if condition: show("remove from %s.%s with condition '%s'"%(schema,table,condition)) dboperator.execute("DELETE FROM %s.%s WHERE %s"%(schema,table,condition)) else: show("empty %s.%s"%(schema,table)) dboperator.empty(schema,table) show("insert data") cnt=0 manycount = 0 rows = [] for row in ijson.items(response,'item'): cnt+=1 manycount+=1 # show some sign of being alive if cnt%100 == 0: sys.stdout.write('.') sys.stdout.flush() if cnt%1000 == 0: show("-- %d" % (cnt)) if verbose: show("%d -- %s"%(cnt,row)) # find out which columns to use on insert dboperator.resetcolumns(row) # flatten arrays/lists for col in row: if type(row[col]) is list: row[col] = ''.join(map(str,json.dumps(row[col]))) rows.append(row) if cnt == 1: dboperator.insert(address,schema,table,row) manycount = 0 rows = [] if cnt > 1: if manycount == rowcount: insert(address,schema,table,rows) manycount = 0 rows = [] if len(rows) <= manycount and len(rows) > 0: insert(address,schema,table,rows) rows = [] manycount = 0 show("wrote %d"%(cnt)) show("ready")
def load_json(): objects = ijson.items(get_stdin(), "") ds.append(next(objects))
def ijsonLoad(filename, tagsGidRange=None, connsGidRange=None, loadTags=True, loadConns=True, tagFormat=None, connFormat=None, saveTags=None, saveConns=None): """ Function for/to <short description of `netpyne.sim.load.ijsonLoad`> Parameters ---------- filename : <type> <Short description of filename> **Default:** *required* tagsGidRange : <``None``?> <Short description of tagsGidRange> **Default:** ``None`` **Options:** ``<option>`` <description of option> connsGidRange : <``None``?> <Short description of connsGidRange> **Default:** ``None`` **Options:** ``<option>`` <description of option> loadTags : bool <Short description of loadTags> **Default:** ``True`` **Options:** ``<option>`` <description of option> loadConns : bool <Short description of loadConns> **Default:** ``True`` **Options:** ``<option>`` <description of option> tagFormat : <``None``?> <Short description of tagFormat> **Default:** ``None`` **Options:** ``<option>`` <description of option> connFormat : <``None``?> <Short description of connFormat> **Default:** ``None`` **Options:** ``<option>`` <description of option> saveTags : <``None``?> <Short description of saveTags> **Default:** ``None`` **Options:** ``<option>`` <description of option> saveConns : <``None``?> <Short description of saveConns> **Default:** ``None`` **Options:** ``<option>`` <description of option> """ # requires: 1) pip install ijson, 2) brew install yajl from .. import sim import ijson.backends.yajl2_cffi as ijson import json from time import time tags, conns = {}, {} if connFormat: conns['format'] = connFormat if tagFormat: tags['format'] = tagFormat with open(filename, 'rb') as fd: start = time() print('Loading data ...') objs = ijson.items(fd, 'net.cells.item') if loadTags and loadConns: print('Storing tags and conns ...') for cell in objs: if tagsGidRange == None or cell['gid'] in tagsGidRange: print('Cell gid: %d' % (cell['gid'])) if tagFormat: tags[int(cell['gid'])] = [ cell['tags'][param] for param in tagFormat ] else: tags[int(cell['gid'])] = cell['tags'] if connsGidRange == None or cell['gid'] in connsGidRange: if connFormat: conns[int(cell['gid'])] = [[ conn[param] for param in connFormat ] for conn in cell['conns']] else: conns[int(cell['gid'])] = cell['conns'] elif loadTags: print('Storing tags ...') if tagFormat: tags.update({ int(cell['gid']): [cell['tags'][param] for param in tagFormat] for cell in objs if tagsGidRange == None or cell['gid'] in tagsGidRange }) else: tags.update({ int(cell['gid']): cell['tags'] for cell in objs if tagsGidRange == None or cell['gid'] in tagsGidRange }) elif loadConns: print('Storing conns...') if connFormat: conns.update({ int(cell['gid']): [[conn[param] for param in connFormat] for conn in cell['conns']] for cell in objs if connsGidRange == None or cell['gid'] in connsGidRange }) else: conns.update({ int(cell['gid']): cell['conns'] for cell in objs if connsGidRange == None or cell['gid'] in connsGidRange }) print('time ellapsed (s): ', time() - start) tags = utils.decimalToFloat(tags) conns = utils.decimalToFloat(conns) if saveTags and tags: outFilename = saveTags if isinstance( saveTags, basestring) else 'filename'[:-4] + '_tags.json' print('Saving tags to %s ...' % (outFilename)) sim.saveJSON(outFilename, {'tags': tags}) if saveConns and conns: outFilename = saveConns if isinstance( saveConns, basestring) else 'filename'[:-4] + '_conns.json' print('Saving conns to %s ...' % (outFilename)) sim.saveJSON(outFilename, {'conns': conns}) return tags, conns
def update_accounts(account_stats, conf, keydb, silent=True): crea_max_authority_membership = conf.get("crea_max_authority_membership", CREA_MAX_AUTHORITY_MEMBERSHIP) crea_address_prefix = conf.get("crea_address_prefix", CREA_ADDRESS_PREFIX) system_account_names = set(get_system_account_names(conf)) account_names = account_stats["account_names"] num_accounts = len(account_names) porter_wif = keydb.get_privkey("porter") tnman = conf["accounts"]["manager"]["name"] accounts_updated = 0 with open(conf["snapshot_file"], "rb") as f: for a in ijson.items(f, "accounts.item"): if a["name"] in system_account_names: continue cur_owner_auth = a["owner"] new_owner_auth = cur_owner_auth.copy() cur_active_auth = a["active"] new_active_auth = cur_active_auth.copy() cur_posting_auth = a["posting"] new_posting_auth = cur_posting_auth.copy() # filter to only include existing accounts for aw in cur_owner_auth["account_auths"][:( crea_max_authority_membership - 1)]: if (aw[0] not in account_names) or (aw[0] in system_account_names): new_owner_auth["account_auths"].remove(aw) for aw in cur_active_auth["account_auths"][:( crea_max_authority_membership - 1)]: if (aw[0] not in account_names) or (aw[0] in system_account_names): new_active_auth["account_auths"].remove(aw) for aw in cur_posting_auth["account_auths"][:( crea_max_authority_membership - 1)]: if (aw[0] not in account_names) or (aw[0] in system_account_names): new_posting_auth["account_auths"].remove(aw) # add tnman to account_auths new_owner_auth["account_auths"].append( [tnman, cur_owner_auth["weight_threshold"]]) new_active_auth["account_auths"].append( [tnman, cur_active_auth["weight_threshold"]]) new_posting_auth["account_auths"].append( [tnman, cur_posting_auth["weight_threshold"]]) # substitute prefix for key_auths new_owner_auth["key_auths"] = [[ crea_address_prefix + k[3:], w ] for k, w in new_owner_auth["key_auths"] [:crea_max_authority_membership]] new_active_auth["key_auths"] = [[ crea_address_prefix + k[3:], w ] for k, w in new_active_auth["key_auths"] [:crea_max_authority_membership]] new_posting_auth["key_auths"] = [[ crea_address_prefix + k[3:], w ] for k, w in new_posting_auth["key_auths"] [:crea_max_authority_membership]] ops = [{ "type": "account_update_operation", "value": { "account": a["name"], "owner": new_owner_auth, "active": new_active_auth, "posting": new_posting_auth, "memo_key": "TST" + a["memo_key"][3:], "json_metadata": a["json_metadata"], } }] accounts_updated += 1 if not silent: if accounts_updated % 100000 == 0: print("Accounts updated:", accounts_updated) print( "\t", '%.2f%% complete' % (accounts_updated / num_accounts * 100.0)) yield {"operations": ops, "wif_sigs": [porter_wif]} if not silent: print("Accounts updated:", accounts_updated) print("\t100.00%% complete")
def create_accounts(account_stats, conf, keydb, silent=True): crea_address_prefix = conf.get("crea_address_prefix", CREA_ADDRESS_PREFIX) system_account_names = set(get_system_account_names(conf)) proportions = get_proportions(account_stats, conf, silent) min_vesting_per_account = proportions["min_vesting_per_account"] vest_conversion_factor = proportions["vest_conversion_factor"] crea_conversion_factor = proportions["crea_conversion_factor"] account_names = account_stats["account_names"] num_accounts = len(account_names) porter = conf["accounts"]["porter"]["name"] porter_wif = keydb.get_privkey("porter") create_auth = { "account_auths": [["porter", 1]], "key_auths": [], "weight_threshold": 1 } accounts_created = 0 with open(conf["snapshot_file"], "rb") as f: for a in ijson.items(f, "accounts.item"): if a["name"] in system_account_names: continue vesting_amount = (satoshis(a["vesting_shares"]) * vest_conversion_factor) // DENOM transfer_amount = (satoshis(a["balance"]) * crea_conversion_factor) // DENOM name = a["name"] vesting_amount = max(vesting_amount, min_vesting_per_account) ops = [{ "type": "account_create_operation", "value": { "fee": { "amount": "0", "precision": 3, "nai": "@@000000021" }, "creator": porter, "new_account_name": name, "owner": create_auth, "active": create_auth, "posting": create_auth, "memo_key": crea_address_prefix + a["memo_key"][3:], "json_metadata": "", } }, { "type": "transfer_to_vesting_operation", "value": { "from": porter, "to": name, "amount": amount(vesting_amount), } }] if transfer_amount > 0: ops.append({ "type": "transfer_operation", "value": { "from": porter, "to": name, "amount": amount(transfer_amount), "memo": "Ported balance", } }) accounts_created += 1 if not silent: if accounts_created % 100000 == 0: print("Accounts created:", accounts_created) print( "\t", '%.2f%% complete' % (accounts_created / num_accounts * 100.0)) yield {"operations": ops, "wif_sigs": [porter_wif]} if not silent: print("Accounts created:", accounts_created) print("\t100.00%% complete")
except IOError: print('ERROR: could not open file ' + fingerprintsfile_old_name) exit() else: print( 'INFO: no old fingerprints file name passed, starting from scratch' ) fingerprints_old = dict() fingerprints_new = dict() idSet = set() # to check uniqueness. Faster than using a list or dict. duplicateIds = list() # CUSTOM IMPLEMENTATION FROM HERE jsonObjects = ijson.items(fullfile_new, 'messages.item.markets.item') deltafile.write('{"markets":[\n') objCount = 0 deltacount = 0 marketcount = 0 # Half-streaming way: parse the complete JSON of a market and iterate over products inside that. # (full streaming would be pretty complex concerning how to markets = (o for o in jsonObjects) for market in markets: prodcount = 0 if marketcount > 0: deltafile.write('\n,') marketcount += 1 marketId = str(market['wwIdent'])
import ijson.backends.yajl2_cffi as ijson import os import re import time import pandas as pd start = time.time() PATH = "../data" records = list() for filename in sorted(os.listdir(PATH)): data = None try: with open(os.path.join(PATH, filename)) as json_file: data = json_file.read() edges = next(ijson.items(data, "data.feedback.display_comments.edges")) for edge in edges: records.append({ 'id': edge['node']['id'], 'author_id': edge['node']['author']['id'], 'author_name': edge['node']['author']['name'], 'author_gender': edge['node']['author'].get('gender'), 'timestamp': edge['node']['created_time'], 'reactions': edge['node']['feedback']['reactors']['count'], 'url': edge['node']['url'], 'comment': edge['node']['body']['text'] if edge['node']['body'] else None, }) except Exception as e: print(f"Skipping {filename}, {e}") break continue
import cv2 import decimal import json import ijson.backends.yajl2_cffi as ijson from sklearn_theano.feature_extraction import OverfeatTransformer tr = OverfeatTransformer(output_layers=[8]) class DecimalEncoder(json.JSONEncoder): def default(self, o): if isinstance(o, decimal.Decimal): return float(o) return super(DecimalEncoder, self).default(o) with open('../workspace/ds.json') as inh: with open('../workspace/ds_deep.json', 'w') as outh: ds = ijson.items(inh, 'item') outh.write('[') for i, item in enumerate(ds): print 'running', i + 1 if i > 0: outh.write(',') img = cv2.imread('set1/' + item['file']) img = cv2.resize(img, (231, 231)) item['deep'] = tr.transform(img)[0].tolist() json.dump(item, outh, cls=DecimalEncoder) outh.write(']')
else: # everything is fine show("api call OK") # remove data conditionally, otherwise empty # merge operation could be considered here... if condition: show("remove from %s.%s with condition '%s'"%(schema,table,condition)) dboperator.execute("DELETE FROM %s.%s WHERE %s"%(schema,table,condition)) else: show("empty %s.%s"%(schema,table)) dboperator.empty(schema,table) show("insert data") cnt=0 for row in ijson.items(response,'item'): cnt+=1 # show some sign of being alive if cnt%100 == 0: sys.stdout.write('.') sys.stdout.flush() if cnt%1000 == 0: show("-- %d" % (cnt)) if verbose: show("%d -- %s"%(cnt,row)) # find out which columns to use on insert dboperator.resetcolumns(row) # flatten arrays/lists for col in row: if type(row[col]) is list:
def stops(self): """Parse stops""" # Get list of stops stops = requests.get( "http://api-tokyochallenge.odpt.org/api/v4/odpt:BusstopPole.json", params={"acl:consumerKey": self.apikey}, timeout=30, stream=True) stops.raise_for_status() stops = ijson.items(stops.raw, "item") # Open files buffer = open("gtfs/stops.txt", mode="w", encoding="utf8", newline="") writer = csv.DictWriter(buffer, GTFS_HEADERS["stops.txt"], extrasaction="ignore") writer.writeheader() broken_stops_buff = open("broken_stops.csv", mode="w", encoding="utf8", newline="") broken_stops_wrtr = csv.writer(broken_stops_buff) broken_stops_wrtr.writerow( ["stop_id", "stop_name", "stop_name_en", "stop_code"]) # Iterate over stops for stop in stops: stop_id = stop["owl:sameAs"].split(":")[1] stop_code = stop.get("odpt:busstopPoleNumber", "") stop_name = stop["dc:title"] stop_name_en = self.carmel_to_title(stop_id.split(".")[1]) if self.verbose: print("\033[1A\033[KParsing stops:", stop_id) self.stop_names[stop_id] = stop_name # Stop name translation if stop_name_en: self.english_strings[stop_name] = stop_name_en # Stop operators if type(stop["odpt:operator"]) is list: operators = [i.split(":")[1] for i in stop["odpt:operator"]] else: operators = [stop["odpt:operator"].split(":")[1]] # Ignore stops that belong to ignored agencies if not set(operators).intersection(self.operators): continue # Correct stop position if "geo:lat" in stop and "geo:long" in stop: stop_lat = stop["geo:lat"] stop_lon = stop["geo:long"] # Output to GTFS or to incorrect stops if stop_lat and stop_lon: self.valid_stops.add(stop_id) writer.writerow({ "stop_id": stop_id, "stop_code": stop_code, "zone_id": stop_id, "stop_name": stop_name, "stop_lat": stop_lat, "stop_lon": stop_lon, }) else: broken_stops_wrtr.writerow( [stop_id, stop_name, stop_name_en, stop_code]) stops.close() buffer.close()
totalComments = 5730430 subreddits = json.load(open('subreddits.json')) # Keep 10 biggest subreddits subreddits.sort(key=lambda s: s['subscribers'], reverse=True) subreddits = subreddits[:20] totalParsedComments = 0 parsingStartTime = time.time() # Parsing for subreddit in subreddits: subreddit['usersDict'] = {} print('Loading', subreddit['name']) with open('data/' + subreddit['name'] + '.json', 'rb') as data: for comment in ijson.items(data, 'item'): user = getUser(subreddit, comment['author']) user['commentsSentiments'].append(comment['sentiment']['compound']) user['sentimentAverage'] += comment['sentiment']['compound'] user['scoreAverage'] += comment['score'] user['lengthAverage'] += comment['textlen'] user['commentCount'] += 1 if comment['sentiment']['compound'] <= -0.5: user['negativeCommentCount'] += 1 elif (comment['sentiment']['compound'] > -0.5 and comment['sentiment']['compound'] < 0.5): user['neutralCommentCount'] += 1 elif comment['sentiment']['compound'] >= 0.5: user['positiveCommentCount'] += 1 totalParsedComments += 1
def get_parameters(self, f): params = {} for obs in ijson.items(f,'results.item.observation_type'): params[obs['code']] = {'description':obs['parameter_short_display_name'], 'unit': obs['referenced_unit_short_display_name']} break # one single parameter per file return params