def load(self, n=1, hold=False): self.entities = [] blobs = list( self.storage_client.list_blobs(self.bucket_name, prefix=self.PREFIX)) if len(blobs) >= n: rng = range(-1, 0 - int(n) - 1, -1) for i in rng: b = self.bucket.get_blob(blobs[i].name) if (b.temporary_hold): now = utcnow() while b.temporary_hold: elapse = utcnow() - now if elapse.seconds >= 10: b.temporary_hold = False b.patch() b = self.bucket.get_blob(blobs[i].name) pickle_load = b.download_as_bytes() e = pickle.loads(pickle_load) e.house_keeping() a = Accumulator.A(e, b) self.entities.append(a) else: a = self.create_and_store() self.entities = self.entities.append(a) last_date = self.entities[0].entity.dt now = utcnow() if last_date.day != now.day: a = self.create_and_store() self.entities = self.entities.append(a)
def process_source(source, i): # build query start = utcnow() query = dict(provider={'$in': providers}, year_month={'$in': source.year_month}, _id={'$ne': source._id}, ) if source.children_airlines[0] != '*': query['children_airlines'] = {'$in': source.children_airlines + ['*']} query_od = dict((k, {'$in': source[k] + ['*']}) for k in ('origin_city_airports', 'destination_city_airports') if source[k][0] != '*') if query_od: query_od_return = dict((k, {'$in': source[r] + ['*']}) for k, r in (('origin_city_airports', 'destination_city_airports'), ('destination_city_airports', 'origin_city_airports')) if source[r][0] != '*') if not source.both_ways: query_od_return['both_ways'] = True query['$or'] = [query_od, query_od_return] update = {'$addToSet': dict(overlap=source._id)} result = External_Segment_Tmp.update(query, update, multi=True) end = utcnow() log.info('Update overlaps %d (%ss) - %r', i, end-start, result)
def send_messages_to_table(self): try: with self.info_table.batch_writer() as batch: for entry in self.commodity_storage: self.insert_into_table(batch, entry, "c", self.commodity_storage[entry][0], self.commodity_storage[entry][1]) for entry in self.shipyard_storage: self.insert_into_table(batch, entry, "s", self.shipyard_storage[entry][0], self.shipyard_storage[entry][1]) for entry in self.outfitting_storage: self.insert_into_table(batch, entry, "o", self.outfitting_storage[entry][0], self.outfitting_storage[entry][1]) for entry in self.blackmarket_storage: self.insert_into_table(batch, entry, "b", self.blackmarket_storage[entry][0], self.blackmarket_storage[entry][1]) for entry in self.journal_storage: self.insert_into_table(batch, entry, "j", self.journal_storage[entry][0], self.journal_storage[entry][1]) except Exception as ex: logger.exception('Exception encountered in sending, sending to bad data for retry') with self.error_table.batch_writer() as batch: for entry in self.commodity_storage: batch.put_item(Item={ 'timestamp': utils.date_to_epoch_micro(utils.utcnow()), 'badData': entry, 'errorCause': repr(ex), 'source': 'DynamoRawSend' }) for entry in self.shipyard_storage: batch.put_item(Item={ 'timestamp': utils.date_to_epoch_micro(utils.utcnow()), 'badData': entry, 'errorCause': repr(ex), 'source': 'DynamoRawSend' }) for entry in self.outfitting_storage: batch.put_item(Item={ 'timestamp': utils.date_to_epoch_micro(utils.utcnow()), 'badData': entry, 'errorCause': repr(ex), 'source': 'DynamoRawSend' }) for entry in self.blackmarket_storage: batch.put_item(Item={ 'timestamp': utils.date_to_epoch_micro(utils.utcnow()), 'badData': entry, 'errorCause': repr(ex), 'source': 'DynamoRawSend' }) for entry in self.journal_storage: batch.put_item(Item={ 'timestamp': utils.date_to_epoch_micro(utils.utcnow()), 'badData': entry, 'errorCause': repr(ex), 'source': 'DynamoRawSend' })
def insert_post_2(): iname = check_cookie() if iname is False: return 'AUTHENTICATION FAILED. PLEASE LOGIN FIRST' title = request.forms.get('title') body = request.forms.get('description') pid = cdb.insert_post(iname, title, body, utils.utcnow()) redirect(HOME + '_select_post/' + str(pid))
def execute(self, now=None): now = now or utcnow() ts = now.replace( minute=(now.minute // 10) * 10, second=0, microsecond=0 ) self.log_node.check_data(now) all_events = self.log_node.parsed_data if not all_events: return interesting_events = self._get_interesting_events(all_events) formatted_events = set(self._get_formatted_events(interesting_events)) with NamedTemporaryFile() as f: with GzipFile(fileobj=f) as gz_f: writer = csv.writer(gz_f) writer.writerow(OUTPUT_FIELDNAMES) writer.writerows(formatted_events) f.flush() remote_path = self.api.send_file( self.data_type, f.name, ts, suffix='{:04}'.format(now.minute * 60 + now.second) ) if remote_path is not None: data = {'path': remote_path, 'log_type': self.data_type} self.api.send_signal('logs', data=data) self.log_node.parsed_data = []
def insert_post_2(): iname = check_cookie(); if iname is False: return 'AUTHENTICATION FAILED. PLEASE LOGIN FIRST'; title = request.forms.get('title'); body = request.forms.get('description'); pid = cdb.insert_post(iname, title, body, utils.utcnow()); redirect(HOME + '_select_post/' + str(pid));
def add_word(): form = AddWordForm() if form.validate_on_submit(): entity = EntityStream.query.filter_by(caption=form.name.data).first() if entity == None: entity = EntityStream() entity.eid = form.name.data + '-' + id_generator() entity.creator = current_user.username entity.creation_time = utcnow() entity.caption = form.name.data entity.alias = form.alias.data entity.description = form.description.data for s in prefix_suggestion(entity.caption): searchkey = SearchKey.query.filter_by(word=s).first() if searchkey == None: searchkey = SearchKey(s) else: print "found searchkey", searchkey.word, searchkey.id entity.suggestion.append(searchkey) db.session.add(entity) # db.session.commit() flash('The new word has been created.') # else: # LOG("add_word(): entity found in db") return redirect(url_for('.go_entity', name=entity.eid)) return render_template('add_word.html', form=form)
def cert_info(hostname, verbose=False): # http://stackoverflow.com/questions/30862099/how-can-i-get-certificate-issuer-information-in-python ret = {'hostname': hostname, 'results': {}} if not hostname: ret['results'] = "HOVERBOARDS DON'T WORK ON WATER." return ret timeout = 1 # seconds try: LOG.info("fetching certificate info for %r", hostname) ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE s = ctx.wrap_socket(socket.socket(), server_hostname=hostname) s.settimeout(timeout) s.connect((hostname, 443)) cert = s.getpeercert() LOG.debug("got: %r", cert) if not cert: ret['results'] = 'no results' return ret now = utils.utcnow() subject = dict(x[0] for x in cert['subject']) issuer = dict(x[0] for x in cert['issuer']) starts = parse(cert['notBefore']) ends = parse(cert['notAfter']) struct = { 'issued_to': subject['commonName'], 'issued_by': issuer['commonName'], 'starts': starts, 'starts_offset': (now - starts).days, 'ends': ends, 'ends_offset': (ends - now).days, } if verbose: struct['raw'] = cert ret['results'] = struct return ret except socket.timeout as err: LOG.error("failed to fetch certificate, connection timed out after %s seconds" % timeout) ret['results'] = 'timed out' except socket.error: LOG.error("failed to fetch certificate, connection was refused. possibly no SSL configured") ret['results'] = 'refused' except ssl.SSLError as err: LOG.error("failed to fetch certificate for %r", hostname) ret['results'] = err.reason except: LOG.exception("unhandled exception attempting to fetch certificate for hostname %r", hostname) raise return ret
def expired(self): """ Si la última actualización de una posición fue hace más de una determinada cantidad de tiempo, se considera que dicha posición ya no es válida """ return False expiration_date = self.last_updated + timedelta(minutes=TIMEDELTA) return utcnow() > expiration_date
def get_object(self, request, article_status, since): return { # used to conveniently generate the reverse url 'original': {'article_status': article_status, 'since': since}, 'article_status': tuple(article_status.split('+')), 'since': utils.utcnow() - timedelta(days=int(since)), }
def load_channels(self): """ Loads the channels and tools given the plugin path specified :return: The loaded channels, including a tool channel, for the tools found. """ channels = [] # Try to get channels for channel_name in self.channel_names: channel_path = os.path.join(self.path, "channels") sys.path.append(self.path) mod = imp.load_module( channel_name, *imp.find_module(channel_name, [channel_path])) cls = getattr(mod, channel_name.title().replace("_", "")) channel_id = channel_name.split("_")[0] # TODO: what about up_to_timestamp? try: channels.append(cls(channel_id, up_to_timestamp=None)) except TypeError: channels.append(cls(channel_id)) # Try to get tools if self.has_tools: tool_path = os.path.join(self.path, "tools") # Create a tool channel using this path channel_id = self.channel_id_prefix + "_" + "tools" channel = ToolChannel(channel_id, tool_path, up_to_timestamp=utcnow()) channels.append(channel) if self.has_assets: assset_path = os.path.join(os.path.abspath(self.path), "assets") channel_id = self.channel_id_prefix + "_" + "assets" channel = AssetsChannel2(channel_id, assset_path, up_to_timestamp=utcnow()) channels.append(channel) # # from . import TimeInterval # channel.streams.values()[0].window(TimeInterval.up_to_now()).items() return channels
def run(self): while not self.stop_event.is_set(): now = utcnow() try: self.execute(now=now) except requests_exceptions.RequestException as e: # catch any exception from the requests library logging.exception('persistent communication problem: %s', e) # Before we sleep, check if the stop_event is set if self.stop_event.is_set(): break # now sleep for the service's interval time_taken = (utcnow() - now).total_seconds() delay_sec = max(0, self.poll_seconds - time_taken) sleep(delay_sec) logging.info('Service stopped')
def check_data(self, now=None): # no handler -> nothing we can do if not HAS_PSUTIL: return # we only want to gather data once a minute now = now or utcnow() if (now - self.last_send) >= SEND_DELTA: logging.info('gathering stats') # get stats, store in data data = [dumps(self._gather(), sort_keys=True)] self.flush_data(data, now)
def execute(self, now=None): # We use the call time to determine query parameters and for the # remote storage location. now = now or utcnow() now = now.replace(tzinfo=utc) self.last_poll = self.last_poll or now ts = now.replace(minute=(now.minute // 10) * 10, second=0, microsecond=0) # Check the configuration and display helpful error messages if not self._validate_configuration(): logging.error('Invalid configuration, could not start') return # Activate the pxGrid session if not self._activate(): logging.warning('Activate request failed') return # Get the session service information peer_node_name, base_url = self._lookup_service() secret = self._get_secret(peer_node_name) # Do the query (starting one tick after the last poll) and save the # most recent timestamp for next time. start_dt = self.last_poll + TICK_DELTA sessions = self._query_sessions(base_url, start_dt, secret) if not sessions: logging.info('No sessions since %s', self.last_poll) return # Normalize the data and send it out normalized_sessions = self._normalize_sessions(sessions) with NamedTemporaryFile() as f: with GzipFile(fileobj=f) as gz_f: writer = DictWriter(gz_f, fieldnames=OUTPUT_FIELDNAMES) writer.writeheader() writer.writerows(normalized_sessions) f.flush() remote_path = self.api.send_file( DATA_TYPE, f.name, ts, suffix='{:04}'.format(now.minute * 60 + now.second)) if remote_path is not None: data = {'path': remote_path, 'log_type': DATA_TYPE} self.api.send_signal('logs', data=data) # Save the last poll time self.last_poll = max(dt_parse(s['timestamp']) for s in sessions)
def create_and_store(self): entity = Accumulator_Entity() entity.dt = ceil_dt(utcnow(), 15) filename = self.get_filename(entity.dt) blob = self.bucket.blob(filename) pickle_dump = pickle.dumps(entity) blob.upload_from_string(data=pickle_dump) a = Accumulator.A(entity, blob) return a
def execute(self, now=None): logging.info('Checking for Suricata alerts') self._rotate_logs() self._upload(now, compress=True) # ideally we'll update this to use the e-tag, for more responsive # config updates. But for now, do it at the start of every day. # we call utcnow() again to avoid the race condition where we miss # midnight. next_time = utcnow() + timedelta(seconds=UPDATE_INTERVAL_SECONDS) need_ruleset_update = (now and next_time.date() > now.date()) if need_ruleset_update: self._update_rules()
def execute(self, now=None): logging.info('Checking for Suricata alerts') self._rotate_logs() self._upload(now, compress=True) # ideally we'll update this to use the e-tag, for more responsive # config updates. But for now, do it at the start of every day. # we call utcnow() again to avoid the race condition where we miss # midnight. next_time = utcnow() + timedelta(seconds=UPDATE_INTERVAL_SECONDS) should_update = (now and next_time.date() > now.date()) if (not os.path.exists(SURICATA_RULE_PATH)) or should_update: logging.info('Updating Suricata rules') self._update_rules() logging.info('Finished updating Suricata rules')
def execute(self, now=None): for data_type, priority in MESSAGE_MAP.iteritems(): try: params = self.state[data_type] except KeyError: params = {'time__gt': utcnow().replace(tzinfo=utc).isoformat()} self.state[data_type] = params messages = self.get_data(data_type, params) if not messages: continue max_time = max(msg['time'] for msg in messages) self.state[data_type] = {'time__gt': max_time} self.publish(messages, priority)
def now_minus(cls, weeks=0, days=0, hours=0, minutes=0, seconds=0, milliseconds=0): delta = timedelta(weeks=weeks, days=days, hours=hours, minutes=minutes, seconds=seconds, milliseconds=milliseconds, microseconds=0) now = utcnow() return TimeInterval(now - delta, now)
def parse_time_tuple(start, end): """ Parse a time tuple. These can be: relative in seconds, e.g. (-4, 0) relative in timedelta, e.g. (timedelta(seconds=-4), timedelta(0)) absolute in date/datetime, e.g. (datetime(2016, 4, 28, 20, 0, 0, 0, UTC), datetime(2016, 4, 28, 21, 0, 0, 0, UTC)) absolute in iso strings, e.g. ("2016-04-28T20:00:00.000Z", "2016-04-28T20:01:00.000Z") Mixtures of relative and absolute are not allowed :param start: Start time :param end: End time :type start: int | timedelta | datetime | str :type end: int | timedelta | datetime | str :return: TimeInterval or RelativeTimeInterval object """ if isinstance(start, int): start_time = timedelta(seconds=start) elif isinstance(start, timedelta): start_time = start elif start is None: start_time = MIN_DATE elif isinstance(start, (date, datetime)): start_time = start.replace(tzinfo=UTC) else: start_time = ciso8601.parse_datetime(start).replace(tzinfo=UTC) if isinstance(end, int): # TODO: add check for future (negative values) and ensure that start < end if not isinstance(start_time, timedelta): raise ValueError("Can't mix relative and absolute times") end_time = timedelta(seconds=end) elif isinstance(end, timedelta): if not isinstance(start_time, timedelta): raise ValueError("Can't mix relative and absolute times") end_time = end elif end is None: end_time = utcnow() # TODO: or MAX_DATE? elif isinstance(end, datetime): end_time = end.replace(tzinfo=UTC) else: end_time = ciso8601.parse_datetime(end).replace(tzinfo=UTC) if isinstance(start_time, timedelta): return RelativeTimeInterval(start=start_time, end=end_time) else: return TimeInterval(start=start_time, end=end_time)
def store_metric_environment(): envelope = request.get_json() if not envelope: msg = 'no Pub/Sub message received' print(f'error: {msg}') return f'Bad Request: {msg}', 400 if not isinstance(envelope, dict) or 'message' not in envelope: msg = 'invalid Pub/Sub message format' print(f'error: {msg}') return f'Bad Request: {msg}', 400 pubsub_message = envelope['message'] payload = '' if isinstance(pubsub_message, dict) and 'data' in pubsub_message: payload = base64.b64decode( pubsub_message['data']).decode('utf-8').strip() if "location:house.basement" in payload: print(re.match("temperature\:([0-9]+\.[0-9]+)", payload)) json_content = { "temperature": float( re.match(".+temperature:([0-9]+\.[0-9]+)", payload).groups()[0]), "original_payload": payload } filename = "environment_sensor_basement-" + datetime.now().strftime( FORMAT_DATE_DASH) create_file(json.dumps(json_content), filename) accumulator = Accumulator(app.logger) n = utcnow() try: accumulator.add_temperature( n, temp_basement=json_content.get('temperature')) except ValueError as ex: app.logger.warn( "Accumulator - no value to add - content: {} --- {}".format( payload, ex)) return ('', 204)
def acc(j): accumulator = Accumulator(app.logger) n = utcnow() if j.get('temperature') is not None: j['temperature'] = float(j.get('temperature')) if j.get('humidity') is not None: j['humidity'] = float(j.get('humidity')) if j.get('stove_exhaust_temp') is not None: j['stove_exhaust_temp'] = float(j.get('stove_exhaust_temp')) try: accumulator.add_temperature2(n, value_dict=j) except ValueError as ex: app.logger.warn( "Accumulator - no value to add - content: {} --- {}".format( payload, ex)) return accumulator
def execute(self, now=None): # Retrieve entries from the log file now = now or utcnow() self.log_node.check_data(now) # We will send data from the previous 10 minute segment now_segment = now.replace(minute=(now.minute // 10) * 10, second=0, microsecond=0) send_segment = now_segment - timedelta(minutes=10) # Remove data that came in too late to do anything about all_segments = sorted(self.log_node.parsed_data.iterkeys()) for segment in all_segments: if segment < send_segment: del self.log_node.parsed_data[segment] self._check_point_to_csv(send_segment, now) super(CheckPointPusher, self).execute(now=now)
def execute(self, now=None): if not self.logger.handlers: return for data_type in self.notification_types: if data_type not in MESSAGE_MAP: continue endpoint = MESSAGE_MAP[data_type]['endpoint'] priority = MESSAGE_MAP[data_type]['priority'] try: params = self.state[data_type] except KeyError: params = {'time__gt': utcnow().replace(tzinfo=utc).isoformat()} self.state[data_type] = params messages = self.get_data(endpoint, params) if not messages: continue max_time = max(msg['time'] for msg in messages) self.state[data_type] = {'time__gt': max_time} self.publish(messages, priority)
def get_data(xlsx_files, year_months): """ Populate the database with data extract in xlsx files. One file per year_month, only one tab per file. Back/Forth routes in rows, one column per way. :param xlsx_files: dict of file names :param year_months: list of strings (YYYY-MM) :return: """ global provider, unknown_airports now = utcnow() airport_replacement = {} airport_exclusions = {} def log_bulk(self): log.info(' store external_segment: %r', self.nresult) for xlsx_f in xlsx_files: # loop through each file previous_data = pd.DataFrame(columns=['origin', 'destination', 'year_month', 'passengers']) row_nb = 0 if "domestic" in xlsx_f: perimeter = "domestic" full_provider = provider + ' - domestic' else: perimeter = "international" full_provider = provider + ' - intl' print('******************** processing Excel file:', xlsx_f) xls = format_file(xlsx_f, perimeter) all_rows = len(xls.index) row_nb = 0 with External_Segment_Tmp.unordered_bulk(1000, execute_callback=log_bulk) as bulk: for row_index, row in xls.iterrows(): # loop through each row (origin, destination) in file row_nb += 1 year_month = row['year_month'] if year_month not in year_months: continue # First the process for domestic files if perimeter == "domestic": passengers = int(row['Passengers']) airport_origin = row['From'] airport_destination = row['To'] if airport_origin in airport_exclusions or airport_destination in airport_exclusions: # skip exclusions continue if airport_origin in airport_replacement: # correct the wrong codes airport_origin = airport_replacement.get(airport_origin) if airport_destination in airport_replacement: # correct the wrong codes airport_destination = airport_replacement.get(airport_destination) if not check_airport(airport_destination, passengers, perimeter): continue if not check_airport(airport_destination, passengers, perimeter): continue if ((previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month)).any(): new_row = False # Add to Excel file's total_pax the "passenger" integer you get from filtering # previous_data on other columns passengers += int(previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month)]) else: new_row = True dic = dict(provider=full_provider, data_type='airport', airline=['*'], airline_ref_code=['*'], total_pax=passengers, overlap=[], origin=[airport_origin], destination=[airport_destination], year_month=[year_month], raw_rec=dict(row), both_ways=False, from_line=row_index, from_filename=xlsx_f, url=domestic_url) new_data = pd.Series({'origin': airport_origin, 'destination': airport_destination, 'year_month': year_month, 'passengers': passengers}).to_frame() if new_row: previous_data = previous_data.append(new_data.T, ignore_index=True) else: previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month)] = passengers # Modify previous_data's pax query = dict((k, dic[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type', 'airline')) bulk.find(query).upsert().update_one({'$set': dic, '$setOnInsert': dict(inserted=now)}) if row_nb % 1000 == 0: print('{0:.3g}'.format(float(row_nb) / float(all_rows) * 100) + '%') # Now for international files else: # Handle missing data, written ".." in the excel files row.replace('..', np.nan, inplace=True) if pd.isnull(row['TotalPax']): continue if pd.isnull(row['PaxIn']): way_in = False else: way_in = True passengers_in = int(row['PaxIn']) if pd.isnull(row['PaxOut']): way_out = False else: way_out = True passengers_out = int(row['PaxOut']) australian_city = row['AustralianPort'] other_city = row['ForeignPort'] other_country = row['Country'] australian_airport = find_airports_by_name(australian_city, 'australian') other_airport = find_airports_by_name(other_city, 'other') # If one of the airports is not recognized by name, store and skip if not australian_airport: check_airport(airport=None, pax=int(row['TotalPax']), perimeter='international', city=australian_city, country='Australia') continue if not other_airport: check_airport(airport=None, pax=int(row['TotalPax']), perimeter='international', city=other_city, country=other_country) continue # Only store data if there was an integer in the PaxIn and/or PaxOut if way_in: dic_in = dict(provider=full_provider, data_type='airport', airline=['*'], airline_ref_code=['*'], total_pax=passengers_in, origin=sorted(other_airport), destination=sorted(australian_airport), year_month=[row['year_month']], raw_rec=dict(row), both_ways=False, from_line=row_index, from_filename=xlsx_f, url=domestic_url) query = dict((k, dic_in[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type')) bulk.find(query).upsert().update_one({'$set': dic_in, '$setOnInsert': dict(inserted=now)}) if way_out: dic_out = dict(provider=full_provider, data_type='airport', airline=['*'], airline_ref_code=['*'], total_pax=passengers_out, origin=sorted(australian_airport), destination=sorted(other_airport), year_month=[row['year_month']], raw_rec=dict(row), both_ways=False, from_line=row_index, from_filename=xlsx_f, url=domestic_url) query = dict((k, dic_out[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type')) bulk.find(query).upsert().update_one({'$set': dic_out, '$setOnInsert': dict(inserted=now)}) log.info('stored: %r', bulk.nresult)
def main(runPath, argv): try: opts, args = getopt.getopt(argv, 'ip', ['iam_role', 'profile_name']) except getopt.GetoptError as err: print(repr(err)) print('EDDNDynamoRaw.py -p <profile name> OR -r') sys.exit(2) profile_name = 'eddntest' for opt, arg in opts: if opt in ("-i", "--iam_role"): profile_name = '' if opt in ("-p", "--profile_name"): profile_name = arg logging.config.dictConfig({ 'version': 1, 'disable_existing_loggers': False, 'formatters': { 'standard': { 'format': '%(asctime)s [%(levelname)s] %(name)s: %(message)s' }, }, 'handlers': { 'default': { 'class': 'logging.StreamHandler', 'level': 'INFO', 'formatter': 'standard', 'stream': 'ext://sys.stdout' }, 'file_handler': { 'class': 'logging.handlers.RotatingFileHandler', 'formatter': 'standard', 'level': 'INFO', 'filename': 'eddn-dynamo-raw.log', 'maxBytes': 10485760, 'backupCount': 5, 'encoding': 'utf8' } }, 'loggers': { '': { 'handlers': ['default', 'file_handler'], 'level': 'INFO', 'propagate': True } } }) logger.debug('Logging configured') commodity_schema1 = requests.get('http://schemas.elite-markets.net/eddn/commodity/1', headers={'Connection': 'close'}).json() logger.info('Obtained commodity schema v1') commodity_schema2 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/924c948a0233421e684145a6c40751c5a7a6bef9/schemas/commodity-v2.0.json', headers={'Connection': 'close'}).json() logger.info('Obtained commodity schema v2') commodity_schema3 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/master/schemas/commodity-v3.0.json', headers={'Connection': 'close'}).json() logger.info('Obtained commodity schema v3') shipyard_schema1 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/924c948a0233421e684145a6c40751c5a7a6bef9/schemas/shipyard-v1.0.json', headers={'Connection': 'close'}).json() logger.info('Obtained shipyard schema v1') shipyard_schema2 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/master/schemas/shipyard-v2.0.json', headers={'Connection': 'close'}).json() logger.info('Obtained shipyard schema v2') outfitting_schema1 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/924c948a0233421e684145a6c40751c5a7a6bef9/schemas/outfitting-v1.0.json', headers={'Connection': 'close'}).json() logger.info('Obtained outfitting schema v1') outfitting_schema2 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/master/schemas/outfitting-v2.0.json', headers={'Connection': 'close'}).json() logger.info('Obtained outfitting schema v2') journal_schema1 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/master/schemas/journal-v1.0.json', headers={'Connection': 'close'}).json() logger.info('Obtained journal schema v1') blackmarket_schema1 = requests.get('https://raw.githubusercontent.com/jamesremuscat/EDDN/master/schemas/blackmarket-v1.0.json', headers={'Connection': 'close'}).json() logger.info('Obtained blackmarket schema v1') if profile_name: boto3.setup_default_session(profile_name=profile_name) dynamodb = boto3.resource('dynamodb', region_name='eu-west-1') logger.info('Connected to Dynamo') ioloop.install() logger.info('Installed PyZMQ version of Tornado IOLoop') context = zmq.Context() message_processes = MessageProcessor(commodity_schema1, commodity_schema2, commodity_schema3, shipyard_schema1, shipyard_schema2, outfitting_schema1, outfitting_schema2, journal_schema1, blackmarket_schema1, dynamodb) # Ideally the timeout here would be coordinated with keep-alive timing from EDDN subscriber = Subscriber(context, random.randint(1500, 1800), message_processes.process_message, message_processes.send_messages) while not subscriber.shutdown_signalled: try: subscriber.start() except Exception as ex: logger.exception('Exception encountered in communications, listening again') bad_data_table.put_item(Item={ 'timestamp': utils.date_to_epoch_micro(utils.utcnow()), 'errorCause': repr(ex), 'source': 'DynamoRaw' }) sleep(0.001)
def run(site, lst, date, targets, testsources, calibrators, args): if date is None: date = datetime.date.today() if lst is None: utc = utils.utcnow() lst = site.utc_to_lst(utc=utc, date=date) datestr = date.strftime("%b %d, %Y") lststr = utils.deg_to_hmsstr(lst*15)[0] utc = site.lst_to_utc(lst=lst, date=date) utcstr = utils.deg_to_hmsstr(utc*15)[0] print "%s\tLST: %s\tUTC: %s\n" % (datestr, lststr, utcstr) for srclist in [calibrators, targets, testsources]: for src in srclist: ra_deg, dec_deg = src.get_posn(lst, date) rastr = "R.A. (J2000): %s" % utils.deg_to_hmsstr(ra_deg, 2)[0] decstr = "Dec. (J2000): %s" % utils.deg_to_dmsstr(dec_deg, 2)[0] print "%-20s%-27s%27s" % (src.name, rastr, decstr) try: risetime, settime = src.get_rise_set_times(site, date) except errors.SourceIsCircumpolar: srctypestr = "(%s)" % srclist.name print "%-20sSource is circumpolar." % srctypestr except errors.SourceNeverRises: srctypestr = "(%s)" % srclist.name print "%-20sSource never rises." % srctypestr except errors.MultipleRiseSets: srctypestr = "(%s)" % srclist.name print "%-20sMultiple rise/set times?!" % srctypestr except: srctypestr = "(%s)" % srclist.name print "%-20sError! Oops..." % srctypestr raise else: if src.is_visible(site, lst, date): eventstr = "Source sets in %s" % \ utils.deg_to_hmsstr(((settime-lst)%24)*15)[0] else: eventstr = "Source rises in %s" % \ utils.deg_to_hmsstr(((risetime-lst)%24)*15)[0] risetosetstr = "Rise to set time: %s" % \ utils.deg_to_hmsstr(((settime-risetime)%24)*15)[0] riselststr = "Rise (LST): %s" % \ utils.deg_to_hmsstr((risetime%24)*15)[0] riseutcstr = "Rise (UTC): %s" % \ utils.deg_to_hmsstr((site.lst_to_utc(risetime, \ date)%24)*15)[0] setlststr = "Set (LST): %s" % \ utils.deg_to_hmsstr((settime%24)*15)[0] setutcstr = "Set (UTC): %s" % \ utils.deg_to_hmsstr((site.lst_to_utc(settime, \ date)%24)*15)[0] srctypestr = "(%s)" % srclist.name print "%-20s%-27s%27s" % (srctypestr, risetosetstr, eventstr) print " "*20 + "%-22s%22s" % (riselststr, setlststr) print " "*20 + "%-22s%22s" % (riseutcstr, setutcstr) if src.notes: print "" print " "*20 + "NOTES: %s" % src.notes print "" print ""
def __init__(self, app): self.app = app self.expires_value = time.strftime("%a, %d %b %Y %H:%M:%S UTC", utils.utcnow())
def checkpoint(self, now=None): self.data = [] self.last_send = now or utcnow()
def process_message(self, msg): year = utils.utcnow().year if self.info_table is None or self.year != year: self.info_table = self.dynamodb.Table('eddn-archive-{0}'.format(year)) self.year = year try: raw_json = zlib.decompress(msg).decode(encoding='UTF-8') try: msg_from_json = simplejson.loads(raw_json) logger.debug('Raw json {0}'.format(msg_from_json)) if msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/commodity/1': jsonschema.validate(msg_from_json, self.commodity_schema1) logger.debug('Json passed commodity schema v1 validation') storage = self.commodity_storage elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/commodity/2': jsonschema.validate(msg_from_json, self.commodity_schema2) logger.debug('Json passed commodity schema v2 validation') storage = self.commodity_storage elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/commodity/3': jsonschema.validate(msg_from_json, self.commodity_schema3) logger.debug('Json passed commodity schema v3 validation') storage = self.commodity_storage elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/shipyard/1': jsonschema.validate(msg_from_json, self.shipyard_schema1) logger.debug('Json passed shipyard schema v1 validation') storage = self.shipyard_storage elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/shipyard/2': jsonschema.validate(msg_from_json, self.shipyard_schema2) logger.debug('Json passed shipyard schema v2 validation') storage = self.shipyard_storage elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/outfitting/1': jsonschema.validate(msg_from_json, self.outfitting_schema1) logger.debug('Json passed outfitting schema v1 validation') storage = self.outfitting_storage elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/outfitting/2': jsonschema.validate(msg_from_json, self.outfitting_schema2) logger.debug('Json passed outfitting schema v2 validation') storage = self.outfitting_storage elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/blackmarket/1': jsonschema.validate(msg_from_json, self.blackmarket_schema1) logger.debug('Json passed blackmarket schema v1 validation') storage = self.blackmarket_storage elif msg_from_json['$schemaRef'] == 'http://schemas.elite-markets.net/eddn/journal/1': jsonschema.validate(msg_from_json, self.journal_schema1) logger.debug('Json passed journal schema v1 validation') storage = self.journal_storage else: logger.debug('Data returned is not commodity, shipyard, outfitting, blackmarket, journal, ignoring {0}'.format(msg_from_json)) return timestamp = utils.utcnow() if raw_json not in storage: storage[raw_json] = (timestamp, msg_from_json) if len(storage) >= 3: self.send_messages() except Exception as ex: logger.exception('Exception encountered in parsing, listening again') self.error_table.put_item(Item={ 'timestamp': utils.date_to_epoch_micro(utils.utcnow()), 'badData': raw_json, 'errorCause': repr(ex), 'source': 'DynamoRawStore' }) sleep(0.001) except Exception as ex: logger.exception('Exception encountered in communications, listening again') self.error_table.put_item(Item={ 'timestamp': utils.date_to_epoch_micro(utils.utcnow()), 'errorCause': repr(ex), 'source': 'DynamoRawDecompress' }) sleep(0.001)
author_id = db.Column(db.Integer, db.ForeignKey('users.username')) stream_id = db.Column(db.Integer, db.ForeignKey('streams.eid')) """ sys.stdout.write('Post\tpid\ttitle\tbody\tbody_html\ttimestamp\tauthor_id\tstream_id\n') entitymap = defaultdict(set) with open(sys.argv[1]) as infile: for line in infile: fields = line.strip('\r\t\n').split('\t') if len(fields)<2: continue caption = fields[1] eid = fields[0] entitymap[caption].add(eid) with open(sys.argv[2]) as infile2: for line in infile2: # print line fields = line.strip('\r\t\n').split('\t') if fields[0] not in entitymap: continue for stream in entitymap[fields[0]]: pid = 'post-' + id_generator() title = fields[1] html = fields[2] author = 'wikomega' streamid = stream timestamp = str(utcnow()) sys.stdout.write('%s\n' % '\t'.join([pid, title, html, author, streamid, timestamp]))
def get_data(csv_files): """ Populate the database with data extract in csv files :return: """ global provider airport_replacement = { "SBCD": "SSCC", "SWUY": "SBUY", "SBJI": "SWJI", "RJNN": "RJNA", "SBPM": "SBPJ", "SEQU": "SEQM", "SNQY": "SBJU", "SJDB": "SBDB", "SWJH": "SIZX", "SNNG": "SJNP", "SDFR": "SDDN", "1AON": "SDOW", "SMPB": "SMJP", "2NHT": "SBTC", "SWIQ": "SBMC", "SWKK": "SSKW", "SAIG": "SARI", "SBER": "SWEI" } airport_exclusions = {"SBNT", "SUPE", "6ASO", "SAMQ"} airline_replacements = {"VIP": "FPG", "BLC": "TAM"} def log_bulk(self): log.info(' store external_segment: %r', self.nresult) for csv_f in csv_files: # loop through each file print('******************** processed csv: ', csv_f) with open('%s/%s' % (tmp_dir, csv_f)) as csv_file: dict_reader = csv.DictReader(csv_file) row_nb = 0 previous_data = pd.DataFrame(columns=[ 'origin', 'destination', 'year_month', 'airline', 'passengers' ]) with External_Segment_Tmp.unordered_bulk( 1000, execute_callback=log_bulk) as bulk: for row in dict_reader: # loop through each row (origin, destination) in file row_nb += 1 for key, value in row.items(): if value == ':': row[key] = '' if ((row['PASSAGEIROS PAGOS'] == '0') and (row['PASSAGEIROS PAGOS'] == '0')) or \ (row['PASSAGEIROS PAGOS'] == ''): # skip rows with no pax continue total_pax = int(row['PASSAGEIROS PAGOS']) + int( row['PASSAGEIROS GRÁTIS']) row_airline = get_airline_by_icao(row['EMPRESA (SIGLA)'], row['EMPRESA (NOME)']) if row['AEROPORTO DE ORIGEM (SIGLA)'] in airport_exclusions or \ row['AEROPORTO DE DESTINO (SIGLA)'] in airport_exclusions: # skip exclusions continue airport_origin = get_airport_by_icao( row['AEROPORTO DE ORIGEM (SIGLA)'], row['AEROPORTO DE ORIGEM (NOME)']) airport_destination = get_airport_by_icao( row['AEROPORTO DE DESTINO (SIGLA)'], row['AEROPORTO DE DESTINO (NOME)']) if airport_destination is None: continue if airport_origin is None: continue if row_airline in airline_replacements: row_airline = airline_replacements.get(row_airline) if airport_origin in airport_replacement: airport_origin = airport_replacement.get( airport_origin) if airport_destination in airport_replacement: airport_destination = airport_replacement.get( airport_destination) year_month = '%04d-%02d' % (int(row['ANO']), int( row['MÊS'])) if ((previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month) & (previous_data['airline'] == row_airline)).any(): new_row = False total_pax += int( previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month) & (previous_data['airline'] == row_airline)] ) # Add to Excel file's total_pax the "passenger" integer you get from filtering previous_data on other columns else: new_row = True dic = dict(provider=provider, data_type='airport', airline=row_airline, origin=airport_origin, destination=airport_destination, year_month=year_month, total_pax=total_pax, raw_rec=row, both_ways=False, from_line=row_nb, from_filename=csv_f, url=full_url) new_data = pd.Series({ 'origin': airport_origin, 'destination': airport_destination, 'year_month': year_month, 'airline': row_airline, 'passengers': total_pax }).to_frame() if new_row: previous_data = previous_data.append(new_data.T, ignore_index=True) else: previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['airline'] == row_airline) & (previous_data['year_month'] == year_month )] = total_pax # Modify previous_data's pax now = utcnow() query = dict( (k, dic[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type', 'airline')) bulk.find(query).upsert().update_one({ '$set': dic, '$setOnInsert': dict(inserted=now) }) if row_nb % 1000 == 0: print(row_nb / len(dict_reader) * 100, "%") log.info('stored: %r', bulk.nresult)
def get_data(xlsx_files): """ Populate the database with data extract in xlsx files :return: """ global provider airport_codes = get_airports_codes() airport_replacement = { "BUE": "EZE", "RIO": "GIG", "LMC": "LMC", "LMA": "MCJ", "VGP": "VGZ", "PTL": "PTX", "MIL": "MXP", "LON": "LHR", "SAO": "CGH", "BSL": "BSL", "TRP": "TCD", "RLB": "LIR", "NYC": "JFK", "GTK": "FRS", "AWH": "USH", "STO": "ARN", "WAS": "IAD", "BHZ": "PLU" } def log_bulk(self): log.info(' store external_segment: %r', self.nresult) for xlsx_f in xlsx_files: # loop through each file print('******************** processing Excel file:', xlsx_f) xls = pd.read_excel(tmp_dir + "/" + xlsx_f) header = np.where( xls.loc[:, :] == "Pasajeros")[0] + 1 # Look for column names xls = pd.read_excel(tmp_dir + "/" + xlsx_f, header=header) # Re-load file with headers xls = format_columns(xls) row_nb = 0 previous_data = pd.DataFrame( columns=[ 'origin', 'destination', 'year_month', 'airline', 'passengers' ] ) # Create a dataframe to save data line after line, so we can check later on with External_Segment_Tmp.unordered_bulk( 1000, execute_callback=log_bulk) as bulk: for row in range( 0, len(xls) ): # loop through each row (origin, destination) in file row_nb += 1 full_row = xls.iloc[row] if np.isnan( full_row['Passengers'] ) or full_row['Passengers'] == "" or int( full_row['Passengers']) == 0: # skip rows with no pax continue total_pax = int(full_row['Passengers']) row_airline = get_airline_by_icao(full_row['Airline'], full_row['Airline_Name']) if row_airline is None: continue airport_origin = full_row['Origen'] airport_destination = full_row['Destino'] if airport_origin in airport_replacement: # correct the wrong codes airport_origin = airport_replacement.get(airport_origin) if airport_destination in airport_replacement: # correct the wrong codes airport_destination = airport_replacement.get( airport_destination) if airport_destination not in airport_codes: unknown_airports.add(airport_destination + ":" + str(full_row['Airport_Destination']) + ":" + str(full_row['Pais Destino'])) continue if airport_origin not in airport_codes: unknown_airports.add(airport_origin + ":" + str(full_row['Airport_Origin']) + ":" + str(full_row['Pais Origen'])) continue year_month = full_row['Year_Month'] if ((previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month) & (previous_data['airline'] == row_airline)).any(): new_row = False # Add to Excel file's total_pax the "passenger" integer you get from filtering previous_data on other columns total_pax += int(previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month) & (previous_data['airline'] == row_airline)]) else: new_row = True dic = dict(provider=provider, data_type='airport', airline=row_airline, origin=airport_origin, destination=airport_destination, year_month=year_month, total_pax=total_pax, raw_rec=full_row.to_json(), both_ways=False, from_line=row_nb, from_filename=xlsx_f, url=full_url) new_data = pd.Series({ 'origin': airport_origin, 'destination': airport_destination, 'year_month': year_month, 'airline': row_airline, 'passengers': total_pax }).to_frame() if new_row: previous_data = previous_data.append(new_data.T, ignore_index=True) else: # Update the previous_data data frame with the new passengers count previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['airline'] == row_airline) & (previous_data['year_month'] == year_month)] = total_pax now = utcnow() query = dict((k, dic[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type', 'airline')) bulk.find(query).upsert().update_one({ '$set': dic, '$setOnInsert': dict(inserted=now) }) if row_nb % 1000 == 0: print(row_nb / len(xls) * 100, "%") log.info('stored: %r', bulk.nresult)
def get_data(xlsx_files, year_months): """ Populate the database with data extract in xlsx files. 4 different tabs, for distinction of national/international and scheduled/charter flights. Routes in rows, months in columns. :param xlsx_files: dict of file names :return: """ global provider now = utcnow() months = { "January": "01", "February": "02", "March": "03", "April": "04", "May": "05", "June": "06", "July": "07", "August": "08", "September": "09", "October": "10", "November": "11", "December": "12" } def log_bulk(self): log.info(' store external_segment: %r', self.nresult) for xlsx_f in xlsx_files: # loop through each file print('******************** processing Excel file:', xlsx_f) xl = pd.ExcelFile(tmp_dir + "/" + xlsx_f) # Create a data frame to save data line after line, so we can check later on and add values to each other previous_data = pd.DataFrame( columns=['origin', 'destination', 'year_month', 'passengers']) for tab in xl.sheet_names: # loop in all sheets of the excel file print('Starting', tab, 'tab in the Excel file') xls = xl.parse(tab) year = int(filter(str.isdigit, xlsx_f)) # Use the renamed file for the year header = np.where(xls.loc[:, :] == "PAR DE CIUDADES / CITY PAIR" )[0] + 3 # Look for line with column names xls = xl.parse(tab, header=header) # Re-load file with headers xls = format_file(xls) xls['tab'] = tab with External_Segment_Tmp.unordered_bulk( 1000, execute_callback=log_bulk) as bulk: for indx, row in xls.iterrows( ): # loop through each row (origin, destination) in file # Skip empty rows (no text in Origin column, or year Total = 0) if isinstance(row['Origin'], float) or row['Total'] == 0: continue # Stop at the end of the table (indicated by "T O T A L") if "".join(row['Origin'].split(" ")).upper() == "TOTAL": break origin = unidecode(row['Origin']).upper() destination = unidecode(row['Destination']).upper() airport_origin = find_airports_by_name(origin, tab) airport_destination = find_airports_by_name( destination, tab) if airport_origin is None: update_unknown_airports(origin, row['Total']) continue if airport_destination is None: update_unknown_airports(destination, row['Total']) continue for colname, colvalue in row.iteritems( ): # loop through rows # Only look at month columns if colname not in months.keys(): continue # skip cells with no pax if np.isnan(colvalue) or colvalue == "" or int( colvalue) == 0: continue year_month = str(year) + "-" + months.get(colname) total_pax = int(colvalue) # Only treat the requested year_months if year_month not in year_months: continue if year_month not in previous_data[ 'year_month'].values: if External_Segment_Tmp.find_one({ 'year_month': year_month, 'provider': provider }): log.warning( "This year_month (%s) already exists for provider %s", year_month, provider) # For international flights, only keep the airports for which capacity exists on that year_month if 'INT' in tab: airport_origin, airport_destination = get_capa( year_month, airport_origin, airport_destination) if airport_destination is None or airport_origin is None: no_capa.append({ 'year_month': year_month, 'origin': origin, 'destination': destination }) continue if ((previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month)).any(): new_row = False # Add to Excel file's total_pax the "passenger" integer you get from filtering previous_data on other columns total_pax += int(previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month)]) else: new_row = True dic = dict( provider=provider, data_type='airport', airline=['*'], airline_ref_code=['*'], origin=[', '.join(airport_origin)], destination=[', '.join(airport_destination)], year_month=[year_month], total_pax=total_pax, raw_rec=dict(row), both_ways=False, from_line=indx, from_filename=xlsx_f, url=base_url + end_url) new_data = pd.Series({ 'origin': airport_origin, 'destination': airport_destination, 'year_month': year_month, 'passengers': total_pax }).to_frame() if new_row: previous_data = previous_data.append( new_data.T, ignore_index=True) else: # Update the previous_data data frame with the new passengers count previous_data['passengers'][ (previous_data['origin'] == airport_origin) & (previous_data['destination'] == airport_destination) & (previous_data['year_month'] == year_month)] = total_pax query = dict( (k, dic[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type', 'airline')) bulk.find(query).upsert().update_one({ '$set': dic, '$setOnInsert': dict(inserted=now) }) log.info('stored: %r', bulk.nresult)
logging_format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.DEBUG, format=logging_format) handler = BackupFileHandler('load_Chili.log', backupCount=20) formatter = logging.Formatter(logging_format) handler.setFormatter(formatter) main_log = logging.getLogger() # le root handler log = logging.getLogger('load_Chili') # log.setLevel(logging.INFO) log.setLevel(logging.DEBUG) log.addHandler(handler) log.info('Load files from Chili - %s - %r', __version__, p) now = utcnow() year_months = p.year_months[0].split(', ') year = list(set([int(ym[0:4]) for ym in year_months])) file_pattern = p.all_files or 'Trafico-de-Par-de-ciudades-por-Operador' get_files(year, file_pattern) Model.init_db(def_w=True) if file_pattern is not True: for type_flight in ( 'Internacional', 'Nacional', ): analyse_and_store('%s-%s' % (file_pattern, type_flight))
def get_data(xlsx_files): """ Populate the database with data extract in xlsx files. One file per year_month, only one tab per file. Back/Forth routes in rows, one column per way. :param xlsx_files: dict of file names :return: """ global provider now = utcnow() months = { "January": "01", "February": "02", "March": "03", "April": "04", "May": "05", "June": "06", "July": "07", "August": "08", "September": "09", "October": "10", "November": "11", "December": "12", "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04", "Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Sept": "09", "Oct": "10", "Nov": "11", "Dec": "12" } quarters = { "01": "Q1", "02": "Q1", "03": "Q1", "04": "Q2", "05": "Q2", "06": "Q2", "07": "Q3", "08": "Q3", "09": "Q3", "10": "Q4", "11": "Q4", "12": "Q4" } def log_bulk(self): log.info(' store external_segment: %r', self.nresult) for xlsx_f in xlsx_files: # loop through each file if "domestic" in xlsx_f: perimeter = "domestic" else: perimeter = "international" provider_label = provider.get(perimeter) print('******************** processing Excel file:', xlsx_f) xl = pd.ExcelFile(tmp_dir + "/" + xlsx_f) xls = xl.parse() # Year_month based on the renamed file. List months of the quarter for the case of international files year = int(filter(str.isdigit, xlsx_f)[-4:]) if perimeter == "domestic": month = '%02d' % int(xlsx_f.split('_')[2].split('-')[0]) year_month = [str(year) + "-" + month] else: quarter = xlsx_f.split('_')[2].split('-')[0] year_month = [ str(year) + '-' + k for k, v in quarters.items() if v == quarter ] # Look for line with column names if perimeter == "domestic": header = np.where( xls.apply(lambda x: x.astype(str).str.upper().str.replace( " ", "")).loc[:, :] == "CITY1")[0] + 1 else: header = np.where( xls.apply(lambda x: x.astype(str).str.upper().str.replace( " ", "")).loc[:, :] == "CITY1")[0][0] + 1 xls = xl.parse(header=header) # Re-load file with headers xls = format_file(xls, perimeter) all_rows = len(xls.index) with External_Segment_Tmp.unordered_bulk( 1000, execute_callback=log_bulk) as bulk: for row in range( 0, len(xls) ): # loop through each row (origin, destination) in file full_row = xls.iloc[row] # Stop at the end of the table (indicated by "TOTAL") if pd.isnull( full_row['CITY 1']) or full_row['CITY 1'] == "CITY 1": continue if isinstance(full_row['ID'], str) and "".join( full_row['ID'].split(" ")).upper() == "TOTAL": break # Skip empty rows (no text in Origin column, or year Total = 0) if isinstance(full_row['PAX TO 2'], float) and full_row['PAX FROM 2'] == 0: continue airport1 = find_airports_by_name( unidecode(full_row['CITY 1']).upper(), perimeter) airport2 = find_airports_by_name( unidecode(full_row['CITY 2']).upper(), 'domestic') if airport1 is None: update_unknown_airports(full_row['CITY 1'], full_row['PAX TO 2'], full_row['PAX FROM 2']) continue if airport2 is None: update_unknown_airports(full_row['CITY 2'], full_row['PAX TO 2'], full_row['PAX FROM 2']) continue # First save data from city 1 to city 2 dic_to = dict(provider=provider_label, data_type='airport', airline=['*'], airline_ref_code=['*'], origin=sorted(airport1), destination=sorted(airport2), year_month=year_month, total_pax=int(full_row['PAX TO 2']), overlap=[], raw_rec=dict(full_row), both_ways=False, from_line=row, from_filename=xlsx_f, url=full_url) query = dict((k, dic_to[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type', 'airline')) bulk.find(query).upsert().update_one({ '$set': dic_to, '$setOnInsert': dict(inserted=now) }) # Then save data from city 2 to city 1 dic_from = dict(provider=provider_label, data_type='airport', airline=['*'], airline_ref_code=['*'], origin=sorted(airport2), destination=sorted(airport1), year_month=year_month, total_pax=int(full_row['PAX FROM 2']), overlap=[], raw_rec=dict(full_row), both_ways=False, from_line=row, from_filename=xlsx_f, url=full_url) query = dict((k, dic_from[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type', 'airline')) bulk.find(query).upsert().update_one({ '$set': dic_from, '$setOnInsert': dict(inserted=now) }) if row % 100 == 0: print('{0:.3g}'.format(float(row) / float(all_rows) * 100) + '%') log.info('stored: %r', bulk.nresult)
def update_routes(csv_file, year_months): """ Save new records in External_Segment collection """ now = utcnow() def log_bulk(self): log.info(' store external_segment: %r', self.nresult) log.info('Updating db with contents of %s...', csv_file) xls = pd.read_csv(tmp_dir + '/' + csv_file, sep=',', skiprows=[0, 1, 2]) new_columns = xls.columns.values new_columns[0] = 'irish_airport' new_columns[1] = 'way' new_columns[2] = 'other_airport' for i, col in enumerate(new_columns[3:len(new_columns)], 3): new_columns[i] = col.replace('M', '-') xls.columns = new_columns xls = xls.replace(' ', np.nan) available_year_months = new_columns[3:len(new_columns)].tolist() with External_Segment_Tmp.unordered_bulk( 1000, execute_callback=log_bulk) as bulk: for row_index, row in xls.iterrows(): if pd.notnull(row['irish_airport']): irish_airport = row['irish_airport'] if pd.notnull(row['way']): way = row['way'] if pd.isnull(row['other_airport']): continue else: other_airport = row['other_airport'] if sum(row[available_year_months]) == 0: continue for ym in available_year_months: # Skip the year_months that are not requested if ym not in year_months: continue pax = row[ym] if way == 1: airport_origin = irish_airport airport_destination = other_airport else: airport_origin = other_airport airport_destination = irish_airport if not check_airport(airport_origin, pax): continue if not check_airport(airport_destination, pax): continue dic = dict(provider=provider, data_type='airport', airline=['*'], airline_ref_code=['*'], total_pax=pax, origin=[airport_origin], destination=[airport_destination], year_month=[ym], overlap=[], raw_rec=dict(row), both_ways=False, from_line=row_index, from_filename=csv_file, url=url) query = dict((k, dic[k]) for k in ('origin', 'destination', 'year_month', 'provider', 'data_type', 'airline')) bulk.find(query).upsert().update_one({ '$set': dic, '$setOnInsert': dict(inserted=now) }) if row_index % 1000 == 0: print('{0:.3g}'.format(row_index / len(xls.index) * 100) + '%') log.info('stored: %r', bulk.nresult)