def process_file( file_path, parser, handle_package, show_progress=False): """ Process a single file with specified plugin. """ if not os.path.exists(file_path): raise Exception("Could not find file '{0}'".format(file_path)) _directory, filename = os.path.split(file_path) with open(file_path) as data_file: stop_event = threading.Event() condition = compose(not_, stop_event.is_set) if show_progress: start_progress_reporter(data_file, condition) try: for package in parser.packages(data_file, filename): handle_package(package) except DataError as exc: raise ParseError("{0!s} at position {1:d}".format( exc, data_file.tell())) except Exception: stack_trace = traceback.format_exc() position = data_file.tell() message = "{0} at position {1:d}".format(stack_trace, position) raise Exception(message) finally: stop_event.set()
def process_file(file_path, parser, handle_package, show_progress=False): """ Process a single file with specified plugin. """ if not os.path.exists(file_path): raise Exception("Could not find file '{0}'".format(file_path)) _directory, filename = os.path.split(file_path) with open(file_path) as data_file: stop_event = threading.Event() condition = compose(not_, stop_event.is_set) if show_progress: start_progress_reporter(data_file, condition) try: for package in parser.packages(data_file, filename): handle_package(package) except DataError as exc: raise ParseError("{0!s} at position {1:d}".format( exc, data_file.tell())) except Exception: stack_trace = traceback.format_exc() position = data_file.tell() message = "{0} at position {1:d}".format(stack_trace, position) raise Exception(message) finally: stop_event.set()
def load_csv(profile, csv_file): """ Return tuple (column_names, data_rows). column_names - a list with selected column names data_rows - an iterator over row tuples (dn, timestamp, values) """ csv_reader = csv.reader(csv_file, dialect=profile.dialect(csv_file)) header = next(csv_reader) fields = profile.field_selector(header) values = ValuesExtractor(fields) check_header(header, fields) header_checks = [ check for check in [ profile.timestamp.header_check(), profile.identifier.header_check() ] if check is not None ] for check in header_checks: check(header) record_checks = [ check for check in [ profile.timestamp.record_check(), profile.identifier.record_check() ] if check is not None ] include_record = partial(record_passes_checks, record_checks) include_row = create_row_check(header) records = filter( include_record, ( dict(zip(header, [item for item in row])) for line_nr, row in enumerate(csv_reader) if include_row(line_nr, row) ) ) extract_raw_data_row = compose(tuple, raw_data_row_extractor( profile.identifier.from_record, profile.timestamp.from_record, values.from_record )) return fields, map(extract_raw_data_row, records)
def execute(self): datasource_name = self.description["datasource"] try: datasource = get_datasource(self.minerva_context.writer_conn, datasource_name) except NoSuchDataSourceError: raise HarvestError("no datasource with name '{}'".format(datasource_name)) parser_config = self.description.get("parser_config", {}) uri = self.description["uri"] update_existence = parser_config.get("update_existence", None) datatype = self.description["datatype"] try: plugin = self.plugins[datatype] except KeyError: raise HarvestError("could not load parser plugin '{}'".format(datatype)) storagetype = plugin.storagetype() try: storage_provider = self.minerva_context.storage_providers[storagetype] except KeyError: raise HarvestError("could not load '{}' storage provider plugin".format(storagetype)) dispatch_raw_datapackage = partial(storage_provider.store_raw, datasource) if update_existence: dispatch_raw_datapackage = partial(dispatch_raw_and_mark_existing, dispatch_raw_datapackage, update_existence, self.existence.mark_existing) dispatch_raw = compose(dispatch_raw_datapackage, storage_provider.RawDataPackage) parser = plugin.create_parser(dispatch_raw, parser_config) encoding = self.description.get("encoding", "utf-8") datastream = open_uri(uri, encoding) logging.debug("opened uri '{}'".format(uri)) try: parser.parse(datastream, os.path.basename(uri)) except Exception as exc: stacktrace = traceback.format_exc() execute_action(uri, self.description.get("on_failure", DEFAULT_ACTION)) raise JobError(stacktrace) else: execute_action(uri, self.description.get("on_success", DEFAULT_ACTION)) if update_existence: self.existence.flush(datetime.now())
def load_csv(profile, csv_file): """ Return tuple (column_names, data_rows). column_names - a list with selected column names data_rows - an iterator over row tuples (dn, timestamp, values) """ csv_reader = create_csv_reader(profile, csv_file) header = csv_reader.next() fields = profile.field_selector(header) values = ValuesExtractor(fields) check_header(header, fields) header_checks = [ check for check in [ profile.timestamp.header_check(), profile.identifier.header_check() ] if not check is None ] for check in header_checks: check(header) record_checks = [ check for check in [ profile.timestamp.record_check(), profile.identifier.record_check() ] if not check is None ] include_record = partial(record_passes_checks, record_checks) include_row = create_row_check(header) records = filter( include_record, ( dict(zip(header, [item.decode('utf-8') for item in row])) for line_nr, row in enumerate(csv_reader) if profile.ignore_field_mismatches or include_row(line_nr, row) ) ) extract_raw_data_row = compose(tuple, raw_data_row_extractor( profile.identifier.from_record, profile.timestamp.from_record, values.from_record )) return fields, map(extract_raw_data_row, records)
def _connect(self): handler_map = { psycopg2.OperationalError: lambda exc: logging.error( "could not connect to database ({}), waiting".format(exc)) } retry_condition = compose(not_, self.stop_event.is_set) return retry_while(self.connect_fn, handler_map, retry_condition)
def __init__(self, template, regex): self.template = template self.regex = regex self.fields = re.findall("{([^}]+)}", template) # composed identifier (e.g. '{fld1}-{fld2}, {fld1}:{fld2}') get_identifier = expand_kwargs(template.format) extract_ident = partial(extract_identifier, regex) self.record_to_dn = compose(extract_ident, get_identifier)
def __init__(self, template, regex): self.template = template self.regex = regex self.fields = re.findall("{([^}]+)}", template) #composed identifier (e.g. '{fld1}-{fld2}, {fld1}:{fld2}') get_identifier = expand_kwargs(template.format) extract_ident = partial(extract_identifier, regex) self.record_to_dn = compose(extract_ident, get_identifier)
def execute(self, cursor, state): partition = self.partition(state) datapackage = self.datapackage(state) try: try: store_batch_insert(cursor, partition.table(), datapackage, state["modified"]) except Exception as exc: logging.debug("exception: {}".format(type(exc).__name__)) raise exc except NoSuchTable: data_types = compose(DataPackage.deduce_data_types, self.datapackage) trend_names = compose(attrgetter("trend_names"), self.datapackage) fix = CreatePartition(self.partition, trend_names, data_types) return insert_before(fix) except NoSuchColumnError: data_types = compose(DataPackage.deduce_data_types, self.datapackage) trend_names = compose(attrgetter("trend_names"), self.datapackage) fix = CheckColumnsExist(self.partition, trend_names, data_types) return insert_before(fix) except UniqueViolation: fix = Update(self.partition, self.datapackage) return replace(fix) except DataTypeMismatch: data_types = compose(DataPackage.deduce_data_types, self.datapackage) trend_names = compose(attrgetter("trend_names"), self.datapackage) fix = CheckColumnTypes(self.partition, trend_names, data_types) return insert_before(fix)
def execute(self, cursor, state): partition = self.partition(state) datapackage = self.datapackage(state) try: store_copy_from(cursor, partition.table(), datapackage, state["modified"]) except NoCopyInProgress: return no_op except NoSuchTable: data_types = compose(DataPackage.deduce_data_types, self.datapackage) trend_names = compose(attrgetter("trend_names"), self.datapackage) fix = CreatePartition(self.partition, trend_names, data_types) return insert_before(fix) except NoSuchColumnError: data_types = compose(DataPackage.deduce_data_types, self.datapackage) trend_names = compose(attrgetter("trend_names"), self.datapackage) fix = CheckColumnsExist(self.partition, trend_names, data_types) return insert_before(fix) except UniqueViolation: fix = Update(self.partition, self.datapackage) return replace(fix) except DataTypeMismatch: data_types = compose(DataPackage.deduce_data_types, self.datapackage) trend_names = compose(attrgetter("trend_names"), self.datapackage) fix = CheckColumnTypes(self.partition, trend_names, data_types) return insert_before(fix)
def test_compose(): composed = compose(add_one, times_two, add_one, add_one) assert_equal(composed(1), 7)
} @classmethod def from_dict(cls, d): """Return DataPackage constructed from the dictionary.""" return cls( attribute_names=d["attribute_names"], rows=d["rows"] ) snd = itemgetter(1) types_from_values = partial(map, datatype.deduce_from_value) row_to_types = compose(types_from_values, itemgetter(2)) def create_copy_from_line(data_types, row): """Return line compatible with COPY FROM command.""" entity_id, timestamp, attributes = row value_mappers = map(value_mapper_by_type.get, data_types) values = chain( (str(entity_id), str(timestamp)), zipapply(value_mappers, attributes) ) return "\t".join(values) + "\n"
from minerva.node import MinervaContext from minerva_transform.types import Transformation from minerva_db import reset_db, with_connection, \ get_dummy_datasource, get_dummy_entitytype, TIMEZONE, add_function_set, \ add_function_mapping, render_result from util import render_datapackage from minerva.storage.trend.store import CopyFrom from minerva.storage.trend.types_v4 import DataPackage, TrendStore3 from minerva.storage.trend.granularity import create_granularity tzinfo = pytz.timezone(TIMEZONE) local_timestamp = compose(tzinfo.localize, datetime) src_timestamp_1 = local_timestamp(2012, 12, 11, 13, 15, 0) src_timestamp_2 = local_timestamp(2012, 12, 11, 13, 30, 0) src_timestamp_3 = local_timestamp(2012, 12, 11, 13, 45, 0) src_timestamp_4 = local_timestamp(2012, 12, 11, 14, 0, 0) modified_a = local_timestamp(2012, 12, 11, 14, 3, 27) modified_b = local_timestamp(2012, 12, 11, 14, 7, 14) dest_timestamp = local_timestamp(2012, 12, 11, 14, 0, 0) granularity = create_granularity("900") trend_names = ("counter_a", "counter_b") source_1_1 = DataPackage(granularity, src_timestamp_1, trend_names, [(1000, (4, 0))])
def record_check(self): return compose(operator.not_, any_field_empty(self.fields))
with closing(conn.cursor()) as cursor: column_names = transformation.function_set.get_dest_columns(cursor) rows = [(row[0], ([transformation.function_set.id],) + row[2:]) for row in transformed_rows if row[0]] plugin = get_plugin("trend")(conn, api_version=4) datapackage = plugin.DataPackage(transformation.function_set.dest_trendstore.granularity, transformation.dest_timestamp, column_names, rows) return plugin.store_txn(transformation.function_set.dest_trendstore, datapackage) row_has_entity_id = compose(truth, head) def function_set_from_row(cursor, row): id, name, description, mapping_signature, source_datasource_ids, \ source_entitytype_id, source_granularity_str, dest_datasource_id, \ dest_entitytype_id, dest_granularity_str, filter_sub_query, group_by, \ relation_type_id, enabled = row get_datasource = partial(get_datasource_by_id, cursor) get_entitytype = partial(get_entitytype_by_id, cursor) source_granularity = create_granularity(str(source_granularity_str)) dest_granularity = create_granularity(str(dest_granularity_str)) source_datasources = map(get_datasource, source_datasource_ids)
def test_compose_pair(): composed = compose(times_two, add_one) assert_equal(composed(2), 6)