def __init__(self, name='MongoDataToCollection'):
        """
        Store the configuration of link MongoDataToCollection

        :param str name: name of link
        :param str read_key: key of data to read from data store
        :param list store_collections: mongo collection names of the collections to store the data
        :param bool clearFirst: if True the mongo store_collections are cleared before storage
        :param int minimal_input_size: integer. Storage is only performed if data meets minimal length (gt 0). default is -1.
        :param list force_move_keys: list of keys to booleans in datastore, OR-red, to overwrite minimal_input_size.
        :param bool clear_input: if true, input data is deleted from ds after storage in mongo.
        """

        Link.__init__(self, name)

        self.read_key = None
        self.store_collections = []
        self.clearFirst = False
        self.minimal_input_size = -1
        self.force_move_keys = []
        self.clear_input = False
        self.fork = False
        self.wait_after_fork = False

        return
    def __init__(self, name='MongoDfToCollection'):
        """
        Store the configuration of link MongoDFToCollection

        :param str name: name of link
        :param str read_key: key of data to read from data store
        :param dict nestedFields: nested structure of columns to be used in mongo. If string, datastore key of the
            nested structure.
        :param list store_collections: mongo collection names of the collections to store the data
        :param bool clearFirst: if True the mongo store_collections are cleared before storage
        :param dict columnsToAdd: columns to add to the pandas.DataFrame before storage. key = column name,
            value = column value
        :param checkFieldsAPI: check compatibility with the data service, i.e. field names that are reserved by the
            data service should not be used as column names in the dataframes. In case this is done anyway
            (don't do that!), the columns have to be of the type defined by the data service otherwise the data in this
            collection will not be processed by the service at all (just silence...)
        """

        # TODO - GOSSIE: use a smart structure with different levels to avoid collisions with reserved field names
        # (e.g. doc['record'] = record, doc['meta'] = reserved_fields_for_api)

        Link.__init__(self, name)

        self.read_key = None
        self.nestedFields = None
        self.store_collections = []
        self.clearFirst = False
        self.columnsToAdd = None
        self.checkFieldsAPI = False
        self.copy = True
        self.columns = []

        return
예제 #3
0
    def __init__(self, **kwargs):
        """Initialize link instance

        :param str name: name of link
        :param str store_key: key to spark dataframe to store in data store
        :param str schema: schema?
        :param str table: table?
        :param list queries: list of queries to perform
        :param list sel_cols: list of columns to select
        :param list col_ranges: list of column ranges to apply
        :param bool group_sel: group selection
        :param bool max_recs: maximum number of retries
        """

        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'JdbcReader'))

        # process keyword arguments
        self._process_kwargs(kwargs, store_key='')
        self._process_kwargs(kwargs, schema='')
        self._process_kwargs(kwargs, table='')
        self._process_kwargs(kwargs, queries=[])
        self._process_kwargs(kwargs, sel_cols=[])
        self._process_kwargs(kwargs, group_sel=False)
        self._process_kwargs(kwargs, col_ranges=[])
        self._process_kwargs(kwargs, max_recs=-1)
        self.check_extra_kwargs(kwargs)
예제 #4
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param str filename: file name where the strings are located (txt or similar). Default is None. (optional)
        :param str store_key: key to collect in datastore. If set lines are collected. (optional)
        :param list line_processor_set: list of functions to apply to input lines. (optional)
        :param bool sort: if true, sort lines before storage (optional)
        :param bool unique: if true, keep only unique lines before storage (optional),
        :param list skip_line_beginning_with: skip line if it starts with any of the list. input is list of strings.
            Default is ['#'] (optional)
        """
        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'EventLooper'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs,
                             filename=None,
                             store_key=None,
                             line_processor_set=[],
                             sort=False,
                             unique=False,
                             skip_line_beginning_with=['#'])

        # process keyword arguments
        self.check_extra_kwargs(kwargs)

        # default line stream to pick up lines is set to sys.stdin below
        # input stream and possible input file
        self._f = None
        self._linestream = None

        # collect lines for storage
        self._collect = False
예제 #5
0
    def __init__(self, **kwargs):
        """
        Skip to the next Chain if any of the input dataset is empty.

        :param str name: name of link
        :param list collectionSet: datastore keys holding the datasets to be checked. If any of these is empty,
            the chain is skipped.
        :param filterDict: filter to be applied on a mongo dataset (optional)
        :param bool skip_chain_when_key_not_in_ds: skip the chain as well if the dataframe is not present in
            the datastore. When True and if type is 'pandas.DataFrame', sents a SkipChain
            signal if key not in DataStore
        :param bool checkAtInitialize: perform dataset empty is check at initialize. Default is true.
        :param bool checkAtExecute: perform dataset empty is check at initialize. Default is false.
        """

        Link.__init__(self, kwargs.pop('name', 'SkipChainIfCollectionEmpty'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             collectionSet=[],
                             filterDict={},
                             skip_chain_when_key_not_in_ds=False,
                             checkAtInitialize=True,
                             checkAtExecute=False)
        self.check_extra_kwargs(kwargs)

        return
예제 #6
0
    def __init__(self, name='MongoCollectionToDF'):
        """
        Store the configuration of link MongoCollectionToDF

        :param str name: name of link
        :param str store_key: key of data to store in data store
        :param str collection: name of the mongo collection
        :param dict/str filter: Filter applied on the mongo query (optional). If string, datastore key of the filter
        :param dict/list columns: Columns to retrieve from mongo. If string, datastore key of the columns
        :param dict columnsToAdd: columns to add to the pandas.DataFrame. key = column name, value = column value
        :param bool store_if_empty: if True and the retrieved data from mongo is empty an empty pandas.DataFrame is
            stored
        :param bool flatten_json: if True the flatten_json method is applied on the retrieved data from mongo
        :param bool flatten_json_addprefix: if True and when flattening, a prefix is added
        :param bool storeNestedStructure: if True the nesting structure is stored in the datastore as a dict with key:
            <self.collection>_nestedstructure
        """

        Link.__init__(self, name)

        self.collection = None
        self.store_key = None
        self.filter = None
        self.columns = None
        self.columnsToAdd = None
        self.store_if_empty = False
        self.flatten_json = True
        self.flatten_json_addprefix = False
        self.storeNestedStructure = True
        return
예제 #7
0
    def __init__(self, *args, **kwargs):
        """Initialize an instance.

        :param str name: name of link
        :param str read_key: key of input data to read from data store
        :param default: The default value of the key in case not found.
        :param assert_type: if set, check object for given type or tuple of types. If fails, raise TypeError.
        :param bool assert_len: if true, check that object has length greater than 0. If fails, raise TypeError or AssertionError.
        :param bool assert_in: assert that key is known, default is true.
        :param func: function to execute
        :param args: all args are passed pass to function as args.
        :param kwargs: all other key word arguments are passed on to the function as kwargs.
        :param str store_key: key of output data to store in data store
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'ApplyFunc'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(kwargs,
                             read_key='',
                             default=None,
                             assert_type=None,
                             assert_len=False,
                             assert_in=True,
                             func=None,
                             store_key='')

        # pass on remaining kwargs to pandas reader
        self.args = copy.deepcopy(args)
        self.kwargs = copy.deepcopy(kwargs)
예제 #8
0
    def __init__(self, name='MongoOverview'):
        """
        Store the configuration of link MongoOverview

        @param name Name given to the link
        """

        Link.__init__(self, name)

        return
예제 #9
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'IPythonEmbed'))

        # check residual kwargs. exit if any present.
        self.check_extra_kwargs(kwargs)
    def __init__(self, name='MongoCheckCollection'):
        """
        Store the configuration of link MongoCheckCollection

        :param str name: name of link
        :param list collectionSet: the collections to be checked
        """

        Link.__init__(self, name)

        self.collectionSet = []
        return
예제 #11
0
    def __init__(self, name='MongoDeleteManyFromDF'):
        """
        Store the configuration of link MongoDeleteManyFromDF

        :param str name: name of link
        :param str read_key: key of data to read from data store
        """

        Link.__init__(self, name)

        self.read_key = None

        return
예제 #12
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param bool send_break: if true, send StatusCode.BreakChain (skip execution of rest of chain).
                                Default is false, then sends StatusCode.Failure (exit program).
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'Break'))
        self._process_kwargs(kwargs, send_break=False)

        # check residual kwargs. exit if any present
        self.check_extra_kwargs(kwargs)
예제 #13
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        Store the configuration of link AssertInDs

        :param str name: name of link
        :param lst keySet: list of keys to check
        """
        Link.__init__(self, kwargs.pop('name', 'AssertInDs'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs, keySet=[])
        self.check_extra_kwargs(kwargs)
예제 #14
0
    def __init__(self, **kwargs):
        """Initialize an instance.

        :param str name: name of link
        :param str store_key: key of object to store in data store
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'ForkExample'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(kwargs, store_key='forkdatacollector')

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)
예제 #15
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param list keys: keys of items to print explicitly.
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'PrintDs'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs, keys=[])

        # check residual kwargs. exit if any present.
        self.check_extra_kwargs(kwargs)
예제 #16
0
    def __init__(self, **kwargs):
        """Initialize an instance.

        :param str name: name of link
        :param list apply: list of functions to execute at execute(), to which datastore is passed
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'DsApply'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(kwargs, apply=[])

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)
예제 #17
0
    def __init__(self, name='MongoEmptyTheCollection'):
        """
        Store the configuration of link MongoEmptyTheCollection

        :param str name: name of link
        :param list collectionSet: mongo collection names to remove data from.
        :param dict fiterDict: pymongo filter for the data to be removed. If empty dict, all data in the collection(s)
            is removed.
        """

        Link.__init__(self, name)

        self.collectionSet = []
        self.filterDict = {}
        return
예제 #18
0
    def __init__(self, **kwargs):
        """Set up the configuration of link LinePrinter.

        :param str name: name of link
        :param str read_key: key of input data to read from data store
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'LinePrinter'))

        # process and register all relevant kwargs. kwargs are added as attributes of the link.
        # second arg is default value for an attribute. key is popped from kwargs.
        self._process_kwargs(kwargs, read_key=None)

        # check residual kwargs. exit if any present.
        self.check_extra_kwargs(kwargs)
예제 #19
0
    def __init__(self, **kwargs):
        """Initialize an instance.

        :param str name: name of link
        :param str read_key: key of input data to read from data store
        :param str store_key: key of output data to store in data store
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'MongoRetrieveLastAdded'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(kwargs, collection='', store_key='')

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)
예제 #20
0
    def __init__(self, **kwargs):
        """Store the configuration of link HelloWorld.

        :param str name: name assigned to the link
        :param str hello: name to print in Hello World! Defaults to 'World'
        :param int repeat: repeat print statement N times. Default is 1
        """
        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'HelloWorld'))

        # process keyword arguments
        self._process_kwargs(kwargs, hello='World', repeat=1)

        # check residual kwargs.
        # (turn line off if you wish to keep these to pass on.)
        self.check_extra_kwargs(kwargs)
예제 #21
0
    def __init__(self, **kwargs):
        """Link that sends signal to processManager to repeat the current chain.

        Sents a RepeatChain deenums.StatusCode signal.

        :param str name: name of link
        :param list listen_to: repeat this chain if given key is present in ConfigObject and set to true.
            E.g. this key is set by readtods link when looping over files.
        :param int maxcount: repeat this chain until max count has been reacher. Default is -1 (off).
        """
        # initialize Link
        Link.__init__(self, kwargs.pop('name', 'RepeatChain'))

        # process keyword arguments
        self._process_kwargs(kwargs, maxcount=-1, listen_to=[])
        self.check_extra_kwargs(kwargs)

        self._counter = 0
예제 #22
0
    def __init__(self, **kwargs):
        """Initialize an instance of the datastore importer link.

        :param str name: name of link
        :param str path: path of the datastore pickle file to import
        :param bool update: if true update the existing datastore, don't replace it. Default is false.
        :param bool import_at_initialize: if false, perform datastore import at execute. Default is true, at initialize. 
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'ImportDataStore'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(kwargs,
                             path='',
                             update=False,
                             import_at_initialize=True)

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)
예제 #23
0
    def __init__(self, **kwargs):
        """Initialize an instance.

        :param str name: name of link
        :param str read_key: key of input data to read from data store
        :param list store_collections: mongo collections to store doc in
        :param bool clear_first: if true, clear collections first before storage
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'MongoDocToCollection'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(kwargs,
                             read_key='',
                             store_collections=[],
                             clear_first=False)

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)
예제 #24
0
    def __init__(self, **kwargs):
        """Initialize link instance.

        :param str name: name of link
        :param list collection_set: datastore keys holding the datasets to be checked. If any of these is empty,
                                    the chain is skipped.
        :param bool skip_missing: skip the chain if the dataframe is not present in the
                                  datastore. Default is True.
        :param bool skip_zero_len: skip the chain if the object is found in the datastore but has zero length. Default is True.
        :param bool check_at_initialize: perform dataset empty is check at initialize. Default is true.
        """
        Link.__init__(self, kwargs.pop('name', 'SkipChainIfEmpty'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             collection_set=[],
                             skip_missing=True,
                             skip_zero_len=True,
                             check_at_initialize=True,
                             check_at_execute=False)
        self.check_extra_kwargs(kwargs)
예제 #25
0
    def __init__(self, name='MongoDFToIDFilter'):
        """
        Store the configuration of link MongoDFToIDFilter

        :param str name: name of link
        :param str read_key: key of data to read from data store
        :param str store_key: key of data to store in data store
        :param str column: a column name of the pandas.DataFrame
        :param str mongoid: mongo id
        :param str store_key: datastore key of the pymongo filter
        :param dict mergewithfilter: pymongo filter to be added
        """

        Link.__init__(self, name)

        self.read_key = None
        self.column = ''
        self.mongoid = '_id'
        self.store_key = ''
        self.mergewithfilter = {}
        return
    def __init__(self, **kwargs):
        """Initialize an instance.

        :param str name: name of link
        :param list keys: functions to apply (list of dicts)
          - 'key_ds' (string): input key in datastore
          - 'key_fs' (string, optional): output key in forkstore
          - 'func': function to apply, optional
          - 'append': if key_ds points to a list, append each item to list in forkstore. Default is True.
          - 'args' (tuple, optional): args for 'func'
          - 'kwargs' (dict, optional): kwargs for 'func'
        """
        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'ForkDataCollector'))

        # Process and register keyword arguments. If the arguments are not given, all arguments are popped from
        # kwargs and added as attributes of the link. Otherwise, only the provided arguments are processed.
        self._process_kwargs(kwargs, keys=[])

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)
예제 #27
0
    def __init__(self, **kwargs):
        """
        Initialize link instance.

        :param str name: name of link
        :param str read_key: key of data to read from data store
        :param str store_key: key of data to store in data store
        :param bool move: move read_key item to store_key. Default is true.
        :param bool copy: if True the read_key key, value pair will not be deleted. Default is false.
        :param bool remove: if True the item corresponding to read_key key will be deleted. Default is false.
        """
        Link.__init__(self, kwargs.pop('name', 'DsToDs'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             read_key='',
                             store_key='',
                             move=True,
                             copy=False,
                             remove=False)
        self.check_extra_kwargs(kwargs)
예제 #28
0
    def __init__(self, name='mongoCollectionMover'):
        """
        Store the configuration of link MongoMoveCollection

        :param str name: name of link
        :param str source_collection: mongo collection name of the source collection
        :param list target_collections: mongo collection name(s) of the target collecion(s)
        :param dict columnsToAdd: columns to add to the pandas.DataFrame before storage. key = column name,
            value = column value
        :param dict filter: pymongo filter for the query on the source collection
        :param bool copy: if True data in the source collection will not be removed
        """

        Link.__init__(self, name)

        self.source_collection = None
        self.target_collections = []
        self.columnsToAdd = None
        self.filter = None
        self.copy = False

        return
예제 #29
0
    def __init__(self, **kwargs):
        """Link to store one external object in the DataStore dict during run time.

        :param str name: name of link
        :param str store_key: key of object to store in data store
        :param obj: object to store
        :param bool force: overwrite if already present in datastore. default is false. (optional)
        :param bool at_initialize: store at initialize of link. Default is false.
        :param bool at_execute: store at execute of link. Default is true.
        :param bool copydict: if true and obj is a dict, copy all key value pairs into datastore. Default is false.
        """
        Link.__init__(self, kwargs.pop('name', 'ToDsDict'))

        # process keyword arguments
        self._process_kwargs(kwargs,
                             store_key=None,
                             obj=None,
                             at_initialize=False,
                             at_execute=True,
                             force=False,
                             copydict=False)
        self.check_extra_kwargs(kwargs)
예제 #30
0
    def __init__(self, **kwargs):
        """Initialize MongoCursor2Df instance

        :param str name: name of link
        :param str collection: name of collection to find in mongo db
        :param str query: query to pass to mongo find command
        :param int skip: number of records to skip in query. Default is None.
        :param int limit: limit number of records in query. Default is None.
        :param str use_cols: columns to request from mongo collection in query
        :param int chuck_size: chunk_size with which to loop over mongo cursor. default is 10000.
        :param str store_key: key of output data to store in data store
        :param bool skip_empty: skip chain if (last) dataframe is empty. default is true.
        :param int n_chunks_in_fork: number of chunks per fork. Default is 1.
        """

        # initialize Link, pass name from kwargs
        Link.__init__(self, kwargs.pop('name', 'MongoCursor2Df'))

        # Process and register keyword arguments.  All arguments are popped from
        # kwargs and added as attributes of the link.  The values provided here
        # are defaults.
        self._process_kwargs(kwargs,
                             collection=None,
                             query=None,
                             skip=None,
                             limit=None,
                             use_cols=None,
                             chuck_size=10000,
                             store_key=None,
                             skip_empty=True,
                             n_chunks_in_fork=1)

        # check residual kwargs; exit if any present
        self.check_extra_kwargs(kwargs)

        # internal
        self._reader = None
        self._latest_data_length = 0
        self._sum_data_length = 0