def get_doc_attachments(conn: ConnType, attachment: int, prefix: _PathLike = "", itemIDs: _Optional[_Sequence[str]] = None, **kwargs: int): """Returns the paths to the attachments to all documents. Parameters ---------- conn : sqlalchemy.engine.Connection The connection object to the database. attachment : int The ID of the item type *attachment*. prefix : str, pathlib.Path, or path-like The path prefix to prepend. itemIDs : list-like of str or None The itemID of interest. If None, consider all items. **kwargs : int Not used. Just to conform the signature with other similar functions. Returns ------- pandas.DataFrame The values in the only one column are the relative paths to attachments. The indices of are "itemID"s. The relative paths are relative to the Zotero storage path. """ # pylint: disable=unused-argument query = """ SELECT itemAttachments.parentItemID, items.key, itemAttachments.path FROM items, itemAttachments WHERE items.itemTypeID = {attachment} AND itemAttachments.itemID = items.itemID """.format(attachment=attachment) if itemIDs is not None: query = _sub( "WHERE", "WHERE itemAttachments.parentitemID in ({}) AND".format( ", ".join(itemIDs)), query) results: pandas.DataFrame = pandas.read_sql_query(query, conn) results: pandas.DataFrame = results.rename( columns={"parentItemID": "itemID"}) results: pandas.DataFrame = results.set_index("itemID").dropna( 0, subset=["path"]) prefix = _Path(prefix) results["key"] = results["key"].map(prefix.joinpath) results["path"] = results["path"].str.replace("storage:", "") results: pandas.Series = results.apply( lambda x: x["key"].joinpath(x["path"]), 1) results: pandas.core.groupby.SeriesGroupBy = results.groupby(level=0) results: pandas.Series = results.aggregate(lambda x: x.values.tolist()) results: pandas.Series = results.apply( lambda x: x[0] if len(x) == 1 else x, 1) results: pandas.DataFrame = results.to_frame("attachment path") return results
def _create_map_job(function, parameter_set, name=None, environment=None, combiner_function=None, _job_type="PIPELINE"): _raise_error_if_not_function(function) # Name the job now = _datetime.now().strftime("%b-%d-%Y-%H-%M-%S") function_name = _sub("[<>]", "", function.__name__) name = "%s-%s" % (function_name, now) if not name else name # Validate args function, name, environment = _job._validate_job_create_args(function, name, environment) _session = _gl.deploy._default_session while _session.exists(name, __job.Job._typename): rand = str(_uuid.uuid4())[:5] old_name = name name = "%s-%s" % (name, rand) __LOGGER__.info("A job with name '%s' already exists. " "Renaming the job to '%s'." % (old_name, name)) # Convert SFrame to a dict if not parameter_set: raise RuntimeError("An empty parameter_set was given. Nothing to do.") # If parameter set is a generator/SFrame, make sure it gets expanded out. parameter_set_copy = [] for i in parameter_set: if not isinstance(i, dict): raise TypeError( "'parameter_set' has to be an iterable of dictionary." " For void functions, use an empty dictionary as inputs." ) parameter_set_copy.append(i) # Create the task. task_prototype = _task.Task(function, function_name) for_each_iterations = _generate_mapjob_tasks(task_prototype, parameter_set_copy) # List of outputs for the final step. if not combiner_function: list_of_tasks = for_each_iterations[0] else: combiner = _task.Task(combiner_function) # The input to this task is all other tasks task_name_to_task = {} for stage in for_each_iterations: for t in stage: task_name_to_task[t.name] = t combiner.set_inputs_from_task(task_name_to_task) for_each_iterations.append([combiner]) list_of_tasks = combiner # Create the job job = __job.Job( name, stages=for_each_iterations, environment=environment, final_stage=list_of_tasks, _job_type=_job_type ) return job
def _extract(token, sentence): tokens = ('take', 'pick', 'apply', 'name_it', '$') token_idx = {'take': 0, 'pick': 1, 'apply': 2, 'name_it': 3} left_token = tokens[token_idx[token]] right_token = tokens[token_idx[token] + 1] result = _search(left_token + '(.+?)' + right_token, sentence) result = _sub('take|pick|apply|name_it', '', result.group()) result = result.strip() return result
def get_doc_authors(conn: ConnType, attachment: int, note: int, author: int, itemIDs: _Optional[_Sequence[str]] = None, **kwargs: int): """Returns the last names of the authors of all documents. Parameters ---------- conn : sqlalchemy.engine.Connection The connection object to the database. attachment : int The ID of the item type *attachment*. note : int The ID of the item type *note*. author : int The ID of the creator type *author*. itemIDs : list-like of str or None The itemID of interest. If None, consider all items. **kwargs : int Not used. Just to conform the signature with other similar functions. Returns ------- pandas.DataFrame The values in the only one column are lists of strings of last names. The indices of are "itemID"s. """ # pylint: disable=unused-argument query = """ SELECT items.itemID, itemCreators.orderIndex, creators.lastName FROM items, itemCreators, creators WHERE items.itemTypeID <> {attachment} AND items.itemTypeID <> {note} AND itemCreators.itemID = items.itemID AND itemCreators.creatorTypeID = {author} AND creators.creatorID = itemCreators.creatorID """.format(attachment=attachment, note=note, author=author) if itemIDs is not None: query = _sub( "WHERE", "WHERE items.itemID in ({}) AND".format(", ".join(itemIDs)), query) results: pandas.DataFrame = pandas.read_sql_query(query, conn) results: pandas.DataFrame = results.sort_values(["itemID", "orderIndex"]) results: pandas.DataFrame = results.set_index("itemID").drop( columns="orderIndex") results: pandas.core.groupby.DataFrameGroupBy = results.groupby(level=0) results: pandas.DataFrame = results.aggregate(lambda x: x.values.tolist()) results: pandas.DataFrame = results.rename(columns={"lastName": "author"}) return results
def _cast_dumbstring(string: str, data_type): """Casts, if possible, a raw string returned by the OpenDSS text interface to the type specified in data_type.""" if data_type == str: return string if data_type in (int, float): if string == '----': # this happens, f.e., if you define a line by rmatrix, xmatrix and then ask for r0, x0... return _np.NaN else: return data_type(string) elif data_type == _np.ndarray: try: return _np.asarray(eval(string)) except: return matricize_str( string.replace(' ', ' ').replace(' ', ' ').replace( ' | ', ';').replace('| ', ';').replace(' |', ';').replace( '|', ';').replace('[ ', '').replace('[ ', '').replace( '[', '').replace(' ]', '').replace(' ]', '').replace( ']', '').replace(', ', ',').replace(' ', ',')) elif data_type == list: dp_str = _sub('[\,|\ ]*(\]|"|\))', '', string) dp_str = _sub('(\[|"|\()\ *', '', dp_str) items = dp_str.split(',') try: return [int(x) for x in items] except ValueError: try: return [float(x) for x in items] except ValueError: return items else: raise TypeError( 'Could not cast the DSS property string "{1}": type {0} unknown'. format(str(data_type), string))
def convert_length(self, string): """ Does a math like `10 meters to kilometers` """ try: string = string.lower().replace(" ", "") formats = _sub(r"\d+", "", string) number = int(string.replace(formats, "").strip(",.")) first_format, second_format = formats.split("to") assert first_format != second_format assert (-999999 < number < 999999999) for format in LengthFormats._ALL.value: if first_format in getattr(LengthFormats, format).value[0]: first_format = getattr(LengthFormats, format).value[1] elif second_format in getattr(LengthFormats, format).value[0]: second_format = format return f'{eval(f"{number}{first_format[LengthFormats._ALL.value.index(second_format)]}")} {second_format.lower()}' except: raise error_message("Unsupported or invalid calculation.")
def func(conn: ConnType, itemIDs: _Optional[_Sequence[str]] = None, **mapping) -> pandas.Series: """Returns a list of all items' {0}s. Note ---- 1. Items with "attachment" type are ignored. 2. Only return items with non-NaN values. Parameters ---------- conn : sqlalchemy.engine.Connection The connection object to the database. itemIDs : list-like of str or None The itemID of interest. If None, consider all items. **mapping : keyword-values The mapping from required keys to values used in query strings. Returns ------- pandas.Dataframe All items' {0}s, where the indices are the `itemID`s, and it only has one column. """ if itemIDs is not None: Q = _sub( "WHERE", "WHERE items.itemID in ({}) AND".format(", ".join(itemIDs)), query) else: Q = query results: pandas.DataFrame = pandas.read_sql_query( Q.format(**mapping), conn) results: pandas.DataFrame = results.set_index("itemID").rename( {org_tag: new_tag}, axis=1) results: pandas.DataFrame = after(results) return results
def create(function, name=None, environment=None, **kwargs): """ Execute arbitrary functions in a remote environment. The job is specified as a function. All functions that are called from within the function are automatically captured. By default, this method will kick off asynchronous work, and return a Job object to monitor/manage that work. Parameters ---------- function : function Function to be executed in this Job, with arguments to pass to this function specified by `kwargs`. name : str, optional Name for this execution (names the returned Job). If set to None, then the name of the job is set to the name of the function with a time-stamp. Valid characters in job name include: digits, characters, '-' and '_'. environment : :class:`~graphlab.deploy.hadoop_cluster.HadoopCluster` | :class:`~graphlab.deploy.ec2_cluster.Ec2Cluster` | :class:`~graphlab.deploy.LocalAsync`, optional Optional environment for execution. If set to None, then a `LocalAsync` by the name `async` is created and used. This will execute the code in the background on your local machine. kwargs: Function kwargs that are passed to the function for execution. Returns ------- job : :py:class:`~graphlab.deploy.Job` Used for monitoring and managing the execution of the Job. See Also -------- graphlab.deploy.map_job.create, graphlab.deploy.Job Examples -------- Let us start out with a simple example to execute a function that can add two numbers. .. sourcecode:: python # Define a function def add(x, y): return x + y # Create a job. job = graphlab.deploy.job.create(add, x=1, y=1) # Get results from the execution when ready. This call waits for the # job to complete before retrieving the results. >>> print job.get_results() 2 Exceptions within the function calls can be captured as follows: .. sourcecode:: python def add(x, y): if x and y: return x + y else: raise ValueError('x or y cannot be None') # Job execution capture the exception raised by the function. job = graphlab.deploy.job.create(add, x=1, y=None) # Get results from the execution when ready. This call waits for the # job to complete before retrieving the results. >>> print job.get_results() None # Get the exceptions raised from this execution by calling # job.get_metrics() >>> print job.get_metrics() +-----------+--------+------------+----------+-----------------------+ | task_name | status | start_time | run_time | exception_message | +-----------+--------+------------+----------+-----------------------+ | add | Failed | 1427928898 | None | x or y cannot be None | +-----------+--------+------------+----------+-----------------------+ +-------------------------------+ | exception_traceback | +-------------------------------+ | Traceback (most recent cal... | +-------------------------------+ [1 rows x 6 columns] If a function requires a package to be installed, the function can be annotated with a decorator. .. sourcecode:: python def my_function(number = 10): import names people = [names.get_full_name() for i in range(number)] sf = graphlab.SFrame({'names':people}) return sf job = graphlab.deploy.job.create(my_function) >>> print job.get_results() Columns: names str Data: +-------------------+ | names | +-------------------+ | Annette Logan | | Nancy Anthony | | Tiffany Zupancic | | Andre Coppin | | Robert Coe | | Donald Dean | | Lynne Bunton | | John Sartwell | | Peter Nicholas | | Chester Rodriguez | +-------------------+ [10 rows x 1 columns] Complex functions that require SFrames, GraphLab models etc. can be deployed with ease. All additional state required by the function are automatically captured. .. sourcecode:: python GLOBAL_CONSTANT = 10 def foo(x): return x + 1 def bar(x): return x + 2 def my_function(x, y): foo_x = foo(x) bar_y = bar(y) return foo_x + bar_y + GLOBAL_CONSTANT # Automatically captures all state needed by the deployed function. job = graphlab.deploy.job.create(my_function, x = 1, y = 1) >>> print job.get_results() 15 You can execute the same job remotely by passing a different environment. .. sourcecode:: python # Define a function def add(x, y): return x + y # Define an EC2 environment ec2 = graphlab.deploy.Ec2Config() # Create an EC2 cluster object c = graphlab.deploy.ec2_cluster.create('my_cluster', 's3://bucket/path', ec2) # Create a job. job = graphlab.deploy.job.create(add, environment=c, x=1, y=1) >>> print job.get_results() 2 Notes ----- - When an exception is raised within the deployed function, :func:`~graphlab.deploy.Job.get_results` returns None. - For asynchronous jobs, :func:`~graphlab.deploy.Job.get_results` is a blocking call which will wait for the job execution to complete before returning the results. """ _session = _gl.deploy._default_session _raise_error_if_not_function(function) _get_metric_tracker().track('jobs.job') # Name the job now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S') function_name = _sub('[<>]', '', function.__name__) name = '%s-%s' % (function_name, now) if not name else name # Validate args function, name, environment = _validate_job_create_args( function, name, environment) while _session.exists(name, _job.Job._typename): rand = str(_uuid.uuid4())[:5] old_name = name name = "%s-%s" % (name, rand) __LOGGER__.info("A job with name '%s' already exists. " "Renaming the job to '%s'." % (old_name, name)) # Setup the task & job task = _task.Task(function, function_name) task.set_inputs(kwargs) job = _job.Job(name, stages=[[task]], environment=environment, final_stage=task) # Setup the env. __LOGGER__.info("Validation complete. Job: '%s' ready for execution." % name) exec_env = _env._get_execution_env(environment) job = exec_env.run_job(job) # Save the job and return to user if not isinstance(environment, _environment.Local): __LOGGER__.info("Job: '%s' scheduled." % name) else: __LOGGER__.info("Job: '%s' finished." % name) _session.register(job) _session.save(job) return job
def _name_from_function(func): return _sub("_", " ", func.__name__)
return idx def remove_falsey_values(iterable) -> list: return list(filter(bool, iterable)) def image_url_parser(img: str) -> dict: # example of a clean URL -> https://m.media-amazon.com/images/M/MV5BMjIxMjgxNTk0MF5BMl5BanBnXkFtZTgwNjIyOTg2MDE@ full_hd_url = img.split("._V1")[0] # full_hd_url = "".join(map(lambda x: x + "@", _full_hd_url)) _height = full_hd_url + "._V1_SY{{height}}.jpg" return {"full": full_hd_url, "template_height": _height} sanitize_str = lambda movie: _sub(r"([^\w]|_)", "", movie).strip().lower() def get_page(url: str) -> Soup: print("[debug] Requesting:", url) page = requests.get(url, headers=BASIC_HEADERS) page.raise_for_status() return Soup(page.text, lib) def next_table(el): return el.find_next("table") if el else None def resp_template(r_type, dct: dict) -> dict: return {"data": {r_type: dct}}
def _unescape_part( part ): #unescape the delim FIRST unesc1 = _sub( _vESC + _vDEXOR, _vDELIM, part ) #unescape the escape SECOND return _sub( _vESC + _vEEXOR, _vESC, unesc1 )
def _name_from_function(func: ExampleFunction) -> str: return _sub("_", " ", func.__name__)
def _unescape_part(part): # unescape the delim FIRST unesc1 = _sub(_vESC + _vDEXOR, _vDELIM, part) # unescape the escape SECOND return _sub(_vESC + _vEEXOR, _vESC, unesc1)
def _escape_part( part ): enc = part.encode() if type(part) is not bytes else part #escape the escape FIRST esc1 = _sub(_vESC, _vESC + _vEEXOR, enc ) #escape the delim SECOND return _sub(_vDELIM, _vESC + _vDEXOR, esc1)
def _name_from_function(func: Callable) -> str: return _sub("_", " ", func.__name__)
def _escape_part(part): enc = part.encode() if type(part) is not bytes else part # escape the escape FIRST esc1 = _sub(_vESC, _vESC + _vEEXOR, enc) # escape the delim SECOND return _sub(_vDELIM, _vESC + _vDEXOR, esc1)
def postprocess(self, seg): '''Remove BPE from seg.tgt ''' seg.tgt = _sub("(@@ )|(@@ ?$)", '', seg.tgt)
def remove(self,n): _k = lzstr(_sub(str(n), "",self)) return _k
def create(function, name=None, environment=None, **kwargs): """ Execute arbitrary functions in a remote environment. The job is specified as a function. All functions that are called from within the function are automatically captured. By default, this method will kick off asynchronous work, and return a Job object to monitor/manage that work. Parameters ---------- function : function Function to be executed in this Job, with arguments to pass to this function specified by `kwargs`. name : str, optional Name for this execution (names the returned Job). If set to None, then the name of the job is set to the name of the function with a time-stamp. Valid characters in job name include: digits, characters, '-' and '_'. environment : :class:`~graphlab.deploy.hadoop_cluster.HadoopCluster` | :class:`~graphlab.deploy.ec2_cluster.Ec2Cluster` | :class:`~graphlab.deploy.LocalAsync`, optional Optional environment for execution. If set to None, then a `LocalAsync` by the name `async` is created and used. This will execute the code in the background on your local machine. kwargs: Function kwargs that are passed to the function for execution. Returns ------- job : :py:class:`~graphlab.deploy.Job` Used for monitoring and managing the execution of the Job. See Also -------- graphlab.deploy.map_job.create, graphlab.deploy.Job Examples -------- Let us start out with a simple example to execute a function that can add two numbers. .. sourcecode:: python # Define a function def add(x, y): return x + y # Create a job. job = graphlab.deploy.job.create(add, x=1, y=1) # Get results from the execution when ready. This call waits for the # job to complete before retrieving the results. >>> print job.get_results() 2 Exceptions within the function calls can be captured as follows: .. sourcecode:: python def add(x, y): if x and y: return x + y else: raise ValueError('x or y cannot be None') # Job execution capture the exception raised by the function. job = graphlab.deploy.job.create(add, x=1, y=None) # Get results from the execution when ready. This call waits for the # job to complete before retrieving the results. >>> print job.get_results() None # Get the exceptions raised from this execution by calling # job.get_metrics() >>> print job.get_metrics() +-----------+--------+------------+----------+-----------------------+ | task_name | status | start_time | run_time | exception_message | +-----------+--------+------------+----------+-----------------------+ | add | Failed | 1427928898 | None | x or y cannot be None | +-----------+--------+------------+----------+-----------------------+ +-------------------------------+ | exception_traceback | +-------------------------------+ | Traceback (most recent cal... | +-------------------------------+ [1 rows x 6 columns] If a function requires a package to be installed, the function can be annotated with a decorator. .. sourcecode:: python @graphlab.deploy.required_packages(['names == 0.3.0']) def my_function(number = 10): import names people = [names.get_full_name() for i in range(number)] sf = graphlab.SFrame({'names':people}) return sf job = graphlab.deploy.job.create(my_function) >>> print job.get_results() Columns: names str Data: +-------------------+ | names | +-------------------+ | Annette Logan | | Nancy Anthony | | Tiffany Zupancic | | Andre Coppin | | Robert Coe | | Donald Dean | | Lynne Bunton | | John Sartwell | | Peter Nicholas | | Chester Rodriguez | +-------------------+ [10 rows x 1 columns] Complex functions that require SFrames, GraphLab models etc. can be deployed with ease. All additional state required by the function are automatically captured. .. sourcecode:: python GLOBAL_CONSTANT = 10 def foo(x): return x + 1 def bar(x): return x + 2 def my_function(x, y): foo_x = foo(x) bar_y = bar(y) return foo_x + bar_y + GLOBAL_CONSTANT # Automatically captures all state needed by the deployed function. job = graphlab.deploy.job.create(my_function, x = 1, y = 1) >>> print job.get_results() 15 You can execute the same job remotely by passing a different environment. .. sourcecode:: python # Define a function def add(x, y): return x + y # Define an EC2 environment ec2 = graphlab.deploy.Ec2Config() # Create an EC2 cluster object c = graphlab.deploy.ec2_cluster.create('my_cluster', 's3://bucket/path', ec2) # Create a job. job = graphlab.deploy.job.create(add, environment=c, x=1, y=1) >>> print job.get_results() 2 Notes ----- - When an exception is raised within the deployed function, :func:`~graphlab.deploy.Job.get_results` returns None. - For asynchronous jobs, :func:`~graphlab.deploy.Job.get_results` is a blocking call which will wait for the job execution to complete before returning the results. """ _session = _gl.deploy._default_session _raise_error_if_not_function(function) _get_metric_tracker().track('jobs.job') # Name the job now = _datetime.now().strftime('%b-%d-%Y-%H-%M-%S') function_name = _sub('[<>]','',function.__name__) name = '%s-%s' % (function_name, now) if not name else name # Validate args function, name, environment = _validate_job_create_args(function, name, environment) while _session.exists(name, _job.Job._typename): rand = str(_uuid.uuid4())[:5] old_name = name name = "%s-%s" % (name, rand) __LOGGER__.info("A job with name '%s' already exists. " "Renaming the job to '%s'." % (old_name, name)) # Setup the task & job task = _task.Task(function,function_name) task.set_inputs(kwargs) job = _job.Job(name, stages=[[task]], environment=environment, final_stage=task) # Setup the env. __LOGGER__.info("Validation complete. Job: '%s' ready for execution." % name) exec_env = _env._get_execution_env(environment) job = exec_env.run_job(job) # Save the job and return to user if not isinstance(environment, _environment.Local): __LOGGER__.info("Job: '%s' scheduled." % name) else: __LOGGER__.info("Job: '%s' finished." % name) _session.register(job) _session.save(job) return job
def obtain_suitable_comment(text, reg): """ remove part of matching reg """ return _sub(reg, ' ', text)
def postprocess_str(self, str): return _sub("(@@ )|(@@ ?$)", '', str)