def filter_vertices(self, frame, predicate, keep_matching_vertices=True): from trustedanalytics.rest.spark import ifilter from trustedanalytics.rest.spark import ifilterfalse if keep_matching_vertices: arguments = {"frame": frame.uri, "udf": get_udf_arg(frame, predicate, ifilter)} else: arguments = {"frame": frame.uri, "udf": get_udf_arg(frame, predicate, ifilterfalse)} execute_update_frame_command("frame:vertex/filter", arguments, frame)
def filter_vertices(self, frame, predicate, keep_matching_vertices = True): from trustedanalytics.rest.spark import ifilter from trustedanalytics.rest.spark import ifilterfalse if keep_matching_vertices: arguments = {'frame': frame.uri, 'udf': get_udf_arg(frame, predicate, ifilter) } else: arguments = {'frame': frame.uri, 'udf': get_udf_arg(frame, predicate, ifilterfalse) } execute_update_frame_command("frame:vertex/filter", arguments, frame)
def filter(self, frame, predicate): from trustedanalytics.rest.spark import ifilter arguments = { 'frame': frame.uri, 'udf': get_udf_arg(frame, predicate, ifilter) } execute_update_frame_command("frame:/filter", arguments, frame)
def drop(self, frame, predicate): from trustedanalytics.rest.spark import ifilterfalse # use the REST API filter, with a ifilterfalse iterator arguments = { 'frame': frame.uri, 'udf': get_udf_arg(frame, predicate, ifilterfalse) } execute_update_frame_command("frame:/filter", arguments, frame)
def add_columns(self, frame, expression, schema, columns_accessed=None): if not schema or not hasattr(schema, "__iter__"): raise ValueError("add_columns requires a non-empty schema of (name, type)") only_one_column = False if isinstance(schema[0], basestring): only_one_column = True schema = [schema] schema = self._format_schema(schema) names, data_types = zip(*schema) optimized_frame_schema = [] if columns_accessed: if isinstance(columns_accessed, basestring): columns_accessed = [columns_accessed] frame_schema = frame.schema for i in columns_accessed: for j in frame_schema: if i == j[0]: optimized_frame_schema.append(j) # By default columns_accessed is an empty list and optimized frame schema is empty which implies frame.schema is considered to evaluate columns_accessed, optimized_frame_schema = ([], None) if columns_accessed is None else (columns_accessed, optimized_frame_schema) add_columns_function = get_add_one_column_function(expression, data_types[0]) if only_one_column \ else get_add_many_columns_function(expression, data_types) from itertools import imap arguments = {'frame': frame.uri, 'column_names': names, 'column_types': [get_rest_str_from_data_type(t) for t in data_types], 'udf': get_udf_arg(frame, add_columns_function, imap, optimized_frame_schema), 'columns_accessed': columns_accessed} execute_update_frame_command('add_columns', arguments, frame)
def get_row_count(self, frame, where): if not where: return self._get_frame_info(frame).row_count # slightly faster generator to only return a list of one item, since we're just counting rows # TODO - there's got to be a better way to do this with the RDDs, trick is with Python. def icountwhere(predicate, iterable): return ("[1]" for item in iterable if predicate(item)) arguments = {'frame': frame.uri, 'udf': get_udf_arg(frame, where, icountwhere)} return executor.execute("frame/count_where", self, arguments)
def get_row_count(self, frame, where): if not where: return self._get_frame_info(frame).row_count # slightly faster generator to only return a list of one item, since we're just counting rows # TODO - there's got to be a better way to do this with the RDDs, trick is with Python. def icountwhere(predicate, iterable): return ("[1]" for item in iterable if predicate(item)) arguments = {'frame': frame.uri, 'udf': get_udf_arg(frame, where, icountwhere)} return executor.execute("frame/count_where", self, arguments)['value']
def filter(self, frame, predicate): from trustedanalytics.rest.spark import ifilter arguments = {'frame': frame.uri, 'udf': get_udf_arg(frame, predicate, ifilter)} execute_update_frame_command("frame:/filter", arguments, frame)
def drop(self, frame, predicate): from trustedanalytics.rest.spark import ifilterfalse # use the REST API filter, with a ifilterfalse iterator arguments = {'frame': frame.uri, 'udf': get_udf_arg(frame, predicate, ifilterfalse)} execute_update_frame_command("frame:/filter", arguments, frame)