def graph_definition(edge_string): """Parse a graph definition string >>> graph_definition('Origin,3,9_I_5.6~Origin_II~I_III') [{'source': 0, 'target': 1, 'value': 5.6}, {'source': 0, 'target': 2, 'value': 4}, {'source': 1, 'target': 3, 'value': 4}] """ edge_definitions = listify(edge_string, delim=graph_definition.delim) edge_list, node_list, node_names = [], [], [] for i, edge_definition in enumerate(edge_definitions): edge = {} node_string_pair = listify(edge_definition, delim=graph_definition.node_pair_delim) for j, field in enumerate(graph_definition.schema): try: obj = field['type'](node_string_pair[j]) # if the field holds a node (source or target) then its name must be retrieved if isinstance(obj, Mapping) and 'name' in obj: # is this node name already in our list (and other node properties already defined) if obj['name'] in node_names: node_index = node_names.index(obj['name']) # update the existing node with any new properties of this node node_list[node_index].update(obj) # obj[1] is a node dict obj = node_index else: # reprocess/recast the object (which is now node dict) to add default values to the node obj = field['type'](obj) node_list += [obj] # obj[1] is a node dict node_names += [obj['name']] # obj[1] is a node dict # since this is a new node, its node index is one less than the len of the node list (or node name list) obj = len(node_list) - 1 edge[field['key']] = obj except: edge[field['key']] = field['default'] edge_list += [edge] return edge_list, node_list
def lagged_series(series, lags=1, pads=None): """ Delay each time series in a set of time series by the lags (number of samples) indicated. Pad any gaps in the resulting series with the value of pads or clip, if None. TODO: Allow fractional sample lags (interpolation) TODO: Allow time value lags instead of sample counts TODO: Incorporate into the nlp.db.Columns class >>> lagged_series([[-1, 0, 1, 2, 3], [2, 7, 1, 8, 2], [8, 1, 8, 2, 8]], lags=3) [[-1, 0, 1, 2, 3], [1, 8, 2, 2, 7], [8, 2, 8, 8, 1]] >>> lagged_series([[-1, 0, 1, 2, 3], [2, 7, 1, 8, 2], [8, 1, 8, 2, 8]], lags=[2, 1], pads=0) [[-1, 0, 1, 2, 3], [0, 0, 2, 7, 1], [0, 8, 1, 8, 2]] >>> lagged_series([[-1, 0, 1, 2, 3], [2, 7, 1, 8, 2], [8, 1, 8, 2, 8]], lags=[-1, 3], pads=[-9, -5]) [[-1, 0, 1, 2, 3], [7, 1, 8, 2, -9], [-5, -5, -5, 8, 1]] """ N = len(series) - 1 pads = [None] * N if pads is None else util.listify(pads, N) pads = [None] + pads lags = [None] * N if lags is None else util.listify(lags, N) lags = [None] + lags ans = [series[0]] for i in range(1, min(len(lags) + 1, len(pads) + 1, N + 1)): #print pads[i] ans += [lagged_seq(series[i], lags[i], pads[i])] return ans
def make_time_series(x, t=pd.Timestamp(datetime.datetime(1970, 1, 1)), freq=None): """Convert a 2-D array of time/value pairs (or pair of time/value vectors) into a pd.Series time-series >>> make_time_series(range(3), freq='15min') # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS 1970-01-01 00:00:00 NaN 1970-01-01 00:15:00 NaN 1970-01-01 00:30:00 NaN dtype: float64 """ if isinstance(x, pd.DataFrame): x = pd.Series(x[x.columns[0]]) elif not isinstance(x, pd.Series) and ( not isinstance(t, (pd.Series, pd.Index, list, tuple)) or not len(t)): #warnings.warn("Coercing a non-Series") if len(x) == 2: t, x = listify(x[0]), listify(x[1]) elif len(x) >= 2: try: t, x = zip(*x) except (ValueError, IndexError, TypeError): pass x = pd.Series(x) else: if isinstance(t, (datetime.datetime, pd.Timestamp)): t = pd.Timestamp(t) else: x = pd.Series(listify(x), index=listify(t)) if not isinstance(x, pd.Series): raise TypeError( "`pug.invest.util.make_time_series(x, t)` expects x to be a type that" " can be coerced to a Series object, but it's type is: {0}".format( type(x))) # By this point x must be a Series, only question is whether its index needs to be converted to a DatetimeIndex if x.index[0] != 0 and isinstance( x.index[0], (datetime.date, datetime.datetime, pd.Timestamp, basestring, float, np.int64, int)): t = x.index elif isinstance(t, (datetime.date, datetime.datetime, pd.Timestamp, basestring, float, np.int64, int)): if not freq: freq = '15min' warnings.warn( 'Assumed time series freq to be {0} though no freq argument was provided!' .format(freq), RuntimeWarning) t = pd.date_range(t, periods=len(x), freq=freq) x = pd.Series(x, index=t) if isinstance(x, pd.Series): x.index = pd.DatetimeIndex(x.index.values) return x
def __init__(self, start_urls=None, *args, **kwargs): self.start_urls = [ r'http://www.google.com/googlebooks/uspto-patents-applications-biblio.html' ] if start_urls: self.start_urls = listify(start_urls) super(PatentBiblioSpider, self).__init__(*args, **kwargs)
def node_name(name, use_defaults=False): """ >>> sorted(node_name('Origin,2.7, 3 ')[1].items()) [('charge', 2.7), ('group', 3), ('name', 'Origin')] >>> node_name('Origin,2.7, 3 ')[0] 'Origin' """ # if the name is not a string, but a dict defining a node, then just set the defaults and return it if isinstance(name, Mapping): ans = dict(name) for j, field in enumerate(node_name.schema): if field['key'] not in ans: ans[field['key']] = field['default'] return ans seq = listify(name, delim=',') ans = {} for j, field in enumerate(node_name.schema): if 'default' in field: try: ans[field['key']] = field['type'](seq[j]) except: if use_defaults: ans[field['key']] = field['default'] else: try: ans[field['key']] = ans.get(field['key'], field['type'](seq[j])) except: pass return ans
def dropna(x): """Delete all NaNs and and infinities in a sequence of real values Returns: list: Array of all values in x that are between -inf and +inf, exclusive """ return [x_i for x_i in listify(x) if float('-inf') < x_i < float('inf')]
def args_tptnfpfn(*args, **kwargs): """Convert kwargs for tp, tn, fp, fn to ordered tuple of args If a single tuple/list is passed as the first arg, it is assumed to be the desired tuple of args >>> args_tptnfpfn(1, 2, 3, 4) (1, 2, 3, 4) >>> args_tptnfpfn((1, 2, 3, 4)) (1, 2, 3, 4) >>> args_tptnfpfn([1, 2, 3, 4]) (1, 2, 3, 4) >>> args_tptnfpfn(3, 4, tp=1, tn=2) (1, 2, 3, 4) >>> args_tptnfpfn(tp=1, tn=2) (1, 2, 0, 0) >>> args_tptnfpfn(tp=1, tn=2, fp=3, fn=4) (1, 2, 3, 4) >>> args_tptnfpfn(1) (1, 0, 0, 0) """ if len(args) == 4: tp, tn, fp, fn = args elif len(kwargs) == 0: if len(args) == 1: args = listify(args[0]) tp, tn, fp, fn = list(list(args) + [0] * (4 - len(args))) else: args = list(args) tp = kwargs['tp'] if 'tp' in kwargs else args.pop(0) if len(args) else 0 tn = kwargs['tn'] if 'tn' in kwargs else args.pop(0) if len(args) else 0 fp = kwargs['fp'] if 'fp' in kwargs else args.pop(0) if len(args) else 0 fn = kwargs['fn'] if 'fn' in kwargs else args.pop(0) if len(args) else 0 return tp, tn, fp, fn
def append_app_urls(local, app_names): app_names = listify(app_names) # or local.get('local.settings.INSTALLED_APPS') ;) urlpatterns = local.get('urlpatterns', patterns('')) for app_name in app_names: urlpatterns += patterns('', url(r'^', include('%s.urls' % app_name)))#, name='order-list'),) local['urlpatterns'] = urlpatterns
def augment_model_meta(model, db_alias, model_meta, column_name_filters=None, count=0, verbosity=0): """Fields are keyed by their db_column name rather than field name (like model_meta)""" if settings.DEBUG and verbosity > 2: print 'Augmenting model meta data for %r...' % model column_name_filters = util.listify(column_name_filters) queryset = djdb.get_queryset(model) if db_alias: queryset = queryset.using(db_alias) for field_name in model._meta.get_all_field_names(): field = None try: field = model._meta.get_field(field_name) db_column = field.db_column # Django creates reverse ForeignKey relationship fields that may not have a database column in this table # This happens if you make existing fields/columns in other tables a ForeignKey referencing this table except FieldDoesNotExist: db_column = None if not field: if verbosity: print "WARNING: Skipped 'phantom' field named '%s'. This is likely because of a ForeignKey relationship elsewhere back to this model (%r). No field found in the model '%s' for database '%s'." % (field_name, model, model.__name__, db_alias) continue if not db_column: if field.name in model_meta: db_column = field.name elif field.name.lower() in model_meta: db_column = field.name.lower() elif field.name.upper() in model_meta: db_column = field.name.upper() if not db_column: if verbosity: print "WARNING: Skipped field named '%s'. No column found in the database.table '%s.%s'." % (field.name, db_alias, model.__name__) continue if column_name_filters: if not any(((callable(cnf) and cnf(db_column)) or (db_column == cnf)) for cnf in column_name_filters): if verbosity: print "WARNING: Skipped field named '%s' for table '%s.%s' because it didn't match any filters: %r." % (field.name, db_alias, model.__name__, column_name_filters) continue if (field.name == 'id' and isinstance(field, models.fields.AutoField) and field.primary_key and (not model_meta[db_column]['primary_key'])): print "WARNING: Skipped field named '%s' for table '%s.%s' because it is an AutoField and no primary_key is defined for this table." % (field.name, db_alias, model.__name__) continue model_meta[db_column] = augment_field_meta(field, queryset, model_meta[db_column], count=count, verbosity=verbosity) if verbosity > 1: print '%s (%s of type %s) has %s / %s (%3.1f%%) distinct values between %s and %s, excluding %s nulls.' % (field.name, db_column, model_meta[db_column]['type'], model_meta[db_column]['num_distinct'], count, 100. * (model_meta[db_column]['num_distinct'] or 0) / (count or 1), repr(model_meta[db_column]['min']), repr(model_meta[db_column]['max']), model_meta[db_column]['num_null']) return model_meta
def make_time_series(x, t=pd.Timestamp(datetime.datetime(1970,1,1)), freq=None): """Convert a 2-D array of time/value pairs (or pair of time/value vectors) into a pd.Series time-series >>> make_time_series(range(3)) # doctest: +NORMALIZE_WHITESPACE, +ELLIPSIS 1970-01-01 00:00:00 NaN 1970-01-01 00:15:00 NaN 1970-01-01 00:30:00 NaN dtype: float64 """ if isinstance(x, pd.DataFrame): x = pd.Series(x[x.columns[0]]) elif not isinstance(x, pd.Series) and (not isinstance(t, (pd.Series, pd.Index, list, tuple)) or not len(t)): #warnings.warn("Coercing a non-Series") if len(x) == 2: t, x = listify(x[0]), listify(x[1]) elif len(x) >= 2: try: t, x = zip(*x) except (ValueError, IndexError, TypeError): pass x = pd.Series(x) else: if isinstance(t, (datetime.datetime, pd.Timestamp)): t = pd.Timestamp(t) else: x = pd.Series(listify(x), index=listify(t)) if not isinstance(x, pd.Series): raise TypeError("`pug.invest.util.make_time_series(x, t)` expects x to be a type that can be coerced to a Series object, but it's type is: {0}".format(type(x))) # By this point x must be a Series, only question is whether its index needs to be converted to a DatetimeIndex if x.index[0] != 0 and isinstance(x.index[0], (datetime.date, datetime.datetime, pd.Timestamp, basestring, float, np.int64, int)): t = x.index elif isinstance(t, (datetime.date, datetime.datetime, pd.Timestamp, basestring, float, np.int64, int)): if not freq: freq = '15min' warnings.warn('Assumed time series freq to be {0} though no freq argument was provided!'.format(freq), RuntimeWarning) t = pd.date_range(t, periods=len(x), freq=freq) x = pd.Series(x, index=t) if isinstance(x, pd.Series): x.index = pd.DatetimeIndex(x.index.values) return x
def graph_definition(edge_string): """Parse a graph definition string >>> graph_definition('Origin,3,9_I_5.6~Origin_II~I_III') [{'source': 0, 'target': 1, 'value': 5.6}, {'source': 0, 'target': 2, 'value': 4}, {'source': 1, 'target': 3, 'value': 4}] """ edge_definitions = listify(edge_string, delim=graph_definition.delim) edge_list, node_list, node_names = [], [], [] for i, edge_definition in enumerate(edge_definitions): edge = {} node_string_pair = listify(edge_definition, delim=graph_definition.node_pair_delim) for j, field in enumerate(graph_definition.schema): try: obj = field['type'](node_string_pair[j]) # if the field holds a node (source or target) then its name must be retrieved if isinstance(obj, Mapping) and 'name' in obj: # is this node name already in our list (and other node properties already defined) if obj['name'] in node_names: node_index = node_names.index(obj['name']) # update the existing node with any new properties of this node node_list[node_index].update( obj) # obj[1] is a node dict obj = node_index else: # reprocess/recast the object (which is now node dict) to add default values to the node obj = field['type'](obj) node_list += [obj] # obj[1] is a node dict node_names += [obj['name']] # obj[1] is a node dict # since this is a new node, its node index is one less than the len of the node list (or node name list) obj = len(node_list) - 1 edge[field['key']] = obj except: edge[field['key']] = field['default'] edge_list += [edge] return edge_list, node_list
def append_urls(local, app_names=None): app_names = listify(app_names or basename(dirname(local.get('__file__', None)))) urlpatterns = local.get('urlpatterns', patterns('')) for app_name in app_names: print 'Composing REST API URLs (Django urlconf entries) for app named %r' % app_name views_name = app_name + '.views' app_module = __import__(views_name) app = get_app(app_name) for Model in get_models(app): # print Model model_name = Model.__name__ View = app_module.views.__dict__[model_name + 'List'] urlpatterns += patterns('', url(r'^' + app_name + r'/' + model_name, View.as_view()))#, name='order-list'),) local['urlpatterns'] = urlpatterns
def create_model_viewsets(local, app_names=None): app_names = listify( app_names or os.path.basename(os.path.dirname(local.get('__file__', None)))) for app_name in app_names: # , 'npc_s'): app = get_app(app_name) for Model in get_models(app): class KitchenSinkFilter(more_django_filters.FilterSet): class Meta: model = Model # fields = tuple(f.name for f in model._meta.fields) # KitchenSinkFilter.__doc__ = "Filter (query) for records the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__) class KitchenSinkSerializer(serializers.ModelSerializer): class Meta: model = KitchenSinkFilter.Meta.model class KitchenSinkList(generics.ListAPIView): __doc__ = "Filtered list of database records (table rows) for the database.table <strong>%s.%s</strong>\n<br>\n%s" % ( app_name, Model.__name__, Model.__doc__) model = KitchenSinkFilter.Meta.model serializer_class = KitchenSinkSerializer #filter_fields = ('acctno','whse','status','partno','date_time','reference','return_days') filter_class = KitchenSinkFilter class Meta: model = Model fields = tuple(f.name for f in model._meta.fields) KitchenSinkList.__name__ = Model.__name__ + 'List' # KitchenSinkList.__doc__ = "Filtered list of database records (table rows) for the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__) local[KitchenSinkList.__name__] = KitchenSinkList class KitchenSinkViewSet(viewsets.ModelViewSet): serializer_class = KitchenSinkSerializer model = KitchenSinkFilter.Meta.model filter_fields = tuple(f.name for f in model._meta.fields) order_by = tuple(f.name for f in model._meta.fields) KitchenSinkViewSet.__name__ = Model.__name__ + 'ViewSet' # KitchenSinkViewSet.__doc__ = "A ViewSet for the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__) local[KitchenSinkViewSet.__name__] = KitchenSinkViewSet
def create_model_viewsets(local, app_names=None): app_names = listify(app_names or os.path.basename(os.path.dirname(local.get('__file__', None)))) for app_name in app_names: # , 'npc_s'): app = get_app(app_name) for Model in get_models(app): class KitchenSinkFilter(more_django_filters.FilterSet): class Meta: model = Model # fields = tuple(f.name for f in model._meta.fields) # KitchenSinkFilter.__doc__ = "Filter (query) for records the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__) class KitchenSinkSerializer(serializers.ModelSerializer): class Meta: model = KitchenSinkFilter.Meta.model class KitchenSinkList(generics.ListAPIView): __doc__ = "Filtered list of database records (table rows) for the database.table <strong>%s.%s</strong>\n<br>\n%s" % (app_name, Model.__name__, Model.__doc__) model = KitchenSinkFilter.Meta.model serializer_class = KitchenSinkSerializer #filter_fields = ('acctno','whse','status','partno','date_time','reference','return_days') filter_class = KitchenSinkFilter class Meta: model = Model fields = tuple(f.name for f in model._meta.fields) KitchenSinkList.__name__ = Model.__name__ + 'List' # KitchenSinkList.__doc__ = "Filtered list of database records (table rows) for the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__) local[KitchenSinkList.__name__] = KitchenSinkList class KitchenSinkViewSet(viewsets.ModelViewSet): serializer_class = KitchenSinkSerializer model = KitchenSinkFilter.Meta.model filter_fields = tuple(f.name for f in model._meta.fields) order_by = tuple(f.name for f in model._meta.fields) KitchenSinkViewSet.__name__ = Model.__name__ + 'ViewSet' # KitchenSinkViewSet.__doc__ = "A ViewSet for the database.table %s.%s\n%s" % (app_name, Model.__name__, Model.__doc__) local[KitchenSinkViewSet.__name__] = KitchenSinkViewSet
def animate_panel(panel, keys=None, columns=None, interval=1000, blit=False, titles='', path='animate_panel', xlabel='Time', ylabel='Value', ext='gif', replot=False, linewidth=3, close=False, fontsize=24, background_color='white', alpha=1, figsize=(12,8), xlabel_rotation=-25, plot_kwargs=(('rotation', 30),), verbosity=1, **video_kwargs): """Animate a pandas.Panel by flipping through plots of the data in each dataframe Arguments: panel (pandas.Panel): Pandas Panel of DataFrames to animate (each DataFrame is an animation video frame) keys (list of str): ordered list of panel keys (pages) to animate columns (list of str): ordered list of data series names to include in plot for eath video frame interval (int): number of milliseconds between video frames titles (str or list of str): titles to place in plot on each data frame. default = `keys` so that titles changes with each frame path (str): path and base file name to save *.mp4 animation video ('' to not save) kwargs (dict): pass-through kwargs for `animation.FuncAnimation(...).save(path, **kwargs)` (Not used if `not path`) TODO: - Work with other 3-D data formats: - dict (sorted by key) or OrderedDict - list of 2-D arrays/lists - 3-D arrays/lists - generators of 2-D arrays/lists - generators of generators of lists/arrays? - Write json and html5 files for d3 SVG line plots with transitions! >>> x = np.arange(0, 2*np.pi, 0.05) >>> panel = pd.Panel(dict((i, pd.DataFrame({ ... 'T=10': np.sin(x + i/10.), ... 'T=7': np.sin(x + i/7.), ... 'beat': np.sin(x + i/10.) + np.sin(x + i/7.), ... }, index=x) ... ) for i in range(50))) >>> animate_panel(panel, interval=200, path='animate_panel_test') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE Drawing frames for a ".gif" animation... Saving video to animate_panel_test.gif... T=10 T=7 beat 0.00 0.000000 0.000000 0.000000 0.05 0.049979 0.049979 0.099958 ... [126 rows x 3 columns] """ plot_kwargs = plot_kwargs or {} plot_kwargs = dict(plot_kwargs) ext_kwargs = { 'mp4': {'writer': 'ffmpeg', 'codec': 'mpeg4', 'dpi': 100, 'bitrate': 2000}, 'gif': {'writer': 'imagemagick', 'dpi': 100, 'bitrate': 2000}, 'imagemagic.gif': {'writer': 'imagemagick_gif', 'dpi': 100, 'bitrate': 2000}, } ext = str(ext).lower().strip() or 'gif' default_kwargs = ext_kwargs.get(ext, {}) keys = keys or list(panel.keys()) if titles: titles = listify(titles) if len(titles) == 1: titles *= len(keys) else: titles = keys titles = dict((k, title) for k, title in zip(keys, titles)) columns = columns or list(panel[keys[0]].columns) fig, ax = plt.subplots(figsize=figsize) fig.patch.set_facecolor(background_color) fig.patch.set_alpha(alpha) i = 0 df = panel[keys[i]] x = df.index.values y = df[columns].values lines = ax.plot(x, y) ax.grid('on') ax.patch.set_facecolor(background_color) ax.patch.set_alpha(alpha) ax.title.set_text(titles[keys[0]]) ax.title.set_fontsize(fontsize) ax.title.set_fontweight('bold') ax.xaxis.label.set_text(xlabel) plt.setp(ax.get_xticklabels(), rotation=xlabel_rotation) ax.yaxis.label.set_text(ylabel) ax.legend(columns) def animate(k): df = panel[k] x = df.index.values y = df[columns].values.T if replot: # plt.cla() # fig, ax = plt.subplots(figsize=figsize) fig = ax.figure fig.patch.set_facecolor(background_color) fig.patch.set_alpha(alpha) lines = ax.plot(x, y.T, linewidth=linewidth) ax.grid('on') ax.patch.set_facecolor(background_color) ax.patch.set_alpha(alpha) ax.title.set_text(titles[k]) ax.title.set_fontsize(fontsize) ax.title.set_fontweight('bold') ax.xaxis.label.set_text(xlabel) plt.setp(ax.get_xticklabels(), rotation=xlabel_rotation) ax.yaxis.label.set_text(ylabel) ax.legend(columns) else: lines = ax.lines fig = ax.figure for i in range(len(lines)): lines[i].set_xdata(x) # all lines have to share the same x-data lines[i].set_ydata(y[i]) # update the data, don't replot a new line lines[i].set_linewidth(linewidth) lines[i].figure.set_facecolor(background_color) lines[i].figure.set_alpha(alpha) lines[i].axes.patch.set_facecolor(background_color) lines[i].axes.patch.set_alpha(alpha) ax.patch.set_facecolor(background_color) ax.figure.patch.set_alpha(alpha) ax.title.set_text(titles[k]) ax.title.set_fontsize(fontsize) ax.title.set_fontweight('bold') if blit: return lines # FIXME: doesn't work with ext=mp4 # init_func to mask out pixels to be redrawn/cleared which speeds redrawing of plot def mask_lines(): if verbosity > 0: print('Initialing mask_lines. . .') df = panel[0] x = df.index.values y = df[columns].values.T for i in range(len(lines)): # FIXME: why are x-values used to set the y-data coordinates of the mask? lines[i].set_xdata(np.ma.array(x, mask=True)) lines[i].set_ydata(np.ma.array(y[i], mask=True)) return lines if verbosity > 0: print('Drawing frames for a ".{0}" animation{1}. . .'.format(ext, ' with blitting' if blit else '')) animate(keys[0]) ani = animation.FuncAnimation(fig, animate, keys, interval=interval, blit=blit) #, init_func=mask_lines, blit=True) kwargs = dict(default_kwargs) for k, v in six.iteritems(default_kwargs): kwargs[k] = video_kwargs.get(k, v) # if 'bitrate' in kwargs: # kwargs['bitrate'] = min(kwargs['bitrate'], int(8e5 / interval)) # low information rate (long interval) might make it impossible to achieve a higher bitrate ight not if path and isinstance(path, basestring): path += '.{0}'.format(ext) if verbosity > 0: print('Saving video to {0}. . .'.format(path)) ani.save(path, **kwargs) if close: plt.close(fig) return df
def animate_panel(panel, keys=None, columns=None, interval=1000, blit=False, titles='', path='animate_panel', xlabel='Time', ylabel='Value', ext='gif', replot=False, linewidth=3, close=False, fontsize=24, background_color='white', alpha=1, figsize=(12, 8), xlabel_rotation=-25, plot_kwargs=(('rotation', 30), ), verbosity=1, **video_kwargs): """Animate a pandas.Panel by flipping through plots of the data in each dataframe Arguments: panel (pandas.Panel): Pandas Panel of DataFrames to animate (each DataFrame is an animation video frame) keys (list of str): ordered list of panel keys (pages) to animate columns (list of str): ordered list of data series names to include in plot for eath video frame interval (int): number of milliseconds between video frames titles (str or list of str): titles to place in plot on each data frame. default = `keys` so that titles changes with each frame path (str): path and base file name to save *.mp4 animation video ('' to not save) kwargs (dict): pass-through kwargs for `animation.FuncAnimation(...).save(path, **kwargs)` (Not used if `not path`) TODO: - Work with other 3-D data formats: - dict (sorted by key) or OrderedDict - list of 2-D arrays/lists - 3-D arrays/lists - generators of 2-D arrays/lists - generators of generators of lists/arrays? - Write json and html5 files for d3 SVG line plots with transitions! >>> x = np.arange(0, 2*np.pi, 0.05) >>> panel = pd.Panel(dict((i, pd.DataFrame({ ... 'T=10': np.sin(x + i/10.), ... 'T=7': np.sin(x + i/7.), ... 'beat': np.sin(x + i/10.) + np.sin(x + i/7.), ... }, index=x) ... ) for i in range(50))) >>> animate_panel(panel, interval=200, path='animate_panel_test') # doctest: +ELLIPSIS, +NORMALIZE_WHITESPACE Drawing frames for a ".gif" animation... Saving video to animate_panel_test.gif... T=10 T=7 beat 0.00 0.000000 0.000000 0.000000 0.05 0.049979 0.049979 0.099958 ... [126 rows x 3 columns] """ plot_kwargs = plot_kwargs or {} plot_kwargs = dict(plot_kwargs) ext_kwargs = { 'mp4': { 'writer': 'ffmpeg', 'codec': 'mpeg4', 'dpi': 100, 'bitrate': 2000 }, 'gif': { 'writer': 'imagemagick', 'dpi': 100, 'bitrate': 2000 }, 'imagemagic.gif': { 'writer': 'imagemagick_gif', 'dpi': 100, 'bitrate': 2000 }, } ext = str(ext).lower().strip() or 'gif' default_kwargs = ext_kwargs.get(ext, {}) keys = keys or list(panel.keys()) if titles: titles = listify(titles) if len(titles) == 1: titles *= len(keys) else: titles = keys titles = dict((k, title) for k, title in zip(keys, titles)) columns = columns or list(panel[keys[0]].columns) fig, ax = plt.subplots(figsize=figsize) fig.patch.set_facecolor(background_color) fig.patch.set_alpha(alpha) i = 0 df = panel[keys[i]] x = df.index.values y = df[columns].values lines = ax.plot(x, y) ax.grid('on') ax.patch.set_facecolor(background_color) ax.patch.set_alpha(alpha) ax.title.set_text(titles[keys[0]]) ax.title.set_fontsize(fontsize) ax.title.set_fontweight('bold') ax.xaxis.label.set_text(xlabel) plt.setp(ax.get_xticklabels(), rotation=xlabel_rotation) ax.yaxis.label.set_text(ylabel) ax.legend(columns) def animate(k): df = panel[k] x = df.index.values y = df[columns].values.T if replot: # plt.cla() # fig, ax = plt.subplots(figsize=figsize) fig = ax.figure fig.patch.set_facecolor(background_color) fig.patch.set_alpha(alpha) lines = ax.plot(x, y.T, linewidth=linewidth) ax.grid('on') ax.patch.set_facecolor(background_color) ax.patch.set_alpha(alpha) ax.title.set_text(titles[k]) ax.title.set_fontsize(fontsize) ax.title.set_fontweight('bold') ax.xaxis.label.set_text(xlabel) plt.setp(ax.get_xticklabels(), rotation=xlabel_rotation) ax.yaxis.label.set_text(ylabel) ax.legend(columns) else: lines = ax.lines fig = ax.figure for i in range(len(lines)): lines[i].set_xdata( x) # all lines have to share the same x-data lines[i].set_ydata( y[i]) # update the data, don't replot a new line lines[i].set_linewidth(linewidth) lines[i].figure.set_facecolor(background_color) lines[i].figure.set_alpha(alpha) lines[i].axes.patch.set_facecolor(background_color) lines[i].axes.patch.set_alpha(alpha) ax.patch.set_facecolor(background_color) ax.figure.patch.set_alpha(alpha) ax.title.set_text(titles[k]) ax.title.set_fontsize(fontsize) ax.title.set_fontweight('bold') if blit: return lines # FIXME: doesn't work with ext=mp4 # init_func to mask out pixels to be redrawn/cleared which speeds redrawing of plot def mask_lines(): if verbosity > 0: print('Initialing mask_lines. . .') df = panel[0] x = df.index.values y = df[columns].values.T for i in range(len(lines)): # FIXME: why are x-values used to set the y-data coordinates of the mask? lines[i].set_xdata(np.ma.array(x, mask=True)) lines[i].set_ydata(np.ma.array(y[i], mask=True)) return lines if verbosity > 0: print('Drawing frames for a ".{0}" animation{1}. . .'.format( ext, ' with blitting' if blit else '')) animate(keys[0]) ani = animation.FuncAnimation( fig, animate, keys, interval=interval, blit=blit) #, init_func=mask_lines, blit=True) kwargs = dict(default_kwargs) for k, v in six.iteritems(default_kwargs): kwargs[k] = video_kwargs.get(k, v) # if 'bitrate' in kwargs: # kwargs['bitrate'] = min(kwargs['bitrate'], int(8e5 / interval)) # low information rate (long interval) might make it impossible to achieve a higher bitrate ight not if path and isinstance(path, basestring): path += '.{0}'.format(ext) if verbosity > 0: print('Saving video to {0}. . .'.format(path)) ani.save(path, **kwargs) if close: plt.close(fig) return df
def index_model_field_batches(model_or_queryset, key_fields=['model_number', 'serial_number'], value_fields=['pk'], key_formatter=lambda x: str.lstrip(str.strip(str(x or '')), '0'), value_formatter=lambda x: str.strip(str(x)), batch_len=10000, limit=100000000, verbosity=1): '''Like index_model_field except uses 50x less memory and 10x more processing cycles Returns 2 dicts where both the keys and values are tuples: target_index = {(<key_fields[0]>, <key_fields[1]>, ...): (<value_fields[0]>,)} for all distinct model-serial pairs in the Sales queryset target_dupes = {(<key_fields[0]>, <key_fields[1]>, ...): [(<value_fields[1]>,), (<value_fields[2]>,), ...]} with all the duplicates except the first pk already listed above ''' qs = djdb.get_queryset(model_or_queryset) N = qs.count() if verbosity: print 'Indexing %d rows (database records) to aid in finding record %r values using the field %r.' % (N, value_fields, key_fields) index, dupes, rownum = {}, {}, 0 pbar, rownum = None, 0 if verbosity and N > min(1000000, max(0, 100000**(1./verbosity))): widgets = [pb.Counter(), '/%d rows: ' % N, pb.Percentage(), ' ', pb.RotatingMarker(), ' ', pb.Bar(),' ', pb.ETA()] pbar = pb.ProgressBar(widgets=widgets, maxval=N).start() # to determine the type of the field value and decide whether to strip() or normalize in any way #obj0 = qs.filter(**{field + '__isnull': False}).all()[0] value_fields = util.listify(value_fields) key_fields = util.listify(key_fields) for batch in djdb.generate_queryset_batches(qs, batch_len=batch_len, verbosity=verbosity): for obj in batch: # print obj # normalize the key keys = [] for kf in key_fields: k = getattr(obj, kf) keys += [key_formatter(k or '')] values = [] keys = tuple(keys) for vf in value_fields: v = getattr(obj, vf) values += [value_formatter(v or '')] values = tuple(values) if keys in index: dupes[keys] = dupes.get(keys, []) + [values] else: index[keys] = values # print rownum / float(N) if pbar: pbar.update(rownum) rownum += 1 if rownum >= limit: break if pbar: pbar.finish() if verbosity: print 'Found %d duplicate %s values among the %d records or %g%%' % (len(dupes), key_fields, len(index), len(dupes)*100./(len(index) or 1.)) return index, dupes
def simulate(t=1000, poly=(0., ), sinusoids=None, sigma=0, rw=0, irw=0, rrw=0): """Simulate a random signal with seasonal (sinusoids), linear and quadratic trend, RW, IRW, and RRW Arguments: t (int or list of float): number of samples or time vector, default = 1000 poly (list of float): polynomial coefficients (in decreasing "order") passed to `numpy.polyval` i.e. poly[0]*x**(N-1) + ... + poly[N-1] sinusoids (list of list): [[period], [amplitude, period], or [ampl., period, phase]] >>> len(simulate(poly=(0,),rrw=1)) 1000 >>> simulate(t=range(3), poly=(1,2)) # doctest: +NORMALIZE_WHITESPACE 0 2 1 3 2 4 dtype: float64 >>> all(simulate(t=50, sinusoids=((1,2,3),)) == simulate(t=range(50), sinusoids=((1,2,3),))) True >>> any(simulate(t=100)) False >>> abs(simulate(sinusoids=42.42).values[1] + simulate(sinusoids=42.42).values[-1]) < 1e-10 True >>> simulate(t=17,sinusoids=[42, 16]).min() -42.0 >>> all((simulate(t=range(10), sinusoids=(1, 9, 4.5))+simulate(t=10, sinusoids=(1,9))).abs() < 1e-10) True """ if t and isinstance(t, int): t = np.arange(t, dtype=np.float64) else: t = np.array(t, dtype=np.float64) N = len(t) poly = poly or (0., ) poly = listify(poly) y = np.polyval(poly, t) sinusoids = listify(sinusoids or []) if any(isinstance(ATP, (int, float)) for ATP in sinusoids): sinusoids = [sinusoids] for ATP in sinusoids: # default period is 1 more than the length of the simulated series (no values of the cycle are repeated) T = (t[-1] - t[0]) * N / (N - 1.) # default amplitude is 1 and phase is 0 A, P = 1., 0 try: A, T, P = ATP except (TypeError, ValueError): try: A, T = ATP except (TypeError, ValueError): # default period is 1 more than the length of the simulated series # (no values of the cycle are repeated) A = ATP[0] # print(A, T, P) # print(t[1] - t[0]) y += A * np.sin(2 * np.pi * (t - P) / T) if sigma: y += np.random.normal(0.0, float(sigma), N) if rw: y += np.random.normal(0.0, float(rw), N).cumsum() if irw: y += np.random.normal(0.0, float(irw), N).cumsum().cumsum() if rrw: y += np.random.normal(0.0, float(rrw), N).cumsum().cumsum().cumsum() return pd.Series(y, index=t)
def __init__(self, start_urls=None, *args, **kwargs): self.start_urls = [r'http://www.google.com/googlebooks/uspto-patents-applications-biblio.html'] if start_urls: self.start_urls = listify(start_urls) super(PatentBiblioSpider, self).__init__(*args, **kwargs)
def simulate(t=1000, poly=(0.,), sinusoids=None, sigma=0, rw=0, irw=0, rrw=0): """Simulate a random signal with seasonal (sinusoids), linear and quadratic trend, RW, IRW, and RRW Arguments: t (int or list of float): number of samples or time vector, default = 1000 poly (list of float): polynomial coefficients (in decreasing "order") passed to `numpy.polyval` i.e. poly[0]*x**(N-1) + ... + poly[N-1] sinusoids (list of list): [[period], [amplitude, period], or [ampl., period, phase]] >>> len(simulate(poly=(0,),rrw=1)) 1000 >>> simulate(t=range(3), poly=(1,2)) # doctest: +NORMALIZE_WHITESPACE 0 2 1 3 2 4 dtype: float64 >>> all(simulate(t=50, sinusoids=((1,2,3),)) == simulate(t=range(50), sinusoids=((1,2,3),))) True >>> any(simulate(t=100)) False >>> abs(simulate(sinusoids=42.42).values[1] + simulate(sinusoids=42.42).values[-1]) < 1e-10 True >>> simulate(t=17,sinusoids=[42, 16]).min() -42.0 >>> all((simulate(t=range(10), sinusoids=(1, 9, 4.5))+simulate(t=10, sinusoids=(1,9))).abs() < 1e-10) True """ if t and isinstance(t, int): t = np.arange(t, dtype=np.float64) else: t = np.array(t, dtype=np.float64) N = len(t) poly = poly or (0.,) poly = listify(poly) y = np.polyval(poly, t) sinusoids = listify(sinusoids or []) if any(isinstance(ATP, (int, float)) for ATP in sinusoids): sinusoids = [sinusoids] for ATP in sinusoids: # default period is 1 more than the length of the simulated series (no values of the cycle are repeated) T = (t[-1] - t[0]) * N / (N - 1.) # default amplitude is 1 and phase is 0 A, P = 1., 0 try: A, T, P = ATP except (TypeError, ValueError): try: A, T = ATP except (TypeError, ValueError): # default period is 1 more than the length of the simulated series (no values of the cycle are repeated) A = ATP[0] # print(A, T, P) # print(t[1] - t[0]) y += A * np.sin(2 * np.pi * (t - P) / T) if sigma: y += np.random.normal(0.0, float(sigma), N) if rw: y += np.random.normal(0.0, float(rw), N).cumsum() if irw: y += np.random.normal(0.0, float(irw), N).cumsum().cumsum() if rrw: y += np.random.normal(0.0, float(rrw), N).cumsum().cumsum().cumsum() return pd.Series(y, index=t)