def plot_gs_editor_fraction(basedir): df = pd.read_csv(os.path.join(basedir, 'datafiles', 'global_south.csv'), index_col='date', parse_dates=['date']) df['Global South Fraction (100+)'] = df['Global South (100+)'] / ( df['Global South (100+)'] + df['Global North (100+)'] + df['Unknown (100+)']).apply(float) df['Global South Fraction (5+)'] = df['Global South (5+)'] / ( df['Global South (5+)'] + df['Global North (5+)'] + df['Unknown (5+)']).apply(float) df['Global South Fraction (all)'] = df['Global South (all)'] / ( df['Global South (all)'] + df['Global North (all)'] + df['Unknown (all)']).apply(float) df_frac = df[[ 'Global South Fraction (100+)', 'Global South Fraction (5+)', 'Global South Fraction (all)' ]] ds_frac = limnpy.DataSource(limn_id='global_south_editor_fractions', limn_name='Global South Editor Fractions', limn_group=LIMN_GROUP, data=df_frac) ds_frac.write(basedir) g = ds_frac.get_graph(metric_ids=['Global South Fraction (5+)'], title='Global South Active Editor Fraction', graph_id='global_south_editor_fractions') g.write(basedir)
def write_project_top_k_mysql(proj, cursor, basedir, k=10): logger.debug('entering') limn_id = proj + '_top%d' % k limn_name = '%s Editors by Country (top %d)' % (proj.upper(), k) if sql.paramstyle == 'qmark': top_k_query = """SELECT country FROM erosen_geocode_active_editors_country WHERE project=? AND cohort='all' AND end = start+INTERVAL 30 day GROUP BY country ORDER BY SUM(count) DESC, end DESC, country LIMIT ?""" elif sql.paramstyle == 'format': top_k_query = """SELECT country FROM erosen_geocode_active_editors_country WHERE project=%s AND cohort='all' AND end = start+INTERVAL 30 day GROUP BY country ORDER BY SUM(count) DESC, end DESC, country LIMIT %s""" logger.debug('top k query: %s', top_k_query % (proj, k)) cursor.execute(top_k_query, (proj, k)) # mysqldb first converts all args to str top_k = map(itemgetter('country'), cursor.fetchall()) logger.debug('proj: %s, top_k countries: %s', proj, top_k) if not top_k: logger.warning('not country edits found for proj: %s', proj) return if sql.paramstyle == 'qmark': country_fmt = ', '.join([' ? '] * len(top_k)) query = """ SELECT * FROM erosen_geocode_active_editors_country WHERE project=? AND country IN %s AND end = start + INTERVAL 30 day""" query = query % country_fmt elif sql.paramstyle == 'format': country_fmt = '(%s)' % ', '.join([' %s '] * len(top_k)) logger.debug('country_fmt: %s', country_fmt) query = """ SELECT * FROM erosen_geocode_active_editors_country WHERE project=%s AND end = start + INTERVAL 30 day AND country IN """ query = query + country_fmt args = [proj] args.extend(top_k) print_query = query % tuple(args) logger.debug('top_k edit count query: %s', print_query) cursor.execute(query, [ proj, ] + top_k) proj_rows = cursor.fetchall() logger.debug('retrieved %d rows', len(proj_rows)) limn_rows = make_limn_rows(proj_rows, 'country') source = limnpy.DataSource(limn_id, limn_name, limn_rows, limn_group=LIMN_GROUP) source.write(basedir=basedir) source.write_graph(basedir=basedir)
def write_overall_mysql(projects, cursor, basedir): logger.info('writing overall datasource') limn_id = 'overall_by_lang' limn_name = 'Overall Editors by Language' query = """ SELECT * FROM erosen_geocode_active_editors_world""" cursor.execute(query) overall_rows = cursor.fetchall() limn_rows = make_limn_rows(overall_rows, 'project') monthly_limn_rows = filter(lambda r: r['date'].day == 1, limn_rows) #logger.debug('overall limn_rows: %s', pprint.pformat(limn_rows)) source = limnpy.DataSource(limn_id, limn_name, limn_rows, limn_group=LIMN_GROUP) source.write(basedir=basedir) source.write_graph(basedir=basedir) monthly_source = limnpy.DataSource(limn_id + '_monthly', limn_name + ' Monthly', monthly_limn_rows, limn_group=LIMN_GROUP) monthly_source.write(basedir=basedir)
def write_project_summed_mysql(proj, cursor, basedir): """Write out per project sums of editors Keyword arguments: proj -- string. The name of the project in the database (e.g.: 'en' for enwiki). cursor -- database connection. Used to obtain the data. basedir -- string. Path to the data repository to store the computed data in. """ # TODO: dumb copy/paste of write_project_mysql to get daily per project # active editor counts. Please apply fixes to this function also to # write_project_mysql. logger.debug('writing summed project datasource for: %s', proj) limn_id = proj + 'wiki_editor_counts' limn_name = proj + 'wiki editors (Tentative)' if sql.paramstyle == 'qmark': query = """SELECT cohort, end, CONCAT(project, 'wiki') AS wikified_project, SUM(count) FROM erosen_geocode_active_editors_country WHERE project = ? AND end = start + INTERVAL 30 day GROUP BY cohort, end, project""" logger.debug('making query: %s', query) elif sql.paramstyle == 'format': query = """SELECT cohort, end, CONCAT(project, 'wiki') AS wikified_project, SUM(count) FROM erosen_geocode_active_editors_country WHERE project = %s AND end = start + INTERVAL 30 day GROUP BY cohort, end, project""" cursor.execute(query, [proj]) proj_rows = cursor.fetchall() logger.debug('len(proj_rows): %d', len(proj_rows)) if not proj_rows and sql.paramstyle == 'format': logger.debug('No results for query: %s', query % proj) return limn_rows = make_limn_rows(proj_rows, 'wikified_project', 'SUM(count)') source = limnpy.DataSource(limn_id, limn_name, limn_rows, limn_group=LIMN_GROUP) source.write(basedir=basedir) graph = source.get_graph(metric_ids=['%swiki (5+)' % project]) graph.graph['desc'] = """This graph currently mis-reports by counting each editor once for each country associated to the IP addresses used by the editor. """ drop_callout_widget(graph) graph.write(basedir)
def plot_active_editor_totals(basedir_source, basedir_destination): """Write out files for 'Active Editors Total' graph Keyword arguments: basedir_source -- string. Path to the data repository to read global_south.csv from basedir_destination -- string. Path to the data repository to store the comupted data in This function computes the total number of active editors and writes out the necessary datafile, datasource, and graph files to show them in Limn. Those files get written into the corresponding subdirectories of basedir_destination. For the computation of the data, this function relies solely on the global_south.csv file. """ df = pd.read_csv(os.path.join(basedir_source, 'datafiles', 'global_south.csv'), index_col='date', parse_dates=['date']) df['Active Editors Total'] = (df['Global South (5+)'] + df['Global North (5+)'] + df['Unknown (5+)']).apply(float) df_total = df[['Active Editors Total']] ds_total = limnpy.DataSource(limn_id='active_editors_total', limn_name='Active Editors Total', limn_group=LIMN_GROUP, data=df_total) ds_total.write(basedir_destination) g = ds_total.get_graph(metric_ids=['Active Editors Total'], title='Active Editors Total (Tentative)', graph_id='active_editors_total') g.graph['desc'] = """This graph currently over-reports by counting each active editor once for each distinct pair of project and country associated to the IP addresses used by the editor. Also, this graph currently only considers the following projects (no wikidata, no wiktionaries, no wikiquotes, no wikibooks): """ + ("wiki, ".join(sorted(get_projects()))) + """wiki """ drop_callout_widget(g) g.write(basedir_destination)
def write_project_country_language(project, cursor, basedir): for country in get_countries(project, cursor): limn_id = '%s_%s' % (project, country.replace(' ', '_').replace( '/', '-').lower()) limn_name = '%s Editors in %s' % (project.upper(), country.title()) query = """SELECT country, end, cohort, count FROM erosen_geocode_active_editors_country WHERE project=%s AND country=%s""" cursor.execute(query, (project, country)) country_rows = cursor.fetchall() limn_rows = make_limn_rows(country_rows, 'country') source = limnpy.DataSource(limn_id, limn_name, limn_rows, limn_group=LIMN_GROUP) source.write(basedir=basedir) source.write_graph(basedir=basedir)
def write_group_mysql(group_key, country_data, cursor, basedir): logger.debug('writing group with group_key: %s', group_key) country_data = filter(lambda row: group_key in row, country_data) country_data = sorted(country_data, key=itemgetter(group_key)) groups = itertools.groupby(country_data, key=itemgetter(group_key)) groups = dict( map( lambda (key, rows): (key, map(itemgetter(META_DATA_COUNTRY_FIELD), rows)), groups)) #logger.debug(pprint.pformat(groups)) all_rows = [] for group_val, countries in groups.items(): logger.debug('processing group_val: %s', group_val) if sql.paramstyle == 'qmark': group_query = """SELECT end, cohort, SUM(count) FROM erosen_geocode_active_editors_country WHERE country IN (%s) AND end = start + INTERVAL 30 day GROUP BY end, cohort""" countries_fmt = ', '.join([' ? '] * len(countries)) elif sql.paramstyle == 'format': group_query = """SELECT end, cohort, SUM(count) FROM erosen_geocode_active_editors_country WHERE country IN (%s) AND end = start + INTERVAL 30 day GROUP BY end, cohort""" countries_fmt = ', '.join([' %s '] * len(countries)) group_query_fmt = group_query % countries_fmt cursor.execute(group_query_fmt, tuple(countries)) group_rows = cursor.fetchall() group_rows = map(dict, group_rows) for row in group_rows: row.update({group_key: group_val}) all_rows.extend(group_rows) #logger.debug('groups_rows: %s', group_rows) limn_rows = make_limn_rows(all_rows, group_key, count_key='SUM(count)') limn_id = group_key.replace(' ', '_').lower() limn_name = group_key.title() logger.debug('limn_rows: %s', limn_rows) source = limnpy.DataSource(limn_id, limn_name, limn_rows, limn_group=LIMN_GROUP) source.write(basedir=basedir) source.write_graph(basedir=basedir)
def write_project_mysql(proj, cursor, basedir, country_graphs=False): # TODO: This function's structure got copy/pasted to # write_project_summed_mysql. Please apply fixes to this function also to # write_project_summed_mysql. logger.debug('writing project datasource for: %s', proj) limn_id = proj + '_all' limn_name = '%s Editors by Country' % proj.upper() if sql.paramstyle == 'qmark': query = """ SELECT * FROM erosen_geocode_active_editors_country WHERE project=? AND end = start + INTERVAL 30 day""" logger.debug('making query: %s', query) elif sql.paramstyle == 'format': query = """ SELECT * FROM erosen_geocode_active_editors_country WHERE project=%s AND end = start + INTERVAL 30 day""" cursor.execute(query, [proj]) proj_rows = cursor.fetchall() logger.debug('len(proj_rows): %d', len(proj_rows)) if not proj_rows and sql.paramstyle == 'format': logger.debug('GOT NUTHIN!: %s', query % proj) return limn_rows = make_limn_rows(proj_rows, 'country') source = limnpy.DataSource(limn_id, limn_name, limn_rows, limn_group=LIMN_GROUP) source.write(basedir=basedir) source.write_graph(basedir=basedir) # construct single column graphs if country_graphs: for country in source.data.columns[1:]: title = '%s Editors in %s' % (proj.upper(), country) graph_id = '%s_%s' % (proj, re.sub( '\W+', ' ', country).strip().replace(' ', '_').lower()) source.write_graph(metric_ids=[country], basedir=basedir, title=title, graph_id=graph_id)
def main(): """ Here is a semi-complex use case which pivots the data (transformed into a doctest comapitable version): >>> import StringIO, os, csv, shutil >>> from subprocess import call >>> >>> os.mkdir('doctest_tmp') >>> f = open('doctest_tmp/limnify1.tsv', 'w') >>> data = [['Hour', 'Continent', 'Count']] >>> data.append(['2013-01-01_00','Asia', '535984']) >>> data.append(['2013-01-01_00','Africa', '20536']) >>> f = open('doctest_tmp/limnify1.tsv', 'w') >>> csv.writer(f,dialect='excel-tab').writerows(data) >>> f.close() >>> >>> call('''limnify --delim='\\t' --pivot --datecol=0 --datefmt=%Y-%m-%d_%H doctest_tmp/limnify1.tsv''', shell=True) 0 >>> shutil.rmtree('doctest_tmp') """ int_or_str = lambda s : int(s) if isinstance(s,str) and s.isdigit() else s parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('data', help='name of file to be limnified', default='STDIN', nargs='?') parser.add_argument('--delim', default='\t', help='delim to use for input file') parser.add_argument('--header', default=None, nargs='+', help='this is a space separated list of names to use as the header row' 'If your data doesn\'t already have a header row you will need to pass in a list of names to use, otherwise it' 'will assume the first row is a header and then produce confusing data sources. Remember, these names will be' 'displayed in the graph editing interface') parser.add_argument('--datecol', type=int_or_str, default=0, help='the date column name or index--required if it is different from `date`') parser.add_argument('--datefmt', help='format to use with datetime.strptime, default uses dateutil.parser.parse') parser.add_argument('--pivot', default=False, action='store_true', help='whether to try and pivot the data (only supports sum aggregation for now)') parser.add_argument('--metriccol', type=int_or_str, default=1, help='the column name or index to use for creating the column (metric) names when pivoting') parser.add_argument('--valcol', type=int_or_str, default=2, help='the column in which to find the actual data to be plotted when pivoting') parser.add_argument('--basedir', default='.', help='directory in which to place the output datasources, datafiles and graphs directories') parser.add_argument('--name', nargs='+', type=' '.join, help='name of datasource which will be displayed in the UI') parser.add_argument('--id', help='the slug / id used to uniquely identify the datasource within a limn installation') parser.add_argument('--write_graph', default=False, help='whether to write a graph file containing all columns from the datasource') args = parser.parse_args() # pprint.pprint(vars(args)) if args.datefmt: date_parser = lambda s : datetime.datetime.strptime(s, args.datefmt) else: date_parser = dateutil.parser.parse if args.data == 'STDIN': data = sys.stdin else: data = args.data if not args.pivot: if args.header: df = pd.read_table(data, sep=args.delim, parse_dates=[args.datecol], date_parser=date_parser, names=args.header) else: df = pd.read_table(data, sep=args.delim, parse_dates=[args.datecol], date_parser=date_parser) if isinstance(args.datecol, int): args.datecol = df.columns[args.datecol] else: if args.header: df_long = pd.read_table(data, sep=args.delim, parse_dates=[args.datecol], date_parser=date_parser, names=args.header) else: df_long = pd.read_table(data, sep=args.delim, parse_dates=[args.datecol], date_parser=date_parser) if isinstance(args.datecol, int): args.datecol = df_long.columns[args.datecol] if isinstance(args.metriccol, int): args.metriccol = df_long.columns[args.metriccol] if isinstance(args.valcol, int): args.valcol = df_long.columns[args.valcol] df = pd.pivot_table(df_long, rows=[args.datecol], cols=[args.metriccol], values=args.valcol, aggfunc=np.sum) sys.stderr.write('output data format (formatted by pandas.DataFrame version):\n%s\n' % df) if args.name is None: args.name = os.path.splitext(os.path.split(args.data)[1])[0] if args.id is None: args.id = os.path.splitext(os.path.split(args.data)[1])[0] ds = limnpy.DataSource(args.id, args.name, df, date_key=args.datecol, date_fmt=args.datefmt) ds.write(args.basedir) if args.write_graph: graph = ds.get_graph() graph.write(args.basedir)