def plot_gs_editor_fraction(basedir):
    df = pd.read_csv(os.path.join(basedir, 'datafiles', 'global_south.csv'),
                     index_col='date',
                     parse_dates=['date'])
    df['Global South Fraction (100+)'] = df['Global South (100+)'] / (
        df['Global South (100+)'] + df['Global North (100+)'] +
        df['Unknown (100+)']).apply(float)
    df['Global South Fraction (5+)'] = df['Global South (5+)'] / (
        df['Global South (5+)'] + df['Global North (5+)'] +
        df['Unknown (5+)']).apply(float)
    df['Global South Fraction (all)'] = df['Global South (all)'] / (
        df['Global South (all)'] + df['Global North (all)'] +
        df['Unknown (all)']).apply(float)
    df_frac = df[[
        'Global South Fraction (100+)', 'Global South Fraction (5+)',
        'Global South Fraction (all)'
    ]]

    ds_frac = limnpy.DataSource(limn_id='global_south_editor_fractions',
                                limn_name='Global South Editor Fractions',
                                limn_group=LIMN_GROUP,
                                data=df_frac)
    ds_frac.write(basedir)
    g = ds_frac.get_graph(metric_ids=['Global South Fraction (5+)'],
                          title='Global South Active Editor Fraction',
                          graph_id='global_south_editor_fractions')
    g.write(basedir)
def write_project_top_k_mysql(proj, cursor, basedir, k=10):
    logger.debug('entering')
    limn_id = proj + '_top%d' % k
    limn_name = '%s Editors by Country (top %d)' % (proj.upper(), k)

    if sql.paramstyle == 'qmark':
        top_k_query = """SELECT country
                    FROM erosen_geocode_active_editors_country
                    WHERE project=? AND cohort='all' AND end = start+INTERVAL 30 day
                    GROUP BY country
                    ORDER BY SUM(count) DESC, end DESC, country
                    LIMIT ?"""
    elif sql.paramstyle == 'format':
        top_k_query = """SELECT country
                    FROM erosen_geocode_active_editors_country
                    WHERE project=%s AND cohort='all' AND end = start+INTERVAL 30 day
                    GROUP BY country
                    ORDER BY SUM(count) DESC, end DESC, country
                    LIMIT %s"""
        logger.debug('top k query: %s', top_k_query % (proj, k))
    cursor.execute(top_k_query,
                   (proj, k))  # mysqldb first converts all args to str
    top_k = map(itemgetter('country'), cursor.fetchall())
    logger.debug('proj: %s, top_k countries: %s', proj, top_k)
    if not top_k:
        logger.warning('not country edits found for proj: %s', proj)
        return

    if sql.paramstyle == 'qmark':
        country_fmt = ', '.join([' ? '] * len(top_k))
        query = """ SELECT * FROM erosen_geocode_active_editors_country WHERE project=? AND country IN %s AND end = start + INTERVAL 30 day"""
        query = query % country_fmt
    elif sql.paramstyle == 'format':
        country_fmt = '(%s)' % ', '.join([' %s '] * len(top_k))
        logger.debug('country_fmt: %s', country_fmt)
        query = """ SELECT * FROM erosen_geocode_active_editors_country WHERE project=%s  AND end = start + INTERVAL 30 day AND country IN """
        query = query + country_fmt
        args = [proj]
        args.extend(top_k)
        print_query = query % tuple(args)
        logger.debug('top_k edit count query: %s', print_query)
    cursor.execute(query, [
        proj,
    ] + top_k)
    proj_rows = cursor.fetchall()

    logger.debug('retrieved %d rows', len(proj_rows))
    limn_rows = make_limn_rows(proj_rows, 'country')
    source = limnpy.DataSource(limn_id,
                               limn_name,
                               limn_rows,
                               limn_group=LIMN_GROUP)
    source.write(basedir=basedir)
    source.write_graph(basedir=basedir)
def write_overall_mysql(projects, cursor, basedir):
    logger.info('writing overall datasource')
    limn_id = 'overall_by_lang'
    limn_name = 'Overall Editors by Language'

    query = """ SELECT * FROM erosen_geocode_active_editors_world"""
    cursor.execute(query)
    overall_rows = cursor.fetchall()

    limn_rows = make_limn_rows(overall_rows, 'project')
    monthly_limn_rows = filter(lambda r: r['date'].day == 1, limn_rows)
    #logger.debug('overall limn_rows: %s', pprint.pformat(limn_rows))
    source = limnpy.DataSource(limn_id,
                               limn_name,
                               limn_rows,
                               limn_group=LIMN_GROUP)
    source.write(basedir=basedir)
    source.write_graph(basedir=basedir)

    monthly_source = limnpy.DataSource(limn_id + '_monthly',
                                       limn_name + ' Monthly',
                                       monthly_limn_rows,
                                       limn_group=LIMN_GROUP)
    monthly_source.write(basedir=basedir)
def write_project_summed_mysql(proj, cursor, basedir):
    """Write out per project sums of editors

    Keyword arguments:
    proj -- string. The name of the project in the database
        (e.g.: 'en' for enwiki).
    cursor -- database connection. Used to obtain the data.
    basedir -- string. Path to the data repository to store the computed
        data in.
    """
    # TODO: dumb copy/paste of write_project_mysql to get daily per project
    # active editor counts. Please apply fixes to this function also to
    # write_project_mysql.
    logger.debug('writing summed project datasource for: %s', proj)
    limn_id = proj + 'wiki_editor_counts'
    limn_name = proj + 'wiki editors (Tentative)'

    if sql.paramstyle == 'qmark':
        query = """SELECT cohort, end, CONCAT(project, 'wiki') AS wikified_project, SUM(count)
                    FROM erosen_geocode_active_editors_country
                    WHERE project = ? AND end = start + INTERVAL 30 day
                    GROUP BY cohort, end, project"""
        logger.debug('making query: %s', query)
    elif sql.paramstyle == 'format':
        query = """SELECT cohort, end, CONCAT(project, 'wiki') AS wikified_project, SUM(count)
                    FROM erosen_geocode_active_editors_country
                    WHERE project = %s AND end = start + INTERVAL 30 day
                    GROUP BY cohort, end, project"""
    cursor.execute(query, [proj])
    proj_rows = cursor.fetchall()

    logger.debug('len(proj_rows): %d', len(proj_rows))
    if not proj_rows and sql.paramstyle == 'format':
        logger.debug('No results for query: %s', query % proj)
        return
    limn_rows = make_limn_rows(proj_rows, 'wikified_project', 'SUM(count)')
    source = limnpy.DataSource(limn_id,
                               limn_name,
                               limn_rows,
                               limn_group=LIMN_GROUP)
    source.write(basedir=basedir)
    graph = source.get_graph(metric_ids=['%swiki (5+)' % project])
    graph.graph['desc'] = """This graph currently mis-reports by counting each
editor once for each country associated to the IP addresses used by
the editor.
"""
    drop_callout_widget(graph)
    graph.write(basedir)
def plot_active_editor_totals(basedir_source, basedir_destination):
    """Write out files for 'Active Editors Total' graph

    Keyword arguments:
    basedir_source -- string. Path to the data repository to read
        global_south.csv from
    basedir_destination -- string. Path to the data repository to store
        the comupted data in

    This function computes the total number of active editors and
    writes out the necessary datafile, datasource, and graph files to
    show them in Limn. Those files get written into the corresponding
    subdirectories of basedir_destination.

    For the computation of the data, this function relies solely on
    the global_south.csv file.
    """
    df = pd.read_csv(os.path.join(basedir_source, 'datafiles',
                                  'global_south.csv'),
                     index_col='date',
                     parse_dates=['date'])
    df['Active Editors Total'] = (df['Global South (5+)'] +
                                  df['Global North (5+)'] +
                                  df['Unknown (5+)']).apply(float)
    df_total = df[['Active Editors Total']]

    ds_total = limnpy.DataSource(limn_id='active_editors_total',
                                 limn_name='Active Editors Total',
                                 limn_group=LIMN_GROUP,
                                 data=df_total)
    ds_total.write(basedir_destination)
    g = ds_total.get_graph(metric_ids=['Active Editors Total'],
                           title='Active Editors Total (Tentative)',
                           graph_id='active_editors_total')
    g.graph['desc'] = """This graph currently over-reports by counting each
active editor once for each distinct pair of project and country
associated to the IP addresses used by the editor.

Also, this graph currently only considers the following projects
(no wikidata, no wiktionaries, no wikiquotes, no wikibooks):

    """ + ("wiki, ".join(sorted(get_projects()))) + """wiki

"""
    drop_callout_widget(g)
    g.write(basedir_destination)
def write_project_country_language(project, cursor, basedir):
    for country in get_countries(project, cursor):
        limn_id = '%s_%s' % (project, country.replace(' ', '_').replace(
            '/', '-').lower())
        limn_name = '%s Editors in %s' % (project.upper(), country.title())
        query = """SELECT country, end, cohort, count FROM erosen_geocode_active_editors_country
                   WHERE project=%s AND country=%s"""
        cursor.execute(query, (project, country))
        country_rows = cursor.fetchall()

        limn_rows = make_limn_rows(country_rows, 'country')
        source = limnpy.DataSource(limn_id,
                                   limn_name,
                                   limn_rows,
                                   limn_group=LIMN_GROUP)
        source.write(basedir=basedir)
        source.write_graph(basedir=basedir)
def write_group_mysql(group_key, country_data, cursor, basedir):
    logger.debug('writing group with group_key: %s', group_key)
    country_data = filter(lambda row: group_key in row, country_data)
    country_data = sorted(country_data, key=itemgetter(group_key))
    groups = itertools.groupby(country_data, key=itemgetter(group_key))
    groups = dict(
        map(
            lambda (key, rows):
            (key, map(itemgetter(META_DATA_COUNTRY_FIELD), rows)), groups))
    #logger.debug(pprint.pformat(groups))
    all_rows = []
    for group_val, countries in groups.items():
        logger.debug('processing group_val: %s', group_val)
        if sql.paramstyle == 'qmark':
            group_query = """SELECT end, cohort, SUM(count)
                         FROM erosen_geocode_active_editors_country
                         WHERE country IN (%s)
                         AND end = start + INTERVAL 30 day
                         GROUP BY end, cohort"""
            countries_fmt = ', '.join([' ? '] * len(countries))
        elif sql.paramstyle == 'format':
            group_query = """SELECT end, cohort, SUM(count)
                         FROM erosen_geocode_active_editors_country
                         WHERE country IN (%s)
                         AND end = start + INTERVAL 30 day
                         GROUP BY end, cohort"""
            countries_fmt = ', '.join([' %s '] * len(countries))
        group_query_fmt = group_query % countries_fmt
        cursor.execute(group_query_fmt, tuple(countries))
        group_rows = cursor.fetchall()
        group_rows = map(dict, group_rows)
        for row in group_rows:
            row.update({group_key: group_val})
        all_rows.extend(group_rows)
    #logger.debug('groups_rows: %s', group_rows)

    limn_rows = make_limn_rows(all_rows, group_key, count_key='SUM(count)')
    limn_id = group_key.replace(' ', '_').lower()
    limn_name = group_key.title()
    logger.debug('limn_rows: %s', limn_rows)
    source = limnpy.DataSource(limn_id,
                               limn_name,
                               limn_rows,
                               limn_group=LIMN_GROUP)
    source.write(basedir=basedir)
    source.write_graph(basedir=basedir)
def write_project_mysql(proj, cursor, basedir, country_graphs=False):
    # TODO: This function's structure got copy/pasted to
    # write_project_summed_mysql. Please apply fixes to this function also to
    # write_project_summed_mysql.
    logger.debug('writing project datasource for: %s', proj)
    limn_id = proj + '_all'
    limn_name = '%s Editors by Country' % proj.upper()

    if sql.paramstyle == 'qmark':
        query = """ SELECT * FROM erosen_geocode_active_editors_country WHERE project=? AND end = start + INTERVAL 30 day"""
        logger.debug('making query: %s', query)
    elif sql.paramstyle == 'format':
        query = """ SELECT * FROM erosen_geocode_active_editors_country WHERE project=%s AND end = start + INTERVAL 30 day"""
    cursor.execute(query, [proj])
    proj_rows = cursor.fetchall()

    logger.debug('len(proj_rows): %d', len(proj_rows))
    if not proj_rows and sql.paramstyle == 'format':
        logger.debug('GOT NUTHIN!: %s', query % proj)
        return
    limn_rows = make_limn_rows(proj_rows, 'country')
    source = limnpy.DataSource(limn_id,
                               limn_name,
                               limn_rows,
                               limn_group=LIMN_GROUP)
    source.write(basedir=basedir)
    source.write_graph(basedir=basedir)

    # construct single column graphs
    if country_graphs:
        for country in source.data.columns[1:]:
            title = '%s Editors in %s' % (proj.upper(), country)
            graph_id = '%s_%s' % (proj, re.sub(
                '\W+', ' ', country).strip().replace(' ', '_').lower())
            source.write_graph(metric_ids=[country],
                               basedir=basedir,
                               title=title,
                               graph_id=graph_id)
Пример #9
0
def main():
    """
    Here is a semi-complex use case which pivots the data (transformed into a doctest comapitable version):

        >>> import StringIO, os, csv, shutil
        >>> from subprocess import call
        >>>
        >>> os.mkdir('doctest_tmp')
        >>> f = open('doctest_tmp/limnify1.tsv', 'w')
        >>> data =     [['Hour',         'Continent', 'Count']]
        >>> data.append(['2013-01-01_00','Asia',      '535984'])
        >>> data.append(['2013-01-01_00','Africa',    '20536'])
        >>> f = open('doctest_tmp/limnify1.tsv', 'w')
        >>> csv.writer(f,dialect='excel-tab').writerows(data)
        >>> f.close()
        >>>
        >>> call('''limnify --delim='\\t' --pivot --datecol=0 --datefmt=%Y-%m-%d_%H doctest_tmp/limnify1.tsv''', shell=True)
        0
        >>> shutil.rmtree('doctest_tmp')
    """


    int_or_str = lambda s : int(s) if isinstance(s,str) and s.isdigit() else s

    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('data', help='name of file to be limnified', default='STDIN', nargs='?')
    parser.add_argument('--delim', default='\t', help='delim to use for input file')
    parser.add_argument('--header', default=None, nargs='+', help='this is a space separated list of names to use as the header row'
                'If your data doesn\'t already have a header row you will need to pass in a list of names to use, otherwise it'
                'will assume the first row is a header and then produce confusing data sources.  Remember, these names will be'
                'displayed in the graph editing interface')
    parser.add_argument('--datecol', type=int_or_str, default=0, help='the date column name or index--required if it is different from `date`')
    parser.add_argument('--datefmt', 
                help='format to use with datetime.strptime, default uses dateutil.parser.parse')
    parser.add_argument('--pivot', default=False, action='store_true', 
                help='whether to try and pivot the data (only supports sum aggregation for now)')
    parser.add_argument('--metriccol', type=int_or_str, default=1, 
                help='the column name or index to use for creating the column (metric) names when pivoting')
    parser.add_argument('--valcol', type=int_or_str, default=2, 
                help='the column in which to find the actual data to be plotted when pivoting')
    parser.add_argument('--basedir', default='.', help='directory in which to place the output datasources, datafiles and graphs directories')
    parser.add_argument('--name', nargs='+', type=' '.join, help='name of datasource which will be displayed in the UI')
    parser.add_argument('--id', help='the slug / id used to uniquely identify the datasource within a limn installation')
    parser.add_argument('--write_graph', default=False, help='whether to write a graph file containing all columns from the datasource')

    args = parser.parse_args()
    # pprint.pprint(vars(args))

    if args.datefmt:
        date_parser = lambda s : datetime.datetime.strptime(s, args.datefmt)
    else:
        date_parser = dateutil.parser.parse

        
    if args.data == 'STDIN':
        data = sys.stdin
    else:
        data = args.data

    if not args.pivot:
        if args.header:
            df = pd.read_table(data, sep=args.delim, parse_dates=[args.datecol], date_parser=date_parser, names=args.header)
        else:
            df = pd.read_table(data, sep=args.delim, parse_dates=[args.datecol], date_parser=date_parser)
        if isinstance(args.datecol, int):
            args.datecol = df.columns[args.datecol] 
    else:
        if args.header:
            df_long = pd.read_table(data, sep=args.delim, parse_dates=[args.datecol], date_parser=date_parser, names=args.header)
        else:
            df_long = pd.read_table(data, sep=args.delim, parse_dates=[args.datecol], date_parser=date_parser)


        if isinstance(args.datecol, int):
            args.datecol = df_long.columns[args.datecol]
        if isinstance(args.metriccol, int):
            args.metriccol = df_long.columns[args.metriccol]
        if isinstance(args.valcol, int):
            args.valcol = df_long.columns[args.valcol]

        df = pd.pivot_table(df_long, rows=[args.datecol], cols=[args.metriccol], values=args.valcol, aggfunc=np.sum)


    sys.stderr.write('output data format (formatted by pandas.DataFrame version):\n%s\n' % df)

    if args.name is None:
        args.name = os.path.splitext(os.path.split(args.data)[1])[0]
    if args.id is None:
        args.id = os.path.splitext(os.path.split(args.data)[1])[0]
    ds = limnpy.DataSource(args.id, args.name, df, date_key=args.datecol, date_fmt=args.datefmt)
    ds.write(args.basedir)

    if args.write_graph:
        graph = ds.get_graph()
        graph.write(args.basedir)