def get_network_ts(df, network, year=2016):
    '''
    MV use frequency for each network.
    '''
    df = df.copy()
    # XXX daily_frequency by network works somewhat differently, but not
    # too sure how to describe it so...WATCH OUT!
    return daily_frequency(df, date_range(year),
                           by=['network'])[network].dropna()
示例#2
0
def test_daily_frequency():

    test_corpus_name = _setup_mongo()
    date_index = pd.date_range('2016-9-1', '2016-9-4', freq='D')
    ic = IatvCorpus.objects(name=test_corpus_name)[0]

    # obtained by dividing total metaphor counts by total shows per day
    expected_metaphor_freq_all = pd.DataFrame(
        index=date_index, data={'freq': [.75, 1.5, 2.0/3.0, 2.0/3.0]}
    )

    pn = [
        'Tracy Morgans news hour', 'Dingbat Alley', 'iCry Sad News Time',
        'Digging Turnips with Ethan Land', 'Good morning, middle america!'
    ]
    n = ['MSNBCW', 'CNNW', 'FOXNEWSW']
    fw = ['kill', 'murder', 'punch', 'attack']
    so = ['trump', 'clinton', 'obama', 'media']

    input_df = _gen_test_input(pn, n, fw, so)
    daily_freq = daily_frequency(input_df, date_index, ic)

    pd.testing.assert_frame_equal(daily_freq, expected_metaphor_freq_all)

    daily_freq_by_network = daily_frequency(
        input_df, date_index, ic, by=['network']
    )[['MSNBCW', 'CNNW', 'FOXNEWSW']]

    expected_metaphor_freq_by_network = pd.DataFrame(
        index=date_index,
        data=[
            (0, 2, 1),
            (2.5, np.nan, .5),
            (0, np.nan, 1),
            (np.nan, 0, 1)
        ],
        dtype=np.float64,
        columns=pd.Index(['MSNBCW', 'CNNW', 'FOXNEWSW'], name='network')
    )

    pd.testing.assert_frame_equal(
        daily_freq_by_network, expected_metaphor_freq_by_network
    )
def get_obj_ts(df, obj, year=2016):
    '''
    Get MV use frequency timeseries for a single subject, e.g., Hillary Clinton
    '''
    df = df.copy()
    # Noticed some cases of, e.g., 'Donald Trump '.
    df.objects = df.objects.str.strip()
    obj_df = df[df.objects == obj]

    # If we have na at this step it's due to dividing by zero counts.
    return daily_frequency(
        obj_df,
        date_range(year),
        by=['objects']  # , predropna=True
    )[obj].fillna(0.0)
def data_for_model(year=2016, save_dir=None):
    '''
    Create a dataframe with all series needed to make regressions of
    faceted MV frequencies.
    '''
    # Create metaphorical violence frequency series across all networks.
    csv = os.path.join('Data', 'viomet-sep-nov-{}.csv'.format(year))
    # viomet_df = pd.read_csv(url, na_values='',
    #                  parse_dates=['start_localtime'])
    project_df = get_project_data_frame(csv)
    project_df = project_df[project_df.include]
    freq_df = daily_frequency(project_df, date_range(year))
    metvi_ts = pd.Series(index=freq_df.index,
                         data=freq_df['freq'],
                         dtype=float)

    days_from_debate = _days_from_debate(year, freq_df.index)

    # Create timeseries of tweets.
    if year == 2016:
        ts_data = dict(
            # Number of days before or after debate.
            days_from_debate=days_from_debate,

            # Twitter timeseries.
            trump=get_tweets_ts('trump'),
            clinton=get_tweets_ts('clinton'),

            # All metaphorical violence freq timeseries.
            metvi_all=metvi_ts,

            # Trump as subject or object metvi freq timeseries.
            metvi_trump_subj=get_subj_ts(project_df, 'Donald Trump'),
            metvi_trump_obj=get_obj_ts(project_df, 'Donald Trump'),

            # Clinton as subject or object metvi freq timeseries.
            metvi_clinton_subj=get_subj_ts(project_df, 'Hillary Clinton'),
            metvi_clinton_obj=get_obj_ts(project_df, 'Hillary Clinton'),

            # Metvi freq on networks timeseries.
            metvi_msnbc=get_network_ts(project_df, 'MSNBCW'),
            metvi_cnn=get_network_ts(project_df, 'CNNW'),
            metvi_foxnews=get_network_ts(project_df, 'FOXNEWSW'))
    elif year == 2012:
        ts_data = dict(
            # Number of days before or after debate.
            days_from_debate=days_from_debate,
            # Twitter timeseries.
            romney=get_tweets_ts('romney', year=2012),
            obama=get_tweets_ts('obama', year=2012),

            # All metaphorical violence freq timeseries.
            metvi_all=metvi_ts,

            # Trump as subject or object metvi freq timeseries.
            metvi_romney_subj=get_subj_ts(project_df, 'Mitt Romney',
                                          year=2012),
            metvi_romney_obj=get_obj_ts(project_df, 'Mitt Romney', year=2012),

            # Clinton as subject or object metvi freq timeseries.
            metvi_obama_subj=get_subj_ts(project_df, 'Barack Obama',
                                         year=2012),
            metvi_obama_obj=get_obj_ts(project_df, 'Barack Obama', year=2012),

            # Metvi freq on networks timeseries.
            metvi_msnbc=get_network_ts(project_df, 'MSNBCW', year=2012),
            metvi_cnn=get_network_ts(project_df, 'CNNW', year=2012),
            metvi_foxnews=get_network_ts(project_df, 'FOXNEWSW', year=2012))

    return pd.DataFrame(ts_data)
示例#5
0
def by_network_frequency_figure(project_df,
                                date_range=pd.date_range('2016-09-01',
                                                         '2016-11-30',
                                                         freq='D'),
                                freq=True,
                                partition_infos=None,
                                font_scale=1.15,
                                save_path=None):

    # sns.axes_style("darkgrid")
    # sns.set(font_scale=font_scale)

    # CUR_PAL = sns.color_palette()

    # fits are not being shown for this condition
    if (partition_infos is None):

        if freq:

            network_freq = daily_frequency(project_df,
                                           date_range,
                                           by=['network'])

            network_freq.plot(style='o')

        else:

            full_df = daily_metaphor_counts(
                project_df, ['network'],
                date_range)[['MSNBCW', 'CNNW', 'FOXNEWSW']]

            full_df.plot(style='o')

    # show fits TODO Include more arguments so that fits don't have to be
    # generated just to plot. Generate fits outside and pass fits in.
    else:

        if freq:

            # put networks in desired order, left to right
            networks = ['MSNBCW', 'CNNW', 'FOXNEWSW']
            line_styles = [':', '--', '-']
            markers = ['s', 'o', '^']
            # markers = ['bs', 'go', 'r^']

            network_freq = daily_frequency(project_df,
                                           date_range,
                                           by=['network'])

            ax = network_freq[networks].plot(style=markers,
                                             mew=1,
                                             mfc='white',
                                             ms=6,
                                             alpha=0.9,
                                             legend=False,
                                             figsize=DEFAULT_FIGSIZE,
                                             mec='lightgrey')

            for net_idx, network in enumerate(networks):

                pinfo = partition_infos[network]

                day_td = timedelta(seconds=60)

                d0 = date_range[0]
                d1 = pinfo.partition_date_1 - day_td

                d2 = pinfo.partition_date_1
                d3 = pinfo.partition_date_2

                d4 = pinfo.partition_date_2 + day_td
                d5 = date_range[-1]

                fg = pinfo.f_ground
                fe = pinfo.f_excited

                dates = pd.DatetimeIndex([d0, d1, d2, d3, d4, d5])
                datas = [fg, fg, fe, fe, fg, fg]

                network_formatted = ['MSNBC', 'CNN', 'Fox News']

                pd.Series(index=dates, data=datas).plot(
                    lw=3,
                    ax=ax,
                    color='k',
                    ls=line_styles[net_idx],
                    # legend=True, label=network_formatted[net_idx]
                )

            ax.xaxis.set_minor_formatter(pltdates.DateFormatter('%-d'))
            ax.xaxis.set_minor_locator(pltdates.DayLocator(bymonthday=(1, 15)))

            yheight = 0.1
            zo = 10
            textargs = dict(size=13,
                            ha='right',
                            bbox=dict(alpha=0.6, color='white'))
            if date_range[0].year == 2016:
                ax.axvline(datetime(2016, 9, 26),
                           ymax=yheight,
                           color='k',
                           zorder=zo)
                ax.axvline(datetime(2016, 10, 9),
                           ymax=yheight,
                           color='k',
                           zorder=zo)
                ax.axvline(datetime(2016, 10, 19),
                           ymax=yheight,
                           color='k',
                           zorder=zo)
                # ax.axhline(
                #     yheight, color='k', xmin=0.278, xmax=0.535)
                # , '2016-9-26', '2016-10-19', zorder=1
                # )
                ax.text('2016-9-25', 0.2, "Debate #1", **textargs)
                ax.text('2016-10-8', 0.2, "#2", **textargs)
                ax.text('2016-10-18', 0.2, "#3", **textargs)
                ax.set_xlim(datetime(2016, 8, 31), datetime(2016, 11, 29))

            if date_range[0].year == 2012:
                ax.axvline(datetime(2012, 10, 3),
                           ymax=yheight,
                           color='k',
                           zorder=zo)
                ax.axvline(datetime(2012, 10, 16),
                           ymax=yheight,
                           color='k',
                           zorder=zo)
                ax.axvline(datetime(2012, 10, 22),
                           ymax=yheight,
                           color='k',
                           zorder=zo)
                # ax.axhline(
                #     yheight, color='k', xmin=0.278, xmax=0.535)
                # , '2016-9-26', '2016-10-19', zorder=1
                # )
                ax.text('2012-10-2', 0.2, "Debate #1", **textargs)
                ax.text('2012-10-15', 0.2, "#2", **textargs)
                ax.text('2012-10-21', 0.2, "#3", **textargs)
                ax.set_xlim(datetime(2012, 8, 31), datetime(2012, 11, 29))

            ax.grid(False)

            ax.set_xlabel('Date')
            ax.set_ylabel('Frequency of usage')
            ax.set_title('Metaphorical violence usage data and dynamic model')

            ax.spines['right'].set_visible(False)
            ax.spines['top'].set_visible(False)

            plt.tight_layout()

            ax.set_ylim(-.25, 7.25)

            import matplotlib.lines as mlines
            net_lines = []

            # Manually create the legend with line and marker styles.
            for net_idx, network in enumerate(network_formatted):
                net_lines.append(
                    mlines.Line2D([], [],
                                  mfc='white',
                                  color='black',
                                  marker=markers[net_idx],
                                  markersize=7.5,
                                  ls=line_styles[net_idx],
                                  label=network,
                                  mec='gray'))
            # 3.8 handlelength to only have whole dashes.
            plt.legend(handles=net_lines, handlelength=3.8)

            if save_path is not None:
                fig = ax.get_figure()
                fig.savefig(save_path)
                plt.close()