def test_empty_breaks(): x = [] assert custom_format()(x) == [] assert comma_format()(x) == [] assert currency_format()(x) == [] assert percent_format()(x) == [] assert scientific_format()(x) == [] assert date_format()(x) == [] assert mpl_format()(x) == [] assert log_format()(x) == [] assert timedelta_format()(x) == []
def test_timedelta_format(): x = [timedelta(days=7 * i) for i in range(5)] labels = timedelta_format()(x) assert labels == ['0', '1 week', '2 weeks', '3 weeks', '4 weeks'] x = [pd.Timedelta(seconds=600 * i) for i in range(5)] labels = timedelta_format()(x) assert labels == \ ['0', '10 minutes', '20 minutes', '30 minutes', '40 minutes'] # specific units labels = timedelta_format(units='h')(x) assert labels == \ ['0', '0.1667 hours', '0.3333 hours', '0.5000 hours', '0.6667 hours'] # usetex x = [timedelta(microseconds=7 * i) for i in range(5)] labels = timedelta_format(units='us', usetex=True)(x) assert labels == \ ['0', '7$\\mu s$', '14$\\mu s$', '21$\\mu s$', '28$\\mu s$']
def test_timedelta_format(): x = [timedelta(days=7*i) for i in range(5)] labels = timedelta_format()(x) assert labels == ['0', '1 week', '2 weeks', '3 weeks', '4 weeks'] x = [pd.Timedelta(seconds=600*i) for i in range(5)] labels = timedelta_format()(x) assert labels == \ ['0', '10 minutes', '20 minutes', '30 minutes', '40 minutes'] # specific units labels = timedelta_format(units='h')(x) assert labels == \ ['0', '0.1667 hours', '0.3333 hours', '0.5000 hours', '0.6667 hours'] # usetex x = [timedelta(microseconds=7*i) for i in range(5)] labels = timedelta_format(units='us', usetex=True)(x) assert labels == \ ['0', '7$\\mu s$', '14$\\mu s$', '21$\\mu s$', '28$\\mu s$']
]) y_line = x_line * results_2.slope + results_2.intercept g = (p9.ggplot( published_date_distances, p9.aes(x="factor(version_count)", y="time_to_published"), ) + p9.geom_boxplot(fill="#a6cee3") + p9.geom_line( mapping=p9.aes(x="version_count", y="time_to_published"), stat="smooth", method="lm", linetype="dashed", se=False, alpha=1, size=0.7, inherit_aes=False, ) + p9.scale_y_timedelta(labels=timedelta_format("d")) + p9.annotate( "text", x=9, y=timedelta(days=1470), label=f"Y={results_2.slope:.2f}*X+{results_2.intercept:.2f}", ) + p9.labs(x="# of Preprint Versions", y="Time Elapsed Until Preprint is Published") + p9.theme_seaborn( context="paper", style="ticks", font="Arial", font_scale=1.3)) # g.save("output/version_count_vs_publication_time.svg", dpi=500) # g.save("output/version_count_vs_publication_time.png", dpi=500) print(g) plt.figure(figsize=(8, 5)) g = sns.boxenplot( x="version_count", y="days_to_published",
median_ci_l, median_ci_u = median_ci.values.flatten() median_ci_l, median_ci_u # In[9]: overall_preprint_survival = kmf.survival_function_.reset_index().assign( label="all_papers") overall_preprint_survival.head() # In[10]: g = (p9.ggplot( overall_preprint_survival.assign( timeline=lambda x: pd.to_timedelta(x.timeline, "D")), p9.aes(x="timeline", y="KM_estimate", color="label"), ) + p9.scale_x_timedelta(labels=timedelta_format("d")) + p9.geom_line() + p9.ylim(0, 1)) print(g) # # Calculate Category Survival Function # This section measures how long it takes for certain categories to get preprints published. # In[11]: entire_preprint_df = pd.DataFrame( [], columns=["timeline", "KM_estimate", "category"]) half_life = [] for cat, grouped_df in preprints_w_published_dates.groupby("category"): temp_df = preprints_w_published_dates.query(f"category=='{cat}'") kmf.fit(
median_ci_l, median_ci_u = median_ci.values.flatten() median_ci_l, median_ci_u overall_preprint_survival = kmf.survival_function_.reset_index().assign( label="all_papers" ) overall_preprint_survival.head() g = ( p9.ggplot( overall_preprint_survival.assign( timeline=lambda x: pd.to_timedelta(x.timeline, "D") ), p9.aes(x="timeline", y="KM_estimate", color="label"), ) + p9.scale_x_timedelta(labels=timedelta_format("d")) + p9.geom_line() + p9.ylim(0, 1) ) print(g) # # Calculate Category Survival Function # This section measures how long it takes for certain categories to get preprints published. entire_preprint_df = pd.DataFrame([], columns=["timeline", "KM_estimate", "category"]) half_life = [] for cat, grouped_df in preprints_w_published_dates.groupby("category"): temp_df = preprints_w_published_dates.query(f"category=='{cat}'") kmf.fit( temp_df["time_to_published"].dt.total_seconds() / 60 / 60 / 24,