def main(): # Read article and urls: article_urls = read_article_urls(urls_dir) article_urls = article_urls[article_urls["year"] != 2020] # Remove year 2020 # 1973 articles full_articles = get_article_length(in_dir, article_urls, "full") filt_articles = get_article_length(in_dir, article_urls, "filt") full_articles["abs_diff"].describe() # count 1973.000000 # mean 11.831221 # std 19.538525 # min 0.000000 # 25% 1.000000 # 50% 1.000000 # 75% 19.000000 # max 130.000000 filt_articles["abs_diff"].describe() # count 1973.000000 # mean 0.274709 # std 0.638960 # min 0.000000 # 25% 0.000000 # 50% 0.000000 # 75% 0.000000 # max 5.000000 p1() p2() p3()
def main(): meta = read_article_urls(url_dir) meta = meta[meta["year"] != 2020] # Remove year 2020 for index, row in meta.iterrows(): year = row["year"] month = row["month"] article_id = row["id"] article_type = re.sub("[0-9%]+", "", article_id) zh_fn = f"{article_dir}/{year}/{month:02}/{article_id}.full.zh" en_fn = f"{article_dir}/{year}/{month:02}/{article_id}.nobox.en" print(f"path: {zh_fn}") zh_article = read_article(zh_fn) zh_article = stitch(zh_article, "zh") zh_article = filter(zh_article, article_type, "zh") print(f"path: {en_fn}") en_article = read_article(en_fn) en_article = stitch(en_article, "en") en_article = filter(en_article, article_type, "en") intersect = set(zh_article).intersection(set(en_article)) zh_out_fn = zh_fn.replace(".full.", ".filt.") with open(zh_out_fn, "w") as f: for line in zh_article: if line not in intersect: f.write(line + "\n") en_out_fn = en_fn.replace(".nobox.", ".filt.") with open(en_out_fn, "w") as f: for line in en_article: if line not in intersect: f.write(line + "\n")
fig, ax = plt.subplots(1,1) ax.clear() plt.scatter(x="zh_len", y="en_len", data=eserix) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.set_xlabel("Chinese Sentence Length") ax.set_ylabel("English Sentence Length") xlim = ylim = ax.get_xlim() plt.plot(xlim, ylim, color="red", linestyle="dashed") fig.set_size_inches(3,3) fig.tight_layout() fig.savefig(f"{out_dir}/eserix.pdf") # Read article and urls: article_urls = read_article_urls(urls_dir) article_urls = article_urls[article_urls["year"] != 2020] # Remove year 2020 punkt = get_article_length(punkt_dir, article_urls, "filt") punkt["abs_diff"].describe() # count 1973.000000 # mean 2.638621 # std 3.738087 # min 0.000000 # 25% 1.000000 # 50% 1.000000 # 75% 3.000000 # max 38.000000 eserix = get_article_length(eserix_dir, article_urls, "filt") eserix["abs_diff"].describe()
"jw.na": "Journal Watch", "clde": "Clinical Decisions", "cps": "Clinical Prob Solving", "p": "Perspective", "e": "Editorial", "cibr": "Clinical Implications\nof Basic Research", "icm": "Images in Clinical Med", "ms": "Medicine and Society", "c": "Correspondence", "sa": "Special Article", "x": "Correction", "hpr": "Health Policy Report" } # Read article and urls: articles = read_article_urls(in_dir) articles = articles[articles["year"] != 2020] # Remove year 2020 # Plot article count by year: year_count = articles.groupby("year").\ agg(count=pd.NamedAgg("year", "count")).\ reset_index() fig, (ax1, ax2) = plt.subplots(2, 1) ax1.clear() ax1.bar("year", "count", data=year_count) ax1.spines['right'].set_visible(False) ax1.spines['top'].set_visible(False) ax1.set_xticks(ticks=year_count["year"]) ax1.set_xticklabels(labels=year_count["year"]) ax1.set_ylabel("# Articles")