Пример #1
0
def main():
	# Read article and urls:
	article_urls = read_article_urls(urls_dir)
	article_urls = article_urls[article_urls["year"] != 2020] # Remove year 2020
	# 1973 articles

	full_articles = get_article_length(in_dir, article_urls, "full")
	filt_articles = get_article_length(in_dir, article_urls, "filt")
	full_articles["abs_diff"].describe()
	# count    1973.000000
	# mean       11.831221
	# std        19.538525
	# min         0.000000
	# 25%         1.000000
	# 50%         1.000000
	# 75%        19.000000
	# max       130.000000
	filt_articles["abs_diff"].describe()
	# count    1973.000000
	# mean        0.274709
	# std         0.638960
	# min         0.000000
	# 25%         0.000000
	# 50%         0.000000
	# 75%         0.000000
	# max         5.000000
	p1()
	p2()
	p3()
Пример #2
0
def main():
    meta = read_article_urls(url_dir)
    meta = meta[meta["year"] != 2020]  # Remove year 2020

    for index, row in meta.iterrows():
        year = row["year"]
        month = row["month"]
        article_id = row["id"]
        article_type = re.sub("[0-9%]+", "", article_id)

        zh_fn = f"{article_dir}/{year}/{month:02}/{article_id}.full.zh"
        en_fn = f"{article_dir}/{year}/{month:02}/{article_id}.nobox.en"

        print(f"path: {zh_fn}")
        zh_article = read_article(zh_fn)
        zh_article = stitch(zh_article, "zh")
        zh_article = filter(zh_article, article_type, "zh")

        print(f"path: {en_fn}")
        en_article = read_article(en_fn)
        en_article = stitch(en_article, "en")
        en_article = filter(en_article, article_type, "en")

        intersect = set(zh_article).intersection(set(en_article))

        zh_out_fn = zh_fn.replace(".full.", ".filt.")
        with open(zh_out_fn, "w") as f:
            for line in zh_article:
                if line not in intersect:
                    f.write(line + "\n")

        en_out_fn = en_fn.replace(".nobox.", ".filt.")
        with open(en_out_fn, "w") as f:
            for line in en_article:
                if line not in intersect:
                    f.write(line + "\n")
Пример #3
0
	fig, ax = plt.subplots(1,1)
	ax.clear()
	plt.scatter(x="zh_len", y="en_len", data=eserix)
	ax.spines['right'].set_visible(False)
	ax.spines['top'].set_visible(False)
	ax.set_xlabel("Chinese Sentence Length")
	ax.set_ylabel("English Sentence Length")
	xlim = ylim = ax.get_xlim()
	plt.plot(xlim, ylim, color="red", linestyle="dashed")
	fig.set_size_inches(3,3)
	fig.tight_layout()
	fig.savefig(f"{out_dir}/eserix.pdf")


# Read article and urls:
article_urls = read_article_urls(urls_dir)
article_urls = article_urls[article_urls["year"] != 2020] # Remove year 2020

punkt = get_article_length(punkt_dir, article_urls, "filt")
punkt["abs_diff"].describe()
# count    1973.000000
# mean        2.638621
# std         3.738087
# min         0.000000
# 25%         1.000000
# 50%         1.000000
# 75%         3.000000
# max        38.000000

eserix = get_article_length(eserix_dir, article_urls, "filt")
eserix["abs_diff"].describe()
Пример #4
0
    "jw.na": "Journal Watch",
    "clde": "Clinical Decisions",
    "cps": "Clinical Prob Solving",
    "p": "Perspective",
    "e": "Editorial",
    "cibr": "Clinical Implications\nof Basic Research",
    "icm": "Images in Clinical Med",
    "ms": "Medicine and Society",
    "c": "Correspondence",
    "sa": "Special Article",
    "x": "Correction",
    "hpr": "Health Policy Report"
}

# Read article and urls:
articles = read_article_urls(in_dir)
articles = articles[articles["year"] != 2020]  # Remove year 2020

# Plot article count by year:
year_count = articles.groupby("year").\
 agg(count=pd.NamedAgg("year", "count")).\
 reset_index()

fig, (ax1, ax2) = plt.subplots(2, 1)
ax1.clear()
ax1.bar("year", "count", data=year_count)
ax1.spines['right'].set_visible(False)
ax1.spines['top'].set_visible(False)
ax1.set_xticks(ticks=year_count["year"])
ax1.set_xticklabels(labels=year_count["year"])
ax1.set_ylabel("# Articles")