def get_X_y():
    X,y,interest = process_file("jama/jama_article_info.csv","jama/jama_pmids.txt_matched_articles_filtered.csv")
    vectorizer = CountVectorizer(stop_words="english",
                                    min_df=2,
                                    token_pattern=r"(?u)95% confidence interval|95% CI|95% ci|[a-zA-Z0-9_*\-][a-zA-Z0-9_/*\-]+",
                                    binary=False, max_features=50000)
    X = vectorizer.fit_transform(X)
    return X,np.array(y),vectorizer
def get_X_y():
    X, y, interest = process_file(
        "jama/jama_article_info.csv",
        "jama/jama_pmids.txt_matched_articles_filtered.csv")
    vectorizer = CountVectorizer(
        stop_words="english",
        min_df=2,
        token_pattern=
        r"(?u)95% confidence interval|95% CI|95% ci|[a-zA-Z0-9_*\-][a-zA-Z0-9_/*\-]+",
        binary=False,
        max_features=50000)
    X = vectorizer.fit_transform(X)
    return X, np.array(y), vectorizer
def get_X_y():
    X, y, interest = process_file(
        "reuters/all_reuters_article_info.csv",
        "reuters/all_reuters_matched_articles_filtered.csv")
    vectorizer = CountVectorizer(
        ngram_range=(1, 2),
        stop_words="english",
        min_df=1,
        token_pattern=
        r"(?u)95% confidence interval|95% CI|95% ci|[a-zA-Z0-9_*\-][a-zA-Z0-9_/*\-]+",
        binary=False,
        max_features=50000)
    X = vectorizer.fit_transform(X)
    return X, np.array(y), vectorizer
def get_X_y():
    X, y, interest = process_file(
        "reuters/all_reuters_article_info.csv", "reuters/all_reuters_matched_articles_filtered.csv"
    )
    vectorizer = CountVectorizer(
        ngram_range=(1, 2),
        stop_words="english",
        min_df=1,
        token_pattern=r"(?u)95% confidence interval|95% CI|95% ci|[a-zA-Z0-9_*\-][a-zA-Z0-9_/*\-]+",
        binary=False,
        max_features=50000,
    )
    X = vectorizer.fit_transform(X)
    return X, np.array(y), vectorizer