def preprocessing_features(model):

    df = pd.read_json("data.json")
    df = prepare.prep_readme_data(df)
    df = prepare_data(df)

    tfidf = TfidfVectorizer()
    X = tfidf.fit_transform(df.readme_contents.apply(clean).apply(' '.join))

    pd.Series(dict(
        zip(tfidf.get_feature_names(),
            model.feature_importances_))).sort_values().tail(5).plot.barh(
                title="Most important words used for modeling",
                figsize=(10, 8))
Exemplo n.º 2
0
def calculate_idf_score(df):

    df = pd.read_json("data.json")
    df = prepare.prep_readme_data(df)
    df = prepare.prepare_data(df)

    languages = df.is_top_language.unique()
    idf_scores = pd.DataFrame()
    for language in languages:
            words = clean(' '.join(df[df.is_top_language == language].clean_lemmatized))
            idf_df = return_words_with_idf(words)
            idf_df["language"] = language
            
            idf_scores = pd.concat([idf_scores, idf_df])
    return idf_scores
Exemplo n.º 3
0
def complex_cluster(df, number_of_clusters):
    df["combined_data"] = df.genre + " " + df.director + " " + df.clean_lemmatized
    df = prepare.prep_readme_data(df, "combined_data")
    cv = CountVectorizer(ngram_range = (1,2))
    cv = cv.fit_transform(df.clean_lemmatized)

    # Create an instance of KMeans to find seven clusters
    kmeans = KMeans(n_clusters=number_of_clusters, random_state=123)
    # Use fit_predict to cluster the dataset
    predictions = kmeans.fit_predict(cv)

    df["cluster_description"] = predictions
    df["cluster_description"] = "cluster_" + df.cluster_description.astype(str)

    return df
def preprocessing(df, data_representation, target_variable,
                  ngram_range=(2, 2)):

    df = prepare.prep_readme_data(df)
    df = prepare_data(df)

    if data_representation == "bag_of_words":
        X, y = run_bag_of_words(df, target_variable)
    elif data_representation == "tf_idf":
        X, y = run_tf_idf(df, target_variable)
    elif data_representation == "bag_of_ngrams":
        X, y = run_bag_of_ngrams(df, target_variable, ngram_range)

    X_train, X_validate, X_test, y_train, y_validate, y_test = split_data(X, y)

    return X_train, X_validate, X_test, y_train, y_validate, y_test
Exemplo n.º 5
0
def return_words_with_idf(words):

    df = pd.read_json("data.json")
    df = prepare.prep_readme_data(df)
    df = prepare.prepare_data(df)
    
    def idf(word):
        return  df.shape[0] / (1 + (df.clean_lemmatized.str.contains(word)).sum())

    # put the unique words into a data frame
    idf_df = (pd.DataFrame(dict(word=words))
    # calculate the idf for each word 
    .assign(idf=lambda df: df.word.apply(idf))
    # sort the data for presentation purposes
    .set_index('word')
    .sort_values(by='idf', ascending=False)
    .head(5))
    
    return idf_df