def get_corpus(filename):
    """Load raw data from a file and return vectorizer and feature_matrix.
    Parameters
    ----------
    filename: The path to a json file containing the university database.

    Returns
    -------
    corpus: A numpy array containing abstracts and titles.
    """
    df_cleaned = database_cleaner(filename)

    # For nlp, only retaining faculty_name, research_areas, paper_titles, abstracts
    df_filtered = df_cleaned[[
        'faculty_name', 'research_areas', 'paper_titles', 'abstracts'
    ]]
    missing = df_filtered['paper_titles'] == ''
    df_nlp = df_filtered[~missing]

    # Choosing abstracts and paper_titles to predict topics for a professor
    df_nlp['research_areas'] = df_nlp['research_areas'].apply(
        lambda x: " ".join(x))
    corpus = (df_nlp['paper_titles'] + df_nlp['abstracts'] +
              df_nlp['research_areas']).values

    return corpus
示例#2
0
def get_data(filename):
    """Load raw data from a file and return vectorizer and feature_matrix.
    Parameters
    ----------
    filename: The path to a json file containing the university database.
    Returns
    -------
    data: A numpy array containing abstracts.
    """
    df_cleaned = database_cleaner(filename)

    # For nlp, only retaining faculty_name, research_areas, paper_titles, abstracts
    df_filtered = df_cleaned[[
        'faculty_name', 'research_areas', 'paper_titles', 'abstracts'
    ]]
    missing = df_filtered['paper_titles'] == ''
    num_missing = sum(missing)
    print(f'{num_missing} faculties have missing papers in {filename}')
    print('Running nlp-pipeline on faculties with non-missing papers...')

    df_nlp = df_filtered[~missing]

    # Choosing abstracts and paper_titles to predict topics for a professor
    df_nlp['research_areas'] = df_nlp['research_areas'].apply(
        lambda x: " ".join(x))
    data = (df_nlp['paper_titles'] + df_nlp['abstracts'] +
            df_nlp['research_areas']).values

    return data
    lda_model.fit()
    print(lda_model.coherence_score())

    # Fit LDAMallet to training data(doesn't work with tf-idf)
    ldamallet_model = MyGenSimModel(num_topics=11,
                                    algorithm='LDAMallet',
                                    tf_idf=False,
                                    bigrams=True,
                                    trigrams=False,
                                    lemmatization=True)
    ldamallet_model.transform(data)
    ldamallet_model.fit()
    print(ldamallet_model.coherence_score())

    # Append to pge_database with updated predicted_research_areas based on top-10 features
    pge_df = database_cleaner('../data/json/majors_database.json')
    doc_topics_df_LDA = lda_model.format_document_topics()
    doc_topics_df_LDAMallet = ldamallet_model.format_document_topics()
    pge_df_updated_LDA = pd.concat([pge_df, doc_topics_df_LDA], axis=1)
    pge_df_updated_LDAMallet = pd.concat([pge_df, doc_topics_df_LDAMallet],
                                         axis=1)
    pge_df_updated_LDA.to_json(
        path_or_buf='../data/json/final_gensim_database_LDA.json')
    pge_df_updated_LDAMallet.to_json(
        path_or_buf='../data/json/final_gensim_database_LDAMallet.json')

    # Save html for the pyLDAvis visualization of LDAMallet model
    vis_LDA = lda_model.visualize_lda_model()
    vis_LDAMallet = ldamallet_model.visualize_lda_mallet()
    pyLDAvis.save_html(data=vis_LDA, fileobj="templates/LDA.html")
    pyLDAvis.save_html(data=vis_LDAMallet, fileobj="templates/LDAMallet.html")