示例#1
0
def run_lda(abstracts, n_topics=50, n_words=31, n_iters=1000, alpha=None,
            beta=0.001):
    """ Perform topic modeling using Latent Dirichlet Allocation with the
    Java toolbox MALLET.

    Args:
        abstracts:  A pandas DataFrame with two columns ('pmid' and 'abstract')
                    containing article abstracts.
        n_topics:   Number of topics to generate. Default=50.
        n_words:    Number of top words to return for each topic. Default=31,
                    based on Poldrack et al. (2012).
        n_iters:    Number of iterations to run in training topic model.
                    Default=1000.
        alpha:      The Dirichlet prior on the per-document topic distributions.
                    Default: 50 / n_topics, based on Poldrack et al. (2012).
        beta:       The Dirichlet prior on the per-topic word distribution.
                    Default: 0.001, based on Poldrack et al. (2012).

    Returns:
        weights_df: A pandas DataFrame derived from the MALLET
                    output-doc-topics output file. Contains the weight assigned
                    to each article for each topic, which can be used to select
                    articles for topic-based meta-analyses (accepted threshold
                    from Poldrack article is 0.001). [n_topics]+1 columns:
                    'pmid' is the first column and the following columns are
                    the topic names. The names of the topics match the names
                    in df (e.g., topic_000).
        keys_df:    A pandas DataFrame derived from the MALLET
                    output-topic-keys output file. Contains the top [n_words]
                    words for each topic, which can act as a summary of the
                    topic's content. Two columns: 'topic' and 'terms'. The
                    names of the topics match the names in weights (e.g.,
                    topic_000).
    """
    if abstracts.index.name != 'pmid':
        abstracts.index = abstracts['pmid']

    resdir = os.path.abspath(get_resource_path())
    tempdir = os.path.join(resdir, 'topic_models')
    absdir = os.path.join(tempdir, 'abstracts')
    if not os.path.isdir(tempdir):
        os.mkdir(tempdir)

    if alpha is None:
        alpha = 50. / n_topics

    # Check for presence of abstract files and convert if necessary
    if not os.path.isdir(absdir):
        print('Abstracts folder not found. Creating abstract files...')
        os.mkdir(absdir)
        for pmid in abstracts.index.values:
            abstract = abstracts.loc[pmid]['abstract']
            with open(os.path.join(absdir, str(pmid) + '.txt'), 'w') as fo:
                fo.write(abstract)

    # Run MALLET topic modeling
    print('Generating topics...')
    mallet_bin = join(dirname(dirname(__file__)), 'resources/mallet/bin/mallet')
    import_str = ('{mallet} import-dir '
                  '--input {absdir} '
                  '--output {outdir}/topic-input.mallet '
                  '--keep-sequence '
                  '--remove-stopwords').format(mallet=mallet_bin,
                                               absdir=absdir,
                                               outdir=tempdir)

    train_str = ('{mallet} train-topics '
                 '--input {out}/topic-input.mallet '
                 '--num-topics {n_topics} '
                 '--num-top-words {n_words} '
                 '--output-topic-keys {out}/topic_keys.txt '
                 '--output-doc-topics {out}/doc_topics.txt '
                 '--num-iterations {n_iters} '
                 '--output-model {out}/saved_model.mallet '
                 '--random-seed 1 '
                 '--alpha {alpha} '
                 '--beta {beta}').format(mallet=mallet_bin, out=tempdir,
                                         n_topics=n_topics, n_words=n_words,
                                         n_iters=n_iters,
                                         alpha=alpha, beta=beta)

    subprocess.call(import_str, shell=True)
    subprocess.call(train_str, shell=True)

    # Read in and convert doc_topics and topic_keys.
    def clean_str(string):
        return os.path.basename(os.path.splitext(string)[0])

    def get_sort(lst):
        return [i[0] for i in sorted(enumerate(lst), key=lambda x:x[1])]

    topic_names = ['topic_{0:03d}'.format(i) for i in range(n_topics)]

    # doc_topics: Topic weights for each paper.
    # The conversion here is pretty ugly at the moment.
    # First row should be dropped. First column is row number and can be used
    # as the index.
    # Second column is 'file: /full/path/to/pmid.txt' <-- Parse to get pmid.
    # After that, odd columns are topic numbers and even columns are the
    # weights for the topics in the preceding column. These columns are sorted
    # on an individual pmid basis by the weights.
    n_cols = (2 * n_topics) + 1
    dt_df = pd.read_csv(os.path.join(tempdir, 'doc_topics.txt'),
                        delimiter='\t', skiprows=1, header=None, index_col=0)
    dt_df = dt_df[dt_df.columns[:n_cols]]

    # Get pmids from filenames
    dt_df[1] = dt_df[1].apply(clean_str)

    # Put weights (even cols) and topics (odd cols) into separate dfs.
    weights_df = dt_df[dt_df.columns[2::2]]
    weights_df.index = dt_df[1]
    weights_df.columns = range(n_topics)

    topics_df = dt_df[dt_df.columns[1::2]]
    topics_df.index = dt_df[1]
    topics_df.columns = range(n_topics)

    # Sort columns in weights_df separately for each row using topics_df.
    sorters_df = topics_df.apply(get_sort, axis=1)
    weights = weights_df.as_matrix()
    sorters = sorters_df.as_matrix()
    for i in range(sorters.shape[0]):  # there has to be a better way to do this.
        weights[i, :] = weights[i, sorters[i, :]]

    # Define topic names (e.g., topic_000)
    index = dt_df[1]
    weights_df = pd.DataFrame(columns=topic_names, data=weights, index=index)
    weights_df.index.name = 'pmid'

    # topic_keys: Top [n_words] words for each topic.
    keys_df = pd.read_csv(os.path.join(tempdir, 'topic_keys.txt'),
                          delimiter='\t', header=None, index_col=0)

    # Second column is a list of the terms.
    keys_df = keys_df[[2]]
    keys_df.rename(columns={2: 'terms'}, inplace=True)
    keys_df.index = topic_names
    keys_df.index.name = 'topic'

    # Remove all temporary files (abstract files, model, and outputs).
    shutil.rmtree(tempdir)

    # Return article topic weights and topic keys.
    return weights_df, keys_df
 def setUp(self):
     """ Create a new Dataset and add features. """
     maskfile = get_resource_path() + 'MNI152_T1_2mm_brain.nii.gz'
     self.masker = Masker(maskfile)
示例#3
0
def run_lda(abstracts, n_topics=50, n_words=31, n_iters=1000, alpha=None,
            beta=0.001):
    """ Perform topic modeling using Latent Dirichlet Allocation with the
    Java toolbox MALLET.

    Args:
        abstracts:  A pandas DataFrame with two columns ('pmid' and 'abstract')
                    containing article abstracts.
        n_topics:   Number of topics to generate. Default=50.
        n_words:    Number of top words to return for each topic. Default=31,
                    based on Poldrack et al. (2012).
        n_iters:    Number of iterations to run in training topic model.
                    Default=1000.
        alpha:      The Dirichlet prior on the per-document topic
                    distributions.
                    Default: 50 / n_topics, based on Poldrack et al. (2012).
        beta:       The Dirichlet prior on the per-topic word distribution.
                    Default: 0.001, based on Poldrack et al. (2012).

    Returns:
        weights_df: A pandas DataFrame derived from the MALLET
                    output-doc-topics output file. Contains the weight assigned
                    to each article for each topic, which can be used to select
                    articles for topic-based meta-analyses (accepted threshold
                    from Poldrack article is 0.001). [n_topics]+1 columns:
                    'pmid' is the first column and the following columns are
                    the topic names. The names of the topics match the names
                    in df (e.g., topic_000).
        keys_df:    A pandas DataFrame derived from the MALLET
                    output-topic-keys output file. Contains the top [n_words]
                    words for each topic, which can act as a summary of the
                    topic's content. Two columns: 'topic' and 'terms'. The
                    names of the topics match the names in weights (e.g.,
                    topic_000).
    """
    if abstracts.index.name != 'pmid':
        abstracts.index = abstracts['pmid']

    resdir = os.path.abspath(get_resource_path())
    tempdir = os.path.join(resdir, 'topic_models')
    absdir = os.path.join(tempdir, 'abstracts')
    if not os.path.isdir(tempdir):
        os.mkdir(tempdir)

    if alpha is None:
        alpha = 50. / n_topics

    # Check for presence of abstract files and convert if necessary
    if not os.path.isdir(absdir):
        print('Abstracts folder not found. Creating abstract files...')
        os.mkdir(absdir)
        for pmid in abstracts.index.values:
            abstract = abstracts.loc[pmid]['abstract']
            with open(os.path.join(absdir, str(pmid) + '.txt'), 'w') as fo:
                fo.write(abstract)

    # Run MALLET topic modeling
    print('Generating topics...')
    mallet_bin = join(dirname(dirname(__file__)),
                      'resources/mallet/bin/mallet')
    import_str = ('{mallet} import-dir '
                  '--input {absdir} '
                  '--output {outdir}/topic-input.mallet '
                  '--keep-sequence '
                  '--remove-stopwords').format(mallet=mallet_bin,
                                               absdir=absdir,
                                               outdir=tempdir)

    train_str = ('{mallet} train-topics '
                 '--input {out}/topic-input.mallet '
                 '--num-topics {n_topics} '
                 '--num-top-words {n_words} '
                 '--output-topic-keys {out}/topic_keys.txt '
                 '--output-doc-topics {out}/doc_topics.txt '
                 '--num-iterations {n_iters} '
                 '--output-model {out}/saved_model.mallet '
                 '--random-seed 1 '
                 '--alpha {alpha} '
                 '--beta {beta}').format(mallet=mallet_bin, out=tempdir,
                                         n_topics=n_topics, n_words=n_words,
                                         n_iters=n_iters,
                                         alpha=alpha, beta=beta)

    subprocess.call(import_str, shell=True)
    subprocess.call(train_str, shell=True)

    # Read in and convert doc_topics and topic_keys.
    def clean_str(string):
        return os.path.basename(os.path.splitext(string)[0])

    def get_sort(lst):
        return [i[0] for i in sorted(enumerate(lst), key=lambda x:x[1])]

    topic_names = ['topic_{0:03d}'.format(i) for i in range(n_topics)]

    # doc_topics: Topic weights for each paper.
    # The conversion here is pretty ugly at the moment.
    # First row should be dropped. First column is row number and can be used
    # as the index.
    # Second column is 'file: /full/path/to/pmid.txt' <-- Parse to get pmid.
    # After that, odd columns are topic numbers and even columns are the
    # weights for the topics in the preceding column. These columns are sorted
    # on an individual pmid basis by the weights.
    n_cols = (2 * n_topics) + 1
    dt_df = pd.read_csv(os.path.join(tempdir, 'doc_topics.txt'),
                        delimiter='\t', skiprows=1, header=None, index_col=0)
    dt_df = dt_df[dt_df.columns[:n_cols]]

    # Get pmids from filenames
    dt_df[1] = dt_df[1].apply(clean_str)

    # Put weights (even cols) and topics (odd cols) into separate dfs.
    weights_df = dt_df[dt_df.columns[2::2]]
    weights_df.index = dt_df[1]
    weights_df.columns = range(n_topics)

    topics_df = dt_df[dt_df.columns[1::2]]
    topics_df.index = dt_df[1]
    topics_df.columns = range(n_topics)

    # Sort columns in weights_df separately for each row using topics_df.
    sorters_df = topics_df.apply(get_sort, axis=1)
    weights = weights_df.as_matrix()
    sorters = sorters_df.as_matrix()
    # there has to be a better way to do this.
    for i in range(sorters.shape[0]):
        weights[i, :] = weights[i, sorters[i, :]]

    # Define topic names (e.g., topic_000)
    index = dt_df[1]
    weights_df = pd.DataFrame(columns=topic_names, data=weights, index=index)
    weights_df.index.name = 'pmid'

    # topic_keys: Top [n_words] words for each topic.
    keys_df = pd.read_csv(os.path.join(tempdir, 'topic_keys.txt'),
                          delimiter='\t', header=None, index_col=0)

    # Second column is a list of the terms.
    keys_df = keys_df[[2]]
    keys_df.rename(columns={2: 'terms'}, inplace=True)
    keys_df.index = topic_names
    keys_df.index.name = 'topic'

    # Remove all temporary files (abstract files, model, and outputs).
    shutil.rmtree(tempdir)

    # Return article topic weights and topic keys.
    return weights_df, keys_df