예제 #1
0
파일: run.py 프로젝트: gpfreitas/topik
def run_pipeline(data_source, source_type="auto", year_field=None, start_year=None, stop_year=None,
              content_field=None, tokenizer='simple', vectorizer='bag_of_words', ntopics=10,
              dir_path='./topic_model', model='lda', termite_plot=False, output_file=False,
              lda_vis=True, seed=42, **kwargs):

    """Run your data through all topik functionality and save all results to a specified directory.

    Parameters
    ----------
    data_source : str
        Input data (e.g. file or folder or solr/elasticsearch instance).

    source_type : {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}.
        The format of your data input. Currently available a json stream or a folder containing text files.
        Default is 'json_stream'
    year_field : str
        The field name (if any) that contains the year associated with each document (for filtering).
    start_year : int
        For beginning of range filter on year_field values
    stop_year : int
        For beginning of range filter on year_field values
    content_field : string
        The primary text field to parse.
    tokenizer : {'simple', 'collocations', 'entities', 'mixed'}
        The type of tokenizer to use. Default is 'simple'.
    vectorizer : {'bag_of_words', 'tfidf'}
        The type of vectorizer to use.  Default is 'bag_of_words'.
    ntopics : int
        Number of topics to find in your data
    dir_path : str
        Directory path to store all topic modeling results files. Default is `./topic_model`.
    model : {'LDA', 'PLSA'}.
        Statistical modeling algorithm to use. Default 'LDA'.
    termite_plot : bool
        Generate termite plot of your model if True. Default is True.
    ldavis : bool
        Generate an interactive data visualization of your topics. Default is False.
    seed : int
        Set random number generator to seed, to be able to reproduce results. Default 42.
    **kwargs : additional keyword arguments, passed through to each individual step
    """

    np.random.seed(seed)

    raw_data = read_input(data_source, content_field=content_field,
                          source_type=source_type, **kwargs)
    raw_data = ((hash(item[content_field]), item[content_field]) for item in raw_data)
    tokenized_data = tokenizers.registered_tokenizers[tokenizer](raw_data, **kwargs)
    vectorized_data = vectorizers.registered_vectorizers[vectorizer](tokenized_data, **kwargs)
    model = models.registered_models[model](vectorized_data, ntopics=ntopics, **kwargs)
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

    if termite_plot:
        termite_html(model, filename="termite.html", plot_title="Termite plot", topn=15)

    if lda_vis:
        visualizers.visualize(model, "lda_vis")
예제 #2
0
def run_pipeline(data_source,
                 source_type="auto",
                 year_field=None,
                 start_year=None,
                 stop_year=None,
                 content_field=None,
                 tokenizer='simple',
                 vectorizer='bag_of_words',
                 ntopics=10,
                 dir_path='./topic_model',
                 model='lda',
                 termite_plot=False,
                 output_file=False,
                 lda_vis=True,
                 seed=42,
                 **kwargs):
    """Run your data through all topik functionality and save all results to a specified directory.

    Parameters
    ----------
    data_source : str
        Input data (e.g. file or folder or solr/elasticsearch instance).

    source_type : {'json_stream', 'folder_files', 'json_large', 'solr', 'elastic'}.
        The format of your data input. Currently available a json stream or a folder containing text files.
        Default is 'json_stream'
    year_field : str
        The field name (if any) that contains the year associated with each document (for filtering).
    start_year : int
        For beginning of range filter on year_field values
    stop_year : int
        For beginning of range filter on year_field values
    content_field : string
        The primary text field to parse.
    tokenizer : {'simple', 'collocations', 'entities', 'mixed'}
        The type of tokenizer to use. Default is 'simple'.
    vectorizer : {'bag_of_words', 'tfidf'}
        The type of vectorizer to use.  Default is 'bag_of_words'.
    ntopics : int
        Number of topics to find in your data
    dir_path : str
        Directory path to store all topic modeling results files. Default is `./topic_model`.
    model : {'LDA', 'PLSA'}.
        Statistical modeling algorithm to use. Default 'LDA'.
    termite_plot : bool
        Generate termite plot of your model if True. Default is True.
    ldavis : bool
        Generate an interactive data visualization of your topics. Default is False.
    seed : int
        Set random number generator to seed, to be able to reproduce results. Default 42.
    **kwargs : additional keyword arguments, passed through to each individual step
    """

    np.random.seed(seed)

    raw_data = read_input(data_source,
                          content_field=content_field,
                          source_type=source_type,
                          **kwargs)
    raw_data = ((hash(item[content_field]), item[content_field])
                for item in raw_data)
    tokenized_data = tokenizers.registered_tokenizers[tokenizer](raw_data,
                                                                 **kwargs)
    vectorized_data = vectorizers.registered_vectorizers[vectorizer](
        tokenized_data, **kwargs)
    model = models.registered_models[model](vectorized_data,
                                            ntopics=ntopics,
                                            **kwargs)
    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

    if termite_plot:
        termite_html(model,
                     filename="termite.html",
                     plot_title="Termite plot",
                     topn=15)

    if lda_vis:
        visualizers.visualize(model, "lda_vis")
예제 #3
0
def test_termite():
    termite_html(test_model_output, filename=os.path.join(module_path, 'termite_plot.html'),
                           plot_title="My lda results", topn=TOP_WORDS)
    nt.assert_true(os.path.isfile(os.path.join(module_path, 'termite_plot.html')))
    os.remove(os.path.join(module_path, 'termite_plot.html'))