Python CorpusSent.load примеры использования

Язык программирования: Python

Пространство имен/Пакет: vsm.extensions.ldasentences

Класс/Тип: CorpusSent

Метод/Функция: load

Примеров на hotexamples.com: 9

Python CorpusSent.load - 9 примеров найдено. Это лучшие примеры Python кода для vsm.extensions.ldasentences.CorpusSent.load, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

load(5)

Основные методы

load (5)

Пример #1

Показать файл

Файл: train.py Проект: inpho/topic-explorer

def main(args):
    if args.cluster:
        cluster(args.cluster, args.config_file)
        return

    config = topicexplorer.config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main", "topics"))))
                if args.quiet:
                    args.k = [int(n) for n in default.split()]
            else:
                raise NoOptionError('main', 'topics')
        except NoOptionError:
            default = ' '.join(map(str, range(20, 100, 20)))

        while args.k is None:
            ks = input("Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print("\nTIP: number of topics can be specified with argument '-k N N N ...':")
                    print("         topicexplorer train %s -k %s\n" %\
                        (args.config_file, ' '.join(map(str, args.k))))
            except ValueError:
                print("Enter valid integers, separated by spaces!")

    if args.processes < 0:
        import multiprocessing
        args.processes = multiprocessing.cpu_count() + args.processes

    print("Loading corpus... ")
    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if (model_pattern is not None and not args.rebuild and (args.quiet or args.cont or
            bool_prompt("""Existing topic models found. You can continue training or start a new model. 
Do you want to continue training your existing models? """, default=True))):

        from vsm.model.lda import LDA
        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None and not args.quiet:    # pragma: no cover
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration * 1.5), min=m.iteration)

            print("\nTIP: number of training iterations can be specified with argument '--iter N':")
            print("         topicexplorer train --iter %d %s\n" % (args.iter, args.config_file))
        elif args.iter is None and args.quiet:      # pragma: no cover
            args.iter = int(m.iteration * 1.5)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main", "topics"))
        if args.k != config_topics:
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)

            build_models(corpus, corpus_filename, model_path,
                         config.get("main", "context_type"),
                         new_models, n_iterations=args.iter,
                         n_proc=args.processes, seed=args.seed,
                         dry_run=args.dry_run)

            model_pattern = continue_training(model_pattern, continuing_models,
                                              args.iter, n_proc=args.processes,
                                              dry_run=args.dry_run)

        else:
            model_pattern = continue_training(model_pattern, args.k, args.iter,
                                              n_proc=args.processes, 
                                              dry_run=args.dry_run)
    else:
        # build a new model
        if args.iter is None and not args.quiet:    # pragma: no cover
            args.iter = int_prompt("Number of training iterations:", default=200)

            print("\nTIP: number of training iterations can be specified with argument '--iter N':")
            print("         topicexplorer train --iter %d %s\n" % (args.iter, args.config_file))
        elif args.iter is None and args.quiet:      # pragma: no cover
            args.iter = 200

        # TODO: if only one context_type, make it just the one context type.
        ctxs = corpus.context_types
        if len(ctxs) == 1:
            args.context_type = ctxs[0]
        else:
            ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
            if args.context_type not in ctxs:
                while args.context_type not in ctxs:
                    contexts = ctxs[:]
                    contexts[0] = contexts[0].upper()
                    contexts = '/'.join(contexts)
                    args.context_type = input("Select a context type [%s] : " % contexts)
                    if args.context_type.strip() == '':
                        args.context_type = ctxs[0]
                    if args.context_type == ctxs[0].upper():
                        args.context_type = ctxs[0]
    
                print("\nTIP: context type can be specified with argument '--context-type TYPE':")
                print("         topicexplorer train --context-type %s %s\n" % (args.context_type, args.config_file))


        print("\nTIP: This configuration can be automated as:")
        print("         topicexplorer train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type, 
                ' '.join(map(str, args.k))))
        model_pattern = build_models(corpus, corpus_filename, model_path,
                                     args.context_type, args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes, seed=args.seed,
                                     dry_run=args.dry_run)
    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))

    if not args.dry_run:
        if config.has_option("main", "cluster"):
            cluster_path = config.get("main", "cluster", fallback=None)
            config.remove_option("main", "cluster")
            try:
                if cluster_path:
                    os.remove(cluster_path)
            except (OSError, IOError):
                # fail silently on IOError
                pass


        with open(args.config_file, "w") as configfh:
            config.write(configfh)

Пример #2

Показать файл

def main(args):
    config = ConfigParser({"htrc": False, "sentences": "False"})
    config.read(args.config_file)

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.lang is None:
        args.lang = []

    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)

    # check for htrc metadata
    if args.htrc or config.get("main", "htrc"):
        htrc_langs = get_htrc_langs(args)
        if htrc_langs:
            args.lang.extend(new_langs)

    # auto-guess a language
    """
    new_langs = [lang for lang in detect_langs(c) if lang in langs and lang not in args.lang]
    if new_langs:
        args.lang.extend(new_langs)
    """

    # add default locale if no other languages are specified
    # do not add if in quiet mode -- make everything explicit
    if not args.lang and not args.quiet:
        import locale
        locale = locale.getdefaultlocale()[0].split('_')[0].lower()
        if locale in langs.keys():
            args.lang.append(locale)

    # check for any new candidates
    args.lang = [lang for lang in args.lang if stop_language(c, langs[lang])]
    if args.lang and not args.quiet:
        args.lang = lang_prompt(args.lang)

    stoplist = set()
    # Apply stop words
    print(" ")
    for lang in args.lang:
        print("Applying", langs[lang], "stopwords")
        candidates = stop_language(c, langs[lang])
        if len(candidates):
            stoplist.update(candidates)

    # Apply custom stopwords file
    if args.stopword_file:
        with open(args.stopword_file, encoding='utf8') as swf:
            #candidates = [unidecode(word.strip()) for word in swf]
            candidates = [word.strip() for word in swf]

            if len(candidates):
                print("Applying custom stopword file to remove {} word{}.".
                      format(len(candidates),
                             's' if len(candidates) > 1 else ''))
                stoplist.update(candidates)

    if args.min_word_len:
        candidates = get_small_words(c, args.min_word_len)
        if len(candidates):
            print("Filtering {} small word{} with less than {} characters.".
                  format(len(candidates), 's' if len(candidates) > 1 else '',
                         args.min_word_len))
            stoplist.update(candidates)

    if not args.special_chars:
        candidates = get_special_chars(c)
        if len(candidates):
            print("Filtering {} word{} with special characters.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)

    if args.high_filter is None and not args.quiet:
        args.high_filter, candidates = get_high_filter(args, c, words=stoplist)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.high_filter > 0:
        candidates = get_candidate_words(c, args.high_filter, sort=False)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)

    if args.low_filter is None and not args.quiet:
        args.low_filter, candidates = get_low_filter(args, c, words=stoplist)
        if len(candidates):
            print("Filtering {} low frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.low_filter > 0:
        candidates = get_candidate_words(c, -1 * args.low_filter, sort=False)
        if len(candidates):
            print("Filtering {} low frequency words.".format(len(candidates)))
            stoplist.update(candidates)

    if not stoplist:
        print("No stopwords applied.\n\n")

        sys.exit(0)
    else:
        print("\n\nApplying {} stopword{}".format(
            len(stoplist), 's' if len(stoplist) > 1 else ''))
        c.in_place_stoplist(stoplist)
        print("\n")

    def name_corpus(dirname, languages, lowfreq=None, highfreq=None):
        items, counts = get_items_counts(c.corpus)

        corpus_name = [dirname]
        if args.lang:
            corpus_name.append('nltk')
            corpus_name.append(''.join(args.lang))
        if lowfreq > 0:
            corpus_name.append('freq%s' % lowfreq)
        else:
            corpus_name.append('freq%s' % min(counts))

        if highfreq > 0:
            corpus_name.append('N%s' % highfreq)
        else:
            corpus_name.append('freq%s' % max(counts))

        corpus_name = '-'.join(corpus_name)
        corpus_name += '.npz'
        return corpus_name

    dirname = os.path.basename(args.corpus_path).split('-nltk-')[0].replace(
        '.npz', '')
    corpus_name = name_corpus(dirname, ['en'], args.low_filter,
                              args.high_filter)

    model_path = os.path.dirname(args.corpus_path)
    args.corpus_path = os.path.join(model_path, corpus_name)
    c.save(args.corpus_path)

    config.set("main", "corpus_file", args.corpus_path)
    config.remove_option("main", "model_pattern")
    with open(args.config_file, 'w') as configfh:
        config.write(configfh)

Пример #3

Показать файл

Файл: corpus.tmpl.py Проект: richierocks/topic-explorer

import os.path
from collections import defaultdict

# load in the configuration file
from ConfigParser import ConfigParser as ConfigParser
config_file = r"$config_file" 
config = ConfigParser({
        'topic_range': None,
        'topics': None,
        'sentences' : 'false'})
config.read(config_file)

# load the corpus
if config.getboolean('main','sentences'):
    from vsm.extensions.ldasentences import CorpusSent
    c = CorpusSent.load(config.get('main', 'corpus_file'))
else:
    c = Corpus.load(config.get('main', 'corpus_file'))
context_type = config.get('main', 'context_type')
ctx_metadata = c.view_metadata(context_type)
all_ids = ctx_metadata[doc_label_name(context_type)]

# create topic model patterns
pattern = config.get('main', 'model_pattern')
if config.get('main', 'topic_range'):
    topic_range = map(int, config.get('main', 'topic_range').split(','))
    topic_range = range(*topic_range)
if config.get('main', 'topics'):
    topic_range = eval(config.get('main', 'topics'))

# load the topic models

Пример #4

Показать файл

def main(args):
    config = topicexplorer.config.read(args.config_file)

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.lang is None:
        args.lang = []

    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)

    if c.original_length != len(c.corpus):
        print("Corpus has already been prepared. Proceed to training or")
        print("re-init the corpus to apply a different set of stopwords.")
        print("\nTIP: Train the LDA models with:")
        print("         topicexplorer train", args.config_file)
        sys.exit(1)

    # auto-guess a language
    """
    new_langs = [lang for lang in detect_langs(c) if lang in langs and lang not in args.lang]
    if new_langs:
        args.lang.extend(new_langs)
    """

    # add default locale if no other languages are specified
    # do not add if in quiet mode -- make everything explicit
    if not args.lang and not args.quiet:
        import locale
        locale = locale.getdefaultlocale()[0].split('_')[0].lower()
        if locale in langs.keys():
            args.lang.append(locale)

    # check for any new candidates
    args.lang = [lang for lang in args.lang if stop_language(c, langs[lang])]
    if args.lang and not args.quiet:
        args.lang = lang_prompt(args.lang)

    stoplist = set()
    # Apply stop words
    print(" ")
    for lang in args.lang:
        print("Applying", langs[lang], "stopwords")
        candidates = stop_language(c, langs[lang])
        if len(candidates):
            stoplist.update(candidates)

    # Apply custom stopwords file
    if args.stopword_file:
        with open(args.stopword_file, encoding='utf8') as swf:
            #candidates = [unidecode(word.strip()) for word in swf]
            candidates = [word.strip() for word in swf]

            if len(candidates):
                print("Applying custom stopword file to remove {} word{}.".
                      format(len(candidates),
                             's' if len(candidates) > 1 else ''))
                stoplist.update(candidates)

    if args.min_word_len:
        candidates = get_small_words(c, args.min_word_len)
        if len(candidates):
            print("Filtering {} small word{} with less than {} characters.".
                  format(len(candidates), 's' if len(candidates) > 1 else '',
                         args.min_word_len))
            stoplist.update(candidates)

    # cache item counts
    items, counts = get_corpus_counts(c)
    if args.high_filter is None and args.high_percent is None and not args.quiet:
        args.high_filter, candidates = get_high_filter(c,
                                                       words=stoplist,
                                                       items=items,
                                                       counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.high_filter is None and args.high_percent is None and args.quiet:
        pass
    elif args.high_filter:
        candidates = get_candidate_words(c,
                                         args.high_filter,
                                         sort=False,
                                         items=items,
                                         counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.high_percent:
        args.high_filter = get_closest_bin(c,
                                           1 - (args.high_percent / 100.),
                                           counts=counts)
        print(args.high_filter)
        candidates = get_candidate_words(c,
                                         args.high_filter,
                                         sort=False,
                                         items=items,
                                         counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)

    if args.low_filter is None and args.low_percent is None and not args.quiet:
        args.low_filter, candidates = get_low_filter(c,
                                                     words=stoplist,
                                                     items=items,
                                                     counts=counts)
        if len(candidates):
            print("Filtering {} low frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.low_filter is None and args.low_percent is None and args.quiet:
        pass
    elif args.low_filter:
        candidates = get_candidate_words(c,
                                         -1 * args.low_filter,
                                         sort=False,
                                         items=items,
                                         counts=counts)
        if len(candidates):
            print("Filtering {} low frequency words.".format(len(candidates)))
            stoplist.update(candidates)

    elif args.low_percent:
        args.low_filter = get_closest_bin(c,
                                          1 - (args.low_percent / 100.),
                                          reverse=True,
                                          counts=counts)
        print(args.low_filter)
        candidates = get_candidate_words(c,
                                         -1 * args.low_filter,
                                         sort=False,
                                         items=items,
                                         counts=counts)
        if len(candidates):
            print("Filtering {} low frequency word{}.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)

    if not stoplist:
        print("No stopwords applied.\n\n")

        sys.exit(0)
    else:
        print("\n\nApplying {} stopword{}".format(
            len(stoplist), 's' if len(stoplist) > 1 else ''))
        c.in_place_stoplist(stoplist)
        print("\n")

    def name_corpus(dirname, languages, lowfreq=None, highfreq=None):
        corpus_name = [dirname]

        if args.lang:
            corpus_name.append('nltk')
            corpus_name.append(''.join(args.lang))

        if lowfreq is not None and lowfreq > 0:
            corpus_name.append('freq%s' % lowfreq)
        if highfreq is not None and highfreq > 0:
            corpus_name.append('N%s' % highfreq)

        corpus_name = '-'.join(corpus_name)
        corpus_name += '.npz'
        return corpus_name

    dirname = os.path.basename(args.corpus_path).split('-nltk-')[0].replace(
        '.npz', '')
    corpus_name = name_corpus(dirname, ['en'], args.low_filter,
                              args.high_filter)

    model_path = os.path.dirname(args.corpus_path)
    args.corpus_path = os.path.join(model_path, corpus_name)
    c.save(args.corpus_path)

    config.set("main", "corpus_file", args.corpus_path)
    config.remove_option("main", "model_pattern")
    with open(args.config_file, 'w') as configfh:
        config.write(configfh)

Пример #5

Показать файл

Файл: train.py Проект: swalter62/topic-explorer

def main(args):
    if args.cluster:
        cluster(args.cluster, args.config_file)
        return

    config = topicexplorer.config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main",
                                                            "topics"))))
                if args.quiet:
                    args.k = [int(n) for n in default.split()]
            else:
                raise NoOptionError('main', 'topics')
        except NoOptionError:
            default = ' '.join(map(str, range(20, 100, 20)))

        while args.k is None:
            ks = input("Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print(
                        "\nTIP: number of topics can be specified with argument '-k N N N ...':"
                    )
                    print("         topicexplorer train %s -k %s\n" %\
                        (args.config_file, ' '.join(map(str, args.k))))
            except ValueError:
                print("Enter valid integers, separated by spaces!")

    if args.processes < 0:
        import multiprocessing
        args.processes = multiprocessing.cpu_count() + args.processes

    print("Loading corpus... ")
    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if (model_pattern is not None and not args.rebuild and
        (args.quiet or args.cont or bool_prompt(
            """Existing topic models found. You can continue training or start a new model. 
Do you want to continue training your existing models? """,
            default=True))):

        from vsm.model.lda import LDA
        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None and not args.quiet:  # pragma: no cover
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration * 1.5),
                                   min=m.iteration)

            print(
                "\nTIP: number of training iterations can be specified with argument '--iter N':"
            )
            print("         topicexplorer train --iter %d %s\n" %
                  (args.iter, args.config_file))
        elif args.iter is None and args.quiet:  # pragma: no cover
            args.iter = int(m.iteration * 1.5)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main", "topics"))
        if args.k != config_topics:
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)

            build_models(corpus,
                         corpus_filename,
                         model_path,
                         config.get("main", "context_type"),
                         new_models,
                         n_iterations=args.iter,
                         n_proc=args.processes,
                         seed=args.seed,
                         dry_run=args.dry_run)

            model_pattern = continue_training(model_pattern,
                                              continuing_models,
                                              args.iter,
                                              n_proc=args.processes,
                                              dry_run=args.dry_run)

        else:
            model_pattern = continue_training(model_pattern,
                                              args.k,
                                              args.iter,
                                              n_proc=args.processes,
                                              dry_run=args.dry_run)
    else:
        # build a new model
        if args.iter is None and not args.quiet:  # pragma: no cover
            args.iter = int_prompt("Number of training iterations:",
                                   default=200)

            print(
                "\nTIP: number of training iterations can be specified with argument '--iter N':"
            )
            print("         topicexplorer train --iter %d %s\n" %
                  (args.iter, args.config_file))
        elif args.iter is None and args.quiet:  # pragma: no cover
            args.iter = 200

        # TODO: if only one context_type, make it just the one context type.
        ctxs = corpus.context_types
        if len(ctxs) == 1:
            args.context_type = ctxs[0]
        else:
            ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
            if args.context_type not in ctxs:
                while args.context_type not in ctxs:
                    contexts = ctxs[:]
                    contexts[0] = contexts[0].upper()
                    contexts = '/'.join(contexts)
                    args.context_type = input("Select a context type [%s] : " %
                                              contexts)
                    if args.context_type.strip() == '':
                        args.context_type = ctxs[0]
                    if args.context_type == ctxs[0].upper():
                        args.context_type = ctxs[0]

                print(
                    "\nTIP: context type can be specified with argument '--context-type TYPE':"
                )
                print("         topicexplorer train --context-type %s %s\n" %
                      (args.context_type, args.config_file))

        print("\nTIP: This configuration can be automated as:")
        print("         topicexplorer train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type,
                ' '.join(map(str, args.k))))
        model_pattern = build_models(corpus,
                                     corpus_filename,
                                     model_path,
                                     args.context_type,
                                     args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes,
                                     seed=args.seed,
                                     dry_run=args.dry_run)
    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))

    if not args.dry_run:
        if config.has_option("main", "cluster"):
            cluster_path = config.get("main", "cluster", fallback=None)
            config.remove_option("main", "cluster")
            try:
                if cluster_path:
                    os.remove(cluster_path)
            except (OSError, IOError):
                # fail silently on IOError
                pass

        with open(args.config_file, "w") as configfh:
            config.write(configfh)

Пример #6

Показать файл

Файл: train.py Проект: gitter-badger/topic-explorer

def main(args):

    config = ConfigParser({"sentences": "False"})
    config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main", "topics"))))
            else:
                raise NoOptionError
        except NoOptionError:
            default = ' '.join(map(str, range(20,100,20)))

        while args.k is None:
            ks = raw_input("Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print "\nTIP: number of topics can be specified with argument '-k N N N ...':"
                    print "         vsm train %s -k %s\n" %\
                             (args.config_file, ' '.join(map(str, args.k)))
            except ValueError:
                print "Enter valid integers, separated by spaces!"
        
    if args.processes < 0:
        import multiprocessing
        args.processes = multiprocessing.cpu_count() + args.processes

    print "Loading corpus... "
    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if model_pattern is not None and\
        bool_prompt("Existing models found. Continue training?", default=True):

        from vsm.model.lda import LDA
        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None:
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration*1.5), min=m.iteration)
    
            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter, args.config_file)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main","topics"))
        if args.k != config_topics :
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)
        
            build_models(corpus, corpus_filename, model_path, 
                                         config.get("main", "context_type"),
                                         new_models, n_iterations=args.iter,
                                         n_proc=args.processes, seed=args.seed)

            model_pattern = continue_training(model_pattern, continuing_models,
                                              args.iter, n_proc=args.processes)

        else:
            model_pattern = continue_training(model_pattern, args.k, args.iter,
                                              n_proc=args.processes)

    else:
        # build a new model
        if args.iter is None:
            args.iter = int_prompt("Number of training iterations:", default=200)
    
            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter, args.config_file)
    
        ctxs = corpus.context_types
        ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
        if args.context_type not in ctxs:
            while args.context_type not in ctxs:
                contexts = ctxs[:]
                contexts[0] = contexts[0].upper()
                contexts = '/'.join(contexts)
                args.context_type = raw_input("Select a context type [%s] : " % contexts)
                if args.context_type.strip() == '':
                    args.context_type = ctxs[0]
                if args.context_type == ctxs[0].upper():
                    args.context_type = ctxs[0]
    
            print "\nTIP: context type can be specified with argument '--context-type TYPE':"
            print "         vsm train --context-type %s %s\n" % (args.context_type, args.config_file)
    
    
        print "\nTIP: This configuration can be automated as:"
        print "         vsm train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type, 
                ' '.join(map(str, args.k)))
        model_pattern = build_models(corpus, corpus_filename, model_path, 
                                     args.context_type, args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes, seed=args.seed,
                                     dry_run=args.dry_run)
    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))
    
    if not args.dry_run:
        with open(args.config_file, "wb") as configfh:
            config.write(configfh)

Пример #7

Показать файл

def main(args):

    config = ConfigParser({"sentences": "False"})
    config.read(args.config_file)
    corpus_filename = config.get("main", "corpus_file")
    model_path = config.get("main", "path")

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.k is None:
        try:
            if config.get("main", "topics"):
                default = ' '.join(map(str, eval(config.get("main",
                                                            "topics"))))
            else:
                raise NoOptionError
        except NoOptionError:
            default = ' '.join(map(str, range(20, 100, 20)))

        while args.k is None:
            ks = raw_input(
                "Number of Topics [Default '{0}']: ".format(default))
            try:
                if ks:
                    args.k = [int(n) for n in ks.split()]
                elif not ks.strip():
                    args.k = [int(n) for n in default.split()]

                if args.k:
                    print "\nTIP: number of topics can be specified with argument '-k N N N ...':"
                    print "         vsm train %s -k %s\n" %\
                             (args.config_file, ' '.join(map(str, args.k)))
            except ValueError:
                print "Enter valid integers, separated by spaces!"

    if args.processes < 0:
        import multiprocessing
        args.processes = multiprocessing.cpu_count() + args.processes

    print "Loading corpus... "
    corpus = Corpus.load(corpus_filename)

    try:
        model_pattern = config.get("main", "model_pattern")
    except NoOptionError:
        model_pattern = None

    if model_pattern is not None and\
        bool_prompt("Existing models found. Continue training?", default=True):

        from vsm.model.lda import LDA
        m = LDA.load(model_pattern.format(args.k[0]),
                     multiprocessing=args.processes > 1,
                     n_proc=args.processes)

        if args.iter is None:
            args.iter = int_prompt("Total number of training iterations:",
                                   default=int(m.iteration * 1.5),
                                   min=m.iteration)

            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter,
                                                         args.config_file)

        del m

        # if the set changes, build some new models and continue some old ones

        config_topics = eval(config.get("main", "topics"))
        if args.k != config_topics:
            new_models = set(args.k) - set(config_topics)
            continuing_models = set(args.k) & set(config_topics)

            build_models(corpus,
                         corpus_filename,
                         model_path,
                         config.get("main", "context_type"),
                         new_models,
                         n_iterations=args.iter,
                         n_proc=args.processes,
                         seed=args.seed)

            model_pattern = continue_training(model_pattern,
                                              continuing_models,
                                              args.iter,
                                              n_proc=args.processes)

        else:
            model_pattern = continue_training(model_pattern,
                                              args.k,
                                              args.iter,
                                              n_proc=args.processes)

    else:
        # build a new model
        if args.iter is None:
            args.iter = int_prompt("Number of training iterations:",
                                   default=200)

            print "\nTIP: number of training iterations can be specified with argument '--iter N':"
            print "         vsm train --iter %d %s\n" % (args.iter,
                                                         args.config_file)

        ctxs = corpus.context_types
        ctxs = sorted(ctxs, key=lambda ctx: len(corpus.view_contexts(ctx)))
        if args.context_type not in ctxs:
            while args.context_type not in ctxs:
                contexts = ctxs[:]
                contexts[0] = contexts[0].upper()
                contexts = '/'.join(contexts)
                args.context_type = raw_input("Select a context type [%s] : " %
                                              contexts)
                if args.context_type.strip() == '':
                    args.context_type = ctxs[0]
                if args.context_type == ctxs[0].upper():
                    args.context_type = ctxs[0]

            print "\nTIP: context type can be specified with argument '--context-type TYPE':"
            print "         vsm train --context-type %s %s\n" % (
                args.context_type, args.config_file)

        print "\nTIP: This configuration can be automated as:"
        print "         vsm train %s --iter %d --context-type %s -k %s\n" %\
            (args.config_file, args.iter, args.context_type,
                ' '.join(map(str, args.k)))
        model_pattern = build_models(corpus,
                                     corpus_filename,
                                     model_path,
                                     args.context_type,
                                     args.k,
                                     n_iterations=args.iter,
                                     n_proc=args.processes,
                                     seed=args.seed,
                                     dry_run=args.dry_run)
    config.set("main", "model_pattern", model_pattern)
    if args.context_type:
        # test for presence, since continuing doesn't require context_type
        config.set("main", "context_type", args.context_type)
    args.k.sort()
    config.set("main", "topics", str(args.k))

    if not args.dry_run:
        with open(args.config_file, "wb") as configfh:
            config.write(configfh)

Пример #8

Показать файл

Файл: prep.py Проект: inpho/topic-explorer

def main(args):
    config = topicexplorer.config.read(args.config_file)

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.lang is None:
        args.lang = []

    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)

    if c.original_length != len(c.corpus):
        print("Corpus has already been prepared. Proceed to training or")
        print("re-init the corpus to apply a different set of stopwords.")
        print("\nTIP: Train the LDA models with:")
        print("         topicexplorer train", args.config_file)
        sys.exit(1)

    # auto-guess a language
    """
    new_langs = [lang for lang in detect_langs(c) if lang in langs and lang not in args.lang]
    if new_langs:
        args.lang.extend(new_langs)
    """

    # add default locale if no other languages are specified
    # do not add if in quiet mode -- make everything explicit
    if not args.lang and not args.quiet:
        import locale
        locale = locale.getdefaultlocale()[0].split('_')[0].lower()
        if locale in langs.keys():
            args.lang.append(locale)

    # check for any new candidates
    args.lang = [lang for lang in args.lang if stop_language(c, langs[lang])]
    if args.lang and not args.quiet:
        args.lang = lang_prompt(args.lang)

    stoplist = set()
    # Apply stop words
    print(" ")
    for lang in args.lang:
        print("Applying", langs[lang], "stopwords")
        candidates = stop_language(c, langs[lang])
        if len(candidates):
            stoplist.update(candidates)

    # Apply custom stopwords file
    if args.stopword_file:
        with open(args.stopword_file, encoding='utf8') as swf:
            #candidates = [unidecode(word.strip()) for word in swf]
            candidates = [word.strip() for word in swf]

            if len(candidates):
                print("Applying custom stopword file to remove {} word{}.".format(
                    len(candidates), 's' if len(candidates) > 1 else ''))
                stoplist.update(candidates)
    
    if args.min_word_len:
        candidates = get_small_words(c, args.min_word_len)
        if len(candidates):
            print("Filtering {} small word{} with less than {} characters.".format(
                len(candidates), 's' if len(candidates) > 1 else '', args.min_word_len))
            stoplist.update(candidates)


    # cache item counts
    items, counts = get_corpus_counts(c)
    if args.high_filter is None and args.high_percent is None and not args.quiet:
        args.high_filter, candidates = get_high_filter(c, words=stoplist, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(len(candidates),
                                                               's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.high_filter is None and args.high_percent is None and args.quiet:
        pass
    elif args.high_filter:
        candidates = get_candidate_words(c, args.high_filter, sort=False, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(len(candidates),
                                                               's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.high_percent:
        args.high_filter = get_closest_bin(c, 1 - (args.high_percent / 100.), counts=counts)
        print(args.high_filter)
        candidates = get_candidate_words(c, args.high_filter, sort=False, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(len(candidates),
                                                               's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)

    if args.low_filter is None and args.low_percent is None and not args.quiet:
        args.low_filter, candidates = get_low_filter(c, words=stoplist, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} low frequency word{}.".format(len(candidates),
                                                              's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.low_filter is None and args.low_percent is None and args.quiet:
        pass
    elif args.low_filter:
        candidates = get_candidate_words(c, -1 * args.low_filter, sort=False, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} low frequency words.".format(len(candidates)))
            stoplist.update(candidates)

    elif args.low_percent:
        args.low_filter = get_closest_bin(c, 1 - (args.low_percent / 100.), reverse=True, counts=counts)
        print(args.low_filter)
        candidates = get_candidate_words(c, -1 * args.low_filter, sort=False, items=items, counts=counts)
        if len(candidates):
            print("Filtering {} low frequency word{}.".format(len(candidates),
                                                               's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)



    if not stoplist:
        print("No stopwords applied.\n\n")

        sys.exit(0)
    else:
        print("\n\nApplying {} stopword{}".format(len(stoplist),
                                                  's' if len(stoplist) > 1 else ''))
        c.in_place_stoplist(stoplist)
        print("\n")

    def name_corpus(dirname, languages, lowfreq=None, highfreq=None):
        corpus_name = [dirname]

        if args.lang:
            corpus_name.append('nltk')
            corpus_name.append(''.join(args.lang))

        if lowfreq is not None and lowfreq > 0:
            corpus_name.append('freq%s' % lowfreq)
        if highfreq is not None and highfreq > 0:
            corpus_name.append('N%s' % highfreq)

        corpus_name = '-'.join(corpus_name)
        corpus_name += '.npz'
        return corpus_name

    dirname = os.path.basename(args.corpus_path).split('-nltk-')[0].replace('.npz', '')
    corpus_name = name_corpus(dirname, ['en'], args.low_filter, args.high_filter)

    model_path = os.path.dirname(args.corpus_path)
    args.corpus_path = os.path.join(model_path, corpus_name)
    c.save(args.corpus_path)

    config.set("main", "corpus_file", args.corpus_path)
    config.remove_option("main", "model_pattern")
    with open(args.config_file, 'w') as configfh:
        config.write(configfh)

Пример #9

Показать файл

Файл: prep.py Проект: LinkedModernismProject/web_code

def main(args):
    config = ConfigParser({"htrc": False,
                           "sentences": "False"})
    config.read(args.config_file)

    if config.getboolean("main", "sentences"):
        from vsm.extensions.ldasentences import CorpusSent as Corpus
    else:
        from vsm.corpus import Corpus

    if args.lang is None:
        args.lang = []

    args.corpus_path = config.get("main", "corpus_file")
    c = Corpus.load(args.corpus_path)

    # check for htrc metadata
    if args.htrc or config.get("main", "htrc"):
        htrc_langs = get_htrc_langs(args)
        if htrc_langs:
            args.lang.extend(new_langs)

    # auto-guess a language
    """
    new_langs = [lang for lang in detect_langs(c) if lang in langs and lang not in args.lang]
    if new_langs:
        args.lang.extend(new_langs)
    """

    # add default locale if no other languages are specified
    # do not add if in quiet mode -- make everything explicit
    if not args.lang and not args.quiet:
        import locale
        locale = locale.getdefaultlocale()[0].split('_')[0].lower()
        if locale in langs.keys():
            args.lang.append(locale)

    # check for any new candidates
    args.lang = [lang for lang in args.lang if stop_language(c, langs[lang])]
    if args.lang and not args.quiet:
        args.lang = lang_prompt(args.lang)

    stoplist = set()
    # Apply stop words
    print(" ")
    for lang in args.lang:
        print("Applying", langs[lang], "stopwords")
        candidates = stop_language(c, langs[lang])
        if len(candidates):
            stoplist.update(candidates)

    # Apply custom stopwords file
    if args.stopword_file:
        with open(args.stopword_file, encoding='utf8') as swf:
            #candidates = [unidecode(word.strip()) for word in swf]
            candidates = [word.strip() for word in swf]

            if len(candidates):
                print("Applying custom stopword file to remove {} word{}.".format(
                    len(candidates), 's' if len(candidates) > 1 else ''))
                stoplist.update(candidates)

    if args.min_word_len:
        candidates = get_small_words(c, args.min_word_len)
        if len(candidates):
            print("Filtering {} small word{} with less than {} characters.".format(
                len(candidates), 's' if len(candidates) > 1 else '', args.min_word_len))
            stoplist.update(candidates)

    if not args.special_chars:
        candidates = get_special_chars(c)
        if len(candidates):
            print("Filtering {} word{} with special characters.".format(
                len(candidates), 's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)

    if args.high_filter is None and not args.quiet:
        args.high_filter, candidates = get_high_filter(args, c, words=stoplist)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(len(candidates),
                                                               's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.high_filter > 0:
        candidates = get_candidate_words(c, args.high_filter, sort=False)
        if len(candidates):
            print("Filtering {} high frequency word{}.".format(len(candidates),
                                                               's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)

    if args.low_filter is None and not args.quiet:
        args.low_filter, candidates = get_low_filter(args, c, words=stoplist)
        if len(candidates):
            print("Filtering {} low frequency word{}.".format(len(candidates),
                                                              's' if len(candidates) > 1 else ''))
            stoplist.update(candidates)
    elif args.low_filter > 0:
        candidates = get_candidate_words(c, -1 * args.low_filter, sort=False)
        if len(candidates):
            print("Filtering {} low frequency words.".format(len(candidates)))
            stoplist.update(candidates)

    if not stoplist:
        print("No stopwords applied.\n\n")

        sys.exit(0)
    else:
        print("\n\nApplying {} stopword{}".format(len(stoplist),
                                                  's' if len(stoplist) > 1 else ''))
        c.in_place_stoplist(stoplist)
        print("\n")

    def name_corpus(dirname, languages, lowfreq=None, highfreq=None):
        items, counts = get_items_counts(c.corpus)

        corpus_name = [dirname]
        if args.lang:
            corpus_name.append('nltk')
            corpus_name.append(''.join(args.lang))
        if lowfreq > 0:
            corpus_name.append('freq%s' % lowfreq)
        else:
            corpus_name.append('freq%s' % min(counts))

        if highfreq > 0:
            corpus_name.append('N%s' % highfreq)
        else:
            corpus_name.append('freq%s' % max(counts))

        corpus_name = '-'.join(corpus_name)
        corpus_name += '.npz'
        return corpus_name

    dirname = os.path.basename(args.corpus_path).split('-nltk-')[0].replace('.npz', '')
    corpus_name = name_corpus(dirname, ['en'], args.low_filter, args.high_filter)

    model_path = os.path.dirname(args.corpus_path)
    args.corpus_path = os.path.join(model_path, corpus_name)
    c.save(args.corpus_path)

    config.set("main", "corpus_file", args.corpus_path)
    config.remove_option("main", "model_pattern")
    with open(args.config_file, 'w') as configfh:
        config.write(configfh)