Пример #1
0
def get_docs(docs=None, id_as_key=False, query=None):
    ctx_md = lda_v.corpus.view_metadata(context_type)
    
    if docs:
        # filter to metadata for selected docs
        ids = [lda_v.corpus.meta_int(context_type, {doc_label_name(context_type) : doc} ) for doc in docs]
        ctx_md = ctx_md[ids]
    else:
        #get metadata for all documents
        docs = lda_v.corpus.view_metadata(context_type)[doc_label_name(context_type)]
    
    js = dict() if id_as_key else list()

    for doc, md in zip(docs, ctx_md):
        if query is None or query.lower() in label(doc).lower():
            struct = {
                'id': doc,
                'label' : label(doc),
                'metadata' : dict(zip(md.dtype.names, [str(m) for m in md])) }
            if id_as_key:
                js[doc] = struct
            else:
                js.append(struct)

    return js
Пример #2
0
def get_docs(docs=None, id_as_key=False, query=None):
    ctx_md = lda_v.corpus.view_metadata(context_type)

    if docs:
        # filter to metadata for selected docs
        ids = [
            lda_v.corpus.meta_int(context_type,
                                  {doc_label_name(context_type): doc})
            for doc in docs
        ]
        ctx_md = ctx_md[ids]
    else:
        #get metadata for all documents
        docs = lda_v.corpus.view_metadata(context_type)[doc_label_name(
            context_type)]

    js = dict() if id_as_key else list()

    for doc, md in zip(docs, ctx_md):
        if query is None or query.lower() in label(doc).lower():
            struct = {
                'id': doc,
                'label': label(doc),
                'metadata': dict(zip(md.dtype.names, [str(m) for m in md]))
            }
            if id_as_key:
                js[doc] = struct
            else:
                js.append(struct)

    return js
Пример #3
0
def add_htrc_metadata(config, corpus=None, corpus_filename=None):
    import htrc.metadata

    config.set("main", "label_module", "topicexplorer.extensions.htrc")
    config.set("www", "doc_title_format", '<a href="{1}">{0}</a>')
    config.set("www", "doc_url_format", 'http://hdl.handle.net/2027/{0}')
    config.set("www", "icons", "htrcbook,link")
    config.set("main", "htrc", True)
    
    if corpus_filename:
        corpus = Corpus.load(corpus_filename)
        config.set("main", "context_type", corpus.context_types[0])
    
    if corpus:
        ctx_type = config.get("main", "context_type")
        label_name = doc_label_name(ctx_type)
        ids = corpus.view_metadata(ctx_type)[label_name]
        
        htrc_metapath = os.path.abspath(config.get("main", "corpus_file"))
        htrc_metapath = os.path.join(
            os.path.dirname(htrc_metapath),
            os.path.basename(htrc_metapath) + '.metadata.json')

        print("Downloading metadata to ", htrc_metapath)
        htrc.metadata.get_metadata(ids, output_file=htrc_metapath)
        
        config.set("www", "htrc_metadata", htrc_metapath)

    return config
Пример #4
0
def docs(docs=None, q=None):
    response.content_type = 'application/json; charset=UTF8'
    response.set_header('Expires', _cache_date())
    
    try:
        if request.query.q:
            q = unquote(request.query.q)
    except:
        pass

    try: 
        if request.query.id:
            docs = [unquote(request.query.id)]
    except:
        pass
    
    try: 
        response.set_header('Expires', 0)
        response.set_header('Pragma', 'no-cache')
        response.set_header('Cache-Control', 'no-cache, no-store, must-revalidate')
        if request.query.random:
            docs = [np.random.choice(lda_v.corpus.view_metadata(context_type)[doc_label_name(context_type)])]
    except:
        pass

    js = get_docs(docs, query=q)

    return json.dumps(js)
Пример #5
0
def docs(docs=None, q=None):
    response.content_type = 'application/json; charset=UTF8'
    response.set_header('Expires', _cache_date())

    try:
        if request.query.q:
            q = unquote(request.query.q)
    except:
        pass

    try:
        if request.query.id:
            docs = [unquote(request.query.id)]
    except:
        pass

    try:
        response.set_header('Expires', 0)
        response.set_header('Pragma', 'no-cache')
        response.set_header('Cache-Control',
                            'no-cache, no-store, must-revalidate')
        if request.query.random:
            docs = [
                np.random.choice(
                    lda_v.corpus.view_metadata(context_type)[doc_label_name(
                        context_type)])
            ]
    except:
        pass

    js = get_docs(docs, query=q)

    return json.dumps(js)
Пример #6
0
def add_htrc_metadata(config, corpus=None):
    import htrc.metadata

    config.set("main", "label_module", "topicexplorer.extensions.htrc")
    config.set("www", "doc_title_format", '<a href="{1}">{0}</a>')
    config.set("www", "doc_url_format", 'http://hdl.handle.net/2027/{0}')
    config.set("www", "icons", "htrcbook,link")
    config.set("main", "htrc", True)

    if corpus:
        ctx_type = config.get("main", "context_type")
        label_name = doc_label_name(ctx_type)
        ids = corpus.view_metadata(ctx_type)[label_name]

        htrc_metapath = os.path.abspath(config.get("main", "corpus_file"))
        htrc_metapath = os.path.join(
            os.path.dirname(htrc_metapath),
            os.path.basename(htrc_metapath) + '.metadata.json')

        print("Downloading metadata to ", htrc_metapath)
        htrc.metadata.get_metadata(ids, output_file=htrc_metapath)

        config.set("www", "htrc_metadata", htrc_metapath)

    return config
Пример #7
0
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0,
                 context_type='document', ignore=['.json', '.log', '.err', '.pickle', '.npz'],
                 decode=True, sentences=False, simple=True, tokenizer='default'):

    from vsm.corpus import Corpus

    # ensure that nltk packages are downloaded
    ensure_nltk_data_downloaded()

    # import appropriate tokenizer
    if tokenizer == 'default':
        from vsm.extensions.corpusbuilders.util import word_tokenize
        tokenizer = word_tokenize
    elif tokenizer == 'simple':
        from topicexplorer.tokenizer import simple_tokenizer
        tokenizer = simple_tokenizer
    elif tokenizer == 'zh':
        from topicexplorer.lib.chinese import modern_chinese_tokenizer
        tokenizer = modern_chinese_tokenizer
    elif tokenizer == 'ltc' or tokenizer == 'och':
        from topicexplorer.lib.chinese import ancient_chinese_tokenizer
        tokenizer = ancient_chinese_tokenizer
    elif tokenizer == 'inpho':
        from topicexplorer.extensions.inpho import inpho_tokenizer
        tokenizer = inpho_tokenizer
    elif tokenizer == 'brain':
        from hyperbrain.parse import brain_tokenizer
        tokenizer = brain_tokenizer
    else:
        raise NotImplementedError(
            "Tokenizer '{}' is not included in topicexplorer".format(tokenizer))

    # pre-process PDF files
    contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(corpus_path, '*.pdf')
    if contains_pdfs:
        corpus_path = process_pdfs(corpus_path)

    print("Building corpus from", corpus_path, end=' ')
    corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore)
    print("with {} function".format(corpusbuilder.__name__))
    c = corpusbuilder(corpus_path, nltk_stop=nltk_stop,
                      stop_freq=stop_freq, ignore=ignore, decode=decode,
                      simple=simple, tokenizer=tokenizer)

    if contains_pdfs:
        from vsm.viewer.wrappers import doc_label_name
        import re
        label_name = doc_label_name(c.context_types[0])
        new_labels = [re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name]]
        c.context_data[0][label_name] = new_labels

    filename = get_corpus_filename(
        corpus_path, model_path, nltk_stop, stop_freq, context_type)
    c.save(filename)
    return filename
Пример #8
0
def extract_labels(corpus, ctx_type, filename):
    """
    Creates a new csv where each row is a label in the corpus.
    """
    label_name = doc_label_name(ctx_type)
    labels = corpus.view_metadata(ctx_type)[label_name]

    with open(filename, 'w') as outfile:
        outfile.write(label_name + '\n')
        for label in labels:
            outfile.write(label + '\n')
Пример #9
0
def extract_labels(corpus, ctx_type, filename):
    """
    Creates a new csv where each row is a label in the corpus.
    """
    label_name = doc_label_name(ctx_type)
    labels = corpus.view_metadata(ctx_type)[label_name]

    with open(filename, 'w') as outfile:
        outfile.write(label_name + '\n')
        for label in labels:
            outfile.write(label + '\n')
Пример #10
0
def parse_metadata_from_csvfile(filename, context_type):
    """
    Takes a csvfile where the first column in each row is the label.
    Returns a dictionary of dictionaries where each key is the label,
    and each value is a dictionary of field values.
    """
    label_name = doc_label_name(context_type)

    with open(filename, encoding='utf8') as csvfile:
        reader = UnicodeDictReader(csvfile)
        metadata = SortedDict()
        for row in reader:
            metadata[row[label_name]] = row

    return metadata
Пример #11
0
def parse_metadata_from_csvfile(filename, context_type):
    """
    Takes a csvfile where the first column in each row is the label.
    Returns a dictionary of dictionaries where each key is the label,
    and each value is a dictionary of field values.
    """
    label_name = doc_label_name(context_type)

    with open(filename, encoding='utf8') as csvfile:
        reader = UnicodeDictReader(csvfile,
                                   delimiter='\t',
                                   quoting=csv.QUOTE_NONE)
        metadata = SortedDict()
        for row in reader:
            metadata[row[label_name]] = row

    return metadata
Пример #12
0
def docs(request):
    """!
    Función para servir los documentos como un json

    @author Jorge Redondo (jredondo at cenditel.gob.ve)
    @copyright GNU/GPLv2
    @param request <b>{object}</b> Objeto que mantiene la peticion
    @return Retorna el objeto json
    """
    global lda_v
    try:
        docs = lda_v.corpus.view_metadata(context_type)[doc_label_name(context_type)]
        js = list()
        for doc in docs:
            js.append({
                'id': doc,
                'label' : label(doc)
            })
        return HttpResponse(json.dumps(js))
    except:
        return dump_exception()
Пример #13
0
def add_metadata(corpus, ctx_type, new_metadata, force=False, rename=False):
    import vsm.corpus

    # get existing metadata
    i = corpus.context_types.index(ctx_type)
    md = corpus.context_data[i]
    fields = md.dtype.fields.keys()

    # sort new_metadata according to existing md order
    # Note: this may raise a KeyError - in which case there's not md
    # for each entry.
    label_name = doc_label_name(ctx_type)
    labels = md[label_name]
    if rename:
        new_data = new_metadata.values()
    else:
        try:
            if force:
                new_data = [new_metadata.get(id, {}) for id in labels]
            else:
                new_data = [new_metadata[id] for id in labels]

            if not new_data:
                print("No metadata labels match existing labels.")
                print("If you changed labels, run with the `--rename` flag.")
                sys.exit(0)
            elif not force and len(new_data) != len(labels):
                raise KeyError
        except KeyError:
            print("New metadata does not span all documents in corpus.")
            print("If you changed labels, run with the `--rename` flag.")
            print("If you wish to force insertion of blank metadata,")
            print("run with the `--force` flag.")
            import sys
            sys.exit(1)

    # look for new fields
    new_fields = set()
    for vals in new_metadata.values():
        new_fields.update(vals.keys())

    # differentiate new and updated fields
    updated_fields = new_fields.intersection(fields)
    if not rename:
        updated_fields.remove(label_name)
    new_fields = new_fields.difference(fields)
    if None in new_fields:
        new_fields.remove(None)

    # process new fields
    for field in new_fields:
        if force:
            data = [d.get(field, '') for d in new_data]
        else:
            # new_data is a sorted list of metadata dictionaries
            data = [d[field] for d in new_data]
        corpus = vsm.corpus.add_metadata(corpus, ctx_type, field, data)

    # process existing fields
    for field in updated_fields:
        if force:
            data = [d.get(field, '') for d in new_data]
        else:
            data = [d[field] for d in new_data]
        corpus.context_data[i][field] = data

    return corpus
Пример #14
0
 def load_corpus(self):
     self.c = Corpus.load(self.config.get('main', 'corpus_file'))
     self.context_type = self.config.get('main', 'context_type')
     self.ctx_metadata = self.c.view_metadata(self.context_type)
     self.all_ids = self.ctx_metadata[doc_label_name(self.context_type)]
Пример #15
0
config_file = r"$config_file" 
config = ConfigParser({
        'topic_range': None,
        'topics': None,
        'sentences' : 'false'})
config.read(config_file)

# load the corpus
if config.getboolean('main','sentences'):
    from vsm.extensions.ldasentences import CorpusSent
    c = CorpusSent.load(config.get('main', 'corpus_file'))
else:
    c = Corpus.load(config.get('main', 'corpus_file'))
context_type = config.get('main', 'context_type')
ctx_metadata = c.view_metadata(context_type)
all_ids = ctx_metadata[doc_label_name(context_type)]

# create topic model patterns
pattern = config.get('main', 'model_pattern')
if config.get('main', 'topic_range'):
    topic_range = map(int, config.get('main', 'topic_range').split(','))
    topic_range = range(*topic_range)
if config.get('main', 'topics'):
    topic_range = eval(config.get('main', 'topics'))

# load the topic models
class keydefaultdict(defaultdict):
    """ Solution from: http://stackoverflow.com/a/2912455 """
    def __missing__(self, key):
        if self.default_factory is None:
            raise KeyError( key )
Пример #16
0
 def load_corpus(self):
     self.c = Corpus.load(self.config.get('main', 'corpus_file'))
     self.context_type = self.config.get('main', 'context_type')
     self.ctx_metadata = self.c.view_metadata(self.context_type)
     self.all_ids = self.ctx_metadata[doc_label_name(self.context_type)]
Пример #17
0
def build_corpus(corpus_path,
                 model_path,
                 nltk_stop=False,
                 stop_freq=0,
                 context_type='document',
                 ignore=['.json', '.log', '.err', '.pickle', '.npz'],
                 decode=True,
                 sentences=False,
                 simple=True,
                 tokenizer='default'):

    from vsm.corpus import Corpus
    from vsm.extensions.corpusbuilders import coll_corpus, dir_corpus, toy_corpus
    if sentences:
        print "Importing sentence constructors"
        from vsm.extensions.ldasentences import dir_corpus, toy_corpus

    # import appropriate tokenizer
    if tokenizer == 'default':
        from vsm.extensions.corpusbuilders.util import word_tokenize
        tokenizer = word_tokenize
    elif tokenizer == 'zh':
        from topicexplorer.lib.chinese import modern_chinese_tokenizer
        tokenizer = modern_chinese_tokenizer
    elif tokenizer == 'ltc' or tokenizer == 'och':
        from topicexplorer.lib.chinese import ancient_chinese_tokenizer
        tokenizer = ancient_chinese_tokenizer
    else:
        raise NotImplementedError(
            "Tokenizer '{}' is not included in topicexplorer".format(
                tokenizer))

    # pre-process PDF files
    contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(
        corpus_path, '*.pdf')
    if contains_pdfs:
        corpus_path = process_pdfs(corpus_path)

    print "Building corpus from", corpus_path,
    corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore)
    print "with {} function".format(corpusbuilder.__name__)

    c = corpusbuilder(corpus_path,
                      nltk_stop=nltk_stop,
                      stop_freq=stop_freq,
                      ignore=ignore,
                      decode=decode,
                      simple=simple,
                      tokenizer=tokenizer)
    '''
    if os.path.isfile(corpus_path):
        print "Constructing toy corpus, each line is a document"
        if sentences:
            c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, 
                           stop_freq=stop_freq, autolabel=True, decode=decode)
        else:
            c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, 
                           stop_freq=stop_freq, autolabel=True, decode=decode,
                           simple=simple, tokenizer=tokenizer)
    elif os.path.isdir(corpus_path):
        contents = listdir_nohidden(corpus_path)
        contents = [os.path.join(corpus_path,obj) for obj in contents 
            if not any([obj.endswith(suffix) for suffix in ignore])]
        count_dirs = len(filter(os.path.isdir, contents))
        count_files = len(filter(os.path.isfile, contents))

        print "Detected %d folders and %d files in %s" %\
            (count_dirs, count_files, corpus_path)

        if count_files > 0 and count_dirs == 0:
            print "Constructing directory corpus, each file is a document"
            if sentences:
                c = dir_corpus(corpus_path, nltk_stop=nltk_stop,
                               stop_freq=stop_freq, chunk_name=context_type,
                               ignore=ignore, decode=decode)
            else:
                c = dir_corpus(corpus_path, nltk_stop=nltk_stop,
                               stop_freq=stop_freq, chunk_name=context_type,
                               ignore=ignore, decode=decode, simple=simple, 
                               tokenizer=tokenizer)
        elif count_dirs > 0 and count_files == 0 and not sentences:
            print "Constructing collection corpus, each folder is a document"
            context_type='book'
            c = coll_corpus(corpus_path, nltk_stop=nltk_stop,
                            stop_freq=stop_freq, ignore=ignore, decode=decode,
                            simple=simple, tokenizer=tokenizer)
        elif count_dirs > 0 and count_files == 0 and sentences:
            raise NotImplementedError("""Collection corpuses are too large for
            sentence parsing. Reduce your corpus to a single folder or
            file.""")
        else:
            raise IOError("Invalid Path: empty directory")
    else:
        raise IOError("Invalid path")
    '''

    if contains_pdfs:
        from vsm.viewer.wrappers import doc_label_name
        import re
        label_name = doc_label_name(c.context_types[0])
        new_labels = [
            re.sub('txt$', 'pdf', label)
            for label in c.context_data[0][label_name]
        ]
        c.context_data[0][label_name] = new_labels

    filename = get_corpus_filename(corpus_path, model_path, nltk_stop,
                                   stop_freq, context_type)
    c.save(filename)
    return filename
Пример #18
0
from vsm.viewer.wrappers import doc_label_name

import os.path

# load in the configuration file
from ConfigParser import ConfigParser as ConfigParser

config_file = r"$config_file"
config = ConfigParser({'topic_range': None, 'topics': None})
config.read(config_file)

# load the corpus
c = Corpus.load(config.get('main', 'corpus_file'))
context_type = config.get('main', 'context_type')
ctx_metadata = c.view_metadata(context_type)
all_ids = ctx_metadata[doc_label_name(context_type)]

# create topic model patterns
pattern = config.get('main', 'model_pattern')
if config.get('main', 'topic_range'):
    topic_range = map(int, config.get('main', 'topic_range').split(','))
    topic_range = range(*topic_range)
if config.get('main', 'topics'):
    topic_range = eval(config.get('main', 'topics'))

# load the topic models
lda_m = dict()
lda_v = dict()
print topic_range
print pattern
for k in topic_range:
Пример #19
0
def build_corpus(corpus_path,
                 model_path,
                 nltk_stop=False,
                 stop_freq=0,
                 context_type='document',
                 ignore=['.json', '.log', '.err', '.pickle', '.npz'],
                 decode=True,
                 sentences=False,
                 simple=True,
                 tokenizer='default'):

    from vsm.corpus import Corpus

    # ensure that nltk packages are downloaded
    ensure_nltk_data_downloaded()

    # import appropriate tokenizer
    if tokenizer == 'default':
        from vsm.extensions.corpusbuilders.util import word_tokenize
        tokenizer = word_tokenize
    elif tokenizer == 'simple':
        from topicexplorer.tokenizer import simple_tokenizer
        tokenizer = simple_tokenizer
    elif tokenizer == 'zh':
        from topicexplorer.lib.chinese import modern_chinese_tokenizer
        tokenizer = modern_chinese_tokenizer
    elif tokenizer == 'ltc' or tokenizer == 'och':
        from topicexplorer.lib.chinese import ancient_chinese_tokenizer
        tokenizer = ancient_chinese_tokenizer
    elif tokenizer == 'inpho':
        from topicexplorer.extensions.inpho import inpho_tokenizer
        tokenizer = inpho_tokenizer
    elif tokenizer == 'brain':
        from hyperbrain.parse import brain_tokenizer
        tokenizer = brain_tokenizer
    else:
        raise NotImplementedError(
            "Tokenizer '{}' is not included in topicexplorer".format(
                tokenizer))

    # pre-process PDF files
    contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(
        corpus_path, '*.pdf')
    if contains_pdfs:
        corpus_path = process_pdfs(corpus_path)

    print("Building corpus from", corpus_path, end=' ')
    corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore)
    print("with {} function".format(corpusbuilder.__name__))
    c = corpusbuilder(corpus_path,
                      nltk_stop=nltk_stop,
                      stop_freq=stop_freq,
                      ignore=ignore,
                      decode=decode,
                      simple=simple,
                      tokenizer=tokenizer)

    if contains_pdfs:
        from vsm.viewer.wrappers import doc_label_name
        import re
        label_name = doc_label_name(c.context_types[0])
        new_labels = [
            re.sub('txt$', 'pdf', label)
            for label in c.context_data[0][label_name]
        ]
        c.context_data[0][label_name] = new_labels

    filename = get_corpus_filename(corpus_path, model_path, nltk_stop,
                                   stop_freq, context_type)
    c.save(filename)
    return filename
Пример #20
0
def id_fn(md):
    context_md = lda_v.corpus.view_metadata(context_type)
    ctx_label = doc_label_name(context_type)
    return context_md[ctx_label]
Пример #21
0
def add_metadata(corpus, ctx_type, new_metadata, force=False, rename=False):
    import vsm.corpus
    
    # get existing metadata
    i = corpus.context_types.index(ctx_type)
    md = corpus.context_data[i]
    fields = md.dtype.fields.keys()

    # sort new_metadata according to existing md order
    # Note: this may raise a KeyError - in which case there's not md
    # for each entry.
    label_name = doc_label_name(ctx_type)
    labels = md[label_name]
    if rename:
        new_data = new_metadata.values()
    else:
        try:
            if force:
                new_data = [new_metadata.get(id, {}) for id in labels]
            else:
                new_data = [new_metadata[id] for id in labels]

            if not new_data:
                print("No metadata labels match existing labels.")
                print("If you changed labels, run with the `--rename` flag.")
                sys.exit(0)
            elif not force and len(new_data) != len(labels):
                raise KeyError
        except KeyError:
            print("New metadata does not span all documents in corpus.")
            print("If you changed labels, run with the `--rename` flag.")
            print("If you wish to force insertion of blank metadata,")
            print("run with the `--force` flag.")
            import sys
            sys.exit(1)

    # look for new fields
    new_fields = set()
    for vals in new_metadata.values():
        new_fields.update(vals.keys())

    # differentiate new and updated fields
    updated_fields = new_fields.intersection(fields)
    if not rename:
        updated_fields.remove(label_name)
    new_fields = new_fields.difference(fields)
    if None in new_fields:
        new_fields.remove(None)

    # process new fields
    for field in new_fields:
        if force:
            data = [d.get(field, '') for d in new_data]
        else:
            # new_data is a sorted list of metadata dictionaries
            data = [d[field] for d in new_data]
        corpus = vsm.corpus.add_metadata(corpus, ctx_type, field, data)

    # process existing fields
    for field in updated_fields:
        if force:
            data = [d.get(field, '') for d in new_data]
        else:
            data = [d[field] for d in new_data]
        corpus.context_data[i][field] = data

    return corpus
Пример #22
0
def id_fn(md):
    context_md = lda_v.corpus.view_metadata(context_type)
    ctx_label = doc_label_name(context_type)
    return context_md[ctx_label]
Пример #23
0
def build_corpus(corpus_path, model_path, nltk_stop=False, stop_freq=0,
    context_type='document', ignore=['.json','.log','.err','.pickle','.npz'],
    decode=True, sentences=False, simple=True, tokenizer='default'):
   
    from vsm.corpus import Corpus
    from vsm.extensions.corpusbuilders import coll_corpus, dir_corpus, toy_corpus
    if sentences:
        print "Importing sentence constructors"
        from vsm.extensions.ldasentences import dir_corpus, toy_corpus


    # import appropriate tokenizer
    if tokenizer == 'default':
        from vsm.extensions.corpusbuilders.util import word_tokenize
        tokenizer = word_tokenize
    elif tokenizer == 'zh':
        from topicexplorer.lib.chinese import modern_chinese_tokenizer
        tokenizer = modern_chinese_tokenizer
    elif tokenizer == 'ltc' or tokenizer == 'och':
        from topicexplorer.lib.chinese import ancient_chinese_tokenizer
        tokenizer = ancient_chinese_tokenizer
    else:
        raise NotImplementedError("Tokenizer '{}' is not included in topicexplorer".format(tokenizer))


    # pre-process PDF files
    contains_pdfs = corpus_path[-4:] == '.pdf' or contains_pattern(corpus_path, '*.pdf')
    if contains_pdfs:
        corpus_path = process_pdfs(corpus_path)

    print "Building corpus from", corpus_path,
    corpusbuilder = get_corpusbuilder_fn(corpus_path, sentences, ignore=ignore)
    print "with {} function".format(corpusbuilder.__name__)

    c = corpusbuilder(corpus_path, nltk_stop=nltk_stop,
                      stop_freq=stop_freq, ignore=ignore, decode=decode,
                      simple=simple, tokenizer=tokenizer)

    '''
    if os.path.isfile(corpus_path):
        print "Constructing toy corpus, each line is a document"
        if sentences:
            c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, 
                           stop_freq=stop_freq, autolabel=True, decode=decode)
        else:
            c = toy_corpus(corpus_path, is_filename=True, nltk_stop=nltk_stop, 
                           stop_freq=stop_freq, autolabel=True, decode=decode,
                           simple=simple, tokenizer=tokenizer)
    elif os.path.isdir(corpus_path):
        contents = listdir_nohidden(corpus_path)
        contents = [os.path.join(corpus_path,obj) for obj in contents 
            if not any([obj.endswith(suffix) for suffix in ignore])]
        count_dirs = len(filter(os.path.isdir, contents))
        count_files = len(filter(os.path.isfile, contents))

        print "Detected %d folders and %d files in %s" %\
            (count_dirs, count_files, corpus_path)

        if count_files > 0 and count_dirs == 0:
            print "Constructing directory corpus, each file is a document"
            if sentences:
                c = dir_corpus(corpus_path, nltk_stop=nltk_stop,
                               stop_freq=stop_freq, chunk_name=context_type,
                               ignore=ignore, decode=decode)
            else:
                c = dir_corpus(corpus_path, nltk_stop=nltk_stop,
                               stop_freq=stop_freq, chunk_name=context_type,
                               ignore=ignore, decode=decode, simple=simple, 
                               tokenizer=tokenizer)
        elif count_dirs > 0 and count_files == 0 and not sentences:
            print "Constructing collection corpus, each folder is a document"
            context_type='book'
            c = coll_corpus(corpus_path, nltk_stop=nltk_stop,
                            stop_freq=stop_freq, ignore=ignore, decode=decode,
                            simple=simple, tokenizer=tokenizer)
        elif count_dirs > 0 and count_files == 0 and sentences:
            raise NotImplementedError("""Collection corpuses are too large for
            sentence parsing. Reduce your corpus to a single folder or
            file.""")
        else:
            raise IOError("Invalid Path: empty directory")
    else:
        raise IOError("Invalid path")
    '''

    if contains_pdfs:
        from vsm.viewer.wrappers import doc_label_name
        import re
        label_name = doc_label_name(c.context_types[0])
        new_labels = [re.sub('txt$', 'pdf', label) for label in c.context_data[0][label_name]]
        c.context_data[0][label_name] = new_labels

    filename = get_corpus_filename(
        corpus_path, model_path, nltk_stop, stop_freq, context_type)
    c.save(filename)
    return filename