Python tokenise 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: nltk

메소드/함수: tokenise

hotexamples.com에서의 예제들: 5

Python tokenise - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 nltk.tokenise에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: build.py 프로젝트: kareem180/corpkit

def parse_corpus(proj_path = False, 
                corpuspath = False, 
                filelist = False, 
                corenlppath = False, 
                operations = False,
                only_tokenise = False, 
                root = False, 
                stdout = False, 
                nltk_data_path = False, 
                memory_mb = 2000,
                copula_head = True,
                **kwargs):
    """
    Create a CoreNLP-parsed and/or NLTK tokenised corpus
    """
    import corpkit
    import subprocess
    from subprocess import PIPE, STDOUT, Popen
    import os
    import sys
    import chardet
    from time import localtime, strftime
    import time
    
    if not only_tokenise:
        if not check_jdk():
            print 'Need latest Java.'
            return

    curdir = os.getcwd()

    if nltk_data_path:
        if only_tokenise:
            import nltk
            if nltk_data_path not in nltk.data.path:
                nltk.data.path.append(nltk_data_path)
            from nltk import word_tokenize as tokenise

    # add nltk to path
    #td = {}
    #from corpkit.other import add_nltk_data_to_nltk_path
    #if 'note' in kwargs.keys():
    #    td['note'] = kwargs['note']
    #add_nltk_data_to_nltk_path(**td)

    if proj_path is False:
        proj_path = os.path.dirname(os.path.abspath(corpuspath.rstrip('/')))

    basecp = os.path.basename(corpuspath)

    if only_tokenise:
        new_corpus_path = os.path.join(proj_path, 'data', '%s-tokenised' % basecp)
    else:
        new_corpus_path = os.path.join(proj_path, 'data', '%s-parsed' % basecp)

    if os.path.join('data', 'data') in new_corpus_path:
        new_corpus_path = new_corpus_path.replace(os.path.join('data', 'data'), 'data')

    if not os.path.isdir(new_corpus_path):
        os.makedirs(new_corpus_path)
    else:
        fs = os.listdir(new_corpus_path)
        if not only_tokenise:
            if any([f.endswith('.xml') for f in fs]):
                print 'Folder containing xml already exists: "%s-parsed"' % basecp
                return False
        else:
            if any([f.endswith('.txt') for f in fs]):
                print 'Folder containing tokens already exists: "%s-tokenised"' % basecp  
                return False          
    #javaloc = os.path.join(proj_path, 'corenlp', 'stanford-corenlp-3.6.0.jar:stanford-corenlp-3.6.0-models.jar:xom.jar:joda-time.jar:jollyday.jar:ejml-0.23.jar')
    cwd = os.getcwd()
    if corenlppath is False:
        home = os.path.expanduser("~")
        corenlppath = os.path.join(home, 'corenlp')
        find_install = [d for d in os.listdir(corenlppath) \
                   if os.path.isdir(os.path.join(corenlppath, d)) \
                   and os.path.isfile(os.path.join(corenlppath, d, 'jollyday.jar'))]
        if len(find_install) > 0:
            corenlppath = os.path.join(corenlppath, find_install[0])
        else:
            print 'No parser found. Try using the keyword arg "corenlp = <path>".'
            return

    # if not gui, don't mess with stdout
    if stdout is False:
        stdout = sys.stdout

    if not only_tokenise:
        os.chdir(corenlppath)
        if root:
            root.update_idletasks()
            reload(sys)
        import os
        import time
        if memory_mb is False:
            memory_mb = 2024
        if operations is False:
            operations = 'tokenize,ssplit,pos,lemma,ner,parse,dcoref'
        if type(operations) == list:
            operations = ','.join(operations)
        num_files_to_parse = len([l for l in open(filelist, 'r').read().splitlines() if l])
        # get corenlp version number
        import re
        reg = re.compile(r'stanford-corenlp-([0-9].[0-9].[0-9])-javadoc.jar')
        fver = next(re.search(reg, s).group(1) for s in os.listdir('.') if re.search(reg, s))
        arglist = ['java', '-cp', 
                     'stanford-corenlp-%s.jar:stanford-corenlp-%s-models.jar:xom.jar:joda-time.jar:jollyday.jar:ejml-0.23.jar' % (fver, fver), 
                     '-Xmx%sm' % str(memory_mb), 
                     'edu.stanford.nlp.pipeline.StanfordCoreNLP', 
                     '-annotators', 
                     operations, 
                     '-filelist', filelist,
                     '-noClobber',
                     '-outputDirectory', new_corpus_path]
        if copula_head:
            arglist.append('--parse.flags')
            arglist.append(' -makeCopulaHead')
        try:
            proc = subprocess.Popen(arglist, stdout=sys.stdout)
        # maybe a problem with stdout. sacrifice it if need be
        except AttributeError:
            proc = subprocess.Popen(arglist)            
        #p = TextProgressBar(num_files_to_parse)
        while proc.poll() is None:
            sys.stdout = stdout
            thetime = strftime("%H:%M:%S", localtime())
            num_parsed = len([f for f in os.listdir(new_corpus_path) if f.endswith('.xml')])  
            if num_parsed == 0:
                if root:
                    print '%s: Initialising parser ... ' % (thetime)
            if num_parsed > 0 and (num_parsed + 1) <= num_files_to_parse:
                if root:
                    print '%s: Parsing file %d/%d ... ' % (thetime, num_parsed + 1, num_files_to_parse)
                if 'note' in kwargs.keys():
                    kwargs['note'].progvar.set((num_parsed) * 100.0 / num_files_to_parse)
                #p.animate(num_parsed - 1, str(num_parsed) + '/' + str(num_files_to_parse))
            time.sleep(1)
            if root:
                root.update()
    else:


        from nltk import word_tokenize as tokenise
        # tokenise each file
        import pickle
        fs = open(filelist).read().splitlines()
        dirs = sorted(list(set([os.path.basename(os.path.dirname(f)) for f in fs])))
        if len(dirs) == 0:
            one_big_corpus = True
        else:
            one_big_corpus = False
        if any(os.path.isdir(os.path.join(new_corpus_path, d)) for d in dirs):
            thetime = strftime("%H:%M:%S", localtime())
            print '%s: Directory already exists. Delete it if need be.' % thetime
            return False
        for d in dirs:
            os.makedirs(os.path.join(new_corpus_path, d))
        nfiles = len(fs)
        thetime = strftime("%H:%M:%S", localtime())
        print '%s: Tokenising ... ' % (thetime)
        for index, f in enumerate(fs):
            data = open(f).read()
            enc = chardet.detect(data)
            enc_text = unicode(data, enc['encoding'], errors = 'ignore')
            tokens = tokenise(enc_text)
            thedir = os.path.basename(os.path.dirname(f))
            newname = os.path.basename(f).replace('.txt', '-tokenised.p')
            if one_big_corpus:
                pth = os.path.join(new_corpus_path, newname)
            else:
                pth = os.path.join(new_corpus_path, thedir, newname)
            with open(pth, "wb") as fo:
                pickle.dump(tokens, fo)
            if 'note' in kwargs.keys():
                kwargs['note'].progvar.set((index + 1) * 100.0 / nfiles)
            if root:
                root.update()

    #p.animate(num_files_to_parse)
    if 'note' in kwargs.keys():
        kwargs['note'].progvar.set(100)
    sys.stdout = stdout
    thetime = strftime("%H:%M:%S", localtime())
    print '%s: Parsing finished. Moving parsed files into place ...' % thetime
    os.chdir(curdir)
    return new_corpus_path

예제 #2

파일 보기

파일: build.py 프로젝트: javelir/corpkit

def parse_corpus(proj_path=False, 
                 corpuspath=False, 
                 filelist=False, 
                 corenlppath=False, 
                 operations=False,
                 only_tokenise=False, 
                 root=False, 
                 stdout=False, 
                 nltk_data_path=False, 
                 memory_mb=2000,
                 copula_head=True,
                 multiprocessing=False,
                 **kwargs
                ):
    """
    Create a CoreNLP-parsed and/or NLTK tokenised corpus
    """
    import corpkit
    import subprocess
    from subprocess import PIPE, STDOUT, Popen
    from corpkit.process import get_corenlp_path
    import os
    import sys
    import re
    import chardet
    from time import localtime, strftime
    import time

    fileparse = kwargs.get('fileparse', False)

    url = 'http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip'
    
    if not only_tokenise:
        if not check_jdk():
            print('Need latest Java.')
            return

    curdir = os.getcwd()
    note = kwargs.get('note', False)

    if nltk_data_path:
        if only_tokenise:
            import nltk
            if nltk_data_path not in nltk.data.path:
                nltk.data.path.append(nltk_data_path)
            from nltk import word_tokenize as tokenise

    if proj_path is False:
        proj_path = os.path.dirname(os.path.abspath(corpuspath.rstrip('/')))

    basecp = os.path.basename(corpuspath)

    if fileparse:
        new_corpus_path = os.path.dirname(corpuspath)
    else:
        if only_tokenise:
            new_corpus_path = os.path.join(proj_path, 'data', '%s-tokenised' % basecp)
        else:
            new_corpus_path = os.path.join(proj_path, 'data', '%s-parsed' % basecp)

    # todo:
    # this is not stable
    if os.path.join('data', 'data') in new_corpus_path:
        new_corpus_path = new_corpus_path.replace(os.path.join('data', 'data'), 'data')

    # this caused errors when multiprocessing
    # it used to be isdir, but supposedly there was a file there
    # i don't see how it's possible ...
    
    if not os.path.exists(new_corpus_path):
        os.makedirs(new_corpus_path)
    else:
        if not os.path.isfile(new_corpus_path):
            fs = os.listdir(new_corpus_path)
            if not multiprocessing:
                if not only_tokenise:
                    if any([f.endswith('.xml') for f in fs]):
                        print('Folder containing xml already exists: "%s-parsed"' % basecp)
                        return False
                else:
                    if any([f.endswith('.p') for f in fs]):
                        print('Folder containing tokens already exists: "%s-tokenised"' % basecp)  
                        return False          

    corenlppath = get_corenlp_path(corenlppath)

    if not corenlppath:
        cnlp_dir = os.path.join(os.path.expanduser("~"), 'corenlp')
        corenlppath, fpath = download_large_file(cnlp_dir, url,
                                                 root=root,
                                                 note=note,
                                                 actually_download=True,
                                                 custom_corenlp_dir=corenlppath)
        if corenlppath is None and fpath is None:
            import shutil
            shutil.rmtree(new_corpus_path)
            shutil.rmtree(new_corpus_path.replace('-parsed', ''))
            os.remove(new_corpus_path.replace('-parsed', '-filelist.txt'))
            raise ValueError('CoreNLP needed to parse texts.')
        extract_cnlp(fpath)
        import glob
        globpath = os.path.join(corenlppath, 'stanford-corenlp*')
        corenlppath = [i for i in glob.glob(globpath) if os.path.isdir(i)]
        if corenlppath:
            corenlppath = corenlppath[-1]
        else:
            raise ValueError('CoreNLP installation failed for some reason. Try manual download.')

    # if not gui, don't mess with stdout
    if stdout is False:
        stdout = sys.stdout

    if not only_tokenise:
        os.chdir(corenlppath)
        if root:
            root.update_idletasks()
            reload(sys)
        if memory_mb is False:
            memory_mb = 2024
        if operations is False:
            operations = 'tokenize,ssplit,pos,lemma,parse,ner,dcoref'
        if isinstance(operations, list):
            operations = ','.join([i.lower() for i in operations])

        with open(filelist, 'r') as fo:
            dat = fo.read()
        num_files_to_parse = len([l for l in dat.splitlines() if l])

        # get corenlp version number
        reg = re.compile(r'stanford-corenlp-([0-9].[0-9].[0-9])-javadoc.jar')
        fver = next(re.search(reg, s).group(1) for s in os.listdir('.') if re.search(reg, s))
        if fver == '3.6.0':
            extra_jar = 'slf4j-api.jar:slf4j-simple.jar:'
        else:
            extra_jar = ''
        arglist = ['java', '-cp', 
                   'stanford-corenlp-%s.jar:stanford-corenlp-%s-models.jar:xom.jar:joda-time.jar:%sjollyday.jar:ejml-0.23.jar' % (fver, fver, extra_jar), 
                   '-Xmx%sm' % str(memory_mb),
                   'edu.stanford.nlp.pipeline.StanfordCoreNLP', 
                   '-annotators',
                   operations, 
                   '-filelist', filelist,
                   '-noClobber',
                   '-outputExtension', '.xml',
                   '-outputDirectory', new_corpus_path]
        if copula_head:
            arglist.append('--parse.flags')
            arglist.append(' -makeCopulaHead')
        try:
            proc = subprocess.Popen(arglist, stdout=sys.stdout)
        # maybe a problem with stdout. sacrifice it if need be
        except:
            proc = subprocess.Popen(arglist)            
        #p = TextProgressBar(num_files_to_parse)
        while proc.poll() is None:
            sys.stdout = stdout
            thetime = strftime("%H:%M:%S", localtime())
            if not fileparse:
                num_parsed = len([f for f in os.listdir(new_corpus_path) if f.endswith('.xml')])  
                if num_parsed == 0:
                    if root:
                        print('%s: Initialising parser ... ' % (thetime))
                if num_parsed > 0 and (num_parsed + 1) <= num_files_to_parse:
                    if root:
                        print('%s: Parsing file %d/%d ... ' % \
                             (thetime, num_parsed + 1, num_files_to_parse))
                    if kwargs.get('note'):
                        kwargs['note'].progvar.set((num_parsed) * 100.0 / num_files_to_parse)
                    #p.animate(num_parsed - 1, str(num_parsed) + '/' + str(num_files_to_parse))
                time.sleep(1)
                if root:
                    root.update()
    else:

        from nltk import word_tokenize as tokenise
        # tokenise each file
        import cPickle as pickle
        fs = open(filelist).read().splitlines()
        dirs = sorted(list(set([os.path.basename(os.path.dirname(f)) for f in fs])))
        one_big_corpus = len(dirs) == 0
        if any(os.path.isdir(os.path.join(new_corpus_path, d)) for d in dirs):
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Directory already exists. Delete it if need be.' % thetime)
            return False
        for d in dirs:
            os.makedirs(os.path.join(new_corpus_path, d))
        nfiles = len(fs)
        thetime = strftime("%H:%M:%S", localtime())
        print('%s: Tokenising ... ' % (thetime))
        for index, f in enumerate(fs):
            with open(f, 'r') as fo:
                data = fo.read()
            enc = chardet.detect(data)
            enc_text = data.decode(enc['encoding'], errors='ignore')
            tokens = tokenise(enc_text)
            thedir = os.path.basename(os.path.dirname(f))
            newname = os.path.basename(f).replace('.txt', '-tokenised.p')
            if one_big_corpus:
                pth = os.path.join(new_corpus_path, newname)
            else:
                pth = os.path.join(new_corpus_path, thedir, newname)
            with open(pth, "wb") as fo:
                pickle.dump(tokens, fo)
            if kwargs.get('note'):
                kwargs['note'].progvar.set((index + 1) * 100.0 / nfiles)
            if root:
                root.update()

    #p.animate(num_files_to_parse)
    if kwargs.get('note'):
        kwargs['note'].progvar.set(100)
    sys.stdout = stdout
    thetime = strftime("%H:%M:%S", localtime())
    print('%s: Parsing finished. Moving parsed files into place ...' % thetime)
    os.chdir(curdir)
    return new_corpus_path

예제 #3

파일 보기

파일: vocabBuild.py 프로젝트: andres-lou/poesia

import os, json
from nltk import word_tokenize as tokenise

## os.getcwd() returns the current working directory
## os.path.dirname() takes a path and returns the directory above it
corpus_filename = os.path.dirname(os.getcwd()) + '/corpus.json'

## load the corpus
corpus = json.load(open(corpus_filename))
index = {}

## number of training examples in corpus
m = len(corpus)

for i in range(m):
    tokens = tokenise(corpus[i]['title']) + tokenise(corpus[i]['text'])
    for token in tokens:
        token_lower = token.lower()
        if token_lower not in index:
            index[token_lower] = [i]
        else:
            index[token_lower].append(i)

with open('index.json', 'w') as op:
    op.write(json.dumps(index))

예제 #4

파일 보기

파일: build.py 프로젝트: hakumiogin/corpkit

def parse_corpus(proj_path, corpuspath, filelist, corenlppath = False, operations = False,
                 only_tokenise = False, root = False, stdout = False, **kwargs):
    import corpkit
    import subprocess
    from subprocess import PIPE, STDOUT, Popen
    import os
    import sys
    import chardet
    from time import localtime, strftime
    import time
    
    if not only_tokenise:
        if not check_jdk():
            print 'Need latest Java.'
            return

    # add nltk to path
    td = {}
    from corpkit.other import add_nltk_data_to_nltk_path
    if 'note' in kwargs.keys():
        td['note'] = kwargs['note']
    add_nltk_data_to_nltk_path(**td)

    basecp = os.path.basename(corpuspath)
    if only_tokenise:
        new_corpus_path = os.path.join(proj_path, 'data', '%s-tokenised' % basecp)
    else:
        new_corpus_path = os.path.join(proj_path, 'data', '%s-parsed' % basecp)
    if not os.path.isdir(new_corpus_path):
        os.makedirs(new_corpus_path)
    else:
        fs = os.listdir(new_corpus_path)
        if not only_tokenise:
            if any([f.endswith('.xml') for f in fs]):
                print 'Folder containing xml already exists: "%s-parsed"' % basecp
                return False
        else:
            if any([f.endswith('.txt') for f in fs]):
                print 'Folder containing tokens already exists: "%s-tokenised"' % basecp  
                return False          
    #javaloc = os.path.join(proj_path, 'corenlp', 'stanford-corenlp-3.5.2.jar:stanford-corenlp-3.5.2-models.jar:xom.jar:joda-time.jar:jollyday.jar:ejml-0.23.jar')
    cwd = os.getcwd()
    if corenlppath is False:
        home = os.path.expanduser("~")
        corenlppath = os.path.join(home, 'corenlp')
        find_install = [d for d in os.listdir(corenlppath) \
                   if os.path.isdir(os.path.join(corenlppath, d)) \
                   and os.path.isfile(os.path.join(corenlppath, d, 'jollyday.jar'))]
        if len(find_install) > 0:
            corenlppath = os.path.join(corenlppath, find_install[0])
        else:
            print 'No parser found.'
            return

    if not only_tokenise:
        os.chdir(corenlppath)
        root.update_idletasks()
        reload(sys)
        import os
        import time
        if operations is False:
            operations = 'tokenize,ssplit,pos,lemma,ner,parse,dcoref'
        num_files_to_parse = len([l for l in open(filelist, 'r').read().splitlines() if l])
        proc = subprocess.Popen(['java', '-cp', 
                     'stanford-corenlp-3.5.2.jar:stanford-corenlp-3.5.2-models.jar:xom.jar:joda-time.jar:jollyday.jar:ejml-0.23.jar', 
                     '-Xmx2g', 
                     'edu.stanford.nlp.pipeline.StanfordCoreNLP', 
                     '-annotators', 
                     operations, 
                     '-filelist', filelist,
                     '-noClobber',
                     '-outputDirectory', new_corpus_path, 
                     '--parse.flags', ' -makeCopulaHead'], stdout=sys.stdout)
        #p = TextProgressBar(num_files_to_parse)
        while proc.poll() is None:
            sys.stdout = stdout
            thetime = strftime("%H:%M:%S", localtime())
            num_parsed = len([f for f in os.listdir(new_corpus_path) if f.endswith('.xml')])  
            if num_parsed == 0:
                print '%s: Initialising parser ... ' % (thetime)
            if num_parsed > 0 and num_parsed <= num_files_to_parse:
                print '%s: Parsing file %d/%d ... ' % (thetime, num_parsed + 1, num_files_to_parse)
                if 'note' in kwargs.keys():
                    kwargs['note'].progvar.set((num_parsed) * 100.0 / num_files_to_parse)
                #p.animate(num_parsed - 1, str(num_parsed) + '/' + str(num_files_to_parse))
            if root:
                root.update()
                time.sleep(1)
    else:

        # tokenise each file
        from nltk import word_tokenize as tokenise
        import pickle
        fs = open(filelist).read().splitlines()
        dirs = sorted(list(set([os.path.basename(os.path.dirname(f)) for f in fs])))
        if len(dirs) == 0:
            one_big_corpus = True
        else:
            one_big_corpus = False
        if any(os.path.isdir(os.path.join(new_corpus_path, d)) for d in dirs):
            thetime = strftime("%H:%M:%S", localtime())
            print '%s: Directory already exists. Delete it if need be.' % thetime
            return
        for d in dirs:
            os.makedirs(os.path.join(new_corpus_path, d))
        nfiles = len(fs)
        thetime = strftime("%H:%M:%S", localtime())
        print '%s: Tokenising ... ' % (thetime)
        for index, f in enumerate(fs):
            data = open(f).read()
            enc = chardet.detect(data)
            enc_text = unicode(data, enc['encoding'], errors = 'ignore')
            tokens = tokenise(enc_text)
            thedir = os.path.basename(os.path.dirname(f))
            newname = os.path.basename(f).replace('.txt', '-tokenised.p')
            if one_big_corpus:
                pth = os.path.join(new_corpus_path, newname)
            else:
                pth = os.path.join(new_corpus_path, thedir, newname)
            with open(pth, "wb") as fo:
                pickle.dump(tokens, fo)
            if 'note' in kwargs.keys():
                kwargs['note'].progvar.set((index + 1) * 100.0 / nfiles)
            if root:
                root.update()

    #p.animate(num_files_to_parse)
    if 'note' in kwargs.keys():
        kwargs['note'].progvar.set(100)
    sys.stdout = stdout
    print 'Parsing finished. Moving parsed files into place ...'
    os.chdir(proj_path)
    return new_corpus_path

예제 #5

파일 보기

def parse_corpus(proj_path=False,
                 corpuspath=False,
                 filelist=False,
                 corenlppath=False,
                 operations=False,
                 only_tokenise=False,
                 root=False,
                 stdout=False,
                 nltk_data_path=False,
                 memory_mb=2000,
                 copula_head=True,
                 **kwargs):
    """
    Create a CoreNLP-parsed and/or NLTK tokenised corpus
    """
    import corpkit
    import subprocess
    from subprocess import PIPE, STDOUT, Popen
    import os
    import sys
    import chardet
    from time import localtime, strftime
    import time

    if not only_tokenise:
        if not check_jdk():
            print('Need latest Java.')
            return

    curdir = os.getcwd()

    if nltk_data_path:
        if only_tokenise:
            import nltk
            if nltk_data_path not in nltk.data.path:
                nltk.data.path.append(nltk_data_path)
            from nltk import word_tokenize as tokenise

    # add nltk to path
    #td = {}
    #from other import add_nltk_data_to_nltk_path
    #if 'note' in kwargs.keys():
    #    td['note'] = kwargs['note']
    #add_nltk_data_to_nltk_path(**td)

    if proj_path is False:
        proj_path = os.path.dirname(os.path.abspath(corpuspath.rstrip('/')))

    basecp = os.path.basename(corpuspath)

    if only_tokenise:
        new_corpus_path = os.path.join(proj_path, 'data',
                                       '%s-tokenised' % basecp)
    else:
        new_corpus_path = os.path.join(proj_path, 'data', '%s-parsed' % basecp)

    if os.path.join('data', 'data') in new_corpus_path:
        new_corpus_path = new_corpus_path.replace(os.path.join('data', 'data'),
                                                  'data')

    if not os.path.isdir(new_corpus_path):
        os.makedirs(new_corpus_path)
    else:
        fs = os.listdir(new_corpus_path)
        if not only_tokenise:
            if any([f.endswith('.xml') for f in fs]):
                print('Folder containing xml already exists: "%s-parsed"' %
                      basecp)
                return False
        else:
            if any([f.endswith('.txt') for f in fs]):
                print(
                    'Folder containing tokens already exists: "%s-tokenised"' %
                    basecp)
                return False
    #javaloc = os.path.join(proj_path, 'corenlp', 'stanford-corenlp-3.6.0.jar:stanford-corenlp-3.6.0-models.jar:xom.jar:joda-time.jar:jollyday.jar:ejml-0.23.jar')
    cwd = os.getcwd()
    if corenlppath is False:
        home = os.path.expanduser("~")
        corenlppath = os.path.join(home, 'corenlp')
        find_install = [d for d in os.listdir(corenlppath) \
                   if os.path.isdir(os.path.join(corenlppath, d)) \
                   and os.path.isfile(os.path.join(corenlppath, d, 'jollyday.jar'))]
        if len(find_install) > 0:
            corenlppath = os.path.join(corenlppath, find_install[0])
        else:
            print(
                'No parser found. Try using the keyword arg "corenlp = <path>", or moving your corenlp folder to ~/corenlp/stanford-corenlp-full ...'
            )
            return

    # if not gui, don't mess with stdout
    if stdout is False:
        stdout = sys.stdout

    if not only_tokenise:
        os.chdir(corenlppath)
        if root:
            root.update_idletasks()
            reload(sys)
        import os
        import time
        if memory_mb is False:
            memory_mb = 2024
        if operations is False:
            operations = 'tokenize,ssplit,pos,lemma,ner,parse,dcoref'
        if type(operations) == list:
            operations = ','.join(operations)
        num_files_to_parse = len(
            [l for l in open(filelist, 'r').read().splitlines() if l])
        # get corenlp version number
        import re
        reg = re.compile(r'stanford-corenlp-([0-9].[0-9].[0-9])-javadoc.jar')
        fver = next(
            re.search(reg, s).group(1) for s in os.listdir('.')
            if re.search(reg, s))
        if fver == '3.6.0':
            extra_jar = 'slf4j-api.jar:slf4j-simple.jar:'
        else:
            extra_jar = ''
        arglist = [
            'java', '-cp',
            'stanford-corenlp-%s.jar:stanford-corenlp-%s-models.jar:xom.jar:joda-time.jar:%sjollyday.jar:ejml-0.23.jar'
            % (fver, fver, extra_jar),
            '-Xmx%sm' % str(memory_mb),
            'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators',
            operations, '-filelist', filelist, '-noClobber',
            '-outputExtension', '.xml', '-outputDirectory', new_corpus_path
        ]
        if copula_head:
            arglist.append('--parse.flags')
            arglist.append(' -makeCopulaHead')
        try:
            proc = subprocess.Popen(arglist, stdout=sys.stdout)
        # maybe a problem with stdout. sacrifice it if need be
        except:
            proc = subprocess.Popen(arglist)
        #p = TextProgressBar(num_files_to_parse)
        while proc.poll() is None:
            sys.stdout = stdout
            thetime = strftime("%H:%M:%S", localtime())
            num_parsed = len(
                [f for f in os.listdir(new_corpus_path) if f.endswith('.xml')])
            if num_parsed == 0:
                if root:
                    print('%s: Initialising parser ... ' % (thetime))
            if num_parsed > 0 and (num_parsed + 1) <= num_files_to_parse:
                if root:
                    print('%s: Parsing file %d/%d ... ' %
                          (thetime, num_parsed + 1, num_files_to_parse))
                if 'note' in list(kwargs.keys()):
                    kwargs['note'].progvar.set(
                        (num_parsed) * 100.0 / num_files_to_parse)
                #p.animate(num_parsed - 1, str(num_parsed) + '/' + str(num_files_to_parse))
            time.sleep(1)
            if root:
                root.update()
    else:

        from nltk import word_tokenize as tokenise
        # tokenise each file
        import pickle
        fs = open(filelist).read().splitlines()
        dirs = sorted(
            list(set([os.path.basename(os.path.dirname(f)) for f in fs])))
        if len(dirs) == 0:
            one_big_corpus = True
        else:
            one_big_corpus = False
        if any(os.path.isdir(os.path.join(new_corpus_path, d)) for d in dirs):
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Directory already exists. Delete it if need be.' %
                  thetime)
            return False
        for d in dirs:
            os.makedirs(os.path.join(new_corpus_path, d))
        nfiles = len(fs)
        thetime = strftime("%H:%M:%S", localtime())
        print('%s: Tokenising ... ' % (thetime))
        for index, f in enumerate(fs):
            data = open(f).read()
            enc = chardet.detect(data)
            enc_text = str(data, enc['encoding'], errors='ignore')
            tokens = tokenise(enc_text)
            thedir = os.path.basename(os.path.dirname(f))
            newname = os.path.basename(f).replace('.txt', '-tokenised.p')
            if one_big_corpus:
                pth = os.path.join(new_corpus_path, newname)
            else:
                pth = os.path.join(new_corpus_path, thedir, newname)
            with open(pth, "wb") as fo:
                pickle.dump(tokens, fo)
            if 'note' in list(kwargs.keys()):
                kwargs['note'].progvar.set((index + 1) * 100.0 / nfiles)
            if root:
                root.update()

    #p.animate(num_files_to_parse)
    if 'note' in list(kwargs.keys()):
        kwargs['note'].progvar.set(100)
    sys.stdout = stdout
    thetime = strftime("%H:%M:%S", localtime())
    print('%s: Parsing finished. Moving parsed files into place ...' % thetime)
    os.chdir(curdir)
    return new_corpus_path