Exemplo n.º 1
0
def test_interro4():
    print 'Testing interrogation 4'
    corp = Corpus('data/test-stripped-tokenised')
    data = corp.interrogate({'n': 'any'})
    d = {u'and interrogating': {'first': 0, 'second': 2},
         u'concordancing and': {'first': 0, 'second': 2}}
    assert_equals(data.results.to_dict(), d)
Exemplo n.º 2
0
def test_parse():
    import shutil
    print 'Testing parser'
    unparsed = Corpus(unparsed_path)
    try:
        shutil.rmtree('data/test-parsed')
    except:
        pass
    parsed = unparsed.parse()
    assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
Exemplo n.º 3
0
def test_parse_speakseg(skipassert = False):
    print 'Testing parser with speaker segmentation'
    unparsed = Corpus(unparsed_path)
    import shutil
    try:
        shutil.rmtree(parsed_path)
    except:
        pass
    parsed = unparsed.parse(speaker_segmentation = True)
    if not skipassert:
        assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
Exemplo n.º 4
0
def move_and_parse(indir='xml-form'):
    """
    Use corpkit/CoreNLP to parse the corpus
    """
    import shutil
    import os
    from corpkit import Corpus, new_project

    # make a new project and move into it
    new_project('rsc-proj')
    shutil.copytree('xml-form', 'rsc-proj/data')
    os.chdir('rsc-proj')
    corpus = Corpus('rsc-form')
    parsed = corpus.parse(metadata=True,
                          speaker_segmentation=False,
                          multiprocess=15)
Exemplo n.º 5
0
def test_interro5():
    print 'Testing interrogation 5'
    corp = Corpus('data/test-stripped')
    data = corp.interrogate({'w': r'\bl[a-z]+?\s'})
    assert_equals(data.results.sum().sum(), 4)
Exemplo n.º 6
0
import corpkit
from corpkit import Corpus
unparsed = Corpus(
    'C:\\Users\\jbjb\\Documents\\DATA\\weird corpus\\corpkit\\explit\\data')
unparsed.parse()