def test_interro4(): print 'Testing interrogation 4' corp = Corpus('data/test-stripped-tokenised') data = corp.interrogate({'n': 'any'}) d = {u'and interrogating': {'first': 0, 'second': 2}, u'concordancing and': {'first': 0, 'second': 2}} assert_equals(data.results.to_dict(), d)
def test_parse(): import shutil print 'Testing parser' unparsed = Corpus(unparsed_path) try: shutil.rmtree('data/test-parsed') except: pass parsed = unparsed.parse() assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
def test_parse_speakseg(skipassert = False): print 'Testing parser with speaker segmentation' unparsed = Corpus(unparsed_path) import shutil try: shutil.rmtree(parsed_path) except: pass parsed = unparsed.parse(speaker_segmentation = True) if not skipassert: assert_equals(list([i.name for i in parsed.files]), ['intro.txt.xml', 'body.txt.xml'])
def move_and_parse(indir='xml-form'): """ Use corpkit/CoreNLP to parse the corpus """ import shutil import os from corpkit import Corpus, new_project # make a new project and move into it new_project('rsc-proj') shutil.copytree('xml-form', 'rsc-proj/data') os.chdir('rsc-proj') corpus = Corpus('rsc-form') parsed = corpus.parse(metadata=True, speaker_segmentation=False, multiprocess=15)
def test_interro5(): print 'Testing interrogation 5' corp = Corpus('data/test-stripped') data = corp.interrogate({'w': r'\bl[a-z]+?\s'}) assert_equals(data.results.sum().sum(), 4)
import corpkit from corpkit import Corpus unparsed = Corpus( 'C:\\Users\\jbjb\\Documents\\DATA\\weird corpus\\corpkit\\explit\\data') unparsed.parse()