def make_no_id_corpus(pth, newpth): """make version of pth without ids""" import os import re import shutil from corpkit.build import get_filepaths # define regex broadly enough to accept timestamps, locations if need be idregex = re.compile(r'(^.*?):\s+(.*$)') try: shutil.copytree(pth, newpth) except OSError: shutil.rmtree(newpth) shutil.copytree(pth, newpth) files = get_filepaths(newpth) names = [] for f in files: good_data = [] with open(f) as fo: data = fo.read().splitlines() for datum in data: matched = re.search(idregex, datum) if matched: names.append(matched.group(1)) good_data.append(matched.group(2)) with open(f, "w") as fo: fo.write('\n'.join(good_data)) if len(names) == 0: from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) print '%s: No speaker names found. Turn off speaker segmentation.' % thetime shutil.rmtree(newpth)
def rename_all_files(dirs_to_do): """get rid of the inserted dirname in filenames after parsing""" import os from corpkit.build import get_filepaths for d in dirs_to_do: if not d.endswith('-parsed'): ext = 'txt' else: ext = 'txt.xml' fs = get_filepaths(d, ext) for f in fs: fname = os.path.basename(f) justdir = os.path.dirname(f) subcorpus = os.path.basename(justdir) newname = fname.replace('-%s.%s' % (subcorpus, ext), '.%s' % ext) os.rename(f, os.path.join(justdir, newname))
def get_list_of_speaker_names(corpuspath): """return a list of speaker names in a pre-processed corpus""" import os import re from corpkit.build import get_filepaths files = get_filepaths(corpuspath) names = [] idregex = re.compile(r'(^.*?):\s+(.*$)') for f in files: data = open(f).read().splitlines() for l in data: m = re.search(idregex, l) if m: if m.group(1) not in names: names.append(m.group(1)) return sorted(list(set(names)))
def add_ids_to_xml(corpuspath, root = False, note = False): """add ids to the xml in corpuspath needs the raw files to be in the same dir as corpuspath, without '-parsed' in the dir name also needs the id files to be in the dir, with '-parsed' changed to -cleaned""" import os import re from bs4 import BeautifulSoup, SoupStrainer from corpkit.build import get_filepaths from time import strftime, localtime files = get_filepaths(corpuspath, ext = 'xml') if note: note.progvar.set(0) thetime = strftime("%H:%M:%S", localtime()) print '%s: Processing speaker IDs ...' % thetime if root: root.update() for i, f in enumerate(files): if note: note.progvar.set(i * 100.0 / len(files)) thetime = strftime("%H:%M:%S", localtime()) print '%s: Processing speaker IDs (%d/%d)' % (thetime, i, len(files)) if root: root.update() xmlf = open(f) data = xmlf.read() xmlf.close() # open the unparsed version of the file, read into memory stripped_txtfile = f.replace('.xml', '').replace('-parsed', '') old_txt = open(stripped_txtfile) stripped_txtdata = old_txt.read() old_txt.close() # open the unparsed version with speaker ids id_txtfile = f.replace('.xml', '').replace('-stripped-parsed', '') idttxt = open(id_txtfile) id_txtdata = idttxt.read() idttxt.close() # todo: do this with lxml soup = BeautifulSoup(data, "lxml") for s in soup.find_all('sentence'): # don't get corefs if s.parent.name == 'sentences': tokens = s.find_all('token') start = int(tokens[0].find_all('characteroffsetbegin', limit = 1)[0].text) end = int(tokens[-1].find_all('characteroffsetend', limit = 1)[0].text) # extract this sentence from the unparsed version sent = stripped_txtdata[start:end] # find out line number # sever at start of match cut_old_text = stripped_txtdata[:start] line_index = cut_old_text.count('\n') # lookup this text with_id = id_txtdata.splitlines()[line_index] split_line = with_id.split(': ', 1) if len(split_line) > 1: speakerid = split_line[0] else: speakerid = 'UNIDENTIFIED' new_tag = soup.new_tag("speakername") s.append(new_tag) new_tag.string = speakerid html = str(soup.root) # make changes with open(f, "wb") as fopen: fopen.write(html) if note: note.progvar.set(100)
def turn_to_plaintext(corpus_dir='xml'): """ turn xml corpus into corpkit input takes about 5 minutes, most of which is lang detection also determines language so that we can remove baddies later """ import os # just an os.walk/glob type function from corpkit.build import get_filepaths from lxml import etree as ET from langdetect import detect, detect_langs # make a new directory if need be corpus_dir = 'xml' outdir = '%s-form' % corpus_dir try: os.makedirs(outdir) except OSError: pass # this function pretty much just calls os.walk fs = get_filepaths(corpus_dir, 'xml') # parse metadata, put in corpkit format # write to same filepath in outdir for index, f in enumerate(fs, start=1): # progress info print("%.2f: %s" % ((index * 100.0 / len(fs)), f)) # make new filename fpath, fname = os.path.split(f) outpath = fpath.replace(corpus_dir, outdir, 1) outname = fname.replace('.xml', '.txt') try: os.makedirs(outpath) except OSError: pass # get all the xml into a string root = ET.parse(f).getroot() metastring = ' <metadata ' for k, v in sorted(root.items()): k = k.strip('"').strip("'").lstrip().rstrip() v = v.strip('"').strip("'").lstrip().rstrip() metastring += "%s='%s' " % (k, v) pages = '' # detect language langs = detect_langs('\n'.join([i.text for i in root if i.text])) lang, score = format_langs_output(langs) # pages don't end at sentence boundaries. join it all together for i, page in enumerate(root, start=1): if page.text: pagetext = page.text.replace('\n', ' ') pages += pagetext.rstrip('\n') metend = "lang='%s' engprob='%s'>\n" % (lang, score) pages += metastring + metend # write to file with open(os.path.join(outpath, outname), 'w') as fo: fo.write(pages.encode('utf-8')) return outdir
def plaintext_to_conll(inpath, postag=False, lemmatise=False, lang='en', metadata=False, outpath=False, nltk_data_path=False, speaker_segmentation=False): """ Take a plaintext corpus and sent/word tokenise. :param inpath: The corpus to read in :param postag: do POS tagging? :param lemmatise: do lemmatisation? :param lang: choose language for pos/lemmatiser (not implemented yet) :param metadata: add metadata to conll (not implemented yet) :param outpath: custom name for the resulting corpus :param speaker_segmentation: did the corpus has speaker names? """ import nltk import shutil import pandas as pd from corpkit.process import saferead from corpkit.build import get_filepaths fps = get_filepaths(inpath, 'txt') # IN THE SECTIONS BELOW, WE COULD ADD MULTILINGUAL # ANNOTATORS, PROVIDED THEY BEHAVE AS THE NLTK ONES DO # SENT TOKENISERS from nltk.tokenize.punkt import PunktSentenceTokenizer stoker = PunktSentenceTokenizer() s_tokers = {'en': stoker} sent_tokenizer = s_tokers.get(lang, stoker) # WORD TOKENISERS tokenisers = {'en': nltk.word_tokenize} tokeniser = tokenisers.get(lang, nltk.word_tokenize) # LEMMATISERS if lemmatise: from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() lemmatisers = {'en': lmtzr} lemmatiser = lemmatisers.get(lang, lmtzr) # POS TAGGERS if postag: # nltk.download('averaged_perceptron_tagger') postaggers = {'en': nltk.pos_tag} tagger = postaggers.get(lang, nltk.pos_tag) # iterate over files, make df of each, convert this # to conll and sent to new filename for f in fps: for_df = [] data, enc = saferead(f) plain, enc = saferead(f.replace('-stripped', '')) #orig_data = data #data, offsets = process_meta(data, speaker_segmentation, metadata) #nest = [] sents = sent_tokenizer.tokenize(data) soffs = sent_tokenizer.span_tokenize(data) toks = [tokeniser(sent) for sent in sents] ser = nested_list_to_pandas(toks) for_df.append(ser) if postag or lemmatise: postags = pos_tag_series(ser, tagger) if lemmatise: lemma = lemmatise_series(ser, postags, lemmatiser) for_df.append(lemma) for_df.append(postags) else: if postag: for_df.append(postags) df = pd.concat(for_df, axis=1) fo = new_fname(f, inpath) write_df_to_conll(df, fo, metadata=metadata, plain=plain, stripped=data, speaker_segmentation=speaker_segmentation, offsets=soffs) nsent = len(set(df.index.labels[0])) print('%s created (%d sentences)' % (fo, nsent)) if '-stripped' in inpath: return inpath.replace('-stripped', '-tokenised') else: return inpath + '-tokenised'