def run_tagger(corpus, outdir, tagger_jar): """ Run the ark-tweet-tagger on all the (unannotated) documents in the corpus and save the results in the specified directory """ for k in corpus: doc = corpus[k] k_txt = copy.copy(k) k_txt.stage = 'turns' k_txt.annotator = None root = stac.id_to_path(k_txt) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: print(extract_turns(doc), file=f) tagged_file = tagger_file_name(k, outdir) tagged_dir = os.path.split(tagged_file)[0] if not os.path.exists(tagged_dir): os.makedirs(tagged_dir) # from the runTagger srcipt cmd = tagger_cmd(tagger_jar, txt_file) with open(tagged_file, 'wb') as tf: subprocess.call(cmd, stdout=tf)
def parsed_file_name(k, dir_name): """ Given an educe.corpus.FileId and directory, return the file path within that directory that corresponds to the corenlp output """ k2 = copy.copy(k) k2.stage = 'parsed' k2.annotator = 'stanford-corenlp' return os.path.join(dir_name, stac.id_to_path(k2) + '.xml')
def tagger_file_name(k, dir): """ Given an educe.corpus.FileId and directory, return the file path within that directory that corresponds to the tagger output """ k2 = copy.copy(k) k2.stage = 'pos-tagged' k2.annotator = 'ark-tweet-nlp' return os.path.join(dir, stac.id_to_path(k2) + '.conll')
def mk_output_path(cls, odir, k, extension=''): """ Generate a path within a parent directory, given a fileid """ relpath = id_to_path(k) ofile_dirname = os.path.join(odir, os.path.dirname(relpath)) ofile_basename = os.path.basename(relpath) return os.path.join(ofile_dirname, ofile_basename) + extension
def tagger_file_name(doc_key, root): """Get the file path to the output of the POS tagger for a document. The returned file path is a .conll file within the given directory. Parameters ---------- doc_key : educe.corpus.FileId FileId of the document root : string Path to the folder containing annotations for this corpus, including the output of the POS tagger. Returns ------- res : string Path to the .conll file output by the POS tagger. """ doc_key2 = copy.copy(doc_key) doc_key2.stage = 'pos-tagged' doc_key2.annotator = 'ark-tweet-nlp' return os.path.join(root, stac.id_to_path(doc_key2) + '.conll')
def run_pipeline(corpus, outdir, corenlp_dir, split=False): """ Run the standard corenlp pipeline on all the (unannotated) documents in the corpus and save the results in the specified directory. If `split=True`, we output one file per turn, an experimental mode to account for switching between multiple speakers. We don't have all the infrastructure to read these back in (it should just be a matter of some filename manipulation though) and hope to flesh this out later. We also intend to tweak the notion of splitting by aggregating consecutive turns with the same speaker, which may somewhat mitigate the loss of coreference information. """ if split: # for each document, how many digits do we need to represent the # turns in that document; for essentially cosmetic purposes # (padding) digits = {} for d in frozenset([k.doc for k in corpus]): turns = [] for k in corpus: if k.doc == d: turns.extend([x for x in corpus[k].units if stac.is_turn(x)]) turn_ids = [stac.turn_id(t)[0] for t in turns] digits[d] = max(2, int(math.ceil(math.log10(max(turn_ids))))) # dump the turn text # TODO: aggregate consecutive turns by same speaker txt_files = [] for k in corpus: doc = corpus[k] k_txt = copy.copy(k) k_txt.stage = 'turns' k_txt.annotator = None if split: nb_digits = digits[k.doc] for tid, ttext in turn_id_text(doc): root = stac.id_to_path(k_txt) + '_' + tid.zfill(nb_digits) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: print(ttext, file=f) txt_files.append(txt_file) else: root = stac.id_to_path(k_txt) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: for _, ttext in turn_id_text(doc): print(ttext, file=f) txt_files.append(txt_file) # run CoreNLP corenlp_wrapper = CoreNlpWrapper(corenlp_dir) corenlp_props = [] if split else ['ssplit.eolonly=true'] corenlp_outdir = corenlp_wrapper.process(txt_files, outdir, properties=corenlp_props) # corenlp dumps all the output into one flat directory; # move them to the standard STAC layout paths for sfile in os.listdir(corenlp_outdir): if os.path.splitext(sfile)[1] != '.xml': continue from_path = os.path.join(corenlp_outdir, sfile) # targeted (STAC) filename k, tid = from_corenlp_output_filename(sfile) to_path = parsed_file_name(k, outdir) to_dir = os.path.dirname(to_path) if not os.path.exists(to_dir): os.makedirs(to_dir) os.rename(from_path, to_path)
def run_pipeline(corpus, outdir, corenlp_dir, split=False): """ Run the standard corenlp pipeline on all the (unannotated) documents in the corpus and save the results in the specified directory If `split=True`, we output one file per turn, an experimental mode to account for switching between multiple speakers. We don't have all the infrastructure to read these back in (it should just be a matter of some filename manipulation though) and hope to flesh this out later. We also intend to tweak the notion of splitting by aggregating consecutive turns with the same speaker, which may somewhat mitigate the lost of coreference information. """ # for each document, how many digits do we need to represent the turns # in that document; for essentially cosmetic purposes (padding) digits = {} for d in frozenset([ k.doc for k in corpus ]): turns = [] for k in corpus: if k.doc == d: turns.extend(filter(stac.is_turn, corpus[k].units)) turn_ids = [ int(t.features['Identifier']) for t in turns ] digits[d] = max(2,int(math.ceil(math.log10(max(turn_ids))))) # dump the turn text # TODO: aggregate consecutive turns by same speaker txt_files = [] for k in corpus: doc = corpus[k] turns = sorted(filter(stac.is_turn, doc.units), key=lambda k:k.span) k_txt = copy.copy(k) k_txt.stage = 'turns' k_txt.annotator = None if split: for turn in turns: ttext = stac.split_turn_text(doc.text_for(turn))[1] tid = turn.features['Identifier'] root = stac.id_to_path(k_txt) + '_' + tid.zfill(digits[k.doc]) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: print >> f, ttext txt_files.append(txt_file) else: root = stac.id_to_path(k_txt) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: for turn in turns: ttext = stac.split_turn_text(doc.text_for(turn))[1] print >> f, ttext txt_files.append(txt_file) # manifest tells corenlp what to files to read as input manifest_dir = os.path.join(outdir, 'tmp') manifest_file = os.path.join(manifest_dir, 'manifest') with codecs.open(manifest_file, 'w', 'utf-8') as f: print >> f, '\n'.join(txt_files) # java properties to control behaviour of corenlp properties = [] if split else ['ssplit.eolonly=true'] props_file = os.path.join(manifest_dir, 'corenlp.properties') with codecs.open(props_file, 'w', 'utf-8') as f: print >> f, '\n'.join(properties) # run corenlp (will take a while for it to load its various models) jars = [ x for x in os.listdir(corenlp_dir) if os.path.splitext(x)[1] == '.jar' ] cp_sep = ':' if os.name != 'nt' else ';' corenlp_outdir = os.path.join(outdir, 'corenlp') if not os.path.exists(corenlp_outdir): os.makedirs(corenlp_outdir) cmd = [ 'java' , '-cp', cp_sep.join(jars) , '-Xmx3g' , 'edu.stanford.nlp.pipeline.StanfordCoreNLP' , '-filelist', manifest_file , '-props', props_file , '-outputDirectory', corenlp_outdir ] subprocess.call(cmd, cwd=corenlp_dir) # corenlp dumps all the output into one flat directory; # move them to the standard STAC layout paths for sfile in os.listdir(corenlp_outdir): if os.path.splitext(sfile)[1] != '.xml': continue k, tid = from_corenlp_output_filename(sfile) from_path = os.path.join(corenlp_outdir, sfile) to_path = parsed_file_name(k, outdir) to_dir = os.path.dirname(to_path) if not os.path.exists(to_dir): os.makedirs(to_dir) os.rename(from_path, to_path)
def run_pipeline(corpus, outdir, corenlp_dir, split=False): """ Run the standard corenlp pipeline on all the (unannotated) documents in the corpus and save the results in the specified directory. If `split=True`, we output one file per turn, an experimental mode to account for switching between multiple speakers. We don't have all the infrastructure to read these back in (it should just be a matter of some filename manipulation though) and hope to flesh this out later. We also intend to tweak the notion of splitting by aggregating consecutive turns with the same speaker, which may somewhat mitigate the loss of coreference information. """ if split: # for each document, how many digits do we need to represent the # turns in that document; for essentially cosmetic purposes # (padding) digits = {} for d in frozenset([k.doc for k in corpus]): turns = [] for k in corpus: if k.doc == d: turns.extend( [x for x in corpus[k].units if stac.is_turn(x)]) turn_ids = [stac.turn_id(t)[0] for t in turns] digits[d] = max(2, int(math.ceil(math.log10(max(turn_ids))))) # dump the turn text # TODO: aggregate consecutive turns by same speaker txt_files = [] for k in corpus: doc = corpus[k] k_txt = copy.copy(k) k_txt.stage = 'turns' k_txt.annotator = None if split: nb_digits = digits[k.doc] for tid, ttext in turn_id_text(doc): root = stac.id_to_path(k_txt) + '_' + tid.zfill(nb_digits) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: print(ttext, file=f) txt_files.append(txt_file) else: root = stac.id_to_path(k_txt) txt_file = os.path.join(outdir, 'tmp', root + '.txt') txt_dir = os.path.split(txt_file)[0] if not os.path.exists(txt_dir): os.makedirs(txt_dir) with codecs.open(txt_file, 'w', 'utf-8') as f: for _, ttext in turn_id_text(doc): print(ttext, file=f) txt_files.append(txt_file) # run CoreNLP corenlp_wrapper = CoreNlpWrapper(corenlp_dir) corenlp_props = [] if split else ['ssplit.eolonly=true'] corenlp_outdir = corenlp_wrapper.process(txt_files, outdir, properties=corenlp_props) # corenlp dumps all the output into one flat directory; # move them to the standard STAC layout paths for sfile in os.listdir(corenlp_outdir): if os.path.splitext(sfile)[1] != '.xml': continue from_path = os.path.join(corenlp_outdir, sfile) # targeted (STAC) filename k, tid = from_corenlp_output_filename(sfile) to_path = parsed_file_name(k, outdir) to_dir = os.path.dirname(to_path) if not os.path.exists(to_dir): os.makedirs(to_dir) os.rename(from_path, to_path)