def main(): steps = [] # Put additional steps in here. Arguments, stdin/stdout, etc. get set below # unpack_lrlp.sh steps.append( Step('unpack_lrlp.sh', call=check_output, help="untars lrlp into position for further processing")) # gather_ephemera.py steps.append( Step('gather_ephemera.py', help="relocates assorted bits from lrlp")) # extract_lexicon.py steps.append( Step('extract_lexicon.py', help="get flat form of bilingual lexicon", abortOnFail=False)) # clean_lexicon steps.append( Step('clean.sh', name="clean_lexicon", help="wildeclean/nfkc lexicon file", abortOnFail=False)) # normalize_lexicon.py steps.append( Step( 'normalize_lexicon_tg.py', name="normalize_lexicon.py", help= "heuristically convert lexicon into something more machine readable", abortOnFail=False)) # relocate lexicon steps.append( Step('cp', progpath='/bin', name="relocate_lexicon", help="move the lexicon stuff into ephemera", abortOnFail=False)) # get_tweet_by_id.rb steps.append(Step('get_tweet_by_id.rb', help="download tweets. must have twitter gem installed " \ "and full internet", abortOnFail=False)) steps.append( Step('ldc_tok.py', help="run ldc tokenizer on tweets ", abortOnFail=False)) # extract_psm_annotation.py steps.append( Step('extract_psm_annotation.py', help="get annotations from psm files into psm.ann", abortOnFail=False)) # extract_entity_annotation.py steps.append( Step('extract_entity_annotation.py', help="get entity and other annotations into entity.ann", abortOnFail=False)) # extract_parallel.py steps.append( Step('extract_parallel.py', help="get flat form parallel data")) steps.append( Step('filter_parallel.py', help="filter parallel data to remove likely mismatches")) # extract_mono.py steps.append(Step('extract_mono.py', help="get flat form mono data")) # extract_comparable.py steps.append( Step('extract_comparable.py', help="get flat form comparable data")) stepsbyname = {} for step in steps: stepsbyname[step.name] = step parser = argparse.ArgumentParser(description="Process a LRLP into flat format", formatter_class= \ argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--tarball", "-t", nargs='+', required=True, help= 'path to gzipped tars for processing (all tars considered to be part of the same package). Ex: lrlp.tar.gz' ) parser.add_argument("--language", "-l", required=True, help='three letter code of language. example "uzb"') parser.add_argument( "--lexversion", "-L", default='1.5', help='version of lexicon to extract (may need to create a new one)') parser.add_argument("--key", "-k", default=None, help='decryption key for encrypted il') parser.add_argument("--set", "-S", default=None, help='decryption set for encrypted il') addonoffarg(parser, "mono", help="extract mono data", default=True) parser.add_argument( "--previous", default=None, help= 'path to previous extraction (equivalent to one level down from root)') parser.add_argument("--root", "-r", default='/home/nlg-02/LORELEI/ELISA/data', help='path to where the extraction will take place') parser.add_argument("--evalil", "-E", action='store_true', default=False, help='this is an eval il. makes expdir set0 aware') parser.add_argument("--expdir", "-e", help='path to where the extraction is (equivalent to root/lang/expanded/lrlp). If starting at ' \ 'step 0 this is ignored') parser.add_argument("--start", "-s", type=int, default=0, help='step to start at') parser.add_argument("--stop", "-p", type=int, default=len(steps) - 1, help='step to stop at (inclusive)') parser.add_argument("--liststeps", "-x", nargs=0, action=make_action(steps), help='print step list and exit') parser.add_argument("--ruby", default="ruby", help='path to ruby (2.1 or higher)') addonoffarg(parser, "swap", help="swap source/target in found data (e.g. il3)", default=False) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) sys.exit(2) if args.expdir is not None and args.start <= 0: sys.stderr.write \ ("Warning: expdir is set but will be ignored and determined dynamically") if args.expdir is None and args.start > 0: sys.stderr.write \ ("Error: must explicitly set expdir if not starting at step 0") sys.exit(1) rootdir = args.root language = args.language start = args.start stop = args.stop + 1 if (args.key is None) ^ (args.set is None): sys.stderr.write("key (-k) and set (-S) must both be set or unset\n") sys.exit(1) # Patchups for step 0 argstring = "-k %s -s %s" % (args.key, args.set) if args.key is not None else "" argstring += " -l %s -r %s %s" % (language, rootdir, ' '.join( args.tarball)) sys.stderr.write("args for unpack lrlp are {}\n".format(argstring)) stepsbyname["unpack_lrlp.sh"].argstring = argstring if start == 0: expdir = steps[0].run().strip().decode("utf-8") if args.evalil: expdir = os.path.join(expdir, 'set0') start += 1 else: expdir = args.expdir monodir = os.path.join(expdir, 'data', 'monolingual_text') # what are the mono files? (needed for later) if args.mono and args.previous is None: monoindirs = dirfind(monodir, "ltf.zip") else: monoindirs = [] # Patchups for the rest if stop > 0: # TWEET tweetintab = os.path.join(expdir, 'docs', 'twitter_info.tab') tweetdir = os.path.join(rootdir, language, 'tweet', 'rsd') if not os.path.exists(tweetintab): stepsbyname["get_tweet_by_id.rb"].disable() stepsbyname["ldc_tok.py"].disable() else: tweetprogpaths = [] #for toolroot in (expdir, scriptdir): # bad ldc tools for eval for toolroot in (scriptdir, ): tweetprogpaths = dirfind(os.path.join(toolroot, 'tools'), 'get_tweet_by_id.rb') if len(tweetprogpaths) > 0: break if len(tweetprogpaths) == 0: sys.stderr.write("Can't find get_tweet_by_id.rb\n") sys.exit(1) else: tweetprogpath = os.path.dirname(tweetprogpaths[0]) mkdir_p(tweetdir) tweeterr = os.path.join(rootdir, language, 'extract_tweet.err') stepsbyname["get_tweet_by_id.rb"].stderr = tweeterr # just copy from previous or skip if no mono if not args.mono: if args.previous is None: stepsbyname["get_tweet_by_id.rb"].disable() else: oldtweetdir = os.path.join( args.previous, 'tweet', 'rsd' ) #WARNING: old versions of data won't have this structure stepsbyname["get_tweet_by_id.rb"].progpath = "/bin" stepsbyname["get_tweet_by_id.rb"].prog = "cp" stepsbyname[ "get_tweet_by_id.rb"].argstring = "-r {} {}".format( oldtweetdir, tweetdir) else: stepsbyname["get_tweet_by_id.rb"].progpath = tweetprogpath stepsbyname[ "get_tweet_by_id.rb"].argstring = tweetdir + " -l " + language stepsbyname["get_tweet_by_id.rb"].scriptbin = args.ruby if os.path.exists(tweetintab): stepsbyname["get_tweet_by_id.rb"].stdin = tweetintab else: stepsbyname["get_tweet_by_id.rb"].disable() # TOKENIZE AND RELOCATE TWEETS # find rb location, params file toxexecpaths = [] thetoolroot = None for toolroot in (expdir, scriptdir): tokexecpaths = dirfind(os.path.join(toolroot, 'tools'), 'token_parse.rb') if len(tokexecpaths) > 0: thetoolroot = toolroot break if len(tokexecpaths) == 0: sys.stderr.write("Can't find token_parse.rb\n") sys.exit(1) tokexec = tokexecpaths[0] tokparamopts = dirfind(os.path.join(thetoolroot, 'tools'), 'yaml') tokparam = "--param {}".format( tokparamopts[0]) if len(tokparamopts) > 0 else "" lrlpdir = os.path.join(expdir, 'data', 'translation', 'from_{}'.format(language), language, 'ltf') # ugly: the base of the file monodir/mononame.zip; need to add it to monoindirs and just pass that base so it gets constructed mononame = "tweets.ltf" monoindirs.append(os.path.join(monodir, mononame + ".zip")) stepsbyname[ "ldc_tok.py"].argstring = "--mononame {mononame} -m {monodir} --ruby {ruby} --dldir {tweetdir} --lrlpdir {lrlpdir} --exec {tokexec} {tokparam} --outfile {outfile}".format( monodir=monodir, mononame=mononame, ruby=args.ruby, tweetdir=tweetdir, lrlpdir=lrlpdir, tokexec=tokexec, tokparam=tokparam, outfile=os.path.join(rootdir, language, 'ldc_tok.stats')) stepsbyname["ldc_tok.py"].stderr = os.path.join( rootdir, language, 'ldc_tok.err') # EPHEMERA ephemdir = os.path.join(rootdir, language, 'ephemera') ephemarg = "-s {} -t {}".format(expdir, ephemdir) if args.previous is not None: ephemarg += " -o {}".format(os.path.join(args.previous, 'ephemera')) stepsbyname['gather_ephemera.py'].argstring = ephemarg ephemerr = os.path.join(rootdir, language, 'gather_ephemera.err') stepsbyname['gather_ephemera.py'].stderr = ephemerr # # LTF2RSD # l2rindir = os.path.join(expdir, 'data', 'translation', 'from_'+language, # 'eng') # Only converts from_SRC_tweet subdir # stepsbyname["ltf2rsd.perl"].argstring = l2rindir # # l2rprogpath = os.path.join(expdir, 'tools', 'ltf2txt') # # stepsbyname["ltf2rsd.perl"].progpath = l2rprogpath # l2rerr = os.path.join(rootdir, language, 'ltf2rsd.err') # stepsbyname["ltf2rsd.perl"].stderr = l2rerr # LEXICON # # IL CHANGE if args.evalil: lexiconinfile = os.path.join(expdir, 'docs', 'categoryI_dictionary', '*.xml') if args.lexversion == "il6": lexiconinfile = os.path.join(expdir, 'docs', 'categoryI_dictionary', '*.zip') elif args.lexversion == "il5": lexiconinfile = os.path.join(expdir, 'docs', 'categoryI_dictionary', '*.txt') else: lexiconinfile = os.path.join(expdir, 'data', 'lexicon', '*.xml') lexiconoutdir = os.path.join(rootdir, language, 'lexicon') lexiconrawoutfile = os.path.join(lexiconoutdir, 'lexicon.raw') lexiconoutfile = os.path.join(lexiconoutdir, 'lexicon') lexiconnormoutfile = os.path.join(lexiconoutdir, 'lexicon.norm') lexiconerr = os.path.join(rootdir, language, 'extract_lexicon.err') lexiconcleanerr = os.path.join(rootdir, language, 'clean_lexicon.err') lexiconnormerr = os.path.join(rootdir, language, 'normalize_lexicon.err') # lexicon v1.5 for y2 stepsbyname[ "extract_lexicon.py"].argstring = " -v {} -i {} -o {}".format( args.lexversion, lexiconinfile, lexiconrawoutfile) stepsbyname["extract_lexicon.py"].stderr = lexiconerr stepsbyname["clean_lexicon"].argstring = "{} {}".format( lexiconrawoutfile, lexiconoutfile) stepsbyname["clean_lexicon"].stderr = lexiconcleanerr stepsbyname["normalize_lexicon.py"].argstring = "-i %s -o %s" % \ (lexiconoutfile, lexiconnormoutfile) stepsbyname["normalize_lexicon.py"].stderr = lexiconnormerr stepsbyname["relocate_lexicon"].argstring = "-r %s %s" % ( lexiconoutdir, ephemdir) # PSM # just copy from previous or skip if no mono psmerr = os.path.join(rootdir, language, 'extract_psm_annotation.err') stepsbyname["extract_psm_annotation.py"].stderr = psmerr psmoutpath = os.path.join(rootdir, language, 'psm.ann') if not args.mono: if args.previous is None: stepsbyname["extract_psm_annotation.py"].disable() else: oldpsm = os.path.join(args.previous, 'psm.ann') stepsbyname["extract_psm_annotation.py"].progpath = "/bin" stepsbyname["extract_psm_annotation.py"].prog = "cp" stepsbyname[ "extract_psm_annotation.py"].argstring = "{} {}".format( oldpsm, psmoutpath) else: psmindir = os.path.join(monodir, 'zipped', '*.psm.zip') stepsbyname["extract_psm_annotation.py"].argstring = "-i %s -o %s" % \ (psmindir, psmoutpath) # ENTITY entityoutpath = os.path.join(rootdir, language, 'entity.ann') entityerr = os.path.join(rootdir, language, 'extract_entity_annotation.err') stepsbyname["extract_entity_annotation.py"].argstring="-r %s -o %s -et %s" \ % (expdir, entityoutpath, tweetdir) stepsbyname["extract_entity_annotation.py"].stderr = entityerr # PARALLEL paralleloutdir = os.path.join(rootdir, language, 'parallel', 'extracted') parallelerr = os.path.join(rootdir, language, 'extract_parallel.err') stepsbyname["extract_parallel.py"].argstring="--no-cdec -r %s -o %s -s %s" % \ (expdir, paralleloutdir, language) stepsbyname["extract_parallel.py"].stderr = parallelerr if args.swap: stepsbyname["extract_parallel.py"].argstring += " --swap" filteroutdir = os.path.join(rootdir, language, 'parallel', 'filtered') rejectoutdir = os.path.join(rootdir, language, 'parallel', 'rejected') filtererr = os.path.join(rootdir, language, 'filter_parallel.err') stepsbyname["filter_parallel.py"].argstring="-s 2 -l %s -i %s -f %s -r %s" % \ (language, paralleloutdir, filteroutdir, rejectoutdir) stepsbyname["filter_parallel.py"].stderr = filtererr # MONO # just copy from previous or skip if no mono monoerr = os.path.join(rootdir, language, 'extract_mono.err') stepsbyname["extract_mono.py"].stderr = monoerr if not args.mono: if args.previous is None: stepsbyname["extract_mono.py"].disable() else: oldmonodir = os.path.join(args.previous, 'mono') monooutdir = os.path.join(rootdir, language, 'mono') stepsbyname["extract_mono.py"].progpath = "/bin" stepsbyname["extract_mono.py"].prog = "cp" stepsbyname["extract_mono.py"].argstring = "-r {} {}".format( oldmonodir, monooutdir) else: monooutdir = os.path.join(rootdir, language, 'mono', 'extracted') stepsbyname["extract_mono.py"].argstring = "--no-cdec -i %s -o %s" % \ (' '.join(monoindirs), monooutdir) # COMPARABLE if os.path.exists( os.path.join(expdir, 'data', 'translation', 'comparable')): compoutdir = os.path.join(rootdir, language, 'comparable', 'extracted') comperr = os.path.join(rootdir, language, 'extract_comparable.err') stepsbyname["extract_comparable.py"].argstring = "-r %s -o %s -s %s" % \ (expdir, compoutdir, language) stepsbyname["extract_comparable.py"].stderr = comperr else: stepsbyname["extract_comparable.py"].disable() for step in steps[start:stop]: step.run() print("Done.\nExpdir is %s" % expdir)
def main(): steps = [] # extract_mono.py steps.append(Step('decrypt_sets.py', help="decode encrypted sets")) # get_tweet_by_id.rb steps.append(Step('get_tweet_by_id.rb', help="download tweets. must have twitter gem installed " \ "and full internet", abortOnFail=False)) steps.append( Step('ldc_tok.py', help="run ldc tokenizer on tweets ", abortOnFail=False)) # extract_mono.py steps.append(Step('extract_mono.py', help="get flat form mono data")) steps.append(Step('make_mono_release.py', help="package mono flat data")) stepsbyname = {} for step in steps: stepsbyname[step.prog] = step parser = argparse.ArgumentParser(description="Build an eval IL monoset from LDC to elisa form", formatter_class= \ argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--setdir", "-d", default='.', help='name of set directory (i.e. set1, setE, etc.)') parser.add_argument("--language", "-l", default='uzb', help='three letter code of IL language') parser.add_argument("--key", "-k", default=None, help='decryption key for encrypted il') parser.add_argument("--notweets", "-n", action='store_true', default=None, help='do not include tweets (for eval IL setE only)') parser.add_argument("--engset", "-E", action='store_true', default=None, help='assume engset and ilset (for eval IL setE only)') parser.add_argument("--expdir", "-e", help='path to where the extraction is. If starting at ' \ 'step 0 this is ignored') parser.add_argument("--root", "-r", default='/home/nlg-02/LORELEI/ELISA/data', help='path to where the extraction will take place') parser.add_argument("--outfile", "-o", help='name of the output file') parser.add_argument("--start", "-s", type=int, default=0, help='step to start at') parser.add_argument("--stop", "-p", type=int, default=len(steps) - 1, help='step to stop at (inclusive)') parser.add_argument("--liststeps", "-x", nargs=0, action=make_action(steps), help='print step list and exit') parser.add_argument("--ruby", default="/opt/local/bin/ruby2.2", help='path to ruby (2.1 or higher)') try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) rootdir = args.root language = args.language setdir = args.setdir outdir = os.path.join(rootdir, language, setdir) outfile = os.path.join(outdir, args.outfile) start = args.start stop = args.stop + 1 if args.engset: emstep = steps.pop(-1) stepsbyname.pop(emstep.name) mmrstep = steps.pop(-1) stepsbyname.pop(mmrstep.name) for flavor in (language, "eng"): newem = Step('extract_mono.py', name='extract_mono_%s' % flavor, help="get flat form mono data in %s" % flavor) steps.append(newem) stepsbyname[newem.name] = newem newmmr = Step('make_mono_release.py', name='make_mono_release_%s' % flavor, help="package mono flat data in %s" % flavor) steps.append(newmmr) stepsbyname[newmmr.name] = newmmr stop += 2 if args.expdir is None: expdir = os.path.join(rootdir, language, 'expanded', 'lrlp') else: expdir = args.expdir mkdir_p(outdir) if args.key is None: stepsbyname["decrypt_sets.py"].disable() else: stepsbyname["decrypt_sets.py"].stderr = os.path.join( outdir, 'decrypt_sets.err') stepsbyname["decrypt_sets.py"].argstring = "-r %s -k %s -s %s" % ( expdir, args.key, setdir) stepsbyname["decrypt_sets.py"].run() start += 1 # from 2018 on, setE has il and eng variants monoindirs = [] # TWEET # hack for tweets; early set of monodir monodir = os.path.join(expdir, setdir, 'data', 'monolingual_text') tweetintab = os.path.join(expdir, setdir, 'docs', 'twitter_info.tab') notweetsinmono = True if args.notweets or not os.path.exists(tweetintab): print("disabling twitter stuff; tweets in regular mono ok") notweetsinmono = False stepsbyname["get_tweet_by_id.rb"].disable() stepsbyname["ldc_tok.py"].disable() else: print( "not disabling twitter stuff; look at {}; avoiding tweets in regular mono" .format(tweetintab)) stepsbyname["get_tweet_by_id.rb"].stdin = tweetintab tweetprogpaths = [] # for toolroot in (os.path.join(expdir, 'set0'), scriptdir): # bad ldc tools for eval for toolroot in (scriptdir, ): tweetprogpaths = dirfind(os.path.join(toolroot, 'tools'), 'get_tweet_by_id.rb') if len(tweetprogpaths) > 0: break if len(tweetprogpaths) == 0: sys.stderr.write("Can't find get_tweet_by_id.rb\n") sys.exit(1) else: tweetprogpath = os.path.dirname(tweetprogpaths[0]) tweetdir = os.path.join(outdir, 'tweet', 'rsd') stepsbyname["get_tweet_by_id.rb"].progpath = tweetprogpath mkdir_p(tweetdir) stepsbyname[ "get_tweet_by_id.rb"].argstring = tweetdir + " -l " + language tweeterr = os.path.join(outdir, 'extract_tweet.err') stepsbyname["get_tweet_by_id.rb"].stderr = tweeterr stepsbyname["get_tweet_by_id.rb"].scriptbin = args.ruby # TOKENIZE AND RELOCATE TWEETS # find rb location, params file toxexecpaths = [] thetoolroot = None for toolroot in (expdir, scriptdir): tokexecpaths = dirfind(os.path.join(toolroot, 'tools'), 'token_parse.rb') if len(tokexecpaths) > 0: thetoolroot = toolroot break if len(tokexecpaths) == 0: sys.stderr.write("Can't find token_parse.rb\n") sys.exit(1) tokexec = tokexecpaths[0] tokparamopts = dirfind(os.path.join(thetoolroot, 'tools'), 'yaml') tokparam = "--param {}".format( tokparamopts[0]) if len(tokparamopts) > 0 else "" # ugly: the base of the file monodir/mononame.zip; need to add it to monoindirs and just pass that base so it gets constructed mononame = "tweets.ltf" monoindirs.append(os.path.join(monodir, mononame + ".zip")) stepsbyname[ "ldc_tok.py"].argstring = "--mononame {mononame} -m {monodir} --ruby {ruby} --dldir {tweetdir} --exec {tokexec} {tokparam} --outfile {outfile}".format( mononame=mononame, monodir=monodir, ruby=args.ruby, tweetdir=tweetdir, tokexec=tokexec, tokparam=tokparam, outfile=os.path.join(rootdir, language, 'ldc_tok.stats')) stepsbyname["ldc_tok.py"].stderr = os.path.join( rootdir, language, 'ldc_tok.err') # # TODO: log tweets! # MONO if args.engset: for flavor in (args.language, "eng"): localmonoindirs = copy.deepcopy(monoindirs) monodir = os.path.join(expdir, setdir, 'data', 'monolingual_text', flavor) localmonoindirs.extend( dirfind(monodir, "%s_%s.ltf.zip" % (setdir, flavor))) print(localmonoindirs) # JM: TODO: ugly copy. refactor!!! monooutdir = os.path.join(outdir, 'mono', 'extracted_%s' % flavor) monoerr = os.path.join(outdir, 'extract_mono_%s.err' % flavor) stepsbyname["extract_mono_%s" % flavor].argstring = "--no-cdec --nogarbage -i %s -o %s" % \ (' '.join(localmonoindirs), monooutdir) if notweetsinmono: stepsbyname["extract_mono_%s" % flavor].argstring += " --removesn" stepsbyname["extract_mono_%s" % flavor].stderr = monoerr # since we package and extract all at once, use the ltf structure to declare the manifest names manfiles = [ x for x in map( lambda y: '.'.join(os.path.basename(y).split('.')[:-2]), localmonoindirs) ] # tweet 2 mono set here so that mono and tweet dirs are already established # if stepsbyname["get_tweet_by_id.rb"].disabled: # stepsbyname["extract_mono_tweet.py"].disable() # else: # stepsbyname["extract_mono_tweet.py"].argstring = "--nogarbage -i "+tweetdir+" -o "+monooutdir # stepsbyname["extract_mono_tweet.py"].stderr = os.path.join(outdir, 'extract_mono_tweet.err') # manfiles.append("tweets") ofcomponents = outfile.split('.') localoutfile = '.'.join( ofcomponents[:-1]) + (".%s." % flavor) + ofcomponents[-1] print(localoutfile) # PACKAGE monoxml = localoutfile monostatsfile = localoutfile + ".stats" manarg = ' '.join(manfiles) monoerr = os.path.join(outdir, 'make_mono_release_%s.err' % flavor) stepsbyname["make_mono_release_%s" % flavor].argstring = "--no-ext -r %s -l %s -c %s -s %s | gzip > %s" % \ (monooutdir, flavor, manarg, monostatsfile, monoxml) stepsbyname["make_mono_release_%s" % flavor].stderr = monoerr else: monodir = os.path.join(expdir, setdir, 'data', 'monolingual_text') monoindirs.extend(dirfind(monodir, "%s.ltf.zip" % setdir)) monooutdir = os.path.join(outdir, 'mono', 'extracted') monoerr = os.path.join(outdir, 'extract_mono.err') stepsbyname["extract_mono.py"].argstring = "--no-cdec --nogarbage -i %s -o %s" % \ (' '.join(monoindirs), monooutdir) if notweetsinmono: stepsbyname["extract_mono.py"].argstring += " --removesn" stepsbyname["extract_mono.py"].stderr = monoerr # since we package and extract all at once, use the ltf structure to declare the manifest names manfiles = [ x for x in map( lambda y: '.'.join(os.path.basename(y).split('.')[:-2]), monoindirs) ] # tweet 2 mono set here so that mono and tweet dirs are already established # if stepsbyname["get_tweet_by_id.rb"].disabled: # stepsbyname["extract_mono_tweet.py"].disable() # else: # stepsbyname["extract_mono_tweet.py"].argstring = "--nogarbage -i "+tweetdir+" -o "+monooutdir # stepsbyname["extract_mono_tweet.py"].stderr = os.path.join(outdir, 'extract_mono_tweet.err') # manfiles.append("tweets") # PACKAGE monoxml = outfile monostatsfile = outfile + ".stats" manarg = ' '.join(manfiles) monoerr = os.path.join(outdir, 'make_mono_release.err') stepsbyname["make_mono_release.py"].argstring = "--no-ext -r %s -l %s -c %s -s %s | gzip > %s" % \ (monooutdir, language, manarg, monostatsfile, monoxml) stepsbyname["make_mono_release.py"].stderr = monoerr for step in steps[start:stop]: step.run() print("Done.\nLast file is %s" % outfile)
def main(): steps = [] # extract_mono.py steps.append(Step('decrypt_sets.py', help="decode encrypted sets")) # extract_mono.py steps.append(Step('extract_mono.py', help="get flat form mono data")) # get_tweet_by_id.rb steps.append(Step('get_tweet_by_id.rb', help="download tweets. must have twitter gem installed " \ "and full internet", abortOnFail=False)) # extract_mono_tweet.py steps.append(Step('extract_mono_tweet.py', help="make twitter data look like regular mono data")) steps.append(Step('make_mono_release.py', help="package mono flat data")) stepsbyname = {} for step in steps: stepsbyname[step.prog] = step parser = argparse.ArgumentParser(description="Build an eval IL monoset from LDC to elisa form", formatter_class= \ argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--setdir", "-d", default='.', help='name of set directory (i.e. set1, setE, etc.)') parser.add_argument("--language", "-l", default='uzb', help='three letter code of IL language') parser.add_argument("--key", "-k", default=None, help='decryption key for encrypted il') parser.add_argument("--notweets", "-n", action='store_true', default=None, help='do not include tweets (for eval IL setE only)') parser.add_argument("--expdir", "-e", help='path to where the extraction is. If starting at ' \ 'step 0 this is ignored') parser.add_argument("--root", "-r", default='/home/nlg-02/LORELEI/ELISA/data', help='path to where the extraction will take place') parser.add_argument("--outfile", "-o", help='name of the output file') parser.add_argument("--start", "-s", type=int, default=0, help='step to start at') parser.add_argument("--stop", "-p", type=int, default=len(steps)-1, help='step to stop at (inclusive)') parser.add_argument("--liststeps", "-x", nargs=0, action=make_action(steps), help='print step list and exit') parser.add_argument("--ruby", default="/opt/local/bin/ruby2.2", help='path to ruby (2.1 or higher)') try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) rootdir = args.root language = args.language setdir = args.setdir outdir = os.path.join(rootdir, language, setdir) outfile = os.path.join(outdir, args.outfile) start = args.start stop = args.stop + 1 if args.expdir is None: expdir = os.path.join(rootdir, language, 'expanded', 'lrlp') else: expdir = args.expdir mkdir_p(outdir) if args.key is None: stepsbyname["decrypt_sets.py"].disable() else: stepsbyname["decrypt_sets.py"].stderr=os.path.join(outdir, 'decrypt_sets.err') stepsbyname["decrypt_sets.py"].argstring="-r %s -k %s -s %s" % (expdir, args.key, setdir) stepsbyname["decrypt_sets.py"].run() start+=1 # TWEET if args.notweets: stepsbyname["get_tweet_by_id.rb"].disable() stepsbyname["extract_mono_tweet.py"].disable() else: tweetprogpath = os.path.join(expdir, 'set0', 'tools', 'twitter-processing', 'bin') stepsbyname["get_tweet_by_id.rb"].progpath = tweetprogpath tweetdir = os.path.join(outdir, 'tweet') stepsbyname["get_tweet_by_id.rb"].argstring = tweetdir+" -l "+language tweetintab = os.path.join(expdir, setdir, 'docs', 'twitter_info.tab') if os.path.exists(tweetintab): stepsbyname["get_tweet_by_id.rb"].stdin = tweetintab else: stepsbyname["get_tweet_by_id.rb"].disable() tweeterr = os.path.join(outdir, 'extract_tweet.err') stepsbyname["get_tweet_by_id.rb"].stderr = tweeterr stepsbyname["get_tweet_by_id.rb"].scriptbin = args.ruby # # TODO: log tweets! # MONO monoindirs = dirfind(os.path.join(expdir, setdir, 'data', 'monolingual_text'), "%s.ltf.zip" % setdir) monooutdir = os.path.join(outdir, 'mono', 'extracted') monoerr = os.path.join(outdir, 'extract_mono.err') stepsbyname["extract_mono.py"].argstring = "--nogarbage -i %s -o %s" % \ (' '.join(monoindirs), monooutdir) stepsbyname["extract_mono.py"].stderr = monoerr # since we package and extract all at once, use the ltf structure to declare the manifest names manfiles = [x for x in map(lambda y: '.'.join(os.path.basename(y).split('.')[:-2]), monoindirs)] # tweet 2 mono set here so that mono and tweet dirs are already established if stepsbyname["get_tweet_by_id.rb"].disabled: stepsbyname["extract_mono_tweet.py"].disable() else: stepsbyname["extract_mono_tweet.py"].argstring = "--nogarbage -i "+tweetdir+" -o "+monooutdir stepsbyname["extract_mono_tweet.py"].stderr = os.path.join(outdir, 'extract_mono_tweet.err') manfiles.append("tweets") # PACKAGE monoxml = outfile monostatsfile = outfile+".stats" manarg = ' '.join(manfiles) monoerr = os.path.join(outdir, 'make_mono_release.err') stepsbyname["make_mono_release.py"].argstring = "-r %s -l %s -c %s -s %s | gzip > %s" % \ (monooutdir, language, manarg, monostatsfile, monoxml) stepsbyname["make_mono_release.py"].stderr = monoerr for step in steps[start:stop]: step.run() print("Done.\nFile is %s" % outfile)
def main(): steps = [] # Put additional steps in here. Arguments, stdin/stdout, etc. get set below # unpack_lrlp.sh steps.append(Step('unpack_lrlp.sh', call=check_output, help="untars lrlp into position for further processing")) # gather_ephemera.py steps.append(Step('gather_ephemera.py', help="relocates assorted bits from lrlp")) # extract_lexicon.py steps.append(Step('extract_lexicon.py', help="get flat form of bilingual lexicon", abortOnFail=False)) # normalize_lexicon.py steps.append(Step('normalize_lexicon.py', help="heuristically convert lexicon into something more machine readable", abortOnFail=False)) # relocate lexicon steps.append(Step('cp', progpath='/bin', help="move the lexicon stuff into ephemera", abortOnFail=False)) # get_tweet_by_id.rb steps.append(Step('get_tweet_by_id.rb', help="download tweets. must have twitter gem installed " \ "and full internet", abortOnFail=False)) # Use .ltf instead of .rsd for tweet translations # # ltf2rsd.perl # steps.append(Step('ltf2rsd.perl', # help="get flat form of tweet translations", # abortOnFail=False)) # extract_psm_annotation.py steps.append(Step('extract_psm_annotation.py', help="get annotations from psm files into psm.ann", abortOnFail=False)) # extract_entity_annotation.py steps.append(Step('extract_entity_annotation.py', help="get entity and other annotations into entity.ann", abortOnFail=False)) # extract_parallel.py steps.append(Step('extract_parallel.py', help="get flat form parallel data")) steps.append(Step('filter_parallel.py', help="filter parallel data to remove likely mismatches")) # extract_mono.py steps.append(Step('extract_mono.py', help="get flat form mono data")) # extract_comparable.py steps.append(Step('extract_comparable.py', help="get flat form comparable data")) stepsbyname = {} for step in steps: stepsbyname[step.prog] = step parser = argparse.ArgumentParser(description="Process a LRLP into flat format", formatter_class= \ argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--tarball", "-t", nargs='+', default=['lrlp.tar.gz'], help='path to gzipped tars for processing (all tars considered to be part of the same package)') parser.add_argument("--language", "-l", default='uzb', help='three letter code of language') parser.add_argument("--key", "-k", default=None, help='decryption key for encrypted il') parser.add_argument("--set", "-S", default=None, help='decryption set for encrypted il') parser.add_argument("--root", "-r", default='/home/nlg-02/LORELEI/ELISA/data', help='path to where the extraction will take place') parser.add_argument("--evalil", "-E", action='store_true', default=False, help='this is an eval il. makes expdir set0 aware') parser.add_argument("--expdir", "-e", help='path to where the extraction is. If starting at ' \ 'step 0 this is ignored') parser.add_argument("--start", "-s", type=int, default=0, help='step to start at') parser.add_argument("--stop", "-p", type=int, default=len(steps)-1, help='step to stop at (inclusive)') parser.add_argument("--liststeps", "-x", nargs=0, action=make_action(steps), help='print step list and exit') parser.add_argument("--ruby", default="/opt/local/bin/ruby2.2", help='path to ruby (2.1 or higher)') try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) if args.expdir is not None and args.start <= 0: sys.stderr.write \ ("Warning: expdir is set but will be ignored and determined dynamically") if args.expdir is None and args.start > 0: sys.stderr.write \ ("Error: must explicitly set expdir if not starting at step 0") sys.exit(1) rootdir = args.root language = args.language start = args.start stop = args.stop + 1 if (args.key is None) ^ (args.set is None): sys.stderr.write("key (-k) and set (-S) must both be set or unset\n") sys.exit(1) # Patchups for step 0 argstring = "-k %s -s %s" % (args.key, args.set) if args.key is not None else "" argstring += " -l %s -r %s %s" % (language, rootdir, ' '.join(args.tarball)) stepsbyname["unpack_lrlp.sh"].argstring=argstring if start == 0: expdir = steps[0].run().strip().decode("utf-8") if args.evalil: expdir = os.path.join(expdir, 'set0') start += 1 else: expdir = args.expdir # Patchups for the rest if stop > 0: # TWEET # LDC changed its mind again # tweetprogpath = os.path.join(expdir, 'tools', 'twitter-processing') tweetprogpath = os.path.join(expdir, 'tools', 'twitter-processing', 'bin') stepsbyname["get_tweet_by_id.rb"].progpath = tweetprogpath tweetdir = os.path.join(rootdir, language, 'tweet') stepsbyname["get_tweet_by_id.rb"].argstring = tweetdir+" -l "+language tweetintab = os.path.join(expdir, 'docs', 'twitter_info.tab') if os.path.exists(tweetintab): stepsbyname["get_tweet_by_id.rb"].stdin = tweetintab else: stepsbyname["get_tweet_by_id.rb"].disable() tweeterr = os.path.join(rootdir, language, 'extract_tweet.err') stepsbyname["get_tweet_by_id.rb"].stderr = tweeterr stepsbyname["get_tweet_by_id.rb"].scriptbin = args.ruby # EPHEMERA ephemdir = os.path.join(rootdir, language, 'ephemera') stepsbyname['gather_ephemera.py'].argstring = "-s %s -t %s" %\ (expdir, ephemdir) ephemerr = os.path.join(rootdir, language, 'gather_ephemera.err') stepsbyname['gather_ephemera.py'].stderr = ephemerr # # LTF2RSD # l2rindir = os.path.join(expdir, 'data', 'translation', 'from_'+language, # 'eng') # Only converts from_SRC_tweet subdir # stepsbyname["ltf2rsd.perl"].argstring = l2rindir # # l2rprogpath = os.path.join(expdir, 'tools', 'ltf2txt') # # stepsbyname["ltf2rsd.perl"].progpath = l2rprogpath # l2rerr = os.path.join(rootdir, language, 'ltf2rsd.err') # stepsbyname["ltf2rsd.perl"].stderr = l2rerr # LEXICON # # IL CHANGE #lexiconinfile = os.path.join(expdir, 'docs', 'categoryI_dictionary', '*.xml') lexiconinfile = os.path.join(expdir, 'data', 'lexicon', '*.xml') lexiconoutdir = os.path.join(rootdir, language, 'lexicon') lexiconoutfile = os.path.join(lexiconoutdir, 'lexicon') lexiconnormoutfile = os.path.join(lexiconoutdir, 'lexicon.norm') lexiconerr = os.path.join(rootdir, language, 'extract_lexicon.err') lexiconnormerr = os.path.join(rootdir, language, 'normalize_lexicon.err') # lexicon v1.5 for y2 stepsbyname["extract_lexicon.py"].argstring = " -v 1.5 -i %s -o %s" % \ (lexiconinfile, lexiconoutfile) stepsbyname["extract_lexicon.py"].stderr = lexiconerr stepsbyname["normalize_lexicon.py"].argstring = "-i %s -o %s" % \ (lexiconoutfile, lexiconnormoutfile) stepsbyname["normalize_lexicon.py"].stderr = lexiconnormerr stepsbyname["cp"].argstring = "-r %s %s" % (lexiconoutdir, ephemdir) # PSM psmindir = os.path.join(expdir, 'data', 'monolingual_text', 'zipped', '*.psm.zip') psmoutpath = os.path.join(rootdir, language, 'psm.ann') psmerr = os.path.join(rootdir, language, 'extract_psm_annotation.err') stepsbyname["extract_psm_annotation.py"].argstring = "-i %s -o %s" % \ (psmindir, psmoutpath) stepsbyname["extract_psm_annotation.py"].stderr = psmerr # ENTITY entityoutpath = os.path.join(rootdir, language, 'entity.ann') entityerr = os.path.join(rootdir, language, 'extract_entity_annotation.err') stepsbyname["extract_entity_annotation.py"].argstring="-r %s -o %s -et %s" \ % (expdir, entityoutpath, tweetdir) stepsbyname["extract_entity_annotation.py"].stderr = entityerr # PARALLEL paralleloutdir = os.path.join(rootdir, language, 'parallel', 'extracted') parallelerr = os.path.join(rootdir, language, 'extract_parallel.err') stepsbyname["extract_parallel.py"].argstring="-r %s -o %s -s %s -et %s" % \ (expdir, paralleloutdir, language, tweetdir) stepsbyname["extract_parallel.py"].stderr = parallelerr filteroutdir = os.path.join(rootdir, language, 'parallel', 'filtered') rejectoutdir = os.path.join(rootdir, language, 'parallel', 'rejected') filtererr = os.path.join(rootdir, language, 'filter_parallel.err') stepsbyname["filter_parallel.py"].argstring="-s 2 -l %s -i %s -f %s -r %s" % \ (language, paralleloutdir, filteroutdir, rejectoutdir) stepsbyname["filter_parallel.py"].stderr = filtererr # MONO monoindirs = dirfind(os.path.join(expdir, 'data', 'monolingual_text'), "ltf.zip") monooutdir = os.path.join(rootdir, language, 'mono', 'extracted') monoerr = os.path.join(rootdir, language, 'extract_mono.err') stepsbyname["extract_mono.py"].argstring = "-i %s -o %s" % \ (' '.join(monoindirs), monooutdir) stepsbyname["extract_mono.py"].stderr = monoerr # COMPARABLE if os.path.exists(os.path.join(expdir, 'data', 'translation', 'comparable')): compoutdir = os.path.join(rootdir, language, 'comparable', 'extracted') comperr = os.path.join(rootdir, language, 'extract_comparable.err') stepsbyname["extract_comparable.py"].argstring = "-r %s -o %s -s %s" % \ (expdir, compoutdir, language) stepsbyname["extract_comparable.py"].stderr = comperr else: stepsbyname["extract_comparable.py"].disable() for step in steps[start:stop]: step.run() print("Done.\nExpdir is %s" % expdir)