示例#1
0
def main():
  parser = argparse.ArgumentParser(description="relocate parts of a lrlp into a centralized location to make it easier to gather later",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--source", "-s",
                      help='path to the expanded lrlp')
  parser.add_argument("--old", "-o", default=None,
                      help='path to old ephemera directory')
  parser.add_argument("--target", "-t",
                      help='path to the desired catch-all directory')

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  if os.path.exists(args.target):
    shutil.rmtree(args.target)
  mkdir_p(args.target)
  for indirstub, outdirstub in manifest.items():
    indir = os.path.join(args.source, indirstub)
    if os.path.exists(indir):
      outdir = os.path.join(args.target, outdirstub)
      sys.stderr.write("Copying {} to {}\n".format(indir, outdir))
      copything(indir, outdir)
  if args.old is not None:
    # traverse top level of old and move everything not in transferexcluded to target/old
    oldtarget=os.path.join(args.target, "old")
    mkdir_p(oldtarget)
    for oldfile in os.listdir(args.old):
      if oldfile not in transferexcluded:
        fullsource = os.path.join(args.old, oldfile)
        fulltarget = os.path.join(oldtarget, oldfile)
        sys.stderr.write("Transferring {} to {}\n".format(fullsource, fulltarget))
        copything(fullsource, fulltarget)
示例#2
0
def relocate_ltf(dldir, lrlpdir, logfile):
    ''' relocate files and replace them '''
    # source of the new files
    parent = os.path.dirname(dldir)
    repldir = os.path.join(parent, 'ltf')
    if not os.path.exists(repldir):
        sys.stderr.write(
            "Directories not set up properly; couldn't find {}\n".format(
                repldir))
        sys.exit(1)
    bkpdir = os.path.join(parent, 'ltf.retired')
    mkdir_p(bkpdir)
    mkdir_p(lrlpdir)
    for file in os.listdir(lrlpdir):
        if not is_sn(file) or not file.endswith(".ltf.xml"):
            continue
        oldfile = os.path.join(lrlpdir, file)
        bkpfile = os.path.join(bkpdir, file)
        shutil.move(oldfile, bkpfile)
    for file in os.listdir(repldir):
        if not file.endswith(".ltf.xml"):
            continue
        dstfile = os.path.join(lrlpdir, file)
        replfile = os.path.join(repldir, file)
        # introduce the replacement file in the new  location
        shutil.copyfile(replfile, dstfile)
示例#3
0
def main():
  parser = argparse.ArgumentParser(description="relocate parts of a lrlp into a centralized location to make it easier to gather later",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--source", "-s",
                      help='path to the expanded lrlp')
  parser.add_argument("--target", "-t",
                      help='path to the desired catch-all directory')

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  if os.path.exists(args.target):
    shutil.rmtree(args.target)
  mkdir_p(args.target)
  for indirstub, outdirstub in manifest.items():
    indir = os.path.join(args.source, indirstub)
    if os.path.exists(indir):
      outdir = os.path.join(args.target, outdirstub)
      if os.path.isdir(indir):
        shutil.copytree(indir, outdir)
      elif os.path.isfile(indir):
        shutil.copy(indir, outdir)
      else:
        sys.stderr.write("%s not directory or file; skipping" % indir)
示例#4
0
def zip_and_copy(workdir, dldir, outfile, logfile):
    ''' zip up directory tree; requires a relocation for files to all line up right '''
    parent = os.path.dirname(dldir)
    repldir = os.path.join(parent, 'ltf')
    if not os.path.exists(repldir):
        sys.stderr.write(
            "Directories not set up properly; couldn't find input {}\n".format(
                repldir))
        sys.exit(1)
    # trying to get an ltf directory underneath so that zip file has ltf prefix before everything
    realwork = os.path.join(workdir, 'foo')
    shutil.copytree(repldir, os.path.join(realwork, 'ltf'))
    mkdir_p(os.path.dirname(outfile))
    shutil.make_archive(outfile, 'zip', realwork)
示例#5
0
def tokrsd(dldir, ruby, exec, param, workdir):
    ''' create ltfs from rsds '''
    rsddir = dldir
    parent = os.path.dirname(rsddir)
    ltfdir = os.path.join(parent, 'ltf')
    if not os.path.exists(rsddir):
        sys.stderr.write(
            "Directories not set up properly; couldn't find {}\n".format(
                rsddir))
        sys.exit(1)
    mkdir_p(ltfdir)
    listfile = os.path.join(workdir, 'list')
    lfh = prepfile(listfile, 'w')
    for l in iglob(os.path.join(rsddir, '*.rsd.txt')):
        lfh.write("{}\n".format(l))
    lfh.close()
    paramtxt = "" if param is None else "-t {}".format(param)
    cmd = "{} {} {} {}".format(ruby, exec, paramtxt, listfile)
    return check_call(shlex.split(cmd))
示例#6
0
def main():
  parser = argparse.ArgumentParser(description="Given category per doc, idfile, data file, put data in category-specific dir",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--catfile", "-c", nargs='?', type=argparse.FileType('r'), help="doc cat file (docid cat)")
  parser.add_argument("--idfile", "-d", nargs='?', type=argparse.FileType('r'), help="id file (docid per line)")
  parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input file")
  parser.add_argument("--prefix", "-p", default=".", help="directory prefix for categories")
  parser.add_argument("--postfix", "-P", default=".", help="directory postfix after categories")
  parser.add_argument("--remainder", "-r", default="train", help="remainder category. Should match previous remainder category")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  catfile = args.catfile
  infile =  args.infile
  idfile =  args.idfile

  basefile = os.path.basename(args.infile.name)
  cats = {}
  fhs = {}
  for line in catfile:
    doc, cat = line.strip().split('\t')
    prefix = os.path.join(args.prefix, cat, args.postfix)
    innercatfile = os.path.join(prefix, basefile)
    if innercatfile not in fhs:      
      mkdir_p(prefix)
      fhs[innercatfile]=open(innercatfile, 'w')
    cats[doc]=fhs[innercatfile]
  remcatpref = os.path.join(args.prefix, args.remainder, args.postfix)
  remaindercatfile = os.path.join(remcatpref, basefile)
  if remaindercatfile not in fhs:
    mkdir_p(remcatpref)
    fhs[remaindercatfile]=open(remaindercatfile, 'w')
    
  for doc, data in zip(idfile, infile):
    doc = doc.strip()
    fh = cats[doc] if doc in cats else fhs[remaindercatfile]
    fh.write(data)
示例#7
0
def main():
  parser = argparse.ArgumentParser(description="extract parallel data from " \
                                   "expanded lrlp to flat files and manifests.")
  parser.add_argument("--rootdir", "-r", default=".",
                      help="root lrlp dir")
  parser.add_argument("--datadirs", nargs='+', default=['data', 'translation'],
                      help="elements in path from root to ltf files")
  parser.add_argument("--outdir", "-o", default="./parallel/extracted",
                      help="where to write extracted files")
  parser.add_argument("--src", "-s", default='uzb',
                      help="source language 3 letter code")
  parser.add_argument("--trg", "-t", default='eng',
                      help="target language 3 letter code")
  parser.add_argument("--origsubdir", default="original",
                      help="subdirectory for untokenized files")
  parser.add_argument("--garbagesubdir", default="garbage",
                      help="subdirectory for garbage files (under orig)")
  parser.add_argument("--nogarbage", action='store_true', default=False,
                      help="turn off garbage filtering")
  parser.add_argument("--toksubdir", default="tokenized",
                      help="subdirectory for tokenized files")
  parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                      help="subdirectory for cdec-tokenized files")
  parser.add_argument("--agiletoksubdir", default="agile-tokenized",
                      help="subdirectory for agile-tokenized files (target side only)")
  parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                      help="subdirectory for tokenized files based on " \
                      "morphological segmentation")
  parser.add_argument("--morphsubdir", default="morph",
                      help="subdirectory for morphological files")
  parser.add_argument("--possubdir", default="pos",
                      help="subdirectory for pos tag files")
  parser.add_argument("--extwtdir", "-et", default=None,
                      help="directory of extracted tweet rsd files")
  parser.add_argument("--agiletokpath", default=os.path.join(scriptdir, 'agiletok.sh'),
                      help="path to agile tokenizer binary")
  parser.add_argument("--cdectokpath", default=os.path.join(scriptdir, 'cdectok.sh'),
                      help="path to cdec tokenizer binary")

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  origoutdir=args.origsubdir
  tokoutdir=args.toksubdir
  morphtokoutdir=args.morphtoksubdir
  cdectokoutdir=args.cdectoksubdir
  agiletokoutdir=args.agiletoksubdir
  cdectoklcoutdir=args.cdectoksubdir+".lc"
  agiletoklcoutdir=args.agiletoksubdir+".lc"
  morphoutdir=args.morphsubdir
  posoutdir=args.possubdir
  agiletokpath = args.agiletokpath
  cdectokpath = args.cdectokpath
  dirs = [origoutdir,
          tokoutdir,
          morphtokoutdir,
          cdectokoutdir,
          agiletokoutdir,
          cdectoklcoutdir,
          agiletoklcoutdir,
          morphoutdir,
          posoutdir]
  if args.nogarbage:
    garbageoutdir = None
  else:
    garbageoutdir=os.path.join(origoutdir, args.garbagesubdir)
    dirs.append(garbageoutdir)

  for dir in dirs:
    fulldir = os.path.join(args.outdir, dir)
    lputil.mkdir_p(fulldir)
  source_fh = open(os.path.join(args.outdir, "source"), 'a')
  source_fh.write("Extracted parallel data from %s to %s on %s\nusing %s;" \
                  " command issued from %s\n" % (args.rootdir, args.outdir,
                                                 datetime.datetime.now(),
                                                 ' '.join(sys.argv),
                                                 os.getcwd()))
  datadirs=[args.rootdir,]+args.datadirs

  '''
  from_eng/ -- manual translations from English into LRLP (elicitation,
  phrasebook, core REFLEX news text, additional news text)

  from_xxx/ -- manual translations from LRLP into English in multiple
  genres
  '''

  # name of corpus and location in lrlp (for cases that don't do anything special)
  corpustuples = [("fromsource.generic", os.path.join(*(datadirs+["from_%s" % args.src,]))),
                  ("fromtarget.news", os.path.join(*(datadirs+["from_%s" % args.trg, "news"]))),
                  ("fromtarget.phrasebook", os.path.join(*(datadirs+["from_%s" % args.trg, "phrasebook"]))),
                  ("fromtarget.elicitation", os.path.join(*(datadirs+["from_%s" % args.trg, "elicitation"])))]
  for corpustuple in corpustuples:
    printout(corpustuple[0], corpustuple[1],
             args.src, args.trg, args.outdir, origoutdir, garbageoutdir,
             tokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir,
             agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir,
             agiletokpath, cdectokpath)

  # Found data
  printout("found.generic",
           args.rootdir, args.src, args.trg, args.outdir, origoutdir, garbageoutdir,
           tokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir,
           agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir,
           agiletokpath, cdectokpath,
           stp=lputil.all_found_tuples, el=lputil.get_aligned_sentences)

  # Tweet data
  if args.extwtdir is not None and os.path.exists(args.extwtdir):
    move_extracted_tweet(os.path.join(*datadirs), args.src, args.extwtdir)
    printout("fromsource.tweet",
             os.path.join(*(datadirs+["from_%s" % args.src,])),
             args.src, args.trg, args.outdir, origoutdir, garbageoutdir,
             tokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir,
             agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir,
             agiletokpath, cdectokpath,
             tweet=True)
示例#8
0
def main(args):
    indir = args.indir
    origsizes = args.sizes
    termfile = args.termfile

    # TODO: find these?
    # doc = keep full docs together  (can detect this by counting number of unique docs)
    docprefixes = [
        "fromsource.generic", "fromsource.tweet", "fromtarget.news",
        "found.generic"
    ]
    nodocprefixes = ["fromtarget.elicitation", "fromtarget.phrasebook"]

    if args.allperseg:
        nodocprefixes.extend(docprefixes)
        docprefixes = []

    extractpath = os.path.join(indir, args.extractpath)
    # http://stackoverflow.com/questions/973473/getting-a-list-of-all-subdirectories-in-the-current-directory
    filetypes = [subdir for subdir in next(os.walk(extractpath))[1]]

    origpath = os.path.join(extractpath, 'original')
    outpath = os.path.join(indir, args.outdir)
    mkdir_p(outpath)
    sf_ann_doc_ids_file = None
    if 'setE' in args.categories:
        #  annotated SF docs goes to setE
        sf_ann_doc_ids_file = os.path.join(outpath, "sf.ann.doc.ids")
        sf_ann_dir = os.path.join(
            indir, '../expanded/lrlp/data/annotation/situation_frame/')
        scan_sf_ann_doc_ids(sf_ann_dir, sf_ann_doc_ids_file)

    # number of words in each file
    fullsizes = {}
    adjsizes = {}
    sizesum = 0.0
    for preflist in [docprefixes, nodocprefixes]:
        for prefix in list(preflist):
            # don't deal with it more if there's nothing in the manifest
            manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
            if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0:
                print("removing " + prefix)
                preflist.remove(prefix)
    for prefix in docprefixes + nodocprefixes:
        engfile = os.path.join(origpath, "%s.original.eng.flat" % prefix)
        prefsize = int(
            check_output("wc -w %s" % engfile,
                         shell=True).decode('utf8').strip().split(' ')[0])
        fullsizes[prefix] = prefsize
        sizesum += prefsize
    # adjust size split by proportion, with minimum
    for prefix in docprefixes + nodocprefixes:
        mult = fullsizes[prefix] / sizesum
        adjsizes[prefix] = [
            max(args.minimum, int(mult * x)) for x in origsizes
        ]
        print(prefix, adjsizes[prefix])
    # doc-based processing
    catlist = ' '.join(args.categories)
    for prefix in docprefixes:
        idfile = os.path.join(outpath, "%s.ids" % prefix)
        manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
        try:
            check_output("cut -f2 %s > %s" % (manfile, idfile),
                         stderr=STDOUT,
                         shell=True)
        except CalledProcessError as exc:
            print("Status : FAIL", exc.returncode, exc.output)
        engfile = os.path.join(origpath, "%s.original.eng.flat" % prefix)
        sizelist = ' '.join(map(str, adjsizes[prefix]))
        catfile = run_selection(prefix,
                                idfile,
                                engfile,
                                termfile,
                                catlist,
                                args.remainder,
                                sizelist,
                                filetypes,
                                args.language,
                                extractpath,
                                outpath,
                                args.devlstfile,
                                setElstfile=sf_ann_doc_ids_file)
        for i in (args.language, 'eng'):
            manifest = os.path.join(extractpath,
                                    "%s.%s.manifest" % (prefix, i))
            cmd = "%s/categorize.py -i %s -d %s -c %s -p %s" % (
                script_dir, manifest, idfile, catfile, outpath)
            print("Running " + cmd)
            check_output(cmd, stderr=STDOUT, shell=True)

    # nodoc-based processing
    for prefix in nodocprefixes:
        idfile = os.path.join(outpath, "%s.fakeids" % prefix)
        try:
            mansize = int(
                check_output(
                    "wc -l %s" %
                    os.path.join(extractpath, "%s.eng.manifest" % prefix),
                    shell=True).decode('utf8').strip().split(' ')[0])
            check_output("seq %d > %s" % (mansize, idfile),
                         stderr=STDOUT,
                         shell=True)
        except CalledProcessError as exc:
            print("Status : FAIL", exc.returncode, exc.output)
        engfile = os.path.join(origpath, "%s.original.eng.flat" % prefix)
        sizelist = ' '.join(map(str, adjsizes[prefix]))
        catfile = run_selection(prefix, idfile, engfile, termfile, catlist,
                                args.remainder, sizelist, filetypes,
                                args.language, extractpath, outpath)
        for i in (args.language, 'eng'):
            manifest = os.path.join(extractpath,
                                    "%s.%s.manifest" % (prefix, i))
            cmd = "%s/categorize.py -i %s -d %s -c %s -p %s" % (
                script_dir, manifest, idfile, catfile, outpath)
            print("Running " + cmd)
            check_output(cmd, stderr=STDOUT, shell=True)

    # warning if entries not found in given dev list
    if args.devlstfile:
        devlst = set(open(args.devlstfile).read().split())
        all_docids = list()
        for prefix in docprefixes:
            all_docids += open(os.path.join(outpath, "%s.ids" %
                                            prefix)).read().split('\n')
        for i in devlst - set(all_docids):
            print("***Warning: docid not found: %s" % i)
示例#9
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Make dataset selections for experimentation given previously generated categorization files",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--indir", "-i", help="location of parallel data")
    parser.add_argument("--language",
                        "-l",
                        help="source language three digit code")
    parser.add_argument(
        "--extractpath",
        "-e",
        default="filtered",
        help="location of extracted data (might want to use 'filtered')")
    parser.add_argument(
        "--remainder",
        "-r",
        default="train",
        help="remainder category. Should match previous remainder category")
    parser.add_argument("--previous",
                        "-p",
                        help="location of previous cat files")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))


#  reader = codecs.getreader('utf8')
#  writer = codecs.getwriter('utf8')
#  outfile = writer(args.outfile)

    indir = args.indir
    # TODO: find these?
    # doc = keep full docs together  (can detect this by counting number of unique docs)
    # TODO: re-add found.generic to docprefixes
    docprefixes = [
        "fromsource.generic", "fromsource.tweet", "fromtarget.news",
        "found.generic"
    ]
    nodocprefixes = ["fromtarget.elicitation", "fromtarget.phrasebook"]

    extractpath = os.path.join(indir, args.extractpath)

    #http://stackoverflow.com/questions/973473/getting-a-list-of-all-subdirectories-in-the-current-directory
    filetypes = [subdir for subdir in next(os.walk(extractpath))[1]]

    origpath = os.path.join(extractpath, 'original')
    outpath = os.path.join(indir, 'splits')
    mkdir_p(outpath)

    for preflist in [docprefixes, nodocprefixes]:
        for prefix in list(preflist):
            # don't deal with it more if there's nothing in the manifest
            manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
            if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0:
                print("removing " + prefix)
                preflist.remove(prefix)
    # doc-based processing
    for prefix in docprefixes:
        idfile = os.path.join(outpath, "%s.ids" % prefix)
        manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
        try:
            check_output("cut -f2 %s > %s" % (manfile, idfile),
                         stderr=STDOUT,
                         shell=True)
        except CalledProcessError as exc:
            print("Status : FAIL", exc.returncode, exc.output)
        catfile = os.path.join(args.previous, "%s.cats" % prefix)
        newcatfile = os.path.join(outpath, os.path.basename(catfile))
        if os.path.exists(catfile):
            copy(catfile, newcatfile)
        else:
            touch(newcatfile)
        runselection(prefix, idfile, newcatfile, args.remainder, filetypes,
                     args.language, extractpath, outpath)
        for i in (args.language, 'eng'):
            manifest = os.path.join(extractpath,
                                    "%s.%s.manifest" % (prefix, i))
            cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % (
                scriptdir, manifest, idfile, newcatfile, outpath,
                args.remainder)
            print("Running " + cmd)
            check_output(cmd, stderr=STDOUT, shell=True)

    # nodoc-based processing

    for prefix in nodocprefixes:
        idfile = os.path.join(outpath, "%s.fakeids" % prefix)
        try:
            mansize = int(
                check_output(
                    "wc -l %s" %
                    os.path.join(extractpath, "%s.eng.manifest" % prefix),
                    shell=True).decode('utf-8').strip().split(' ')[0])
            check_output("seq %d > %s" % (mansize, idfile),
                         stderr=STDOUT,
                         shell=True)
        except CalledProcessError as exc:
            print("Status : FAIL", exc.returncode, exc.output)
        catfile = os.path.join(args.previous, "%s.cats" % prefix)
        newcatfile = os.path.join(outpath, os.path.basename(catfile))
        if os.path.exists(catfile):
            copy(catfile, newcatfile)
        else:
            touch(newcatfile)
        runselection(prefix, idfile, newcatfile, args.remainder, filetypes,
                     args.language, extractpath, outpath)
        for i in (args.language, 'eng'):
            manifest = os.path.join(extractpath,
                                    "%s.%s.manifest" % (prefix, i))
            cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % (
                scriptdir, manifest, idfile, newcatfile, outpath,
                args.remainder)
            print("Running " + cmd)
            check_output(cmd, stderr=STDOUT, shell=True)
示例#10
0
def main():
  parser = argparse.ArgumentParser(description="Make dataset selections for experimentation",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--indir", "-i", help="location of parallel data")
  parser.add_argument("--language", "-l", help="source language three digit code")
  parser.add_argument("--extractpath", "-e", default="extracted", help="location of extracted data (might want to use 'filtered')")
  parser.add_argument("--minimum", "-m", default=100, help="minimum number of words per subselection")
  parser.add_argument("--sizes", "-s", nargs='+', type=int, help="list of sizes desired in each category")
  parser.add_argument("--categories", "-c", nargs='+', help="list of categories. Must match sizes")
  parser.add_argument("--termfile", "-t", help="file of desired terms, one per line")
  parser.add_argument("--remainder", "-r", default="train", help="remainder category. Should be a new category")
  parser.add_argument("--devlstfile", "-d", default=None, help="file of desired documents for dev (subject to length constraints, must be a set called 'dev')")



  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

#  reader = codecs.getreader('utf8')
#  writer = codecs.getwriter('utf8')
#  outfile = writer(args.outfile)

  indir = args.indir
  origsizes = args.sizes
  termfile = args.termfile

  # TODO: find these?
  # doc = keep full docs together  (can detect this by counting number of unique docs)
  docprefixes = ["fromsource.generic", "fromsource.tweet", "fromtarget.news", "found.generic"]
  nodocprefixes = ["fromtarget.elicitation", "fromtarget.phrasebook"]

  # TODO: find these
  filetypes = ["morph", "morph-tokenized", "original", "pos", "tokenized", "mttok", "mttoklc", "agile-tokenized", "cdec-tokenized", "agile-tokenized.lc", "cdec-tokenized.lc"]

  extractpath = os.path.join(indir, args.extractpath)
  origpath = os.path.join(extractpath, 'original')
  outpath = os.path.join(indir, 'splits')
  mkdir_p(outpath)

  # number of words in each file
  fullsizes = {}
  adjsizes = {}
  sizesum = 0.0
  for preflist in [docprefixes, nodocprefixes]:
    for prefix in list(preflist):
      # don't deal with it more if there's nothing in the manifest
      manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
      if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0:
        print("removing "+prefix)
        preflist.remove(prefix)
  for prefix in docprefixes+nodocprefixes:
    engfile=os.path.join(origpath, "%s.original.eng.flat" % prefix)
    prefsize = int(check_output("wc -w %s" % engfile, shell=True).decode('utf8').strip().split(' ')[0])
    fullsizes[prefix] = prefsize
    sizesum +=prefsize
  # adjust size split by proportion, with minimum
  for prefix in docprefixes+nodocprefixes:
    mult = fullsizes[prefix]/sizesum
    adjsizes[prefix] = [max(args.minimum, int(mult*x)) for x in origsizes]
    print(prefix,adjsizes[prefix])
  # doc-based processing
  catlist = ' '.join(args.categories)
  for prefix in docprefixes:
    idfile = os.path.join(outpath, "%s.ids" % prefix)
    manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
    try:
      check_output("cut -f2 %s > %s" % (manfile, idfile), stderr=STDOUT, shell=True)
    except CalledProcessError as exc:
      print("Status : FAIL", exc.returncode, exc.output)
    engfile=os.path.join(origpath, "%s.original.eng.flat" % prefix)
    sizelist = ' '.join(map(str, adjsizes[prefix]))
    catfile = runselection(prefix, idfile, engfile, termfile, catlist, args.remainder, sizelist, filetypes, args.language, extractpath, outpath, args.devlstfile)
    for i in (args.language, 'eng'):
      manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i))
      cmd = "%s/categorize.py -i %s -d %s -c %s -p %s" % (scriptdir, manifest, idfile, catfile, outpath)
      print("Running "+cmd)
      check_output(cmd, stderr=STDOUT, shell=True)

  # nodoc-based processing
  for prefix in nodocprefixes:
    idfile = os.path.join(outpath, "%s.fakeids" % prefix)
    try:
      mansize = int(check_output("wc -l %s" % os.path.join(extractpath, "%s.eng.manifest" % prefix), shell=True).decode('utf8').strip().split(' ')[0])
      check_output("seq %d > %s" % (mansize, idfile), stderr=STDOUT, shell=True)
    except CalledProcessError as exc:
      print("Status : FAIL", exc.returncode, exc.output)
    engfile=os.path.join(origpath, "%s.original.eng.flat" % prefix)
    sizelist = ' '.join(map(str, adjsizes[prefix]))
    catfile = runselection(prefix, idfile, engfile, termfile, catlist, args.remainder, sizelist, filetypes, args.language, extractpath, outpath)
    for i in (args.language, 'eng'):
      manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i))
      cmd = "%s/categorize.py -i %s -d %s -c %s -p %s" % (scriptdir, manifest, idfile, catfile, outpath)
      print("Running "+cmd)
      check_output(cmd, stderr=STDOUT, shell=True)

  # warning if entries not found in given dev list
  if args.devlstfile:
    devlst = set(open(args.devlstfile).read().split())
    all_docids = list()
    for prefix in docprefixes:
      all_docids += open(os.path.join(outpath, "%s.ids" % prefix)).read().split('\n')
    for i in devlst - set(all_docids):
      print ("***Warning: docid not found: %s" % i)
示例#11
0
def main():
  steps = []

  # extract_mono.py
  steps.append(Step('decrypt_sets.py',
                    help="decode encrypted sets"))

  # extract_mono.py
  steps.append(Step('extract_mono.py',
                    help="get flat form mono data"))

  # get_tweet_by_id.rb
  steps.append(Step('get_tweet_by_id.rb',
                    help="download tweets. must have twitter gem installed " \
                    "and full internet",
                    abortOnFail=False))
  # extract_mono_tweet.py
  steps.append(Step('extract_mono_tweet.py',
                    help="make twitter data look like regular mono data"))

  steps.append(Step('make_mono_release.py',
                    help="package mono flat data"))

  stepsbyname = {}
  for step in steps:
    stepsbyname[step.prog] = step

  parser = argparse.ArgumentParser(description="Build an eval IL monoset from LDC to elisa form",
                                   formatter_class= \
                                   argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--setdir", "-d", default='.',
                      help='name of set directory (i.e. set1, setE, etc.)')
  parser.add_argument("--language", "-l", default='uzb',
                      help='three letter code of IL language')
  parser.add_argument("--key", "-k", default=None,
                      help='decryption key for encrypted il')
  parser.add_argument("--notweets", "-n", action='store_true', default=None,
                      help='do not include tweets (for eval IL setE only)')
  parser.add_argument("--expdir", "-e",
                      help='path to where the extraction is. If starting at ' \
                      'step 0 this is ignored')
  parser.add_argument("--root", "-r", default='/home/nlg-02/LORELEI/ELISA/data',
                      help='path to where the extraction will take place')
  parser.add_argument("--outfile", "-o", help='name of the output file')
  parser.add_argument("--start", "-s", type=int, default=0,
                      help='step to start at')
  parser.add_argument("--stop", "-p", type=int, default=len(steps)-1,
                      help='step to stop at (inclusive)')
  parser.add_argument("--liststeps", "-x", nargs=0, action=make_action(steps),
                      help='print step list and exit')
  parser.add_argument("--ruby", default="/opt/local/bin/ruby2.2", help='path to ruby (2.1 or higher)')

  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  rootdir = args.root
  language = args.language

  setdir = args.setdir
  outdir = os.path.join(rootdir, language, setdir)
  outfile = os.path.join(outdir, args.outfile)
  start = args.start
  stop = args.stop + 1

  if args.expdir is None:
    expdir = os.path.join(rootdir, language, 'expanded', 'lrlp')
  else:
    expdir = args.expdir

  mkdir_p(outdir)

  if args.key is None:
    stepsbyname["decrypt_sets.py"].disable()
  else:
    stepsbyname["decrypt_sets.py"].stderr=os.path.join(outdir, 'decrypt_sets.err')
    stepsbyname["decrypt_sets.py"].argstring="-r %s -k %s -s %s" % (expdir, args.key, setdir)
    stepsbyname["decrypt_sets.py"].run()
    start+=1
  # TWEET
  if args.notweets:
    stepsbyname["get_tweet_by_id.rb"].disable()
    stepsbyname["extract_mono_tweet.py"].disable()
  else:
    tweetprogpath = os.path.join(expdir, 'set0', 'tools', 'twitter-processing', 'bin')

    stepsbyname["get_tweet_by_id.rb"].progpath = tweetprogpath
    tweetdir = os.path.join(outdir, 'tweet')
    stepsbyname["get_tweet_by_id.rb"].argstring = tweetdir+" -l "+language
    tweetintab = os.path.join(expdir, setdir, 'docs', 'twitter_info.tab')
    if os.path.exists(tweetintab):
      stepsbyname["get_tweet_by_id.rb"].stdin = tweetintab
    else:
      stepsbyname["get_tweet_by_id.rb"].disable()
    tweeterr = os.path.join(outdir, 'extract_tweet.err')
    stepsbyname["get_tweet_by_id.rb"].stderr = tweeterr
    stepsbyname["get_tweet_by_id.rb"].scriptbin = args.ruby

  # # TODO: log tweets!

  # MONO
  monoindirs = dirfind(os.path.join(expdir, setdir, 'data', 'monolingual_text'), "%s.ltf.zip" % setdir)
  monooutdir = os.path.join(outdir, 'mono', 'extracted')
  monoerr = os.path.join(outdir, 'extract_mono.err')
  stepsbyname["extract_mono.py"].argstring = "--nogarbage -i %s -o %s" % \
    (' '.join(monoindirs), monooutdir)
  stepsbyname["extract_mono.py"].stderr = monoerr

  
  # since we package and extract all at once, use the ltf structure to declare the manifest names
  manfiles = [x for x in map(lambda y: '.'.join(os.path.basename(y).split('.')[:-2]), monoindirs)]


  # tweet 2 mono set here so that mono and tweet dirs are already established
  if stepsbyname["get_tweet_by_id.rb"].disabled:
    stepsbyname["extract_mono_tweet.py"].disable()
  else:
    stepsbyname["extract_mono_tweet.py"].argstring = "--nogarbage -i "+tweetdir+" -o "+monooutdir
    stepsbyname["extract_mono_tweet.py"].stderr = os.path.join(outdir, 'extract_mono_tweet.err')
    manfiles.append("tweets")
  
  # PACKAGE
  monoxml = outfile
  monostatsfile = outfile+".stats"
  manarg = ' '.join(manfiles)
  monoerr = os.path.join(outdir, 'make_mono_release.err')
  stepsbyname["make_mono_release.py"].argstring = "-r %s -l %s -c %s -s %s | gzip > %s" % \
                                                  (monooutdir, language, manarg, monostatsfile, monoxml)
  stepsbyname["make_mono_release.py"].stderr = monoerr

  for step in steps[start:stop]:
    step.run()

  print("Done.\nFile is %s" % outfile)
示例#12
0
def main():
    parser = argparse.ArgumentParser(
        description="filter extracted parallel data directory",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--indir",
                        "-i",
                        default="./extracted",
                        help="input directory")
    parser.add_argument("--lang", "-l", help="input directory")
    parser.add_argument(
        "--stds",
        "-s",
        type=int,
        default=1,
        help="number of standard deviations from mean to filter out")
    parser.add_argument("--filterdir",
                        "-f",
                        default="./filtered",
                        help="output filter directory")
    parser.add_argument(
        "--genre",
        "-g",
        default="original",
        help=
        "genre to use when filtering (could try tokenized but not available for twitter)"
    )
    parser.add_argument("--remaindir",
                        "-r",
                        default="./remainder",
                        help="output remainder directory")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    # crawl indir for expected original files. cat them together, save ratios, get mean and stdev
    # for each file, including manifest, zip with ratios, determine whether it belongs in filter or remaindir

    # TODO: add deltas too!

    indir = args.indir
    filterdir = args.filterdir
    remaindir = args.remaindir
    mkdir_p(filterdir)
    mkdir_p(remaindir)

    # assumption: there are a number of *.eng.manifest files, each paired with *.<lang>.manifest, and for each i, there is original/i.eng.flat and original/i.<lang>.flat
    engmanifests = glob.glob(os.path.join(indir, "*.eng.manifest"))
    fmanifests = []
    ratios = dd(list)
    deltas = dd(list)
    blackballs = dd(list)
    genres = set()
    for eman in engmanifests:
        ebase = os.path.basename(eman)
        genre = '.'.join(ebase.split('.')[:-2])
        genres.add(genre)
        fman = os.path.join(os.path.dirname(eman),
                            "%s.%s.manifest" % (genre, args.lang))
        fmanifests.append(fman)
        eorig = os.path.join(args.indir, args.genre,
                             "%s.%s.eng.flat" % (genre, args.genre))
        forig = os.path.join(args.indir, args.genre,
                             "%s.%s.%s.flat" % (genre, args.genre, args.lang))
        # test existence
        for f in [eman, fman, eorig, forig]:
            if not os.path.exists(f):
                sys.stderr.write("ERROR: %s does not exist\n" % f)
                sys.exit(1)
        #slurp files, calculate ratios, store ratios
        eorig = prepfile(open(eorig, 'r'), 'r')
        forig = prepfile(open(forig, 'r'), 'r')
        for ln, (eline, fline) in enumerate(izip(eorig, forig)):
            ewords = eline.strip().split()
            fwords = fline.strip().split()
            blackballs[genre].append(blackball(eline, fline))
            deltas[genre].append(abs(len(ewords) - len(fwords)))
            try:
                ratios[genre].append((len(ewords) + 0.0) / (len(fwords) + 0.0))
            except ZeroDivisionError:
                sys.stderr.write(
                    "0-length foreign sentence at line {} of {}\n".format(
                        ln + 1, forig.name))
                ratios[genre].append(0.)

    allratios = np.concatenate(list(map(np.array, ratios.values())), 0)
    alldeltas = np.concatenate(list(map(np.array, deltas.values())), 0)
    allblackballs = np.concatenate(list(map(np.array, blackballs.values())), 0)
    bbrejectsize = Counter(allblackballs)[True]
    ratiomean = np.mean(allratios)
    ratiostd = np.std(allratios)
    lowratio = ratiomean - (args.stds * ratiostd)
    highratio = ratiomean + (args.stds * ratiostd)
    rejectratiosize = len(
        list(filter(lambda x: x < lowratio or x > highratio, allratios)))

    deltamean = np.mean(alldeltas)
    deltastd = np.std(alldeltas)
    lowdelta = deltamean - (args.stds * deltastd)
    highdelta = deltamean + (args.stds * deltastd)
    rejectdeltasize = len(
        list(filter(lambda x: x < lowdelta or x > highdelta, alldeltas)))

    sys.stderr.write(
        "Could be rejecting %d of %d lines (%f %%) with ratio below %f or above %f\n"
        % (rejectratiosize, len(allratios),
           100.0 * rejectratiosize / len(allratios), lowratio, highratio))
    sys.stderr.write(
        "Could be rejecting %d of %d lines (%f %%) with delta below %f or above %f\n"
        % (rejectdeltasize, len(alldeltas),
           100.0 * rejectdeltasize / len(alldeltas), lowdelta, highdelta))

    reject_ratio_delta_size = len(
        list(
            filter(
                lambda x: (x[0] < lowratio or x[0] > highratio) and
                (x[1] < lowdelta or x[1] > highdelta),
                zip(allratios, alldeltas))))
    sys.stderr.write(
        "Actually rejecting %d of %d lines (%f %%) meeting both delta and ratio criteria\n"
        % (reject_ratio_delta_size, len(alldeltas),
           100.0 * reject_ratio_delta_size / len(alldeltas)))

    sys.stderr.write(
        "Also rejecting %d of %d lines (%f %%) for blackball criteria\n" %
        (bbrejectsize, len(allblackballs),
         100.0 * bbrejectsize / len(allblackballs)))
    # iterate through manifests and all files and filter per ratio and delta
    for manset in (engmanifests, fmanifests):
        for man in manset:
            sys.stderr.write("filtering %s\n" % man)
            base = os.path.basename(man)
            genre = '.'.join(base.split('.')[:-2])
            sys.stderr.write("genre %s\n" % genre)
            rats = ratios[genre]
            delts = deltas[genre]
            bbs = blackballs[genre]
            reject_ratio_delta_size = len(
                list(
                    filter(
                        lambda x: (x[0] < lowratio or x[0] > highratio) and
                        (x[1] < lowdelta or x[1] > highdelta),
                        zip(rats, delts))))
            #rejectratiosize = len(list(filter(lambda x: x<lowratio or x > highratio, rats)))
            sys.stderr.write("rejecting %d of %d\n" %
                             (reject_ratio_delta_size, len(rats)))
            infile = prepfile(open(man, 'r'), 'r')
            filterfile = prepfile(open(os.path.join(filterdir, base), 'w'),
                                  'w')
            remainfile = prepfile(open(os.path.join(remaindir, base), 'w'),
                                  'w')
            filterlines(infile, bbs, (rats, delts), (lowratio, lowdelta),
                        (highratio, highdelta), filterfile, remainfile)

    # for directories in extracted
    #http://stackoverflow.com/questions/973473/getting-a-list-of-all-subdirectories-in-the-current-directory
    for subdir in next(os.walk(indir))[1]:
        # make parallel directories
        # for genres in genre set
        # for languages
        # filter lines
        insubdir = os.path.join(indir, subdir)
        filtersubdir = os.path.join(filterdir, subdir)
        mkdir_p(filtersubdir)
        remainsubdir = os.path.join(remaindir, subdir)
        mkdir_p(remainsubdir)
        for genre in genres:
            for lang in (args.lang, 'eng'):
                base = "%s.%s.%s.flat" % (genre, subdir, lang)
                infilename = os.path.join(insubdir, base)
                if os.path.exists(infilename):
                    infile = prepfile(open(infilename, 'r'), 'r')
                    filterfile = prepfile(
                        open(os.path.join(filtersubdir, base), 'w'), 'w')
                    remainfile = prepfile(
                        open(os.path.join(remainsubdir, base), 'w'), 'w')
                    filterlines(infile, blackballs[genre],
                                (ratios[genre], deltas[genre]),
                                (lowratio, lowdelta), (highratio, highdelta),
                                filterfile, remainfile)
                else:
                    sys.stderr.write("%s does not exist\n" % infilename)

    # count files in each of the directories; should be the same
    for dir in (indir, filterdir, remaindir):
        sys.stderr.write("%d files in %s\n" % (countfiles(dir), dir))
示例#13
0
def main():
    parser = argparse.ArgumentParser(description="extract parallel data from " \
                                                 "expanded lrlp to flat files and manifests.")
    parser.add_argument("--rootdir", "-r", default=".",
                        help="root lrlp dir")
    parser.add_argument("--datadirs", nargs='+', default=['data', 'translation'],
                        help="elements in path from root to ltf files")
    parser.add_argument("--outdir", "-o", default="./parallel/extracted",
                        help="where to write extracted files")
    parser.add_argument("--src", "-s", default='uzb',
                        help="source language 3 letter code")
    parser.add_argument("--trg", "-t", default='eng',
                        help="target language 3 letter code")
    parser.add_argument("--origsubdir", default="raw.original",
                        help="subdirectory for untokenized files")
    parser.add_argument("--cleanorigsubdir", default="original",
                        help="subdirectory for cleaned raw original")
    parser.add_argument("--garbagesubdir", default="garbage",
                        help="subdirectory for garbage files (under orig)")
    parser.add_argument("--nogarbage", action='store_true', default=False,
                        help="turn off garbage filtering")
    parser.add_argument("--toksubdir", default="raw.tokenized",
                        help="subdirectory for ldc-tokenized but raw files")
    parser.add_argument("--cleantoksubdir", default="tokenized",
                        help="subdirectory for cleaned ldc-tokenized files")
    parser.add_argument("--cdectoksubdir", default="cdec-tokenized",
                        help="subdirectory for cdec-tokenized files")
    parser.add_argument("--agiletoksubdir", default="agile-tokenized",
                        help="subdirectory for agile-tokenized files (target side only)")
    parser.add_argument("--morphtoksubdir", default="morph-tokenized",
                        help="subdirectory for tokenized files based on " \
                             "morphological segmentation")
    parser.add_argument("--morphsubdir", default="morph",
                        help="subdirectory for morphological files")
    parser.add_argument("--possubdir", default="pos",
                        help="subdirectory for pos tag files")
    # parser.add_argument("--extwtdir", "-et", default=None,
    #                     help="directory of extracted tweet rsd files")
    parser.add_argument("--agiletokpath", default=os.path.join(scriptdir, 'agiletok.sh'),
                        help="path to agile tokenizer binary")
    parser.add_argument("--cdectokpath", default=os.path.join(scriptdir, 'cdectok.sh'),
                        help="path to cdec tokenizer binary")
    parser.add_argument("--cleanpath", default=os.path.join(scriptdir, 'clean.sh'),
                        help="path to cleaning script")
    addonoffarg(parser, 'cdec', help="do cdec tokenization", default=True)
    addonoffarg(parser, 'swap', help="swap source/translation in found file (il3=true, cmn=false)", default=False)

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    origoutdir = args.origsubdir
    cleanorigoutdir = args.cleanorigsubdir
    tokoutdir = args.toksubdir
    cleantokoutdir = args.cleantoksubdir
    morphtokoutdir = args.morphtoksubdir
    cdectokoutdir = args.cdectoksubdir
    agiletokoutdir = args.agiletoksubdir
    cdectoklcoutdir = args.cdectoksubdir + ".lc"
    agiletoklcoutdir = args.agiletoksubdir + ".lc"
    morphoutdir = args.morphsubdir
    posoutdir = args.possubdir
    agiletokpath = args.agiletokpath
    cdectokpath = args.cdectokpath
    cleanpath = args.cleanpath
    dirs = [origoutdir,
            cleanorigoutdir,
            tokoutdir,
            cleantokoutdir,
            morphtokoutdir,
            cdectokoutdir,
            agiletokoutdir,
            cdectoklcoutdir,
            agiletoklcoutdir,
            morphoutdir,
            posoutdir]
    if args.nogarbage:
        garbageoutdir = None
    else:
        garbageoutdir = os.path.join(origoutdir, args.garbagesubdir)
        dirs.append(garbageoutdir)

    for dir in dirs:
        fulldir = os.path.join(args.outdir, dir)
        lputil.mkdir_p(fulldir)
    source_fh = open(os.path.join(args.outdir, "source"), 'a')
    source_fh.write("Extracted parallel data from %s to %s on %s\nusing %s;" \
                    " command issued from %s\n" % (args.rootdir, args.outdir,
                                                   datetime.datetime.now(),
                                                   ' '.join(sys.argv),
                                                   os.getcwd()))
    datadirs = [args.rootdir, ] + args.datadirs

    '''
    from_eng/ -- manual translations from English into LRLP (elicitation,
    phrasebook, core REFLEX news text, additional news text)
  
    from_xxx/ -- manual translations from LRLP into English in multiple
    genres
    '''

    # name of corpus and location in lrlp (for cases that don't do anything special)
    corpustuples = [("fromsource.generic", os.path.join(*(datadirs + ["from_%s" % args.src, ]))),
                    ("fromtarget.news", os.path.join(*(datadirs + ["from_%s" % args.trg, "news"]))),
                    ("fromtarget.phrasebook", os.path.join(*(datadirs + ["from_%s" % args.trg, "phrasebook"]))),
                    ("fromtarget.elicitation", os.path.join(*(datadirs + ["from_%s" % args.trg, "elicitation"])))
                    ]
    commonargs = [args.src, args.trg, args.outdir, origoutdir, cleanorigoutdir, garbageoutdir,
                  tokoutdir, cleantokoutdir, morphtokoutdir, cdectokoutdir, cdectoklcoutdir,
                  agiletokoutdir, agiletoklcoutdir, morphoutdir, posoutdir,
                  agiletokpath, cdectokpath, cleanpath, args.cdec]
    for corpustuple in corpustuples:
        printout(corpustuple[0], corpustuple[1], *commonargs)

    # Found data
    printout("found.generic", args.rootdir, *commonargs,
             stp=lputil.all_found_tuples, el=lputil.get_aligned_sentences, swap=args.swap)

    # # Tweet data
    printout("fromsource.tweet",
             os.path.join(*(datadirs + ["from_%s" % args.src, ])), *commonargs,
             tweet=True)
示例#14
0
def main():
    steps = []

    # extract_mono.py
    steps.append(Step('decrypt_sets.py', help="decode encrypted sets"))

    # get_tweet_by_id.rb
    steps.append(Step('get_tweet_by_id.rb',
                      help="download tweets. must have twitter gem installed " \
                      "and full internet",
                      abortOnFail=False))

    steps.append(
        Step('ldc_tok.py',
             help="run ldc tokenizer on tweets ",
             abortOnFail=False))

    # extract_mono.py
    steps.append(Step('extract_mono.py', help="get flat form mono data"))

    steps.append(Step('make_mono_release.py', help="package mono flat data"))

    stepsbyname = {}
    for step in steps:
        stepsbyname[step.prog] = step

    parser = argparse.ArgumentParser(description="Build an eval IL monoset from LDC to elisa form",
                                     formatter_class= \
                                     argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--setdir",
                        "-d",
                        default='.',
                        help='name of set directory (i.e. set1, setE, etc.)')
    parser.add_argument("--language",
                        "-l",
                        default='uzb',
                        help='three letter code of IL language')
    parser.add_argument("--key",
                        "-k",
                        default=None,
                        help='decryption key for encrypted il')
    parser.add_argument("--notweets",
                        "-n",
                        action='store_true',
                        default=None,
                        help='do not include tweets (for eval IL setE only)')
    parser.add_argument("--engset",
                        "-E",
                        action='store_true',
                        default=None,
                        help='assume engset and ilset (for eval IL setE only)')
    parser.add_argument("--expdir", "-e",
                        help='path to where the extraction is. If starting at ' \
                        'step 0 this is ignored')
    parser.add_argument("--root",
                        "-r",
                        default='/home/nlg-02/LORELEI/ELISA/data',
                        help='path to where the extraction will take place')
    parser.add_argument("--outfile", "-o", help='name of the output file')
    parser.add_argument("--start",
                        "-s",
                        type=int,
                        default=0,
                        help='step to start at')
    parser.add_argument("--stop",
                        "-p",
                        type=int,
                        default=len(steps) - 1,
                        help='step to stop at (inclusive)')
    parser.add_argument("--liststeps",
                        "-x",
                        nargs=0,
                        action=make_action(steps),
                        help='print step list and exit')
    parser.add_argument("--ruby",
                        default="/opt/local/bin/ruby2.2",
                        help='path to ruby (2.1 or higher)')

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    rootdir = args.root
    language = args.language

    setdir = args.setdir
    outdir = os.path.join(rootdir, language, setdir)
    outfile = os.path.join(outdir, args.outfile)
    start = args.start
    stop = args.stop + 1

    if args.engset:
        emstep = steps.pop(-1)
        stepsbyname.pop(emstep.name)
        mmrstep = steps.pop(-1)
        stepsbyname.pop(mmrstep.name)
        for flavor in (language, "eng"):
            newem = Step('extract_mono.py',
                         name='extract_mono_%s' % flavor,
                         help="get flat form mono data in %s" % flavor)
            steps.append(newem)
            stepsbyname[newem.name] = newem
            newmmr = Step('make_mono_release.py',
                          name='make_mono_release_%s' % flavor,
                          help="package mono flat data in %s" % flavor)
            steps.append(newmmr)
            stepsbyname[newmmr.name] = newmmr
        stop += 2

    if args.expdir is None:
        expdir = os.path.join(rootdir, language, 'expanded', 'lrlp')
    else:
        expdir = args.expdir

    mkdir_p(outdir)

    if args.key is None:
        stepsbyname["decrypt_sets.py"].disable()
    else:
        stepsbyname["decrypt_sets.py"].stderr = os.path.join(
            outdir, 'decrypt_sets.err')
        stepsbyname["decrypt_sets.py"].argstring = "-r %s -k %s -s %s" % (
            expdir, args.key, setdir)
        stepsbyname["decrypt_sets.py"].run()
        start += 1
    # from 2018 on, setE has il and eng variants
    monoindirs = []

    # TWEET
    # hack for tweets; early set of monodir
    monodir = os.path.join(expdir, setdir, 'data', 'monolingual_text')
    tweetintab = os.path.join(expdir, setdir, 'docs', 'twitter_info.tab')
    notweetsinmono = True
    if args.notweets or not os.path.exists(tweetintab):
        print("disabling twitter stuff; tweets in regular mono ok")
        notweetsinmono = False
        stepsbyname["get_tweet_by_id.rb"].disable()
        stepsbyname["ldc_tok.py"].disable()
    else:
        print(
            "not disabling twitter stuff; look at {}; avoiding tweets in regular mono"
            .format(tweetintab))
        stepsbyname["get_tweet_by_id.rb"].stdin = tweetintab
        tweetprogpaths = []
        #    for toolroot in (os.path.join(expdir, 'set0'), scriptdir): # bad ldc tools for eval
        for toolroot in (scriptdir, ):
            tweetprogpaths = dirfind(os.path.join(toolroot, 'tools'),
                                     'get_tweet_by_id.rb')
            if len(tweetprogpaths) > 0:
                break
        if len(tweetprogpaths) == 0:
            sys.stderr.write("Can't find get_tweet_by_id.rb\n")
            sys.exit(1)
        else:
            tweetprogpath = os.path.dirname(tweetprogpaths[0])
        tweetdir = os.path.join(outdir, 'tweet', 'rsd')

        stepsbyname["get_tweet_by_id.rb"].progpath = tweetprogpath
        mkdir_p(tweetdir)
        stepsbyname[
            "get_tweet_by_id.rb"].argstring = tweetdir + " -l " + language

        tweeterr = os.path.join(outdir, 'extract_tweet.err')
        stepsbyname["get_tweet_by_id.rb"].stderr = tweeterr
        stepsbyname["get_tweet_by_id.rb"].scriptbin = args.ruby

        # TOKENIZE AND RELOCATE TWEETS
        # find rb location, params file
        toxexecpaths = []
        thetoolroot = None
        for toolroot in (expdir, scriptdir):
            tokexecpaths = dirfind(os.path.join(toolroot, 'tools'),
                                   'token_parse.rb')
            if len(tokexecpaths) > 0:
                thetoolroot = toolroot
                break
        if len(tokexecpaths) == 0:
            sys.stderr.write("Can't find token_parse.rb\n")
            sys.exit(1)
        tokexec = tokexecpaths[0]
        tokparamopts = dirfind(os.path.join(thetoolroot, 'tools'), 'yaml')
        tokparam = "--param {}".format(
            tokparamopts[0]) if len(tokparamopts) > 0 else ""
        # ugly: the base of the file monodir/mononame.zip; need to add it to monoindirs and just pass that base so it gets constructed
        mononame = "tweets.ltf"
        monoindirs.append(os.path.join(monodir, mononame + ".zip"))
        stepsbyname[
            "ldc_tok.py"].argstring = "--mononame {mononame} -m {monodir} --ruby {ruby} --dldir {tweetdir} --exec {tokexec} {tokparam} --outfile {outfile}".format(
                mononame=mononame,
                monodir=monodir,
                ruby=args.ruby,
                tweetdir=tweetdir,
                tokexec=tokexec,
                tokparam=tokparam,
                outfile=os.path.join(rootdir, language, 'ldc_tok.stats'))
        stepsbyname["ldc_tok.py"].stderr = os.path.join(
            rootdir, language, 'ldc_tok.err')

    # # TODO: log tweets!

    # MONO

    if args.engset:
        for flavor in (args.language, "eng"):
            localmonoindirs = copy.deepcopy(monoindirs)
            monodir = os.path.join(expdir, setdir, 'data', 'monolingual_text',
                                   flavor)
            localmonoindirs.extend(
                dirfind(monodir, "%s_%s.ltf.zip" % (setdir, flavor)))
            print(localmonoindirs)
            # JM: TODO: ugly copy. refactor!!!

            monooutdir = os.path.join(outdir, 'mono', 'extracted_%s' % flavor)
            monoerr = os.path.join(outdir, 'extract_mono_%s.err' % flavor)
            stepsbyname["extract_mono_%s" % flavor].argstring = "--no-cdec --nogarbage -i %s -o %s" % \
              (' '.join(localmonoindirs), monooutdir)
            if notweetsinmono:
                stepsbyname["extract_mono_%s" %
                            flavor].argstring += " --removesn"
            stepsbyname["extract_mono_%s" % flavor].stderr = monoerr

            # since we package and extract all at once, use the ltf structure to declare the manifest names
            manfiles = [
                x for x in map(
                    lambda y: '.'.join(os.path.basename(y).split('.')[:-2]),
                    localmonoindirs)
            ]

            # tweet 2 mono set here so that mono and tweet dirs are already established
            # if stepsbyname["get_tweet_by_id.rb"].disabled:
            #   stepsbyname["extract_mono_tweet.py"].disable()
            # else:
            #   stepsbyname["extract_mono_tweet.py"].argstring = "--nogarbage -i "+tweetdir+" -o "+monooutdir
            #   stepsbyname["extract_mono_tweet.py"].stderr = os.path.join(outdir, 'extract_mono_tweet.err')
            #   manfiles.append("tweets")

            ofcomponents = outfile.split('.')
            localoutfile = '.'.join(
                ofcomponents[:-1]) + (".%s." % flavor) + ofcomponents[-1]
            print(localoutfile)

            # PACKAGE
            monoxml = localoutfile
            monostatsfile = localoutfile + ".stats"
            manarg = ' '.join(manfiles)
            monoerr = os.path.join(outdir, 'make_mono_release_%s.err' % flavor)
            stepsbyname["make_mono_release_%s" % flavor].argstring = "--no-ext -r %s -l %s -c %s -s %s | gzip > %s" % \
                                                            (monooutdir, flavor, manarg, monostatsfile, monoxml)
            stepsbyname["make_mono_release_%s" % flavor].stderr = monoerr

    else:
        monodir = os.path.join(expdir, setdir, 'data', 'monolingual_text')
        monoindirs.extend(dirfind(monodir, "%s.ltf.zip" % setdir))

        monooutdir = os.path.join(outdir, 'mono', 'extracted')
        monoerr = os.path.join(outdir, 'extract_mono.err')
        stepsbyname["extract_mono.py"].argstring = "--no-cdec --nogarbage -i %s -o %s" % \
          (' '.join(monoindirs), monooutdir)
        if notweetsinmono:
            stepsbyname["extract_mono.py"].argstring += " --removesn"
        stepsbyname["extract_mono.py"].stderr = monoerr

        # since we package and extract all at once, use the ltf structure to declare the manifest names
        manfiles = [
            x for x in map(
                lambda y: '.'.join(os.path.basename(y).split('.')[:-2]),
                monoindirs)
        ]

        # tweet 2 mono set here so that mono and tweet dirs are already established
        # if stepsbyname["get_tweet_by_id.rb"].disabled:
        #   stepsbyname["extract_mono_tweet.py"].disable()
        # else:
        #   stepsbyname["extract_mono_tweet.py"].argstring = "--nogarbage -i "+tweetdir+" -o "+monooutdir
        #   stepsbyname["extract_mono_tweet.py"].stderr = os.path.join(outdir, 'extract_mono_tweet.err')
        #   manfiles.append("tweets")

        # PACKAGE
        monoxml = outfile
        monostatsfile = outfile + ".stats"
        manarg = ' '.join(manfiles)
        monoerr = os.path.join(outdir, 'make_mono_release.err')
        stepsbyname["make_mono_release.py"].argstring = "--no-ext -r %s -l %s -c %s -s %s | gzip > %s" % \
                                                        (monooutdir, language, manarg, monostatsfile, monoxml)
        stepsbyname["make_mono_release.py"].stderr = monoerr

    for step in steps[start:stop]:
        step.run()

    print("Done.\nLast file is %s" % outfile)
示例#15
0
def main():
    parser = argparse.ArgumentParser(
        description="Deterministic subselect designed for nov 2016 uyghur evaluation: per-doc, from end",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("--indir", "-i", help="location of parallel data")
    parser.add_argument("--language", "-l", help="source language three digit code")
    parser.add_argument(
        "--extractpath", "-e", default="extracted", help="location of extracted data (might want to use 'filtered')"
    )
    parser.add_argument("--sizes", "-s", nargs="+", type=int, help="list of sizes desired in each category")
    parser.add_argument("--categories", "-c", nargs="+", help="list of categories. Must match sizes")
    parser.add_argument("--remainder", "-r", default="train", help="remainder category. Should be a new category")
    parser.add_argument(
        "--devlstfile",
        "-d",
        default=None,
        help="file of desired documents for dev (subject to length constraints, must be a set called 'dev')",
    )
    addonoffarg(parser, "fromFront", default=False, help="do doc assignment from the beginning (instead of the end)")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    #  reader = codecs.getreader('utf8')
    #  writer = codecs.getwriter('utf8')
    #  outfile = writer(args.outfile)

    indir = args.indir
    origsizes = args.sizes

    # TODO: find these?
    # doc = keep full docs together  (can detect this by counting number of unique docs)
    # TODO: re-add found.generic to docprefixes
    docprefixes = ["fromsource.generic", "fromsource.tweet", "fromtarget.news"]
    # IL3: moving found.generic!!
    nodocprefixes = ["fromtarget.elicitation", "fromtarget.phrasebook", "found.generic"]

    # TODO: find these
    filetypes = [
        "morph",
        "morph-tokenized",
        "original",
        "pos",
        "tokenized",
        "mttok",
        "mttoklc",
        "agile-tokenized",
        "cdec-tokenized",
        "agile-tokenized.lc",
        "cdec-tokenized.lc",
    ]

    extractpath = os.path.join(indir, args.extractpath)
    origpath = os.path.join(extractpath, "original")
    outpath = os.path.join(indir, "splits")
    mkdir_p(outpath)

    for preflist in [docprefixes, nodocprefixes]:
        for prefix in list(preflist):
            # don't deal with it more if there's nothing in the manifest
            manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
            if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0:
                print("removing " + prefix)
                preflist.remove(prefix)
    # doc-based processing

    for prefix in docprefixes:
        idfile = os.path.join(outpath, "%s.ids" % prefix)
        manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
        try:
            check_output("cut -f2 %s > %s" % (manfile, idfile), stderr=STDOUT, shell=True)
        except CalledProcessError as exc:
            print("Status : FAIL", exc.returncode, exc.output)
        catfile = runselection(
            prefix,
            idfile,
            args.categories,
            args.remainder,
            origsizes,
            filetypes,
            args.language,
            extractpath,
            outpath,
            args.devlstfile,
            fromFront=args.fromFront,
        )
        for i in (args.language, "eng"):
            manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i))
            cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % (
                scriptdir,
                manifest,
                idfile,
                catfile,
                outpath,
                args.remainder,
            )
            print("Running " + cmd)
            check_output(cmd, stderr=STDOUT, shell=True)

    # nodoc-based processing
    for prefix in nodocprefixes:
        idfile = os.path.join(outpath, "%s.fakeids" % prefix)
        try:
            mansize = int(
                check_output("wc -l %s" % os.path.join(extractpath, "%s.eng.manifest" % prefix), shell=True)
                .decode("utf8")
                .strip()
                .split(" ")[0]
            )
            check_output("seq %d > %s" % (mansize, idfile), stderr=STDOUT, shell=True)
        except CalledProcessError as exc:
            print("Status : FAIL", exc.returncode, exc.output)
        catfile = runselection(
            prefix,
            idfile,
            args.categories,
            args.remainder,
            origsizes,
            filetypes,
            args.language,
            extractpath,
            outpath,
            fromFront=args.fromFront,
        )
        for i in (args.language, "eng"):
            manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i))
            cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % (
                scriptdir,
                manifest,
                idfile,
                catfile,
                outpath,
                args.remainder,
            )
            print("Running " + cmd)
            check_output(cmd, stderr=STDOUT, shell=True)

    # warning if entries not found in given dev list
    if args.devlstfile:
        devlst = set(open(args.devlstfile).read().split())
        all_docids = list()
        for prefix in docprefixes:
            all_docids += open(os.path.join(outpath, "%s.ids" % prefix)).read().split("\n")
        for i in devlst - set(all_docids):
            print("***Warning: docid not found: %s" % i)
示例#16
0
def main():
    steps = []
    # Put additional steps in here. Arguments, stdin/stdout, etc. get set below

    # unpack_lrlp.sh
    steps.append(
        Step('unpack_lrlp.sh',
             call=check_output,
             help="untars lrlp into position for further processing"))

    # gather_ephemera.py
    steps.append(
        Step('gather_ephemera.py', help="relocates assorted bits from lrlp"))

    # extract_lexicon.py
    steps.append(
        Step('extract_lexicon.py',
             help="get flat form of bilingual lexicon",
             abortOnFail=False))

    # clean_lexicon
    steps.append(
        Step('clean.sh',
             name="clean_lexicon",
             help="wildeclean/nfkc lexicon file",
             abortOnFail=False))

    # normalize_lexicon.py
    steps.append(
        Step(
            'normalize_lexicon_tg.py',
            name="normalize_lexicon.py",
            help=
            "heuristically convert lexicon into something more machine readable",
            abortOnFail=False))

    # relocate lexicon
    steps.append(
        Step('cp',
             progpath='/bin',
             name="relocate_lexicon",
             help="move the lexicon stuff into ephemera",
             abortOnFail=False))

    # get_tweet_by_id.rb
    steps.append(Step('get_tweet_by_id.rb',
                      help="download tweets. must have twitter gem installed " \
                      "and full internet",
                      abortOnFail=False))

    steps.append(
        Step('ldc_tok.py',
             help="run ldc tokenizer on tweets ",
             abortOnFail=False))

    # extract_psm_annotation.py
    steps.append(
        Step('extract_psm_annotation.py',
             help="get annotations from psm files into psm.ann",
             abortOnFail=False))

    # extract_entity_annotation.py
    steps.append(
        Step('extract_entity_annotation.py',
             help="get entity and other annotations into entity.ann",
             abortOnFail=False))

    # extract_parallel.py
    steps.append(
        Step('extract_parallel.py', help="get flat form parallel data"))

    steps.append(
        Step('filter_parallel.py',
             help="filter parallel data to remove likely mismatches"))

    # extract_mono.py
    steps.append(Step('extract_mono.py', help="get flat form mono data"))

    # extract_comparable.py
    steps.append(
        Step('extract_comparable.py', help="get flat form comparable data"))

    stepsbyname = {}
    for step in steps:
        stepsbyname[step.name] = step

    parser = argparse.ArgumentParser(description="Process a LRLP into flat format",
                                     formatter_class= \
                                     argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--tarball",
        "-t",
        nargs='+',
        required=True,
        help=
        'path to gzipped tars for processing (all tars considered to be part of the same package). Ex: lrlp.tar.gz'
    )
    parser.add_argument("--language",
                        "-l",
                        required=True,
                        help='three letter code of language. example "uzb"')
    parser.add_argument(
        "--lexversion",
        "-L",
        default='1.5',
        help='version of lexicon to extract (may need to create a new one)')

    parser.add_argument("--key",
                        "-k",
                        default=None,
                        help='decryption key for encrypted il')
    parser.add_argument("--set",
                        "-S",
                        default=None,
                        help='decryption set for encrypted il')
    addonoffarg(parser, "mono", help="extract mono data", default=True)
    parser.add_argument(
        "--previous",
        default=None,
        help=
        'path to previous extraction (equivalent to one level down from root)')

    parser.add_argument("--root",
                        "-r",
                        default='/home/nlg-02/LORELEI/ELISA/data',
                        help='path to where the extraction will take place')
    parser.add_argument("--evalil",
                        "-E",
                        action='store_true',
                        default=False,
                        help='this is an eval il. makes expdir set0 aware')
    parser.add_argument("--expdir", "-e",
                        help='path to where the extraction is (equivalent to root/lang/expanded/lrlp). If starting at ' \
                        'step 0 this is ignored')
    parser.add_argument("--start",
                        "-s",
                        type=int,
                        default=0,
                        help='step to start at')
    parser.add_argument("--stop",
                        "-p",
                        type=int,
                        default=len(steps) - 1,
                        help='step to stop at (inclusive)')
    parser.add_argument("--liststeps",
                        "-x",
                        nargs=0,
                        action=make_action(steps),
                        help='print step list and exit')
    parser.add_argument("--ruby",
                        default="ruby",
                        help='path to ruby (2.1 or higher)')
    addonoffarg(parser,
                "swap",
                help="swap source/target in found data (e.g. il3)",
                default=False)
    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))
        sys.exit(2)

    if args.expdir is not None and args.start <= 0:
        sys.stderr.write \
          ("Warning: expdir is set but will be ignored and determined dynamically")
    if args.expdir is None and args.start > 0:
        sys.stderr.write \
          ("Error: must explicitly set expdir if not starting at step 0")
        sys.exit(1)

    rootdir = args.root
    language = args.language
    start = args.start
    stop = args.stop + 1
    if (args.key is None) ^ (args.set is None):
        sys.stderr.write("key (-k) and set (-S) must both be set or unset\n")
        sys.exit(1)
    # Patchups for step 0
    argstring = "-k %s -s %s" % (args.key,
                                 args.set) if args.key is not None else ""
    argstring += " -l %s -r %s %s" % (language, rootdir, ' '.join(
        args.tarball))
    sys.stderr.write("args for unpack lrlp are {}\n".format(argstring))
    stepsbyname["unpack_lrlp.sh"].argstring = argstring

    if start == 0:
        expdir = steps[0].run().strip().decode("utf-8")
        if args.evalil:
            expdir = os.path.join(expdir, 'set0')
        start += 1
    else:
        expdir = args.expdir
    monodir = os.path.join(expdir, 'data', 'monolingual_text')
    # what are the mono files? (needed for later)
    if args.mono and args.previous is None:
        monoindirs = dirfind(monodir, "ltf.zip")
    else:
        monoindirs = []
    # Patchups for the rest
    if stop > 0:
        # TWEET
        tweetintab = os.path.join(expdir, 'docs', 'twitter_info.tab')
        tweetdir = os.path.join(rootdir, language, 'tweet', 'rsd')
        if not os.path.exists(tweetintab):
            stepsbyname["get_tweet_by_id.rb"].disable()
            stepsbyname["ldc_tok.py"].disable()
        else:
            tweetprogpaths = []
            #for toolroot in (expdir, scriptdir): # bad ldc tools for eval
            for toolroot in (scriptdir, ):
                tweetprogpaths = dirfind(os.path.join(toolroot, 'tools'),
                                         'get_tweet_by_id.rb')
                if len(tweetprogpaths) > 0:
                    break
            if len(tweetprogpaths) == 0:
                sys.stderr.write("Can't find get_tweet_by_id.rb\n")
                sys.exit(1)
            else:
                tweetprogpath = os.path.dirname(tweetprogpaths[0])
            mkdir_p(tweetdir)
            tweeterr = os.path.join(rootdir, language, 'extract_tweet.err')
            stepsbyname["get_tweet_by_id.rb"].stderr = tweeterr

            # just copy from previous or skip if no mono
            if not args.mono:
                if args.previous is None:
                    stepsbyname["get_tweet_by_id.rb"].disable()
                else:
                    oldtweetdir = os.path.join(
                        args.previous, 'tweet', 'rsd'
                    )  #WARNING: old versions of data won't have this structure
                    stepsbyname["get_tweet_by_id.rb"].progpath = "/bin"
                    stepsbyname["get_tweet_by_id.rb"].prog = "cp"
                    stepsbyname[
                        "get_tweet_by_id.rb"].argstring = "-r {} {}".format(
                            oldtweetdir, tweetdir)
            else:
                stepsbyname["get_tweet_by_id.rb"].progpath = tweetprogpath
                stepsbyname[
                    "get_tweet_by_id.rb"].argstring = tweetdir + " -l " + language
                stepsbyname["get_tweet_by_id.rb"].scriptbin = args.ruby
                if os.path.exists(tweetintab):
                    stepsbyname["get_tweet_by_id.rb"].stdin = tweetintab
                else:
                    stepsbyname["get_tweet_by_id.rb"].disable()

            # TOKENIZE AND RELOCATE TWEETS
            # find rb location, params file
            toxexecpaths = []
            thetoolroot = None
            for toolroot in (expdir, scriptdir):
                tokexecpaths = dirfind(os.path.join(toolroot, 'tools'),
                                       'token_parse.rb')
                if len(tokexecpaths) > 0:
                    thetoolroot = toolroot
                    break
            if len(tokexecpaths) == 0:
                sys.stderr.write("Can't find token_parse.rb\n")
                sys.exit(1)
            tokexec = tokexecpaths[0]
            tokparamopts = dirfind(os.path.join(thetoolroot, 'tools'), 'yaml')
            tokparam = "--param {}".format(
                tokparamopts[0]) if len(tokparamopts) > 0 else ""
            lrlpdir = os.path.join(expdir, 'data', 'translation',
                                   'from_{}'.format(language), language, 'ltf')
            # ugly: the base of the file monodir/mononame.zip; need to add it to monoindirs and just pass that base so it gets constructed
            mononame = "tweets.ltf"
            monoindirs.append(os.path.join(monodir, mononame + ".zip"))
            stepsbyname[
                "ldc_tok.py"].argstring = "--mononame {mononame} -m {monodir} --ruby {ruby} --dldir {tweetdir} --lrlpdir {lrlpdir} --exec {tokexec} {tokparam} --outfile {outfile}".format(
                    monodir=monodir,
                    mononame=mononame,
                    ruby=args.ruby,
                    tweetdir=tweetdir,
                    lrlpdir=lrlpdir,
                    tokexec=tokexec,
                    tokparam=tokparam,
                    outfile=os.path.join(rootdir, language, 'ldc_tok.stats'))
            stepsbyname["ldc_tok.py"].stderr = os.path.join(
                rootdir, language, 'ldc_tok.err')

        # EPHEMERA
        ephemdir = os.path.join(rootdir, language, 'ephemera')
        ephemarg = "-s {} -t {}".format(expdir, ephemdir)
        if args.previous is not None:
            ephemarg += " -o {}".format(os.path.join(args.previous,
                                                     'ephemera'))
        stepsbyname['gather_ephemera.py'].argstring = ephemarg
        ephemerr = os.path.join(rootdir, language, 'gather_ephemera.err')
        stepsbyname['gather_ephemera.py'].stderr = ephemerr

        # # LTF2RSD
        # l2rindir = os.path.join(expdir, 'data', 'translation', 'from_'+language,
        #                         'eng') # Only converts from_SRC_tweet subdir
        # stepsbyname["ltf2rsd.perl"].argstring = l2rindir
        # # l2rprogpath = os.path.join(expdir, 'tools', 'ltf2txt')
        # # stepsbyname["ltf2rsd.perl"].progpath = l2rprogpath
        # l2rerr = os.path.join(rootdir, language, 'ltf2rsd.err')
        # stepsbyname["ltf2rsd.perl"].stderr = l2rerr

        # LEXICON
        #
        # IL CHANGE
        if args.evalil:
            lexiconinfile = os.path.join(expdir, 'docs',
                                         'categoryI_dictionary', '*.xml')
            if args.lexversion == "il6":
                lexiconinfile = os.path.join(expdir, 'docs',
                                             'categoryI_dictionary', '*.zip')
            elif args.lexversion == "il5":
                lexiconinfile = os.path.join(expdir, 'docs',
                                             'categoryI_dictionary', '*.txt')
        else:
            lexiconinfile = os.path.join(expdir, 'data', 'lexicon', '*.xml')
        lexiconoutdir = os.path.join(rootdir, language, 'lexicon')
        lexiconrawoutfile = os.path.join(lexiconoutdir, 'lexicon.raw')
        lexiconoutfile = os.path.join(lexiconoutdir, 'lexicon')
        lexiconnormoutfile = os.path.join(lexiconoutdir, 'lexicon.norm')

        lexiconerr = os.path.join(rootdir, language, 'extract_lexicon.err')
        lexiconcleanerr = os.path.join(rootdir, language, 'clean_lexicon.err')
        lexiconnormerr = os.path.join(rootdir, language,
                                      'normalize_lexicon.err')
        # lexicon v1.5 for y2
        stepsbyname[
            "extract_lexicon.py"].argstring = " -v {} -i {} -o {}".format(
                args.lexversion, lexiconinfile, lexiconrawoutfile)
        stepsbyname["extract_lexicon.py"].stderr = lexiconerr

        stepsbyname["clean_lexicon"].argstring = "{} {}".format(
            lexiconrawoutfile, lexiconoutfile)
        stepsbyname["clean_lexicon"].stderr = lexiconcleanerr

        stepsbyname["normalize_lexicon.py"].argstring = "-i %s -o %s" % \
                                                      (lexiconoutfile, lexiconnormoutfile)
        stepsbyname["normalize_lexicon.py"].stderr = lexiconnormerr

        stepsbyname["relocate_lexicon"].argstring = "-r %s %s" % (
            lexiconoutdir, ephemdir)

        # PSM
        # just copy from previous or skip if no mono
        psmerr = os.path.join(rootdir, language, 'extract_psm_annotation.err')
        stepsbyname["extract_psm_annotation.py"].stderr = psmerr
        psmoutpath = os.path.join(rootdir, language, 'psm.ann')
        if not args.mono:
            if args.previous is None:
                stepsbyname["extract_psm_annotation.py"].disable()
            else:
                oldpsm = os.path.join(args.previous, 'psm.ann')
                stepsbyname["extract_psm_annotation.py"].progpath = "/bin"
                stepsbyname["extract_psm_annotation.py"].prog = "cp"
                stepsbyname[
                    "extract_psm_annotation.py"].argstring = "{} {}".format(
                        oldpsm, psmoutpath)
        else:
            psmindir = os.path.join(monodir, 'zipped', '*.psm.zip')
            stepsbyname["extract_psm_annotation.py"].argstring = "-i %s -o %s" % \
                                                                 (psmindir, psmoutpath)

        # ENTITY
        entityoutpath = os.path.join(rootdir, language, 'entity.ann')
        entityerr = os.path.join(rootdir, language,
                                 'extract_entity_annotation.err')
        stepsbyname["extract_entity_annotation.py"].argstring="-r %s -o %s -et %s" \
          % (expdir, entityoutpath, tweetdir)
        stepsbyname["extract_entity_annotation.py"].stderr = entityerr

        # PARALLEL
        paralleloutdir = os.path.join(rootdir, language, 'parallel',
                                      'extracted')
        parallelerr = os.path.join(rootdir, language, 'extract_parallel.err')
        stepsbyname["extract_parallel.py"].argstring="--no-cdec -r %s -o %s -s %s" % \
          (expdir, paralleloutdir, language)
        stepsbyname["extract_parallel.py"].stderr = parallelerr
        if args.swap:
            stepsbyname["extract_parallel.py"].argstring += " --swap"

        filteroutdir = os.path.join(rootdir, language, 'parallel', 'filtered')
        rejectoutdir = os.path.join(rootdir, language, 'parallel', 'rejected')
        filtererr = os.path.join(rootdir, language, 'filter_parallel.err')
        stepsbyname["filter_parallel.py"].argstring="-s 2 -l %s -i %s -f %s -r %s" % \
          (language, paralleloutdir, filteroutdir, rejectoutdir)
        stepsbyname["filter_parallel.py"].stderr = filtererr

        # MONO
        # just copy from previous or skip if no mono
        monoerr = os.path.join(rootdir, language, 'extract_mono.err')
        stepsbyname["extract_mono.py"].stderr = monoerr
        if not args.mono:
            if args.previous is None:
                stepsbyname["extract_mono.py"].disable()
            else:
                oldmonodir = os.path.join(args.previous, 'mono')
                monooutdir = os.path.join(rootdir, language, 'mono')
                stepsbyname["extract_mono.py"].progpath = "/bin"
                stepsbyname["extract_mono.py"].prog = "cp"
                stepsbyname["extract_mono.py"].argstring = "-r {} {}".format(
                    oldmonodir, monooutdir)
        else:
            monooutdir = os.path.join(rootdir, language, 'mono', 'extracted')
            stepsbyname["extract_mono.py"].argstring = "--no-cdec -i %s -o %s" % \
                                                       (' '.join(monoindirs), monooutdir)

        # COMPARABLE
        if os.path.exists(
                os.path.join(expdir, 'data', 'translation', 'comparable')):
            compoutdir = os.path.join(rootdir, language, 'comparable',
                                      'extracted')
            comperr = os.path.join(rootdir, language, 'extract_comparable.err')
            stepsbyname["extract_comparable.py"].argstring = "-r %s -o %s -s %s" % \
                                                             (expdir, compoutdir, language)
            stepsbyname["extract_comparable.py"].stderr = comperr
        else:
            stepsbyname["extract_comparable.py"].disable()

        for step in steps[start:stop]:
            step.run()

    print("Done.\nExpdir is %s" % expdir)
示例#17
0
def main():
  parser = argparse.ArgumentParser(description="filter extracted parallel data directory",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--indir", "-i", default="./extracted", help="input directory")
  parser.add_argument("--lang", "-l", help="input directory")
  parser.add_argument("--stds", "-s", type=int, default=1, help="number of standard deviations from mean to filter out")
  parser.add_argument("--filterdir", "-f", default="./filtered", help="output filter directory")
  parser.add_argument("--genre", "-g", default="original", help="genre to use when filtering (could try tokenized but not available for twitter)")
  parser.add_argument("--remaindir", "-r", default="./remainder", help="output remainder directory")



  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  # crawl indir for expected original files. cat them together, save ratios, get mean and stdev
  # for each file, including manifest, zip with ratios, determine whether it belongs in filter or remaindir

  # TODO: add deltas too!

  indir = args.indir
  filterdir = args.filterdir
  remaindir = args.remaindir
  mkdir_p(filterdir)
  mkdir_p(remaindir)

  # assumption: there are a number of *.eng.manifest files, each paired with *.<lang>.manifest, and for each i, there is original/i.eng.flat and original/i.<lang>.flat
  engmanifests = glob.glob(os.path.join(indir, "*.eng.manifest"))
  fmanifests = []
  ratios = dd(list)
  deltas = dd(list)
  genres = set()
  for eman in engmanifests:
    ebase = os.path.basename(eman)
    genre = '.'.join(ebase.split('.')[:-2])
    genres.add(genre)
    fman = os.path.join(os.path.dirname(eman), "%s.%s.manifest" % (genre, args.lang))
    fmanifests.append(fman)
    eorig = os.path.join(args.indir, args.genre, "%s.%s.eng.flat" % (genre, args.genre))
    forig = os.path.join(args.indir, args.genre, "%s.%s.%s.flat" % (genre, args.genre, args.lang))
    # test existence
    for f in [eman, fman, eorig, forig]:
      if not os.path.exists(f):
        sys.stderr.write("ERROR: %s does not exist\n" % f)
        sys.exit(1)
    #slurp files, calculate ratios, store ratios
    eorig = prepfile(open(eorig, 'r'), 'r')
    forig = prepfile(open(forig, 'r'), 'r')
    for ln, (eline, fline) in enumerate(izip(eorig, forig)):
      ewords = eline.strip().split()
      fwords = fline.strip().split()
      ratios[genre].append((len(ewords)+0.0)/(len(fwords)+0.0))
      deltas[genre].append(abs(len(ewords)-len(fwords)))
  allratios = np.concatenate(list(map(np.array, ratios.values())), 0)
  alldeltas = np.concatenate(list(map(np.array, deltas.values())), 0)
  ratiomean = np.mean(allratios)
  ratiostd = np.std(allratios)
  lowratio = ratiomean-(args.stds*ratiostd)
  highratio = ratiomean+(args.stds*ratiostd)
  rejectratiosize = len(list(filter(lambda x: x<lowratio or x > highratio, allratios)))

  deltamean = np.mean(alldeltas)
  deltastd = np.std(alldeltas)
  lowdelta = deltamean-(args.stds*deltastd)
  highdelta = deltamean+(args.stds*deltastd)
  rejectdeltasize = len(list(filter(lambda x: x<lowdelta or x > highdelta, alldeltas)))

  sys.stderr.write("Rejecting %d of %d lines (%f %%) with ratio below %f or above %f\n" % (rejectratiosize, len(allratios), 100.0*rejectratiosize/len(allratios), lowratio, highratio))
  sys.stderr.write("Rejecting %d of %d lines (%f %%) with delta below %f or above %f\n" % (rejectdeltasize, len(alldeltas), 100.0*rejectdeltasize/len(alldeltas), lowdelta, highdelta))

  reject_ratio_delta_size = len(list(filter(lambda x: (x[0]<lowratio or x[0]>highratio) and (x[1]<lowdelta or x[1]>highdelta), zip(allratios, alldeltas))))
  sys.stderr.write("Rejecting %d of %d lines (%f %%) meeting both delta and ratio criteria\n" % (reject_ratio_delta_size, len(alldeltas), 100.0*reject_ratio_delta_size/len(alldeltas)))

  # iterate through manifests and all files and filter per ratio and delta
  for manset in (engmanifests, fmanifests):
    for man in manset:
      sys.stderr.write("filtering %s\n" % man)
      base = os.path.basename(man)
      genre = '.'.join(base.split('.')[:-2])
      sys.stderr.write("genre %s\n" % genre)
      rats = ratios[genre]
      delts = deltas[genre]
      reject_ratio_delta_size = len(list(filter(lambda x: (x[0]<lowratio or x[0]>highratio) and (x[1]<lowdelta or x[1]>highdelta), zip(rats, delts))))
      #rejectratiosize = len(list(filter(lambda x: x<lowratio or x > highratio, rats)))
      sys.stderr.write("rejecting %d of %d\n" % (reject_ratio_delta_size, len(rats)))
      infile = prepfile(open(man, 'r'), 'r')
      filterfile = prepfile(open(os.path.join(filterdir, base), 'w'), 'w')
      remainfile = prepfile(open(os.path.join(remaindir, base), 'w'), 'w')
      filterlines(infile, (rats, delts), (lowratio,lowdelta), (highratio,highdelta), filterfile, remainfile)

  # for directories in extracted
  #http://stackoverflow.com/questions/973473/getting-a-list-of-all-subdirectories-in-the-current-directory
  for subdir in next(os.walk(indir))[1]:
    # make parallel directories
    # for genres in genre set
    # for languages
    # filter lines
    insubdir = os.path.join(indir, subdir)
    filtersubdir = os.path.join(filterdir, subdir)
    mkdir_p(filtersubdir)
    remainsubdir = os.path.join(remaindir, subdir)
    mkdir_p(remainsubdir)
    for genre in genres:
      for lang in (args.lang, 'eng'):
        base = "%s.%s.%s.flat" % (genre, subdir, lang)
        infilename = os.path.join(insubdir, base)
        if os.path.exists(infilename):
          infile = prepfile(open(infilename, 'r'), 'r')
          filterfile = prepfile(open(os.path.join(filtersubdir, base), 'w'), 'w')
          remainfile = prepfile(open(os.path.join(remainsubdir, base), 'w'), 'w')
          filterlines(infile, (ratios[genre], deltas[genre]), (lowratio,lowdelta), (highratio,highdelta), filterfile, remainfile)
        else:
          sys.stderr.write("%s does not exist\n" % infilename)

  # count files in each of the directories; should be the same
  for dir in (indir, filterdir, remaindir):
    sys.stderr.write("%d files in %s\n" % (countfiles(dir), dir))
示例#18
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Given a reflex lrlp laf with token ids and a ltf with token-to-start_char/end_char, create an laf with start_char/end_char. Operate per directory",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "--indir",
        "-i",
        help=
        "input directory. Presumed to contain x.laf.xml. Might contain x.ltf.xml for all x"
    )
    parser.add_argument("--corpusdirs",
                        "-c",
                        nargs='+',
                        help="directory tree or trees to find x.ltf.xml")
    parser.add_argument(
        "--outdir",
        "-o",
        help=
        "output directory. may not exist. will contain modified x.laf.xml for all x"
    )

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    reader = codecs.getreader('utf8')
    writer = codecs.getwriter('utf8')

    stderr = writer(sys.stderr)
    indir = args.indir
    outdir = args.outdir
    mkdir_p(outdir)

    localcount = 0
    remotecount = 0
    bothcount = 0
    for inlaf in [x for x in os.listdir(indir) if x.endswith(".laf.xml")]:
        base = inlaf.replace(".laf.xml", "")
        outlaf = os.path.join(outdir, inlaf)
        inlaf = os.path.join(indir, inlaf)
        inltf = os.path.join(indir, base + ".ltf.xml")
        corpusltf = find(
            base + ".ltf.xml",
            args.corpusdirs) if args.corpusdirs is not None else None
        # cases:
        # 1. local ltf exists with char offsets. no remote ltf. we use what local ltf gives us
        # 2. local and remote ltf exist, with the same number of tokens in the same order. we map local id to remote id and use those offsets
        # 3. remote ltf exists. local does not. We use remote ltf as in case 1
        # 4. local and remote exist with different numbers of tokens or nothing exists or something else. complain and skip this document

        try:
            # case 2: build id map
            idmap = {}
            useidmap = False
            if os.path.exists(
                    inltf) and corpusltf is not None and os.path.exists(
                        corpusltf):
                localroot = ET.parse(inltf)
                corpusroot = ET.parse(corpusltf)
                localtoks = localroot.findall(".//TOKEN")
                corpustoks = corpusroot.findall(".//TOKEN")
                if len(localtoks) != len(corpustoks):
                    stderr.write("Token count mismatch; skipping " + inlaf +
                                 "\n")
                    continue
                ok = True
                for localtok, corpustok in zip(localtoks, corpustoks):
                    if localtok.text != corpustok.text:
                        stderr.write("Token count mismatch; skipping " +
                                     inlaf + "\n")
                        ok = False
                        break
                    idmap[localtok.get("id")] = corpustok.get("id")
                if not ok:
                    continue
                useidmap = True

            # case 1: swap inltf and corpusltf (otherwise below handles case 2, 3)
            if os.path.exists(inltf) and (corpusltf is None
                                          or not os.path.exists(corpusltf)):
                inltf, corpusltf = corpusltf, inltf
                remotecount += 1
            elif useidmap:
                bothcount += 1
            else:
                localcount += 1

            # Final token id-to-offset
            starts = {}
            ends = {}

            root = ET.parse(corpusltf)
            for node in root.findall(".//TOKEN"):
                id = node.get("id")
                starts[id] = node.get("start_char")
                ends[id] = node.get("end_char")
            #root.clear()
            # re-map
            root = ET.parse(inlaf)
            for node in root.findall(".//ANNOTATION"):
                stok = node.get("start_token")
                etok = node.get("end_token")
                stok = idmap[stok] if useidmap else stok
                etok = idmap[etok] if useidmap else etok
                start = starts[stok]
                end = ends[etok]
                ext = node.find(".//EXTENT")
                ext.set('start_char', start)
                ext.set('end_char', end)
            xmlstr = ET.tostring(root, pretty_print=True, encoding='unicode')
            writer(open(outlaf, 'w')).write(xmlstr + "\n")
        except:
            e = sys.exc_info()[0]
            stderr.write("Problem with %s: %s\n" % (inltf, e))
            continue
    stderr.write("%d using local only, %d using remote only, %d using both\n" %
                 (localcount, remotecount, bothcount))
示例#19
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Given category per doc, idfile, data file, put data in category-specific dir",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--catfile",
                        "-c",
                        nargs='?',
                        type=argparse.FileType('r'),
                        help="doc cat file (docid cat)")
    parser.add_argument("--idfile",
                        "-d",
                        nargs='?',
                        type=argparse.FileType('r'),
                        help="id file (docid per line)")
    parser.add_argument("--infile",
                        "-i",
                        nargs='?',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="input file")
    parser.add_argument("--prefix",
                        "-p",
                        default=".",
                        help="directory prefix for categories")
    parser.add_argument("--postfix",
                        "-P",
                        default=".",
                        help="directory postfix after categories")
    parser.add_argument(
        "--remainder",
        "-r",
        default="train",
        help="remainder category. Should match previous remainder category")
    addonoffarg(
        parser,
        'backup',
        help=
        "backup matches to universal docid, following strict ldc format (in May 2017)",
        default=True)

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    catfile = args.catfile
    infile = args.infile
    idfile = args.idfile

    # the unsplit resource
    # e.g. base of tmp_170502/yor/parallel/filtered/agile-tokenized/fromsource.generic.agile-tokenized.eng.flat
    basefile = os.path.basename(args.infile.name)
    cats = {}
    backupcats = {}
    fhs = {}
    backupcount = 0
    for line in catfile:
        # for each document, what category it belongs in
        # e.g. YOR_WL_001975_20150409_G0021WWLJ.eng	test
        doc, cat = line.strip().split('\t')
        # the prefix of the files that will be created
        # e.g. tmp_170502/yor/parallel/split / test / agile-tokenized
        prefix = os.path.join(args.prefix, cat, args.postfix)
        # the file that will be created
        # e.g. tmp_170502/yor/parallel/split/test/agile-tokenized/fromsource.generic.agile-tokenized.eng.flat
        innercatfile = os.path.join(prefix, basefile)
        if innercatfile not in fhs:
            mkdir_p(prefix)
            fhs[innercatfile] = open(innercatfile, 'w')
        # doc -> file to write to
        # print("{} -> {}".format(doc,innercatfile))
        cats[doc] = fhs[innercatfile]
        if args.backup:
            backupcats[backup(doc)] = fhs[innercatfile]

    # catchall remainder file
    remcatpref = os.path.join(args.prefix, args.remainder, args.postfix)
    remaindercatfile = os.path.join(remcatpref, basefile)
    if remaindercatfile not in fhs:
        mkdir_p(remcatpref)
        fhs[remaindercatfile] = open(remaindercatfile, 'w')
    # pairs of docids and lines
    # e.g. YOR_DF_001261_20031127_G0022DCKG.eng LESBIANISM IN NIGERIA IS A BIG SURPRISE
    for doc, data in zip(idfile, infile):
        doc = doc.strip()
        if doc in cats:
            fh = cats[doc]
            # print("{}: writing to {}".format(doc, fh.name))
        elif backup(doc) in backupcats:
            fh = backupcats[backup(doc)]
            backupcount += 1
        else:
            fh = fhs[remaindercatfile]
        fh.write(data)
    if args.backup and backupcount > 0:
        sys.stderr.write(
            "{} lines written via backup retrieval\n".format(backupcount))
示例#20
0
def main():
  parser = argparse.ArgumentParser(description="Make dataset selections for experimentation given previously generated categorization files",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--indir", "-i", help="location of parallel data")
  parser.add_argument("--language", "-l", help="source language three digit code")
  parser.add_argument("--extractpath", "-e", default="extracted", help="location of extracted data (might want to use 'filtered')")
  parser.add_argument("--remainder", "-r", default="train", help="remainder category. Should match previous remainder category")
  parser.add_argument("--previous", "-p", help="location of previous cat files")



  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

#  reader = codecs.getreader('utf8')
#  writer = codecs.getwriter('utf8')
#  outfile = writer(args.outfile)

  indir = args.indir
  # TODO: find these?
  # doc = keep full docs together  (can detect this by counting number of unique docs)
  # TODO: re-add found.generic to docprefixes
  docprefixes = ["fromsource.generic", "fromsource.tweet", "fromtarget.news", "found.generic"]
  nodocprefixes = ["fromtarget.elicitation", "fromtarget.phrasebook"]

  # TODO: find these
  filetypes = ["morph", "morph-tokenized", "original", "pos", "tokenized", "mttok", "mttoklc", "agile-tokenized", "cdec-tokenized", "agile-tokenized.lc", "cdec-tokenized.lc"]

  extractpath = os.path.join(indir, args.extractpath)
  origpath = os.path.join(extractpath, 'original')
  outpath = os.path.join(indir, 'splits')
  mkdir_p(outpath)

  for preflist in [docprefixes, nodocprefixes]:
    for prefix in list(preflist):
      # don't deal with it more if there's nothing in the manifest
      manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
      if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0:
        print("removing "+prefix)
        preflist.remove(prefix)
  # doc-based processing
  for prefix in docprefixes:
    idfile = os.path.join(outpath, "%s.ids" % prefix)
    manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
    try:
      check_output("cut -f2 %s > %s" % (manfile, idfile), stderr=STDOUT, shell=True)
    except CalledProcessError as exc:
      print("Status : FAIL", exc.returncode, exc.output)
    catfile = os.path.join(args.previous, "%s.cats" % prefix)
    newcatfile = os.path.join(outpath, os.path.basename(catfile))
    if os.path.exists(catfile):
      copy(catfile, newcatfile)
    else:
      touch(newcatfile)
    runselection(prefix, idfile, newcatfile, args.remainder, filetypes, args.language, extractpath, outpath)
    for i in (args.language, 'eng'):
      manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i))
      cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % (scriptdir, manifest, idfile, newcatfile, outpath, args.remainder)
      print("Running "+cmd)
      check_output(cmd, stderr=STDOUT, shell=True)

  # nodoc-based processing

  for prefix in nodocprefixes:
    idfile = os.path.join(outpath, "%s.fakeids" % prefix)
    try:
      mansize = int(check_output("wc -l %s" % os.path.join(extractpath, "%s.eng.manifest" % prefix), shell=True).decode('utf-8').strip().split(' ')[0])
      check_output("seq %d > %s" % (mansize, idfile), stderr=STDOUT, shell=True)
    except CalledProcessError as exc:
      print("Status : FAIL", exc.returncode, exc.output)
    catfile = os.path.join(args.previous, "%s.cats" % prefix)
    newcatfile = os.path.join(outpath, os.path.basename(catfile))
    if os.path.exists(catfile):
      copy(catfile, newcatfile)
    else:
      touch(newcatfile)
    runselection(prefix, idfile, newcatfile, args.remainder, filetypes, args.language, extractpath, outpath)
    for i in (args.language, 'eng'):
      manifest = os.path.join(extractpath, "%s.%s.manifest" % (prefix, i))
      cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % (scriptdir, manifest, idfile, newcatfile, outpath, args.remainder)
      print("Running "+cmd)
      check_output(cmd, stderr=STDOUT, shell=True)
示例#21
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Deterministic subselect designed for nov 2016 uyghur evaluation: per-doc, from end",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--indir", "-i", help="location of parallel data")
    parser.add_argument("--language",
                        "-l",
                        help="source language three digit code")
    parser.add_argument(
        "--extractpath",
        "-e",
        default="extracted",
        help="location of extracted data (might want to use 'filtered')")
    parser.add_argument("--sizes",
                        "-s",
                        nargs='+',
                        type=int,
                        help="list of sizes desired in each category")
    parser.add_argument("--categories",
                        "-c",
                        nargs='+',
                        help="list of categories. Must match sizes")
    parser.add_argument("--remainder",
                        "-r",
                        default="train",
                        help="remainder category. Should be a new category")
    parser.add_argument(
        "--devlstfile",
        "-d",
        default=None,
        help=
        "file of desired documents for dev (subject to length constraints, must be a set called 'dev')"
    )
    addonoffarg(
        parser,
        'fromFront',
        default=False,
        help="do doc assignment from the beginning (instead of the end)")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))


#  reader = codecs.getreader('utf8')
#  writer = codecs.getwriter('utf8')
#  outfile = writer(args.outfile)

    indir = args.indir
    origsizes = args.sizes

    # TODO: find these?
    # doc = keep full docs together  (can detect this by counting number of unique docs)
    # TODO: re-add found.generic to docprefixes
    docprefixes = ["fromsource.generic", "fromsource.tweet", "fromtarget.news"]
    # IL3: moving found.generic!!
    nodocprefixes = [
        "fromtarget.elicitation", "fromtarget.phrasebook", "found.generic"
    ]

    # TODO: find these
    filetypes = [
        "morph", "morph-tokenized", "original", "pos", "tokenized", "mttok",
        "mttoklc", "agile-tokenized", "cdec-tokenized", "agile-tokenized.lc",
        "cdec-tokenized.lc"
    ]

    extractpath = os.path.join(indir, args.extractpath)
    origpath = os.path.join(extractpath, 'original')
    outpath = os.path.join(indir, 'splits')
    mkdir_p(outpath)

    for preflist in [docprefixes, nodocprefixes]:
        for prefix in list(preflist):
            # don't deal with it more if there's nothing in the manifest
            manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
            if (not os.path.exists(manfile)) or os.path.getsize(manfile) == 0:
                print("removing " + prefix)
                preflist.remove(prefix)
    # doc-based processing

    for prefix in docprefixes:
        idfile = os.path.join(outpath, "%s.ids" % prefix)
        manfile = os.path.join(extractpath, "%s.eng.manifest" % prefix)
        try:
            check_output("cut -f2 %s > %s" % (manfile, idfile),
                         stderr=STDOUT,
                         shell=True)
        except CalledProcessError as exc:
            print("Status : FAIL", exc.returncode, exc.output)
        catfile = runselection(prefix,
                               idfile,
                               args.categories,
                               args.remainder,
                               origsizes,
                               filetypes,
                               args.language,
                               extractpath,
                               outpath,
                               args.devlstfile,
                               fromFront=args.fromFront)
        for i in (args.language, 'eng'):
            manifest = os.path.join(extractpath,
                                    "%s.%s.manifest" % (prefix, i))
            cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % (
                scriptdir, manifest, idfile, catfile, outpath, args.remainder)
            print("Running " + cmd)
            check_output(cmd, stderr=STDOUT, shell=True)

    # nodoc-based processing
    for prefix in nodocprefixes:
        idfile = os.path.join(outpath, "%s.fakeids" % prefix)
        try:
            mansize = int(
                check_output(
                    "wc -l %s" %
                    os.path.join(extractpath, "%s.eng.manifest" % prefix),
                    shell=True).decode('utf8').strip().split(' ')[0])
            check_output("seq %d > %s" % (mansize, idfile),
                         stderr=STDOUT,
                         shell=True)
        except CalledProcessError as exc:
            print("Status : FAIL", exc.returncode, exc.output)
        catfile = runselection(prefix,
                               idfile,
                               args.categories,
                               args.remainder,
                               origsizes,
                               filetypes,
                               args.language,
                               extractpath,
                               outpath,
                               fromFront=args.fromFront)
        for i in (args.language, 'eng'):
            manifest = os.path.join(extractpath,
                                    "%s.%s.manifest" % (prefix, i))
            cmd = "%s/categorize.py -i %s -d %s -c %s -p %s -r %s" % (
                scriptdir, manifest, idfile, catfile, outpath, args.remainder)
            print("Running " + cmd)
            check_output(cmd, stderr=STDOUT, shell=True)

    # warning if entries not found in given dev list
    if args.devlstfile:
        devlst = set(open(args.devlstfile).read().split())
        all_docids = list()
        for prefix in docprefixes:
            all_docids += open(os.path.join(outpath, "%s.ids" %
                                            prefix)).read().split('\n')
        for i in devlst - set(all_docids):
            print("***Warning: docid not found: %s" % i)
示例#22
0
def main():
  parser = argparse.ArgumentParser(description="Given a reflex lrlp laf with token ids and a ltf with token-to-start_char/end_char, create an laf with start_char/end_char. Operate per directory",
                                   formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument("--indir", "-i", help="input directory. Presumed to contain x.laf.xml. Might contain x.ltf.xml for all x")
  parser.add_argument("--corpusdirs", "-c", nargs='+', help="directory tree or trees to find x.ltf.xml")
  parser.add_argument("--outdir", "-o", help="output directory. may not exist. will contain modified x.laf.xml for all x")


  try:
    args = parser.parse_args()
  except IOError as msg:
    parser.error(str(msg))

  reader = codecs.getreader('utf8')
  writer = codecs.getwriter('utf8')

  stderr = writer(sys.stderr)
  indir = args.indir
  outdir = args.outdir
  mkdir_p(outdir)

  localcount = 0
  remotecount = 0
  bothcount = 0
  for inlaf in [x for x in os.listdir(indir) if x.endswith(".laf.xml")]:
    base = inlaf.replace(".laf.xml", "")
    outlaf = os.path.join(outdir, inlaf)
    inlaf = os.path.join(indir, inlaf)
    inltf = os.path.join(indir, base+".ltf.xml")
    corpusltf = find(base+".ltf.xml", args.corpusdirs) if args.corpusdirs is not None else None
    # cases:
    # 1. local ltf exists with char offsets. no remote ltf. we use what local ltf gives us
    # 2. local and remote ltf exist, with the same number of tokens in the same order. we map local id to remote id and use those offsets
    # 3. remote ltf exists. local does not. We use remote ltf as in case 1
    # 4. local and remote exist with different numbers of tokens or nothing exists or something else. complain and skip this document

    try:
      # case 2: build id map
      idmap = {}
      useidmap = False
      if os.path.exists(inltf) and corpusltf is not None and os.path.exists(corpusltf):
        localroot = ET.parse(inltf)
        corpusroot = ET.parse(corpusltf)
        localtoks = localroot.findall(".//TOKEN")
        corpustoks = corpusroot.findall(".//TOKEN")
        if len(localtoks) != len(corpustoks):
          stderr.write("Token count mismatch; skipping "+inlaf+"\n")
          continue
        ok = True
        for localtok, corpustok in zip(localtoks, corpustoks):
          if localtok.text != corpustok.text:
            stderr.write("Token count mismatch; skipping "+inlaf+"\n")
            ok = False
            break
          idmap[localtok.get("id")]=corpustok.get("id")
        if not ok:
          continue
        useidmap = True

      # case 1: swap inltf and corpusltf (otherwise below handles case 2, 3)
      if os.path.exists(inltf) and ( corpusltf is None or not os.path.exists(corpusltf)):
        inltf, corpusltf = corpusltf, inltf
        remotecount+=1
      elif useidmap:
        bothcount+=1
      else:
        localcount+=1

      # Final token id-to-offset
      starts = {}
      ends = {}

      root = ET.parse(corpusltf)
      for node in root.findall(".//TOKEN"):
        id = node.get("id")
        starts[id]=node.get("start_char")
        ends[id]=node.get("end_char")
      #root.clear()
      # re-map
      root = ET.parse(inlaf)
      for node in root.findall(".//ANNOTATION"):
        stok = node.get("start_token")
        etok = node.get("end_token")
        stok = idmap[stok] if useidmap else stok
        etok = idmap[etok] if useidmap else etok
        start = starts[stok]
        end = ends[etok]
        ext = node.find(".//EXTENT")
        ext.set('start_char', start)
        ext.set('end_char', end)
      xmlstr = ET.tostring(root, pretty_print=True, encoding='unicode')
      writer(open(outlaf, 'w')).write(xmlstr+"\n")
    except:
      e = sys.exc_info()[0]
      stderr.write("Problem with %s: %s\n" % (inltf, e))
      continue
  stderr.write("%d using local only, %d using remote only, %d using both\n" % (localcount, remotecount, bothcount))