def main(): logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) parser = optparse.OptionParser("%prog [options]") parser.add_option("-e", "--target-language", type="string", dest="target_language") parser.add_option("-f", "--source-language", type="string", dest="source_language") parser.add_option("-c", "--corpus", type="string", dest="corpus_stem") parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem") parser.add_option("-a", "--align", type="string", dest="align_file") parser.add_option("-w", "--working-dir", type="string", dest="working_dir") parser.set_defaults( target_language = "en", source_language = "de", corpus_stem = "test", align_file = "test.align", working_dir = "working", ) options,args = parser.parse_args(sys.argv) if not os.path.exists(options.working_dir): LOG.error("Working directory '%s' not found" % working_dir) sys.exit(1) m,n = None,None for line in open(options.working_dir + "/info"): name,value = line[:-1].split() if name == "m": m = int(value) if name == "n": n = int(value) if m == None or n == None: LOG.error("info file is incomplete") sys.exit(1) tvocab, offset = read_vocab(options.working_dir + "/vocab.target") svocab, offset = read_vocab(options.working_dir + "/vocab.source", offset+1) file_stem = os.path.basename(options.corpus_stem) ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w") extract.get_ngrams(options.corpus_stem, options.align_file, options.tagged_stem, svocab, tvocab, options.source_language, options.target_language, m, n, ofh) numberized_file = options.working_dir + "/" + file_stem + ".numberized" ngrams_file_handle = open(options.working_dir + "/" + file_stem + ".ngrams", 'r') numberized_file_handle = open(numberized_file, 'w') #Numberize the file for line in ngrams_file_handle: numberized_file_handle.write(extract.numberize(line, m, n, svocab, tvocab)) numberized_file_handle.close() ngrams_file_handle.close()
def main(): logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) parser = optparse.OptionParser("%prog [options]") parser.add_option("-e", "--target-language", type="string", dest="target_language") parser.add_option("-f", "--source-language", type="string", dest="source_language") parser.add_option("-c", "--corpus", type="string", dest="corpus_stem") parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem") parser.add_option("-a", "--align", type="string", dest="align_file") parser.add_option("-w", "--working-dir", type="string", dest="working_dir") parser.set_defaults(target_language="en", source_language="de", corpus_stem="test", align_file="test.align", working_dir="working") options, args = parser.parse_args(sys.argv) if not os.path.exists(options.working_dir): raise Exception("Working directory '%s' not found" % options.working_dir) m, n = None, None for line in open(options.working_dir + "/info"): name, value = line[:-1].split() if name == "m": m = int(value) if name == "n": n = int(value) if m is None or n is None: raise Exception("Info file is incomplete.") tvocab, offset = read_vocab(options.working_dir + "/vocab.target") svocab, offset = read_vocab(options.working_dir + "/vocab.source", offset + 1) file_stem = os.path.basename(options.corpus_stem) ofh = open(options.working_dir + "/" + file_stem + ".ngrams", "w") extract.get_ngrams(options.corpus_stem, options.align_file, options.tagged_stem, svocab, tvocab, options.source_language, options.target_language, m, n, ofh) numberized_file = options.working_dir + "/" + file_stem + ".numberized" ngrams_file_handle = open( os.path.join(options.working_dir, file_stem + ".ngrams"), 'r') numberized_file_handle = open(numberized_file, 'w') # Numberize the file. for line in ngrams_file_handle: numberized_file_handle.write( extract.numberize(line, m, n, svocab, tvocab)) numberized_file_handle.close() ngrams_file_handle.close()
def main(): logging.basicConfig( format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) parser = optparse.OptionParser("%prog [options]") parser.add_option( "-e", "--target-language", type="string", dest="target_language") parser.add_option( "-f", "--source-language", type="string", dest="source_language") parser.add_option("-c", "--corpus", type="string", dest="corpus_stem") parser.add_option( "-t", "--tagged-corpus", type="string", dest="tagged_stem") parser.add_option("-a", "--align", type="string", dest="align_file") parser.add_option("-w", "--working-dir", type="string", dest="working_dir") parser.add_option("-n", "--target-context", type="int", dest="n") parser.add_option("-m", "--source-context", type="int", dest="m") parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune") parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune") parser.set_defaults( target_language="en", source_language="de", corpus_stem="train.10k", align_file="train.10k.align", n=5, m=4, working_dir="working", sprune=16000, tprune=16000 ) options, args = parser.parse_args(sys.argv) if not os.path.exists(options.working_dir): os.makedirs(options.working_dir) else: LOG.warn("Directory %s already exists, re-using" % options.working_dir) info_file = options.working_dir + "/info" if os.path.exists(info_file): for line in open(info_file): name, value = line[:-1].split() n_mismatch = (name == 'n' and int(value) != options.n) m_mismatch = (name == 'm' and int(value) != options.m) if n_mismatch or m_mismatch: LOG.error( "info file exists, but parameters do not match. " "Delete working directory and rerun.") sys.exit(1) else: ifh = open(info_file, "w") print>>ifh, "m", options.m print>>ifh, "n", options.n ifh.close() scorpus = options.corpus_stem + "." + options.source_language tcorpus = options.corpus_stem + "." + options.target_language tvocab, svocab = None, None # Extract vocabulary, and prune, if required. svocab = get_pruned_vocab(scorpus, options.sprune) tvocab = get_pruned_vocab(tcorpus, options.tprune) file_stem = os.path.basename(options.corpus_stem) ngram_file = options.working_dir + "/" + file_stem + ".ngrams" ofh = open(ngram_file, "w") tags = extract.get_ngrams( options.corpus_stem, options.align_file, options.tagged_stem, svocab, tvocab, options.source_language, options.target_language, options.m, options.n, ofh) # Save vocabularies. del svocab["<null>"] del tvocab["<null>"] del svocab["<unk>"] del tvocab["<unk>"] svocab_list = [item[0] for item in svocab.most_common()] tvocab_list = [item[0] for item in tvocab.most_common()] # UNK is always the first vocabulary element. Make sure # it appears in position 0 # We need to use <null> token in the chart decoder in order # to correctly estimate the probabilities of incomplete subphrases # that are not sentence initial. tvocab_list.insert(0, "<null>") tvocab_list.insert(0, "<unk>") svocab_list.insert(0, "<unk>") # Get tags: tag_list = [item[0] for item in tags.most_common()] svocab_list = svocab_list + tag_list tvocab_list = tvocab_list + tag_list save_vocab(options.working_dir, "vocab.source", svocab_list) save_vocab(options.working_dir, "vocab.target", tvocab_list) # Create vocab dictionaries that map word to ID. tvocab_idmap = {} for i in range(len(tvocab_list)): tvocab_idmap[tvocab_list[i]] = i svocab_idmap = {} for i in range(len(svocab_list)): svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap) numberized_file = options.working_dir + "/" + file_stem + ".numberized" ngrams_file_handle = open(ngram_file, 'r') numberized_file_handle = open(numberized_file, 'w') # Numberize the file. for line in ngrams_file_handle: numberized_file_handle.write( extract.numberize( line, options.m, options.n, svocab_idmap, tvocab_idmap)) numberized_file_handle.close() ngrams_file_handle.close()
def main(): logging.basicConfig(format='%(asctime)s %(levelname)s: %(message)s', datefmt='%Y-%m-%d %H:%M:%S', level=logging.DEBUG) parser = optparse.OptionParser("%prog [options]") parser.add_option("-e", "--target-language", type="string", dest="target_language") parser.add_option("-f", "--source-language", type="string", dest="source_language") parser.add_option("-c", "--corpus", type="string", dest="corpus_stem") parser.add_option("-t", "--tagged-corpus", type="string", dest="tagged_stem") parser.add_option("-a", "--align", type="string", dest="align_file") parser.add_option("-w", "--working-dir", type="string", dest="working_dir") parser.add_option("-n", "--target-context", type="int", dest="n") parser.add_option("-m", "--source-context", type="int", dest="m") parser.add_option("-s", "--prune-source-vocab", type="int", dest="sprune") parser.add_option("-p", "--prune-target-vocab", type="int", dest="tprune") parser.set_defaults(target_language="en", source_language="de", corpus_stem="train.10k", align_file="train.10k.align", n=5, m=4, working_dir="working", sprune=16000, tprune=16000) options, args = parser.parse_args(sys.argv) if not os.path.exists(options.working_dir): os.makedirs(options.working_dir) else: LOG.warn("Directory %s already exists, re-using" % options.working_dir) info_file = options.working_dir + "/info" if os.path.exists(info_file): for line in open(info_file): name, value = line[:-1].split() if name == "n" and int(value) != options.n or \ name == "m" and int(value) != options.m: LOG.error( "info file exists, but parameters do not match. Delete working directory and rerun" ) sys.exit(1) else: ifh = open(info_file, "w") print >> ifh, "m", options.m print >> ifh, "n", options.n ifh.close() scorpus = options.corpus_stem + "." + options.source_language tcorpus = options.corpus_stem + "." + options.target_language tvocab, svocab = None, None # Extract vocabulary, and prune, if required svocab = get_pruned_vocab(scorpus, options.sprune) tvocab = get_pruned_vocab(tcorpus, options.tprune) file_stem = os.path.basename(options.corpus_stem) ngram_file = options.working_dir + "/" + file_stem + ".ngrams" ofh = open(ngram_file, "w") tags = extract.get_ngrams(options.corpus_stem, options.align_file, options.tagged_stem, svocab, tvocab, options.source_language, options.target_language, options.m, options.n, ofh) # Save vocabularies del svocab["<null>"] del tvocab["<null>"] del svocab["<unk>"] del tvocab["<unk>"] svocab_list = [item[0] for item in svocab.most_common()] tvocab_list = [item[0] for item in tvocab.most_common()] # UNK is always the first vocabulary element. Make sure # it appears in position 0 # We need to use <null> token in the chart decoder in order # to correctly estimate the probabilities of incomplete subphrases # that are not sentence initial. tvocab_list.insert(0, "<null>") tvocab_list.insert(0, "<unk>") svocab_list.insert(0, "<unk>") #Get tags: tag_list = [item[0] for item in tags.most_common()] svocab_list = svocab_list + tag_list tvocab_list = tvocab_list + tag_list save_vocab(options.working_dir, "vocab.source", svocab_list) save_vocab(options.working_dir, "vocab.target", tvocab_list) #Create vocab dictionaries that map word to ID tvocab_idmap = {} for i in range(len(tvocab_list)): tvocab_idmap[tvocab_list[i]] = i svocab_idmap = {} for i in range(len(svocab_list)): svocab_idmap[svocab_list[i]] = i + len(tvocab_idmap) numberized_file = options.working_dir + "/" + file_stem + ".numberized" ngrams_file_handle = open(ngram_file, 'r') numberized_file_handle = open(numberized_file, 'w') #Numberize the file for line in ngrams_file_handle: numberized_file_handle.write( extract.numberize(line, m, n, tvocab_idmap, tvocab_idmap)) numberized_file_handle.close() ngrams_file_handle.close()