def fromstring(cls, input, features=None, logic_parser=None, fstruct_reader=None, encoding=None): """ Return a feature structure based ``FeatureGrammar``. :param input: a grammar, either in the form of a string or else as a list of strings. :param features: a tuple of features (default: SLASH, TYPE) :param logic_parser: a parser for lambda-expressions, by default, ``LogicParser()`` :param fstruct_reader: a feature structure parser (only if features and logic_parser is None) """ if features is None: features = (TYPE, SLASH) if fstruct_reader is None: fstruct_reader = FeatStructReader(features, FeatStructNonterminal, FeatListNonterminal, logic_parser=logic_parser) elif logic_parser is not None: raise Exception('\'logic_parser\' and \'fstruct_reader\' must ' 'not both be set') start, productions = read_grammar(input, fstruct_reader.read_partial, encoding=encoding) # Add the whole lexicon # for wordtuple, featlist in lexicon.lexicon.items(): # for lexent in featlist: # lexlhs = lexent # newprod = Production(lexlhs, ['_'.join(wordtuple)]) # productions.append(newprod) return FGGrammar(start, productions)
def fromstring(transitions, *finals): """ Constructs a BUTA from a string representation of the transitions. :type transitions: str :param transitions: The transitions of the tree automaton, in string representation :type finals: unicode :param finals: The accept states of the tree automaton :rtype: BUTA :return: The BUTA described by the parameters """ _, rules = gr.read_grammar(transitions, gr.standard_nonterm_parser) return BUTA(rules, set(gr.Nonterminal(nt) for nt in finals))
def main(): options = parse_args() # parameter set 1 #assert(options.corpus_name!=None); assert (options.input_directory != None) assert (options.output_directory != None) input_directory = options.input_directory input_directory = input_directory.rstrip("/") corpus_name = os.path.basename(input_directory) output_directory = options.output_directory if not os.path.exists(output_directory): os.mkdir(output_directory) output_directory = os.path.join(output_directory, corpus_name) if not os.path.exists(output_directory): os.mkdir(output_directory) assert (options.grammar_file != None) grammar_file = options.grammar_file assert (os.path.exists(grammar_file)) # Documents train_docs = [] input_stream = open(os.path.join(input_directory, 'train.dat'), 'r') for line in input_stream: train_docs.append(line.strip()) input_stream.close() print("successfully load all training documents...") # parameter set 2 if options.number_of_documents > 0: number_of_documents = options.number_of_documents else: number_of_documents = len(train_docs) if options.batch_size > 0: batch_size = options.batch_size else: batch_size = number_of_documents #assert(number_of_documents % batch_size==0); training_iterations = number_of_documents / batch_size if options.training_iterations > 0: training_iterations = options.training_iterations #training_iterations=int(math.ceil(1.0*number_of_documents/batch_size)); #multiprocesses = options.multiprocesses; assert (options.number_of_processes >= 0) number_of_processes = options.number_of_processes # parameter set 3 assert (options.grammaton_prune_interval > 0) grammaton_prune_interval = options.grammaton_prune_interval snapshot_interval = grammaton_prune_interval if options.snapshot_interval > 0: snapshot_interval = options.snapshot_interval assert (options.tau >= 0) tau = options.tau #assert(options.kappa>=0.5 and options.kappa<=1); assert (options.kappa >= 0 and options.kappa <= 1) kappa = options.kappa if batch_size <= 0: print("warning: running in batch mode...") kappa = 0 # read in adaptor grammars desired_truncation_level = {} alpha_pi = {} beta_pi = {} grammar_rules = [] adapted_non_terminals = set() #for line in codecs.open(grammar_file, 'r', encoding='utf-8'): for line in open(grammar_file, 'r'): line = line.strip() if line.startswith("%"): continue if line.startswith("@"): tokens = line.split() assert (len(tokens) == 5) adapted_non_terminal = nltk.Nonterminal(tokens[1]) adapted_non_terminals.add(adapted_non_terminal) desired_truncation_level[adapted_non_terminal] = int(tokens[2]) alpha_pi[adapted_non_terminal] = float(tokens[3]) beta_pi[adapted_non_terminal] = float(tokens[4]) continue grammar_rules.append(line) grammar_rules = "\n".join(grammar_rules) # Warning: if you are using nltk 2.x, please use parse_grammar() #from nltk.grammar import parse_grammar, standard_nonterm_parser #start, productions = parse_grammar(grammar_rules, standard_nonterm_parser, probabilistic=False) from nltk.grammar import read_grammar, standard_nonterm_parser start, productions = read_grammar(grammar_rules, standard_nonterm_parser, probabilistic=False) print("start, productions: ", start, productions) # create output directory now = datetime.datetime.now() suffix = now.strftime("%y%b%d-%H%M%S") + "" #desired_truncation_level_string = "".join(["%s%d" % (symbol, desired_truncation_level[symbol]) for symbol in desired_truncation_level]); #alpha_pi_string = "".join(["%s%d" % (symbol, alpha_pi[symbol]) for symbol in alpha_pi]); #beta_pi_string = "".join(["%s%d" % (symbol, beta_pi[symbol]) for symbol in beta_pi]); #output_directory += "-" + str(now.microsecond) + "/"; suffix += "-D%d-P%d-S%d-B%d-O%d-t%d-k%g-G%s/" % ( number_of_documents, #number_of_topics, grammaton_prune_interval, snapshot_interval, batch_size, training_iterations, tau, kappa, #alpha_theta, #alpha_pi_string, #beta_pi_string, #desired_truncation_level_string, os.path.basename(grammar_file)) output_directory = os.path.join(output_directory, suffix) os.mkdir(os.path.abspath(output_directory)) # store all the options to a input_stream options_output_file = open(output_directory + "option.txt", 'w') # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n") options_output_file.write("corpus_name=" + corpus_name + "\n") options_output_file.write("grammar_file=" + str(grammar_file) + "\n") # parameter set 2 options_output_file.write("number_of_processes=" + str(number_of_processes) + "\n") #options_output_file.write("multiprocesses=" + str(multiprocesses) + "\n"); options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n") options_output_file.write("batch_size=" + str(batch_size) + "\n") options_output_file.write("training_iterations=" + str(training_iterations) + "\n") # parameter set 3 options_output_file.write("grammaton_prune_interval=" + str(grammaton_prune_interval) + "\n") options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n") options_output_file.write("tau=" + str(tau) + "\n") options_output_file.write("kappa=" + str(kappa) + "\n") # parameter set 4 #options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n"); options_output_file.write("alpha_pi=%s\n" % alpha_pi) options_output_file.write("beta_pi=%s\n" % beta_pi) options_output_file.write("desired_truncation_level=%s\n" % desired_truncation_level) # parameter set 5 #options_output_file.write("heldout_data=" + str(heldout_data) + "\n"); options_output_file.close() print("========== ========== ========== ========== ==========") # parameter set 1 print("output_directory=" + output_directory) print("input_directory=" + input_directory) print("corpus_name=" + corpus_name) print("grammar_file=" + str(grammar_file)) # parameter set 2 print("number_of_documents=" + str(number_of_documents)) print("batch_size=" + str(batch_size)) print("training_iterations=" + str(training_iterations)) print("number_of_processes=" + str(number_of_processes)) #print("multiprocesses=" + str(multiprocesses) # parameter set 3 print("grammaton_prune_interval=" + str(grammaton_prune_interval)) print("snapshot_interval=" + str(snapshot_interval)) print("tau=" + str(tau)) print("kappa=" + str(kappa)) # parameter set 4 #print("alpha_theta=" + str(alpha_theta) print("alpha_pi=%s" % alpha_pi) print("beta_pi=%s" % beta_pi) print("desired_truncation_level=%s" % desired_truncation_level) # parameter set 5 #print("heldout_data=" + str(heldout_data) print("========== ========== ========== ========== ==========") import hybrid print("passing prodcutions = : ", productions) adagram_inferencer = hybrid.Hybrid(start, productions, adapted_non_terminals) adagram_inferencer._initialize(number_of_documents, batch_size, tau, kappa, alpha_pi, beta_pi, None, desired_truncation_level, grammaton_prune_interval) ''' clock_iteration = time.time(); clock_e_step, clock_m_step = adagram_inferencer.seed(train_docs); clock_iteration = time.time()-clock_iteration; print('E-step, M-step and Seed take %g, %g and %g seconds respectively...' % (clock_e_step, clock_m_step, clock_iteration);p ''' #adagram_inferencer.export_adaptor_grammar(os.path.join(output_directory, "infag-0")) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-0")) random.shuffle(train_docs) training_clock = time.time() snapshot_clock = time.time() for iteration in range(int(training_iterations)): start_index = batch_size * iteration end_index = batch_size * (iteration + 1) if start_index / number_of_documents < end_index / number_of_documents: #train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents) :] + train_docs[: (batch_size * (iteration+1)) % (number_of_documents)]; train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents):] random.shuffle(train_docs) train_doc_set += train_docs[:(batch_size * (iteration + 1)) % (number_of_documents)] else: train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents): (batch_size * (iteration + 1)) % number_of_documents] clock_iteration = time.time() #print("processing document:", train_doc_set clock_e_step, clock_m_step = adagram_inferencer.learning( train_doc_set, number_of_processes) if (iteration + 1) % snapshot_interval == 0: #pickle_file = open(os.path.join(output_directory, "model-%d" % (adagram_inferencer._counter+1)), 'wb'); #pickle.dump(adagram_inferencer, pickle_file); #pickle_file.close(); adagram_inferencer.export_adaptor_grammar( os.path.join(output_directory, "infag-" + str( (iteration + 1)))) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1)))) if (iteration + 1) % 1000 == 0: snapshot_clock = time.time() - snapshot_clock print('Processing 1000 mini-batches take %g seconds...' % (snapshot_clock)) snapshot_clock = time.time() clock_iteration = time.time() - clock_iteration print( 'E-step, M-step and iteration %d take %g, %g and %g seconds respectively...' % (adagram_inferencer._counter, clock_e_step, clock_m_step, clock_iteration)) adagram_inferencer.export_adaptor_grammar( os.path.join(output_directory, "infag-" + str(adagram_inferencer._counter + 1))) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1)))) pickle_file = open( os.path.join(output_directory, "model-%d" % (iteration + 1)), 'wb') pickle.dump(adagram_inferencer, pickle_file) pickle_file.close() training_clock = time.time() - training_clock print('Training finished in %g seconds...' % (training_clock))
def main(): options = parse_args(); # parameter set 1 #assert(options.corpus_name!=None); assert(options.input_directory!=None); assert(options.output_directory!=None); input_directory = options.input_directory; input_directory = input_directory.rstrip("/"); corpus_name = os.path.basename(input_directory); output_directory = options.output_directory; if not os.path.exists(output_directory): os.mkdir(output_directory); output_directory = os.path.join(output_directory, corpus_name); if not os.path.exists(output_directory): os.mkdir(output_directory); assert(options.grammar_file!=None); grammar_file = options.grammar_file; assert(os.path.exists(grammar_file)); # Documents train_docs = []; input_stream = open(os.path.join(input_directory, 'train.dat'), 'r'); for line in input_stream: train_docs.append(line.strip()); input_stream.close(); print "successfully load all training documents..." # parameter set 2 if options.number_of_documents>0: number_of_documents = options.number_of_documents; else: number_of_documents = len(train_docs) if options.batch_size>0: batch_size = options.batch_size; else: batch_size = number_of_documents #assert(number_of_documents % batch_size==0); training_iterations=number_of_documents/batch_size; if options.training_iterations>0: training_iterations=options.training_iterations; #training_iterations=int(math.ceil(1.0*number_of_documents/batch_size)); #multiprocesses = options.multiprocesses; assert(options.number_of_processes>=0); number_of_processes = options.number_of_processes; # parameter set 3 assert(options.grammaton_prune_interval>0); grammaton_prune_interval = options.grammaton_prune_interval; snapshot_interval = grammaton_prune_interval; if options.snapshot_interval>0: snapshot_interval=options.snapshot_interval; assert(options.tau>=0); tau = options.tau; #assert(options.kappa>=0.5 and options.kappa<=1); assert(options.kappa>=0 and options.kappa<=1); kappa = options.kappa; if batch_size<=0: print "warning: running in batch mode..." kappa = 0; # read in adaptor grammars desired_truncation_level = {}; alpha_pi = {}; beta_pi = {}; grammar_rules = []; adapted_non_terminals = set(); #for line in codecs.open(grammar_file, 'r', encoding='utf-8'): for line in open(grammar_file, 'r'): line = line.strip(); if line.startswith("%"): continue; if line.startswith("@"): tokens = line.split(); assert(len(tokens)==5); adapted_non_terminal = nltk.Nonterminal(tokens[1]); adapted_non_terminals.add(adapted_non_terminal); desired_truncation_level[adapted_non_terminal] = int(tokens[2]); alpha_pi[adapted_non_terminal] = float(tokens[3]); beta_pi[adapted_non_terminal] = float(tokens[4]); continue; grammar_rules.append(line); grammar_rules = "\n".join(grammar_rules) # Warning: if you are using nltk 2.x, please use parse_grammar() #from nltk.grammar import parse_grammar, standard_nonterm_parser #start, productions = parse_grammar(grammar_rules, standard_nonterm_parser, probabilistic=False) from nltk.grammar import read_grammar, standard_nonterm_parser start, productions = read_grammar(grammar_rules, standard_nonterm_parser, probabilistic=False) # create output directory now = datetime.datetime.now(); suffix = now.strftime("%y%b%d-%H%M%S")+""; #desired_truncation_level_string = "".join(["%s%d" % (symbol, desired_truncation_level[symbol]) for symbol in desired_truncation_level]); #alpha_pi_string = "".join(["%s%d" % (symbol, alpha_pi[symbol]) for symbol in alpha_pi]); #beta_pi_string = "".join(["%s%d" % (symbol, beta_pi[symbol]) for symbol in beta_pi]); #output_directory += "-" + str(now.microsecond) + "/"; suffix += "-D%d-P%d-S%d-B%d-O%d-t%d-k%g-G%s/" % (number_of_documents, #number_of_topics, grammaton_prune_interval, snapshot_interval, batch_size, training_iterations, tau, kappa, #alpha_theta, #alpha_pi_string, #beta_pi_string, #desired_truncation_level_string, os.path.basename(grammar_file) ); output_directory = os.path.join(output_directory, suffix); os.mkdir(os.path.abspath(output_directory)); # store all the options to a input_stream options_output_file = open(output_directory + "option.txt", 'w'); # parameter set 1 options_output_file.write("input_directory=" + input_directory + "\n"); options_output_file.write("corpus_name=" + corpus_name + "\n"); options_output_file.write("grammar_file=" + str(grammar_file) + "\n"); # parameter set 2 options_output_file.write("number_of_processes=" + str(number_of_processes) + "\n"); #options_output_file.write("multiprocesses=" + str(multiprocesses) + "\n"); options_output_file.write("number_of_documents=" + str(number_of_documents) + "\n"); options_output_file.write("batch_size=" + str(batch_size) + "\n"); options_output_file.write("training_iterations=" + str(training_iterations) + "\n"); # parameter set 3 options_output_file.write("grammaton_prune_interval=" + str(grammaton_prune_interval) + "\n"); options_output_file.write("snapshot_interval=" + str(snapshot_interval) + "\n"); options_output_file.write("tau=" + str(tau) + "\n"); options_output_file.write("kappa=" + str(kappa) + "\n"); # parameter set 4 #options_output_file.write("alpha_theta=" + str(alpha_theta) + "\n"); options_output_file.write("alpha_pi=%s\n" % alpha_pi); options_output_file.write("beta_pi=%s\n" % beta_pi); options_output_file.write("desired_truncation_level=%s\n" % desired_truncation_level); # parameter set 5 #options_output_file.write("heldout_data=" + str(heldout_data) + "\n"); options_output_file.close() print "========== ========== ========== ========== ==========" # parameter set 1 print "output_directory=" + output_directory print "input_directory=" + input_directory print "corpus_name=" + corpus_name print "grammar_file=" + str(grammar_file) # parameter set 2 print "number_of_documents=" + str(number_of_documents) print "batch_size=" + str(batch_size) print "training_iterations=" + str(training_iterations) print "number_of_processes=" + str(number_of_processes) #print "multiprocesses=" + str(multiprocesses) # parameter set 3 print "grammaton_prune_interval=" + str(grammaton_prune_interval) print "snapshot_interval=" + str(snapshot_interval); print "tau=" + str(tau) print "kappa=" + str(kappa) # parameter set 4 #print "alpha_theta=" + str(alpha_theta) print "alpha_pi=%s" % alpha_pi print "beta_pi=%s" % beta_pi print "desired_truncation_level=%s" % desired_truncation_level # parameter set 5 #print "heldout_data=" + str(heldout_data) print "========== ========== ========== ========== ==========" import hybrid; adagram_inferencer = hybrid.Hybrid(start, productions, adapted_non_terminals ); adagram_inferencer._initialize(number_of_documents, batch_size, tau, kappa, alpha_pi, beta_pi, None, desired_truncation_level, grammaton_prune_interval ); ''' clock_iteration = time.time(); clock_e_step, clock_m_step = adagram_inferencer.seed(train_docs); clock_iteration = time.time()-clock_iteration; print 'E-step, M-step and Seed take %g, %g and %g seconds respectively...' % (clock_e_step, clock_m_step, clock_iteration);p ''' #adagram_inferencer.export_adaptor_grammar(os.path.join(output_directory, "infag-0")) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-0")) random.shuffle(train_docs); training_clock = time.time(); snapshot_clock = time.time(); for iteration in xrange(training_iterations): start_index = batch_size * iteration; end_index = batch_size * (iteration + 1); if start_index / number_of_documents < end_index / number_of_documents: #train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents) :] + train_docs[: (batch_size * (iteration+1)) % (number_of_documents)]; train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents) :]; random.shuffle(train_docs); train_doc_set += train_docs[: (batch_size * (iteration+1)) % (number_of_documents)]; else: train_doc_set = train_docs[(batch_size * iteration) % (number_of_documents) : (batch_size * (iteration+1)) % number_of_documents]; clock_iteration = time.time(); #print "processing document:", train_doc_set clock_e_step, clock_m_step = adagram_inferencer.learning(train_doc_set, number_of_processes); if (iteration+1)%snapshot_interval==0: #cpickle_file = open(os.path.join(output_directory, "model-%d" % (adagram_inferencer._counter+1)), 'wb'); #cPickle.dump(adagram_inferencer, cpickle_file); #cpickle_file.close(); adagram_inferencer.export_adaptor_grammar(os.path.join(output_directory, "infag-" + str((iteration+1)))) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1)))) if (iteration+1) % 1000==0: snapshot_clock = time.time() - snapshot_clock; print 'Processing 1000 mini-batches take %g seconds...' % (snapshot_clock); snapshot_clock = time.time() clock_iteration = time.time()-clock_iteration; print 'E-step, M-step and iteration %d take %g, %g and %g seconds respectively...' % (adagram_inferencer._counter, clock_e_step, clock_m_step, clock_iteration); adagram_inferencer.export_adaptor_grammar(os.path.join(output_directory, "infag-" + str(adagram_inferencer._counter+1))) #adagram_inferencer.export_aggregated_adaptor_grammar(os.path.join(output_directory, "ag-" + str((iteration+1)))) cpickle_file = open(os.path.join(output_directory, "model-%d" % (iteration+1)), 'wb'); cPickle.dump(adagram_inferencer, cpickle_file); cpickle_file.close(); training_clock = time.time()-training_clock; print 'Training finished in %g seconds...' % (training_clock);