def expand_features(self, category, attributes, features): for morph in config.get_attributes(category, attributes): # target features fid = self.convert(morph) yield 'F{}=1'.format(fid) # pairwise features for morph2 in config.get_attributes(category, attributes): if morph2 <= morph: continue fid = self.convert(u'{}+{}'.format(morph, morph2)) yield 'F{}=1'.format(fid) # translation features for fname, fval in features.iteritems(): fid = self.convert(u'{}_{}'.format(morph, fname)) yield 'F{}={}'.format(fid, fval)
def score_all(self, inflections, features): X = self.feature_dict.transform([features]) Y_all = [] for i, (tag, _) in enumerate(inflections): label = {attr: 1 for attr in config.get_attributes(self.category, tag)} Y_all.append(label) Y_all = self.label_dict.transform(Y_all) scores = self.model.predict_log_proba(X, Y_all) return [(score, tag, inflection) for score, (tag, inflection) in zip(scores, inflections)]
def score_all(self, inflections, features): X = self.feature_dict.transform([features]) Y_all = [] for i, (tag, _) in enumerate(inflections): label = { attr: 1 for attr in config.get_attributes(self.category, tag) } Y_all.append(label) Y_all = self.label_dict.transform(Y_all) scores = self.model.predict_log_proba(X, Y_all) return [(score, tag, inflection) for score, (tag, inflection) in zip(scores, inflections)]
def score(self, tag, features): score = 0 for attr in config.get_attributes(self.category, tag): for fname, fval in features.iteritems(): score += fval * self.weights.get(attr+'_'+fname, 0) return score
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Create cdec CRF grammars and training data') parser.add_argument('category', help='Russian word category to (R/V/A/N/M)') parser.add_argument('rev_map', help='reverse inflection map') parser.add_argument('output', help='training output path') args = parser.parse_args() category = args.category logging.info('Loading reverse inflection map') with open(args.rev_map) as f: rev_map = cPickle.load(f) # Create training data paths if not os.path.exists(args.output): os.mkdir(args.output) grammar_path = os.path.join(args.output, 'grammars') if not os.path.exists(grammar_path): os.mkdir(grammar_path) sgm = io.open(os.path.join(args.output, 'train.sgm'), 'w', encoding='utf8') fvoc = Vocabulary() n_sentences = 0 logging.info('Generating the grammars') for source, target, alignment in read_sentences(sys.stdin): n_sentences += 1 if n_sentences % 1000 == 0: if too_much_mem(): logging.info('Running out of memory') break for word, features in extract_instances(category, source, target, alignment): inflection, lemma, tag = word category = tag[0] ref_attributes = tag[1:] possible_inflections = rev_map.get((lemma, category), []) if (ref_attributes, inflection) not in possible_inflections: logging.debug('Skip: %s (%s)', inflection, ref_attributes) continue # Write sentence grammar grammar_name = os.path.join(grammar_path, uuid.uuid1().hex) with io.open(grammar_name, 'w', encoding='utf8') as grammar: for attributes, _ in possible_inflections: rule = fvoc.make_rule(lemma, category, attributes, features) grammar.write(rule) # Write src / ref src = lemma+'_'+category ref = ' '.join(config.get_attributes(category, ref_attributes)) sgm.write(u'<seg grammar="{}"> {} ||| {} {} </seg>\n'.format( os.path.abspath(grammar_name), src, category, ref)) logging.info('Processed %d sentences', n_sentences) logging.info('Saving weights') ff_path = os.path.join(args.output, 'weights.ini') with io.open(ff_path, 'w', encoding='utf8') as f: for fname, fid in fvoc.iteritems(): f.write(u'# {}\n'.format(fname)) f.write(u'F{} 0\n'.format(fid)) sgm.close()
def make_rule(self, lemma, category, attributes, features): src = lemma+'_'+category tgt = ' '.join(config.get_attributes(category, attributes)) feat = ' '.join(self.expand_features(category, attributes, features)) return (u'[S] ||| {} ||| {} {} ||| {}\n'.format(src, category, tgt, feat))
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Trained stuctured model') parser.add_argument('category', help='target word category') parser.add_argument('rev_map', help='reverse inflection map') parser.add_argument('model', help='output directory for models') parser.add_argument('-i', '--n_iter', type=int, help='number of SGD iterations') parser.add_argument('-r', '--rate', type=float, help='SGD udpate rate') args = parser.parse_args() category = args.category logging.info('Loading reverse inflection map') with open(args.rev_map) as f: rev_map = cPickle.load(f) logging.info('Generating the training data') X = [] Y_all = [] Y_star = [] Y_lim = [] n = 0 inflection_lims = {} # inflection set cache (ranges for y in Y_all) for source, target, alignment in read_sentences(sys.stdin): for word, features in extract_instances(category, source, target, alignment): ref_inflection, lemma, tag = word category = tag[0] ref_attributes = tag[1:] possible_inflections = rev_map.get((lemma, category), []) # Skip if |inflections| = 1 [p(infl | lemma) = 1] if len(possible_inflections) == 1: continue if (ref_attributes, ref_inflection) not in possible_inflections: continue X.append(features) # Y_all / Y_lim lims = inflection_lims.get((lemma, category), None) if lims is None: # new set of inflections for i, (attributes, _) in enumerate(possible_inflections): label = {attr: 1 for attr in config.get_attributes(category, attributes)} Y_all.append(label) # attributes map lims = (n, n+len(possible_inflections)) inflection_lims[lemma, category] = lims n += len(possible_inflections) Y_lim.append(lims) # Y_star for i, (attributes, _) in enumerate(possible_inflections): if attributes == ref_attributes: Y_star.append(i) # free some memory del rev_map if not os.path.exists(args.model): os.mkdir(args.model) def save_model(it, model): with open(os.path.join(args.model, 'model.{}.pickle'.format(it+1)), 'w') as f: cPickle.dump(model, f, protocol=-1) model = StructuredModel(args.category) model.train(X, Y_all, Y_star, Y_lim, n_iter=args.n_iter, alpha_sgd=args.rate, every_iter=save_model)
def main(): logging.basicConfig(level=logging.INFO, format='%(message)s') parser = argparse.ArgumentParser(description='Trained stuctured model') parser.add_argument('category', help='target word category') parser.add_argument('rev_map', help='reverse inflection map') parser.add_argument('model', help='output directory for models') parser.add_argument('-i', '--n_iter', type=int, help='number of SGD iterations') parser.add_argument('-r', '--rate', type=float, help='SGD udpate rate') args = parser.parse_args() category = args.category logging.info('Loading reverse inflection map') with open(args.rev_map) as f: rev_map = cPickle.load(f) logging.info('Generating the training data') X = [] Y_all = [] Y_star = [] Y_lim = [] n = 0 inflection_lims = {} # inflection set cache (ranges for y in Y_all) for source, target, alignment in read_sentences(sys.stdin): for word, features in extract_instances(category, source, target, alignment): ref_inflection, lemma, tag = word category = tag[0] ref_attributes = tag[1:] possible_inflections = rev_map.get((lemma, category), []) # Skip if |inflections| = 1 [p(infl | lemma) = 1] if len(possible_inflections) == 1: continue if (ref_attributes, ref_inflection) not in possible_inflections: continue X.append(features) # Y_all / Y_lim lims = inflection_lims.get((lemma, category), None) if lims is None: # new set of inflections for i, (attributes, _) in enumerate(possible_inflections): label = { attr: 1 for attr in config.get_attributes( category, attributes) } Y_all.append(label) # attributes map lims = (n, n + len(possible_inflections)) inflection_lims[lemma, category] = lims n += len(possible_inflections) Y_lim.append(lims) # Y_star for i, (attributes, _) in enumerate(possible_inflections): if attributes == ref_attributes: Y_star.append(i) # free some memory del rev_map if not os.path.exists(args.model): os.mkdir(args.model) def save_model(it, model): with open(os.path.join(args.model, 'model.{}.pickle'.format(it + 1)), 'w') as f: cPickle.dump(model, f, protocol=-1) model = StructuredModel(args.category) model.train(X, Y_all, Y_star, Y_lim, n_iter=args.n_iter, alpha_sgd=args.rate, every_iter=save_model)
def score(self, tag, features): score = 0 for attr in config.get_attributes(self.category, tag): for fname, fval in features.iteritems(): score += fval * self.weights.get(attr + '_' + fname, 0) return score