示例#1
0
文件: online.py 项目: PhdDone/cdec
 def add_bitext(self, alignment_f, text_f, target_text_f=None):
     # Allow one or two args for bitext
     if target_text_f:
         t = itertools.izip((line.strip() for line in gzip_or_text(text_f)), (line.strip() for line in gzip_or_text(target_text_f)))
     else:
         t = (line.strip().split(' ||| ') for line in gzip_or_text(text_f))
     a = (line.strip() for line in gzip_or_text(alignment_f))
     for (source, target) in t:
         links = sorted(tuple(int(link) for link in link_str.split('-')) for link_str in a.next().split())
         self.update(source.split(), target.split(), links)
示例#2
0
 def add_bitext(self, alignment_f, text_f, target_text_f=None):
     # Allow one or two args for bitext
     if target_text_f:
         t = itertools.izip(
             (line.strip() for line in gzip_or_text(text_f)),
             (line.strip() for line in gzip_or_text(target_text_f)))
     else:
         t = (line.strip().split(' ||| ') for line in gzip_or_text(text_f))
     a = (line.strip() for line in gzip_or_text(alignment_f))
     for (source, target) in t:
         links = sorted(
             tuple(int(link) for link in link_str.split('-'))
             for link_str in a.next().split())
         self.update(source.split(), target.split(), links)
示例#3
0
文件: online.py 项目: PhdDone/cdec
 def read(self, in_f):
     with gzip_or_text(in_f) as inp:
         while True:
             line = inp.readline().strip()
             if not line:
                 break
             (w, c) = line.split()
             self.f[w] = int(c)
         while True:
             line = inp.readline().strip()
             if not line:
                 break
             (w, c) = line.split()
             self.e[w] = int(c)
         while True:
             line = inp.readline().strip()
             if not line:
                 break
             (f, e, c) = line.split()
             self.fe[(f, e)] = float(c)
示例#4
0
 def read(self, in_f):
     with gzip_or_text(in_f) as inp:
         while True:
             line = inp.readline().strip()
             if not line:
                 break
             (w, c) = line.split()
             self.f[w] = int(c)
         while True:
             line = inp.readline().strip()
             if not line:
                 break
             (w, c) = line.split()
             self.e[w] = int(c)
         while True:
             line = inp.readline().strip()
             if not line:
                 break
             (f, e, c) = line.split()
             self.fe[(f, e)] = float(c)
示例#5
0
文件: online.py 项目: PhdDone/cdec
def read_vocab(in_f):
    return set(line.strip() for line in gzip_or_text(in_f))
示例#6
0
文件: online.py 项目: PhdDone/cdec
def learn_vocab(text_f):
    vocab = set()
    for line in gzip_or_text(text_f):
        for word in line.strip().split():
            vocab.add(word)
    return vocab
示例#7
0
def read_vocab(in_f):
    return set(line.strip() for line in gzip_or_text(in_f))
示例#8
0
def learn_vocab(text_f):
    vocab = set()
    for line in gzip_or_text(text_f):
        for word in line.strip().split():
            vocab.add(word)
    return vocab
示例#9
0
    def __init__(self, config, online=False, vocab=None, features=None):
        if isinstance(config, basestring):
            if not os.path.exists(config):
                raise IOError('cannot read configuration from {0}'.format(config))
            config = cdec.configobj.ConfigObj(config, unrepr=True)
        alignment = cdec.sa.Alignment(from_binary=config['a_file'])
        self.factory = cdec.sa.HieroCachingRuleFactory(
                # compiled alignment object (REQUIRED)
                alignment,
                # name of generic nonterminal used by Hiero
                category="[X]",
                # maximum number of contiguous chunks of terminal symbols in RHS of a rule
                max_chunks=config['max_nt']+1,
                # maximum span of a grammar rule in TEST DATA
                max_initial_size=MAX_INITIAL_SIZE,
                # maximum number of symbols (both T and NT) allowed in a rule
                max_length=config['max_len'],
                # maximum number of nonterminals allowed in a rule (set >2 at your own risk)
                max_nonterminals=config['max_nt'],
                # maximum number of contiguous chunks of terminal symbols
                # in target-side RHS of a rule.
                max_target_chunks=config['max_nt']+1,
                # maximum number of target side symbols (both T and NT) allowed in a rule.
                max_target_length=MAX_INITIAL_SIZE,
                # minimum span of a nonterminal in the RHS of a rule in TEST DATA
                min_gap_size=1,
                # filename of file containing precomputed collocations
                precompute_file=config['precompute_file'],
                # maximum frequency rank of patterns used to compute triples (< 20)
                precompute_secondary_rank=config['rank2'],
                # maximum frequency rank of patterns used to compute collocations (< 300)
                precompute_rank=config['rank1'],
                # require extracted rules to have at least one aligned word
                require_aligned_terminal=True,
                # require each contiguous chunk of extracted rules
                # to have at least one aligned word
                require_aligned_chunks=False,
                # maximum span of a grammar rule extracted from TRAINING DATA
                train_max_initial_size=config['max_size'],
                # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
                train_min_gap_size=config['min_gap'],
                # False if phrases should be loose (better but slower), True otherwise
                tight_phrases=config.get('tight_phrases', True),
                )

        # lexical weighting tables
        tt = cdec.sa.BiLex(from_binary=config['lex_file'])

        # TODO: clean this up
        extended_features = []
        if online:
            extended_features.append(IsSupportedOnline)
        if vocab:
            vcb_set = set(line.strip() for line in gzip_or_text(vocab))
            extended_features.append(CountExceptLM(vcb_set))
            extended_features.append(CountExceptLex(tt))
            
        # TODO: use @cdec.sa.features decorator for standard features too
        # + add a mask to disable features
        for f in cdec.sa._SA_FEATURES:
            extended_features.append(f)
            
        scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, 
            MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
            *extended_features)

        fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
        edarray = cdec.sa.DataArray(from_binary=config['e_file'])

        # lower=faster, higher=better; improvements level off above 200-300 range,
        # -1 = don't sample, use all data (VERY SLOW!)
        sampler = cdec.sa.Sampler(300, fsarray)

        self.factory.configure(fsarray, edarray, sampler, scorer)
        # Initialize feature definitions with configuration
        for fn in cdec.sa._SA_CONFIGURE:
            fn(config)