예제 #1
0
 def input(self, fwords, meta):
     self.ewords = [{} for f in fwords]
     for (tag, attrs, i, j) in meta:
         attrs = sgml.attrs_to_dict(attrs)
         if attrs.has_key('eword'):
             if j-i != 1:
                 log.write("warning: eword attribute given for multi-word French expression")
             ewords = [sym.fromstring(e.strip()) for e in attrs['eword'].split('|')]
             if 'cost' in attrs:
                 costs = [float(x) for x in attrs['cost'].split('|')]
             elif 'prob' in attrs:
                 costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')]
             else:
                 costs = [-math.log10(1.0/len(ewords)) for e in ewords]
             self.ewords[i] = dict(zip(ewords,costs))
예제 #2
0
    def input(self, input):
        self.rules = collections.defaultdict(list)
        for tag, attrs, i, j in input.fmeta:
            attrs = sgml.attrs_to_dict(attrs)
            if attrs.has_key('english'):
                ephrases = attrs['english'].split('|')

                if attrs.has_key('cost'):
                    costs = [float(x) for x in attrs['cost'].split('|')]
                elif attrs.has_key('prob'):
                    costs = [-math.log10(float(x)) for x in attrs['prob'].split('|')]
                else:
                    costs = [-math.log10(1.0/len(ephrases)) for e in ephrases] # uniform
                if len(costs) != len(ephrases):
                    sys.stderr.write("wrong number of probabilities/costs")
                    raise ValueError

                if attrs.has_key('features'):
                    features = attrs['features'].split('|')
                    if len(features) != len(ephrases):
                        sys.stderr.write("wrong number of feature names")
                        raise ValueError
                elif attrs.has_key('feature'):
                    features = [attrs['feature'] for ephrase in ephrases]
                else:
                    features = ['sgml' for ephrase in ephrases]

                if attrs.has_key('label'):
                    tags = attrs['label'].split('|')
                else:
                    tags = [tag.upper()]

                # bug: if new nonterminals are introduced at this point,
                # they will not participate in the topological sort

                for (ephrase,cost,feature) in zip(ephrases,costs,features):
                    for tag in tags:
                        r = rule.Rule(sym.fromtag(tag),
                                      rule.Phrase(input.fwords[i:j]),
                                      rule.Phrase([sym.fromstring(e) for e in ephrase.split()]),
                                      scores=svector.Vector('%s' % feature, cost))
                        self.rules[i,j].append((r,))
예제 #3
0
 def input(self, fwords, meta):
     self.ewords = [{} for f in fwords]
     for (tag, attrs, i, j) in meta:
         attrs = sgml.attrs_to_dict(attrs)
         if attrs.has_key('eword'):
             if j - i != 1:
                 log.write(
                     "warning: eword attribute given for multi-word French expression"
                 )
             ewords = [
                 sym.fromstring(e.strip())
                 for e in attrs['eword'].split('|')
             ]
             if 'cost' in attrs:
                 costs = [float(x) for x in attrs['cost'].split('|')]
             elif 'prob' in attrs:
                 costs = [
                     -math.log10(float(x)) for x in attrs['prob'].split('|')
                 ]
             else:
                 costs = [-math.log10(1.0 / len(ewords)) for e in ewords]
             self.ewords[i] = dict(zip(ewords, costs))
예제 #4
0
파일: extractor.py 프로젝트: awildfox/cdec
        if log.level >= 1:
            log.write("Reading configuration from %s\n" % opts.config)
        execfile(opts.config)

    if len(args) >= 1 and args[0] != "-":
        input_file = file(args[0], "r")
    else:
        input_file = sys.stdin

    if len(args) >= 2 and args[1] != "-":
        output_file = file(args[1], "w")
    else:
        output_file = sys.stdout

    gc.collect()
    if log.level >= 1:
        log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu()))
        log.write("models: %s\n" % (" ".join(str(x.name) for x in models)))

    sents = sgml.read_raw(input_file)
    for sent in sents:
        mark = sent.getmark()
        if mark is not None:
            (tag, attrs) = mark
            if tag == "seg":
                sent.unmark()
                dattrs = sgml.attrs_to_dict(attrs)
                sent.meta = attrs
        extract_grammar(sent)