Пример #1
0
def parser(line):
    states = []
    bps = {}
    best = {}
    for i in xrange(len(line)):
        found = False
        pattern = '.*->\s'+re.sub('\?','\?',re.sub('\.','\.',line[i]))+r'\s#\s.*'
        with open('pcfg') as rules:
            for rule in rules:
                if re.match(pattern, rule):
                    X = re.match(r'(.*)\s->.*', rule).group(1)
                    logprob = bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1))))
                    states.append((X,i,i+1))
                    best[(X,i,i+1)]=logprob
                    bps[(X,i,i+1)] = (re.match(r'(.*)\s#.*', rule).group(1),)
                    found = True
        if found == False:
            unk_pattern = re.compile(r'.*<unk>.*')
            with open('pcfg') as rules:
                for rule in rules:
                    if unk_pattern.match(rule):
                        X = re.match(r'(.*)\s->.*', rule).group(1)
                        logprob = bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1))))
                        states.append((X,i,i+1))
                        best[(X,i,i+1)]=logprob
                        bps[(X,i,i+1)] = (re.match(r'(.*)\s#.*', rule).group(1),)
                        found = True
    states, bps, best = parse(states, bps, best, line)
    root = ('TOP', 0, len(line))
    try:
        return trees(root, bps)
    except:
        return ''
def main():
    parser = argparse.ArgumentParser(
        description=
        "ignore input; make a demo grammar that is compliant in form",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    addonoffarg(parser, 'debug', help="debug mode", default=False)
    parser.add_argument("--infile",
                        "-i",
                        nargs='?',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="input file (ignored)")
    parser.add_argument("--outfile",
                        "-o",
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="output file (grammar)")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    workdir = tempfile.mkdtemp(prefix=os.path.basename(__file__),
                               dir=os.getenv('TMPDIR', '/tmp'))

    fh = open('pcfg_log', 'w')

    def cleanwork():
        shutil.rmtree(workdir, ignore_errors=True)

    if args.debug:
        print(workdir)
    else:
        atexit.register(cleanwork)

    rule_dict = {}
    rule_freq = {}
    treebank_rules = []
    rule_lhs = []
    infile = prepfile(args.infile, 'r')
    outfile = prepfile(args.outfile, 'w')
    for tree in infile:
        t = Tree.fromstring(tree)
        tree_rules = t.productions()
        for rule in tree_rules:
            rule_lhs.append(rule.lhs())
            treebank_rules.append(rule)
        #print treebank_rules
    freq_dict = Counter(rule_lhs)
    treebank_dict = Counter(treebank_rules)
    for production in treebank_dict.iterkeys():
        count = treebank_dict.get(production)
        prob = bigfloat.bigfloat(count) / bigfloat.bigfloat(
            freq_dict.get(production.lhs()))
        outfile.write('{0} # {1} \n'.format(production, prob))
        fh.write('{0} # {1} \n'.format(production, bigfloat.log10(prob)))
    fh.close()
def q1_parse_input_trees(tree_unk_file):
    f = open(tree_unk_file, 'r')

    fileData = f.read()
    data = fileData.split('\n')

    inputTrees = []
    for line in data:
        if line == '': continue
        inputTrees.append(Tree.from_str(line))

    rules_dict = {}
    for tree in inputTrees:
        if tree == '': continue

        nodes = tree.bottomup()
        children = None

        for node in nodes:
            children = node.children
            if children == []: continue

            rules_dict.setdefault(str(node), {})
            # if leaf node(a terminal), add a string else tuple
            right_rule = None
            if len(children[0].children) == 0:
                right_rule = str(
                    children[0])  #<<<<<<<<<---- CONVERT LEAF NODES TO LOWER??
            else:
                right_rule = tuple(map(lambda x: str(x), node.children))

            rules_dict[str(node)].setdefault(right_rule, {
                'count': 0,
                'probability': 0
            })

            rules_dict[str(node)][right_rule]['count'] += 1

    q1_answer = [[None, [None]], 0]
    for left_rule, right_rule in rules_dict.iteritems():

        denominator = 0
        for r_rule, count_prob_dict in right_rule.iteritems():
            denominator += count_prob_dict['count']

            if count_prob_dict['count'] > q1_answer[1]:
                q1_answer[1] = count_prob_dict['count']
                q1_answer[0][0] = left_rule
                q1_answer[0][1] = r_rule

        for r_rule, count_prob_dict in right_rule.iteritems():
            count_prob_dict['probability'] = log10(
                bigfloat(float(count_prob_dict['count']) / denominator))

    print 'QUESTION 1 - Most Frequent Rule: ', q1_answer[0][
        0], '->' + q1_answer[0][1], '    Occcourence =', q1_answer[1]

    return rules_dict
Пример #4
0
def FileRead():
    with open('./train.trees.pre.unk','r') as f:
        for line in f:
            tr1 = Tree.from_str(line)
            q = tr1.bottomup()
            for l in q:
                if l.children == []:
                    continue
                grammar.setdefault(l.label, {})
                children = map(lambda x:str(x), l.children)
                grammar[l.label].setdefault(tuple(children),0 )
                grammar[l.label] [tuple(children)]+=1
    
    #Smoothing by adding additional rules
    i='<unk>'
    for k,v in grammar.iteritems():
        if i not in str(v):
            grammar[k][('<unk>',)]=1
            
    count =0
    for k,v in grammar.iteritems():
        count+=len(v)
    
    print "QUESTION 1 - \n Number of rules in grammar = ", count
    
    answer=[ [None,[None]],0]
    for k,v in grammar.iteritems():
        
        denominator=0
        for k1, v1 in v.iteritems():
            denominator+=v1
            
            if v1 > answer[1]:
                answer[1] = v1
                answer[0][0] = k
                answer[0][1] = k1
        
    
    print "Most Frequent Rule: \n ",str(answer[0][0]),"->"+ str(answer[0][1]),"Count =", str(answer[1])
    
    for k,v in grammar.iteritems():
        s1=0
        for k1,v1 in v.iteritems():
            s1 = s1 + v1 
        for k1,v1 in v.iteritems():
            p = float(v1)/float(s1)
            v[k1]= log10(bigfloat(p))
Пример #5
0
def parse(states, bps, best, line, length=1):
    if length == len(line)+1:
        return states, bps, best
    else:
        for s1 in states:
            for s2 in states:
                if s1[2]==s2[1] and s2[2]-s1[1]==length:       
                    with open('pcfg') as rules:
                        for rule in rules:
                            backptr = {}
                            if re.compile('.*->\s'+re.sub('\*','\*',s1[0])+' '+re.sub('\*','\*',s2[0])+r'\s#\s.*').match(rule):
                                logprob = best[s1]+best[s2]+bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1))))
                                new_state = (re.match(r'(.*)\s->.*', rule).group(1), s1[1], s2[2])
                                if new_state not in states:
                                    states.append(new_state)
                                    best[new_state]=logprob
                                    bps[new_state] = (re.match(r'(.*)\s#.*', rule).group(1),s1[2]) 
                                elif logprob > best[new_state]:
                                    best[new_state]=logprob
                                    bps[new_state] = (re.match(r'(.*)\s#.*', rule).group(1),s1[2])
        length += 1
        return parse(states, bps, best, line, length)
def q1_parse_input_trees(tree_unk_file):
    f = open(tree_unk_file, 'r')

    fileData = f.read()
    data = fileData.split('\n')

    inputTrees = []
    for line in data:
        if line == '': continue
        inputTrees.append(Tree.from_str(line))

    rules_dict = {}
    for tree in inputTrees:
        if tree == '': continue

        nodes = tree.bottomup()
        children = None

        for node in nodes:
            children = node.children
            if children == []: continue

            rules_dict.setdefault(str(node), {})
            # if leaf node(a terminal), add a string else tuple
            right_rule = None
            if len(children[0].children) == 0:
                right_rule = str(
                    children[0])  #<<<<<<<<<---- CONVERT LEAF NODES TO LOWER??
            else:
                right_rule = tuple(map(lambda x: str(x), node.children))

            rules_dict[str(node)].setdefault(right_rule, {
                'count': 0,
                'probability': 0
            })

            rules_dict[str(node)][right_rule]['count'] += 1

    #SMOOTHEN <unk>
    for k, v in rules_dict.iteritems():
        if '<unk>' not in v:
            rules_dict[k].setdefault('<unk>', {'count': 0, 'probability': 0})
            rules_dict[k]['<unk>']['count'] += 1

    q1_answer = [[None, [None]], 0]
    for left_rule, right_rule in rules_dict.iteritems():

        denominator = 0
        for r_rule, count_prob_dict in right_rule.iteritems():
            denominator += count_prob_dict['count']

            if count_prob_dict['count'] > q1_answer[1]:
                q1_answer[1] = count_prob_dict['count']
                q1_answer[0][0] = left_rule
                q1_answer[0][1] = r_rule

        for r_rule, count_prob_dict in right_rule.iteritems():
            count_prob_dict['probability'] = log10(
                bigfloat(float(count_prob_dict['count']) / denominator))

    print 'QUESTION 1 - Most Frequent Rule: ', q1_answer[0][
        0], '->', q1_answer[0][1], '    Occcourence =', q1_answer[1]

    #===========================================================================
    # import csv
    # with open('Rules.csv','wb') as f:
    #     cw=csv.writer(f,delimiter=',',quoting=csv.QUOTE_ALL)
    #     for k,v in rules_dict.iteritems():
    #         pk=True
    #         for k2,v2 in v.iteritems():
    #             if pk:
    #                 cw.writerow([k,k2,v2])
    #                 pk = False
    #             else: cw.writerow(['',k2,v2])
    #===========================================================================

    #print 'CSV PRINTED'
    return rules_dict
Пример #7
0
def norm(ts):
    vs = ts.values
    vs = [ 0.0-float( bigfloat.log10(bigfloat.BigFloat(v)) ) for v in vs]
    ts = pd.Series(vs,index=ts.index)
    return ts
Пример #8
0
    print "Most Frequent Rule is:-", items, "and the count is=", grammar_dict[
        items]

keys = grammar_dict.keys()


def getDenominator(each, keys):
    count = 0
    seach = each.split(' ', 1)
    if (seach[0] == 'RB'):
        pass

    # print '&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&'
    # print seach[0]
    for key in keys:
        if seach[0] == key.split(' ', 1)[0]:
            count = count + 1
    return count


prob_dict = {}
for each in keys:
    num = grammar_dict[each]
    den = getDenominator(each, keys)
    #print 'num=',num,'den=',den
    #print round((num/den),2)
    prob_dict[each] = log10(bigfloat(float(num / den)))
print '################################################'
#print prob_dict
#print 'prob dict length:',len(prob_dict.keys())