def parser(line): states = [] bps = {} best = {} for i in xrange(len(line)): found = False pattern = '.*->\s'+re.sub('\?','\?',re.sub('\.','\.',line[i]))+r'\s#\s.*' with open('pcfg') as rules: for rule in rules: if re.match(pattern, rule): X = re.match(r'(.*)\s->.*', rule).group(1) logprob = bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1)))) states.append((X,i,i+1)) best[(X,i,i+1)]=logprob bps[(X,i,i+1)] = (re.match(r'(.*)\s#.*', rule).group(1),) found = True if found == False: unk_pattern = re.compile(r'.*<unk>.*') with open('pcfg') as rules: for rule in rules: if unk_pattern.match(rule): X = re.match(r'(.*)\s->.*', rule).group(1) logprob = bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1)))) states.append((X,i,i+1)) best[(X,i,i+1)]=logprob bps[(X,i,i+1)] = (re.match(r'(.*)\s#.*', rule).group(1),) found = True states, bps, best = parse(states, bps, best, line) root = ('TOP', 0, len(line)) try: return trees(root, bps) except: return ''
def main(): parser = argparse.ArgumentParser( description= "ignore input; make a demo grammar that is compliant in form", formatter_class=argparse.ArgumentDefaultsHelpFormatter) addonoffarg(parser, 'debug', help="debug mode", default=False) parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input file (ignored)") parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file (grammar)") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) workdir = tempfile.mkdtemp(prefix=os.path.basename(__file__), dir=os.getenv('TMPDIR', '/tmp')) fh = open('pcfg_log', 'w') def cleanwork(): shutil.rmtree(workdir, ignore_errors=True) if args.debug: print(workdir) else: atexit.register(cleanwork) rule_dict = {} rule_freq = {} treebank_rules = [] rule_lhs = [] infile = prepfile(args.infile, 'r') outfile = prepfile(args.outfile, 'w') for tree in infile: t = Tree.fromstring(tree) tree_rules = t.productions() for rule in tree_rules: rule_lhs.append(rule.lhs()) treebank_rules.append(rule) #print treebank_rules freq_dict = Counter(rule_lhs) treebank_dict = Counter(treebank_rules) for production in treebank_dict.iterkeys(): count = treebank_dict.get(production) prob = bigfloat.bigfloat(count) / bigfloat.bigfloat( freq_dict.get(production.lhs())) outfile.write('{0} # {1} \n'.format(production, prob)) fh.write('{0} # {1} \n'.format(production, bigfloat.log10(prob))) fh.close()
def q1_parse_input_trees(tree_unk_file): f = open(tree_unk_file, 'r') fileData = f.read() data = fileData.split('\n') inputTrees = [] for line in data: if line == '': continue inputTrees.append(Tree.from_str(line)) rules_dict = {} for tree in inputTrees: if tree == '': continue nodes = tree.bottomup() children = None for node in nodes: children = node.children if children == []: continue rules_dict.setdefault(str(node), {}) # if leaf node(a terminal), add a string else tuple right_rule = None if len(children[0].children) == 0: right_rule = str( children[0]) #<<<<<<<<<---- CONVERT LEAF NODES TO LOWER?? else: right_rule = tuple(map(lambda x: str(x), node.children)) rules_dict[str(node)].setdefault(right_rule, { 'count': 0, 'probability': 0 }) rules_dict[str(node)][right_rule]['count'] += 1 q1_answer = [[None, [None]], 0] for left_rule, right_rule in rules_dict.iteritems(): denominator = 0 for r_rule, count_prob_dict in right_rule.iteritems(): denominator += count_prob_dict['count'] if count_prob_dict['count'] > q1_answer[1]: q1_answer[1] = count_prob_dict['count'] q1_answer[0][0] = left_rule q1_answer[0][1] = r_rule for r_rule, count_prob_dict in right_rule.iteritems(): count_prob_dict['probability'] = log10( bigfloat(float(count_prob_dict['count']) / denominator)) print 'QUESTION 1 - Most Frequent Rule: ', q1_answer[0][ 0], '->' + q1_answer[0][1], ' Occcourence =', q1_answer[1] return rules_dict
def FileRead(): with open('./train.trees.pre.unk','r') as f: for line in f: tr1 = Tree.from_str(line) q = tr1.bottomup() for l in q: if l.children == []: continue grammar.setdefault(l.label, {}) children = map(lambda x:str(x), l.children) grammar[l.label].setdefault(tuple(children),0 ) grammar[l.label] [tuple(children)]+=1 #Smoothing by adding additional rules i='<unk>' for k,v in grammar.iteritems(): if i not in str(v): grammar[k][('<unk>',)]=1 count =0 for k,v in grammar.iteritems(): count+=len(v) print "QUESTION 1 - \n Number of rules in grammar = ", count answer=[ [None,[None]],0] for k,v in grammar.iteritems(): denominator=0 for k1, v1 in v.iteritems(): denominator+=v1 if v1 > answer[1]: answer[1] = v1 answer[0][0] = k answer[0][1] = k1 print "Most Frequent Rule: \n ",str(answer[0][0]),"->"+ str(answer[0][1]),"Count =", str(answer[1]) for k,v in grammar.iteritems(): s1=0 for k1,v1 in v.iteritems(): s1 = s1 + v1 for k1,v1 in v.iteritems(): p = float(v1)/float(s1) v[k1]= log10(bigfloat(p))
def parse(states, bps, best, line, length=1): if length == len(line)+1: return states, bps, best else: for s1 in states: for s2 in states: if s1[2]==s2[1] and s2[2]-s1[1]==length: with open('pcfg') as rules: for rule in rules: backptr = {} if re.compile('.*->\s'+re.sub('\*','\*',s1[0])+' '+re.sub('\*','\*',s2[0])+r'\s#\s.*').match(rule): logprob = best[s1]+best[s2]+bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1)))) new_state = (re.match(r'(.*)\s->.*', rule).group(1), s1[1], s2[2]) if new_state not in states: states.append(new_state) best[new_state]=logprob bps[new_state] = (re.match(r'(.*)\s#.*', rule).group(1),s1[2]) elif logprob > best[new_state]: best[new_state]=logprob bps[new_state] = (re.match(r'(.*)\s#.*', rule).group(1),s1[2]) length += 1 return parse(states, bps, best, line, length)
def q1_parse_input_trees(tree_unk_file): f = open(tree_unk_file, 'r') fileData = f.read() data = fileData.split('\n') inputTrees = [] for line in data: if line == '': continue inputTrees.append(Tree.from_str(line)) rules_dict = {} for tree in inputTrees: if tree == '': continue nodes = tree.bottomup() children = None for node in nodes: children = node.children if children == []: continue rules_dict.setdefault(str(node), {}) # if leaf node(a terminal), add a string else tuple right_rule = None if len(children[0].children) == 0: right_rule = str( children[0]) #<<<<<<<<<---- CONVERT LEAF NODES TO LOWER?? else: right_rule = tuple(map(lambda x: str(x), node.children)) rules_dict[str(node)].setdefault(right_rule, { 'count': 0, 'probability': 0 }) rules_dict[str(node)][right_rule]['count'] += 1 #SMOOTHEN <unk> for k, v in rules_dict.iteritems(): if '<unk>' not in v: rules_dict[k].setdefault('<unk>', {'count': 0, 'probability': 0}) rules_dict[k]['<unk>']['count'] += 1 q1_answer = [[None, [None]], 0] for left_rule, right_rule in rules_dict.iteritems(): denominator = 0 for r_rule, count_prob_dict in right_rule.iteritems(): denominator += count_prob_dict['count'] if count_prob_dict['count'] > q1_answer[1]: q1_answer[1] = count_prob_dict['count'] q1_answer[0][0] = left_rule q1_answer[0][1] = r_rule for r_rule, count_prob_dict in right_rule.iteritems(): count_prob_dict['probability'] = log10( bigfloat(float(count_prob_dict['count']) / denominator)) print 'QUESTION 1 - Most Frequent Rule: ', q1_answer[0][ 0], '->', q1_answer[0][1], ' Occcourence =', q1_answer[1] #=========================================================================== # import csv # with open('Rules.csv','wb') as f: # cw=csv.writer(f,delimiter=',',quoting=csv.QUOTE_ALL) # for k,v in rules_dict.iteritems(): # pk=True # for k2,v2 in v.iteritems(): # if pk: # cw.writerow([k,k2,v2]) # pk = False # else: cw.writerow(['',k2,v2]) #=========================================================================== #print 'CSV PRINTED' return rules_dict
def norm(ts): vs = ts.values vs = [ 0.0-float( bigfloat.log10(bigfloat.BigFloat(v)) ) for v in vs] ts = pd.Series(vs,index=ts.index) return ts
print "Most Frequent Rule is:-", items, "and the count is=", grammar_dict[ items] keys = grammar_dict.keys() def getDenominator(each, keys): count = 0 seach = each.split(' ', 1) if (seach[0] == 'RB'): pass # print '&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&' # print seach[0] for key in keys: if seach[0] == key.split(' ', 1)[0]: count = count + 1 return count prob_dict = {} for each in keys: num = grammar_dict[each] den = getDenominator(each, keys) #print 'num=',num,'den=',den #print round((num/den),2) prob_dict[each] = log10(bigfloat(float(num / den))) print '################################################' #print prob_dict #print 'prob dict length:',len(prob_dict.keys())