def compute(p, op, e, target, problem, story, order, score=None, cons=None): if op == '=': vec = [order, score, cons] vec.extend(makesets.vector(p, e, problem, story, target)) op_label, op_acc, op_val = svmutils.svm_predict( [-1], [vec], glob, '-q -b 1' ) else: vec = makesets.vector(p, e, problem, story, target) op_label, op_acc, op_val = svmutils.svm_predict( [-1], [vec], multi, '-q -b 1' ) op_val = op_val[0] if op == '+': val = op_val[0] if op == '-': val = op_val[1] if op == '*': val = op_val[2] if op == '/': val = op_val[3] if op == '=': val = op_val[0] c = makesets.combine(p[1], e[1], op) return (val, c, op_val)
def dotrain(): fn = sys.argv[1] with open(fn) as f: f = f.split("___") for c in f: p,a,t,n, for j,eq in enumerate(answers): trips = [] print(j,eq) l,r = [x.strip().split(' ') for x in eq.split('=')] compound = l if len(r)==1 else r simplex = l if len(l)==1 else r target = simplex[0] target = (target,objs[target]) #find innermost parens? while len(compound)>1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index('(') lpidx = rpidx+compound[rpidx:].index(")") subeq = compound[rpidx+1:lpidx] substr = "("+''.join(subeq)+")" compound = compound[:rpidx]+[substr]+compound[lpidx+1:] else: subeq = compound[0:3] substr = "("+''.join(subeq)+")" compound = [substr]+compound[3:] if True: p,op,e = subeq #print(p,op,e) p = objs[p] e = objs[e] op = op.strip() trips.append((op,p,e)) pute = (0,makesets.combine(p[1],e[1],op)) #print("OPERATION SELECTED: ",op) #p.details() #e.details() #print(substr,pute[1].num) objs[substr]=pute if pute == -1: exit() if simplex == l: trips.append(("=",objs[simplex[0]],objs[compound[0]])) else: trips.append(("=",objs[compound[0]],objs[simplex[0]])) t = training(trips,problem,target) for op in t: bigtexamples[op][0].extend(t[op][0]) bigtexamples[op][1].extend(t[op][1]) print(op,len(bigtexamples[op][0])) pickle.dump(bigtexamples,open('data/training.pickle','wb'))
def compute(p, op, e, target, problem, story, order): vec = makesets.vector(p, e, problem, story, target) op_label, op_acc, op_val = svmutil.svm_predict([-1], [vec], multi, '-q -b 1') op_val = op_val[0] if op == '+': val = op_val[0] if op == '-': val = op_val[1] if op == '*': val = op_val[2] if op == '/': val = op_val[3] if op == '=': val = op_val[1] c = makesets.combine(p[1], e[1], op) return (val, c, op_val)
def compute(p, op, e, target, problem): vec = makesets.vector((0, p), (0, e), problem, target) #if p.ent == e.ent and op in ['*','/']: # val = 0 #else: if True: op_label, op_acc, op_val = svm_predict([-1], [vec], multi, '-q -b 1') #pmop_label, pmop_acc, pmop_val = svm_predict([-1], [vec], pm ,'-q -b 1') #mmop_label, mmop_acc, mmop_val = svm_predict([-1], [vec], md ,'-q -b 1') op_val = op_val[0] #pmop_val = pmop_val[0] #mmop_val = mmop_val[0] if op == '+': val = op_val[0] #*pmop_val[0] if op == '-': val = op_val[1] #*pmop_val[1] if op == '*': val = op_val[2] #*mmop_val[0] if op == '/': val = op_val[3] #*mmop_val[1] c = makesets.combine(p, e, op) return (val, c)
def compute(p,op,e,target,problem): vec = makesets.vector((0,p),(0,e),problem,target) #if p.ent == e.ent and op in ['*','/']: # val = 0 #else: if True: op_label, op_acc, op_val = svm_predict([-1], [vec], model ,'-q -b 1') #sop_label, sop_acc, sop_val = svm_predict([-1], [vec], smodel ,'-q -b 1') #print(op_label,op_acc,op_val) op_val=op_val[0] #sop_val=sop_val[0] #op_val = [op_val[0]*sop_val[0],op_val[1]*sop_val[0],op_val[2]*sop_val[1],op_val[3]*sop_val[1]] if op == '+': val = op_val[0] if op == '-': val = op_val[1] if op == '*': val = op_val[2] if op == '/': val = op_val[3] c = makesets.combine(p,e,op) return (val,c)
def compute(p,op,e,target,problem): vec = makesets.vector((0,p),(0,e),problem,target) #if p.ent == e.ent and op in ['*','/']: # val = 0 #else: if True: op_label, op_acc, op_val = svm_predict([-1], [vec], multi ,'-q -b 1') #pmop_label, pmop_acc, pmop_val = svm_predict([-1], [vec], pm ,'-q -b 1') #mmop_label, mmop_acc, mmop_val = svm_predict([-1], [vec], md ,'-q -b 1') op_val=op_val[0] #pmop_val = pmop_val[0] #mmop_val = mmop_val[0] if op == '+': val = op_val[0]#*pmop_val[0] if op == '-': val = op_val[1]#*pmop_val[1] if op == '*': val = op_val[2]#*mmop_val[0] if op == '/': val = op_val[3]#*mmop_val[1] c = makesets.combine(p,e,op) return (val,c)
def dotrain(): if len(sys.argv)>1: wps = open(sys.argv[1]).readlines() answs = open(sys.argv[2]).readlines() else: wps = open("emnlp_noIrrelev_p.txt").readlines() answs = open("emnlp_noIrrelev_a.txt").readlines() problematic = open('nogoodtrainproblems','w') bigtexamples = {x:([],[]) for x in ["+","*",'/','-','=']} replacements = {' two ':' 2 '," three ":' 3 ',' four ':' 4 ',' five ':' 5 ',' six ':' 6 ',' seven ':' 7 ',' eight ':' 8 ',' nine ':' 9 ',' ten ':' 10 ',' eleven ':' 11 ',' week ':' 7 days ',' dozen ':' 12 of ', ' dozens ': ' 12 ', ' twice ':' 2 '} for k in range(len(wps)): print(k) problem = wps[k].lower() for r in replacements: problem = problem.replace(r,replacements[r]) #extract numbers: #problem = ' '.join([x.replace(",","") for x in problem.split()]) story = nlp.parse(problem) numbs = makesets.makesets(story['sentences']) numlist = [(cleannum(v.num),v) for k,v in numbs] numlist = [x for x in numlist if x[0]!=''] allnumbs = {str(k):v for k,v in numlist} if 'x' not in allnumbs: if 'x*' not in allnumbs: problematic.write('no x :'+problem); continue objs = {k:(0,v) for k,v in numlist} print('start solving') print(numlist) if len(numlist)<2: problematic.write("not enough numbers : "+problem);continue ST = Solver([x[0] for x in numlist if x[0]!='x']) answers = ST.solveEquations(float(answs[k])) print('done solving') #filter out where = in middle if simpler eq exists simpleranswers = [x for x in answers if x.split(" ")[1] == '=' or x.split(" ")[-2]=="="] if not answers: continue if simpleranswers: answers = simpleranswers else: print(answers) problematic.write("not simple : "+problem);continue answervals = [x for x in answers[0].split(" ") if x not in ['+','-','/','=',')','(','*']] numvals = [x[0] for x in numlist if x[0] in answervals] xidx = numvals.index("x") rightidx = [i for i,x in enumerate(answers) if [z for z in x.split(" ") if z not in ['+','-','/','=',')','(','*']].index('x')==xidx] xrightanswers = [answers[i] for i in rightidx] if xrightanswers: answers = xrightanswers for j,eq in enumerate(answers): trips = [] print(j,eq) l,r = [x.strip().split(' ') for x in eq.split('=')] compound = l if len(r)==1 else r simplex = l if len(l)==1 else r target = simplex[0] target = (target,objs[target]) #find innermost parens? while len(compound)>1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index('(') lpidx = rpidx+compound[rpidx:].index(")") subeq = compound[rpidx+1:lpidx] substr = "("+''.join(subeq)+")" compound = compound[:rpidx]+[substr]+compound[lpidx+1:] else: subeq = compound[0:3] substr = "("+''.join(subeq)+")" compound = [substr]+compound[3:] if True: p,op,e = subeq #print(p,op,e) p = objs[p] e = objs[e] op = op.strip() trips.append((op,p,e)) pute = (0,makesets.combine(p[1],e[1],op)) #print("OPERATION SELECTED: ",op) #p.details() #e.details() #print(substr,pute[1].num) objs[substr]=pute if pute == -1: exit() if simplex == l: trips.append(("=",objs[simplex[0]],objs[compound[0]])) else: trips.append(("=",objs[compound[0]],objs[simplex[0]])) t = training(trips,problem,target) for op in t: bigtexamples[op][0].extend(t[op][0]) bigtexamples[op][1].extend(t[op][1]) print(op,len(bigtexamples[op][0])) pickle.dump(bigtexamples,open('data/dev_training.pickle','wb'))
def make_eq(q, a, equations): bigtexamples = {x: ([], []) for x in ["+", "*", "/", "-", "="]} wps = q # open(q).readlines() answs = a # open(a).readlines() for k in range(len(wps)): # First preprocessing, tokenize slightly problem = wps[k] # .lower() problem = problem.strip().split(" ") for i, x in enumerate(problem): if len(x) == 0: continue if x[-1] in [",", ".", "?"]: problem[i] = x[:-1] + " " + x[-1] problem = " ".join(problem) problem = " " + problem + " " print(k) print(problem) # story = nlp.parse(problem) story = read_parse(int(equations[k])) eqs = get_k_eqs(equations[k]) answers = [x[1] for x in eqs if x[0] == 1] if answers == []: continue answers = list(set(answers)) print(answers) # make story # story = nlp.parse(problem) # sets = makesets.makesets(story['sentences']) sets = read_sets(equations[k]) i = 0 xidx = [i for i, x in enumerate(sets) if x[1].num == "x"] if not xidx: print("NO X WHY") continue numlist = [(cleannum(v.num), v) for k, v in sets] numlist = [x for x in numlist if x[0] != ""] allnumbs = {str(k): v for k, v in numlist} objs = {k: (0, v) for k, v in numlist} print(objs.items()) consts = [x for x in answers[0].split(" ") if x not in ["(", ")", "+", "-", "/", "*", "="]] present = [x for x in consts if x in objs] if present != consts: print(present, consts) print("missing thing") # continue continue oanswers = [] for eq in answers: consts = [x for x in eq.split(" ") if x not in ["(", ")", "+", "-", "/", "*", "="]] order = int(consts == [x[0] for x in numlist]) if order == 0: continue else: oanswers.append(eq) if oanswers == []: continue answers = oanswers print(answers) simpleanswers = [x for x in answers if x.split(" ")[-2] == "="] if simpleanswers: answers = simpleanswers else: answers = [answers[randint(0, len(answers) - 1)]] print(answers) # simpleanswers = [] for j, eq in enumerate(answers): trips = [] print(j, eq) l, r = [x.strip().split(" ") for x in eq.split("=")] target = "x" target = (target, objs[target]) # find innermost parens? sides = [] for i, compound in enumerate([l, r]): while len(compound) > 1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index("(") lpidx = rpidx + compound[rpidx:].index(")") subeq = compound[rpidx + 1 : lpidx] substr = "(" + "".join(subeq) + ")" compound = compound[:rpidx] + [substr] + compound[lpidx + 1 :] else: subeq = compound[0:3] substr = "(" + "".join(subeq) + ")" compound = [substr] + compound[3:] if True: p, op, e = subeq p = objs[p] e = objs[e] op = op.strip() trips.append((op, p, e)) pute = (0, makesets.combine(p[1], e[1], op)) objs[substr] = pute if pute == -1: exit() t = training(trips, problem, story, target) for op in t: bigtexamples[op][0].extend(t[op][0]) bigtexamples[op][1].extend(t[op][1]) pickle.dump(bigtexamples, open("data/" + sys.argv[1][-1] + ".local.training", "wb"))
def make_eq(q, a, eqs, VERBOSE, TRAIN): bigtexamples = {x: ([], []) for x in ["+", "*", '/', '-', '=']} wps = q #open(q).readlines() answs = a #open(a).readlines() if not TRAIN and not VERBOSE: out = open(q + ".out.txt", 'w') problematic = open('somethingWrongProblems', 'w') replacements = { ' two ': ' 2 ', " three ": ' 3 ', ' four ': ' 4 ', ' five ': ' 5 ', ' six ': ' 6 ', ' seven ': ' 7 ', ' eight ': ' 8 ', ' nine ': ' 9 ', ' ten ': ' 10 ', ' eleven ': ' 11 ', ' twice ': ' 2 ' } for k in range(len(wps)): print(eqs[k]) if eqs[k].strip() == "None": continue answers = [eqs[k]] if VERBOSE: for i in range(len(wps)): print(i, wps[i]) k = int(input()) print(k) #First preprocessing, tokenize slightly problem = wps[k].lower() problem = problem.strip().split(" ") for i, x in enumerate(problem): if len(x) == 0: continue if x[-1] in [',', '.', '?']: problem[i] = x[:-1] + " " + x[-1] problem = ' '.join(problem) problem = " " + problem + " " print(problem) for r in replacements: problem = problem.replace(r, replacements[r]) #make story story = nlp.parse(problem) sets = makesets.makesets(story['sentences']) i = 0 print(sets) while i < len(sets): dups = [y for y in sets if y[1].idx != None] dups = [y for y in dups if y[1].idx == sets[i][1].idx] if len(dups) > 1: good = [ y for y in dups if len([x for x in y[1].num if x.isdigit()]) > 0 ] if good: others = [x for x in dups if x != good[0]] for x in others: sets.remove(x) else: # just pick 1 for x in dups[1:]: sets.remove(x) i += 1 xidx = [i for i, x in enumerate(sets) if x[1].num == 'x'] if not xidx: problematic.write('no x :' + problem) continue #TODO look for 2 xes xidx = xidx[0] numlist = [(cleannum(v.num), v) for k, v in sets] numlist = [x for x in numlist if x[0] != ''] if VERBOSE: for z, v in numlist: v.details() input() allnumbs = {str(k): v for k, v in numlist} objs = {k: (0, v) for k, v in numlist} for j, eq in enumerate(answers): trips = [] print(j, eq) l, r = [x.strip().split(' ') for x in eq.split('=')] #compound = r if len(l)==1 else l #simplex = l if len(l)==1 else r target = 'x' target = (target, objs[target]) #find innermost parens? sides = [] for i, compound in enumerate([l, r]): while len(compound) > 1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index('(') lpidx = rpidx + compound[rpidx:].index(")") subeq = compound[rpidx + 1:lpidx] substr = "(" + ''.join(subeq) + ")" compound = compound[:rpidx] + [substr ] + compound[lpidx + 1:] else: subeq = compound[0:3] substr = "(" + ''.join(subeq) + ")" compound = [substr] + compound[3:] if True: p, op, e = subeq #print(p,op,e) p = objs[p] e = objs[e] op = op.strip() trips.append((op, p, e)) pute = (0, makesets.combine(p[1], e[1], op)) #print("OPERATION SELECTED: ",op) #p.details() #e.details() #print(substr,pute[1].num) objs[substr] = pute if pute == -1: exit() t = training(trips, problem, story, target) for op in t: bigtexamples[op][0].extend(t[op][0]) bigtexamples[op][1].extend(t[op][1]) print(op, len(bigtexamples[op][0])) if TRAIN: pickle.dump(bigtexamples, open('data/' + OUT + ".local.training", 'wb'))
def make_eq(q, a, VERBOSE, TRAIN): bigtexamples = {x: ([], []) for x in ["+", "*", '/', '-', '=']} wps = open(q).readlines() answs = open(a).readlines() if not TRAIN and not VERBOSE: out = open(q + ".out.txt", 'w') problematic = open('somethingWrongProblems', 'w') replacements = { ' two ': ' 2 ', " three ": ' 3 ', ' four ': ' 4 ', ' five ': ' 5 ', ' six ': ' 6 ', ' seven ': ' 7 ', ' eight ': ' 8 ', ' nine ': ' 9 ', ' ten ': ' 10 ', ' eleven ': ' 11 ', ' twice ': ' 2 ' } for k in range(len(wps)): if VERBOSE: for i in range(len(wps)): print(i, wps[i]) k = int(input()) print(k) problem = wps[k].lower() #First preprocessing, tokenize slightly problem = problem.strip().split(" ") for i, x in enumerate(problem): if len(x) == 0: continue if x[-1] in [',', '.', '?']: problem[i] = x[:-1] + " " + x[-1] problem = ' '.join(problem) problem = " " + problem + " " print(problem) for r in replacements: problem = problem.replace(r, replacements[r]) ''' if " how " in problem: left,right = problem.split(" how ") else: left = problem for r in replacements: left = left.replace(r,replacements[r]) if " how " in problem: problem = left + ' how ' + right else: problem = left ''' story = nlp.parse(problem) sets = makesets.makesets(story['sentences']) i = 0 print(sets) while i < len(sets): dups = [y for y in sets if y[1].idx != None] dups = [y for y in dups if y[1].idx == sets[i][1].idx] if len(dups) > 1: good = [ y for y in dups if len([x for x in y[1].num if x.isdigit()]) > 0 ] if good: others = [x for x in dups if x != good[0]] for x in others: sets.remove(x) else: # just pick 1 for x in dups[1:]: sets.remove(x) i += 1 xidx = [i for i, x in enumerate(sets) if x[1].num == 'x'] if not xidx: problematic.write('no x :' + problem) continue #TODO look for 2 xes xidx = xidx[0] twoToRight = False if xidx > 0: print(len(sets), xidx) if sets[xidx - 1][1].entity == 'dozen': # 2 vals to right twoToRight = True if len(sets) - xidx > 1: if sets[xidx + 1][1].entity == 'dozen': twoToRight = True if len(sets) - xidx < 3: if sets[xidx][1].entity == 'dozen': twoToRight = True numlist = [(cleannum(v.num), v) for k, v in sets] numlist = [x for x in numlist if x[0] != ''] if VERBOSE: for z, v in numlist: v.details() input() allnumbs = {str(k): v for k, v in numlist} objs = {k: (0, v) for k, v in numlist} print('start solving') print(numlist) if len(numlist) < 2: problematic.write("not enough numbers : " + problem) continue values = [x[0] for x in numlist if x[0] != 'x'] print(values) ST = Solver(values) answers = [] answers = ST.solveEquations(float(answs[k])) print(answs[k]) if not answers: problematic.write("No answers : " + problem + "\n") problematic.write(str([x[0] for x in numlist]) + '\n') problematic.write(answs[k] + '\n') continue print('done solving') # if target has 2 entities, try eqs with = x op y format simpleranswers = None if twoToRight: try: simpleranswers = [ x for x in answers if x.split(" ")[-4] == "=" and ( x.split(" ")[-3] == 'x' or x.split(' ')[-1] == 'x') ] except: pass if not simpleranswers: simpleranswers = [ x for x in answers if x.split(" ")[1] == '=' or x.split(" ")[-2] == "=" ] #simpleranswers = [x for x in answers if x.split(" ")[1] == '=' or x.split(" ")[-2]=="="] #filter out where = in middle if simpler eq exists if simpleranswers: print(answers) answers = simpleranswers[:] else: problematic.write("not simple : " + problem + "\n") continue values = [x[0] for x in numlist] xidx = values.index('x') print(simpleranswers) print(xidx) for a in simpleranswers: aspl = [ x for x in a.split(" ") if x not in ["/", "-", '+', '*', '=', '(', ')'] ] print(a) print(aspl) print(values) aidx = aspl.index('x') print(aidx) if aidx != xidx: print("removing ", a) answers.remove(a) print(answers) if answers == []: answers = simpleranswers print(answers) if not VERBOSE: if not TRAIN: out.write(problem + '\n') out.write(answs[k] + "\n") out.write(str([x[0] for x in numlist])) out.write("\n") for x in answers: out.write(x + "\n") out.write("___\n") if VERBOSE: input() if not TRAIN: continue if len([x for x in answers if x.split(" ")[-2] == "="]) > 0: answers = [x for x in answers if x.split(" ")[-2] == "="] c = randint(0, len(answers) - 1) answers = [answers[c]] for j, eq in enumerate(answers): trips = [] print(j, eq) l, r = [x.strip().split(' ') for x in eq.split('=')] compound = r if len(l) == 1 else l simplex = l if len(l) == 1 else r target = simplex[0] target = (target, objs[target]) #find innermost parens? while len(compound) > 1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index('(') lpidx = rpidx + compound[rpidx:].index(")") subeq = compound[rpidx + 1:lpidx] substr = "(" + ''.join(subeq) + ")" compound = compound[:rpidx] + [substr ] + compound[lpidx + 1:] else: subeq = compound[0:3] substr = "(" + ''.join(subeq) + ")" compound = [substr] + compound[3:] if True: p, op, e = subeq #print(p,op,e) p = objs[p] e = objs[e] op = op.strip() trips.append((op, p, e)) pute = (0, makesets.combine(p[1], e[1], op)) #print("OPERATION SELECTED: ",op) #p.details() #e.details() #print(substr,pute[1].num) objs[substr] = pute if pute == -1: exit() if simplex == l: trips.append(("=", objs[simplex[0]], objs[compound[0]])) else: trips.append(("=", objs[compound[0]], objs[simplex[0]])) t = training(trips, problem, target) for op in t: bigtexamples[op][0].extend(t[op][0]) bigtexamples[op][1].extend(t[op][1]) print(op, len(bigtexamples[op][0])) if TRAIN: pickle.dump(bigtexamples, open('data/' + OUT + ".training", 'wb'))
def make_eq(q, a, equations): bigtexamples = {x: ([], []) for x in ["+", "*", '/', '-', '=']} wps = q # open(q).readlines() for k in range(len(wps)): # First preprocessing, tokenize slightly problem = utils.preprocess_problem(wps[k]) print(k) print(problem) # story = nlp.parse(problem) story = utils.read_parse(int(equations[k])) eqs = utils.get_k_eqs(equations[k]) answers = [x[1] for x in eqs if x[0] == 1] if answers == []: continue answers = list(set(answers)) print(story["sentences"][0]["text"]) print(answers) #make story #story = nlp.parse(problem) sets = makesets.makesets(story['sentences']) i = 0 xidx = [i for i, x in enumerate(sets) if x[1].num == 'x'] if not xidx: print("NO X WHY") continue numlist = [(utils.cleannum(v.num), v) for k, v in sets] numlist = [x for x in numlist if x[0] != ''] objs = {k: (0, v) for k, v in numlist} print(objs.items()) consts = [ x for x in answers[0].split(" ") if x not in [ '(', ')', '+', '-', '/', '*', '=', ] ] present = [x for x in consts if x in objs] if present != consts: print(present, consts) print("missing thing") exit() #simpleanswers = [] for j, eq in enumerate(answers): trips = [] print(j, eq) l, r = [x.strip().split(' ') for x in eq.split('=')] target = 'x' target = (target, objs[target]) #find innermost parens? for i, compound in enumerate([l, r]): while len(compound) > 1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index('(') lpidx = rpidx + compound[rpidx:].index(")") subeq = compound[rpidx + 1:lpidx] substr = "(" + ''.join(subeq) + ")" compound = compound[:rpidx] + [substr ] + compound[lpidx + 1:] else: subeq = compound[0:3] substr = "(" + ''.join(subeq) + ")" compound = [substr] + compound[3:] if True: p, op, e = subeq p = objs[p] e = objs[e] op = op.strip() trips.append((op, p, e)) pute = (0, makesets.combine(p[1], e[1], op)) objs[substr] = pute if pute == -1: exit() t = training(trips, problem, story, target) for op in t: bigtexamples[op][0].extend(t[op][0]) bigtexamples[op][1].extend(t[op][1]) with open('data/' + sys.argv[1][-1] + ".local.training", 'wb') as f: pickle.dump(bigtexamples, f)
def make_eq(q): bigtexamples = {x:([],[]) for x in ["+","*",'/','-','=']} wps,eqs= parse(q) for k in range(len(wps)): if len(wps[k])==0:continue problem = wps[k][0].lower() story = nlp.parse(problem) sets = makesets.makesets(story['sentences']) i = 0 print(sets) while i < len(sets): dups = [y for y in sets if y[1].idx != None] dups = [y for y in dups if y[1].idx == sets[i][1].idx] if len(dups)>1: good = [y for y in dups if len([x for x in y[1].num if x.isdigit()])>0] if good: others = [x for x in dups if x!=good[0]] for x in others: sets.remove(x) else: # just pick 1 for x in dups[1:]: sets.remove(x) i+=1 xidx = [x for x in sets if x[1].num=='x'] if not xidx: problematic.write('no x :'+problem); continue #TODO look for 2 xes ''' xidx = xidx[0][0] postx = [x for x in numbs if x[0]>=xidx] if len(postx)>1: # 2 vals to right twoToRight = True else: twoToRight = False ''' numlist = [(cleannum(v.num),v) for k,v in sets] numlist = [x for x in numlist if x[0]!=''] allnumbs = {str(k):v for k,v in numlist} objs = {k:(0,v) for k,v in numlist} answers = eqs[k] for j,eq in enumerate(answers): trips = [] print(j,eq) l,r = [x.strip().split(' ') for x in eq.split('=')] compound = r if len(l)==1 else l simplex = l if len(l)==1 else r target = simplex[0] target = (target,objs[target]) #find innermost parens? while len(compound)>1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index('(') lpidx = rpidx+compound[rpidx:].index(")") subeq = compound[rpidx+1:lpidx] substr = "("+''.join(subeq)+")" compound = compound[:rpidx]+[substr]+compound[lpidx+1:] else: subeq = compound[0:3] substr = "("+''.join(subeq)+")" compound = [substr]+compound[3:] if True: p,op,e = subeq #print(p,op,e) p = objs[p] e = objs[e] op = op.strip() trips.append((op,p,e)) pute = (0,makesets.combine(p[1],e[1],op)) #print("OPERATION SELECTED: ",op) #p.details() #e.details() #print(substr,pute[1].num) objs[substr]=pute if pute == -1: exit() if simplex == l: trips.append(("=",objs[simplex[0]],objs[compound[0]])) else: trips.append(("=",objs[compound[0]],objs[simplex[0]])) t = training(trips,problem,target) for op in t: bigtexamples[op][0].extend(t[op][0]) bigtexamples[op][1].extend(t[op][1]) print(op,len(bigtexamples[op][0])) pickle.dump(bigtexamples,open('data/gold_training.pickle','wb'))
def dotrain(): if len(sys.argv) > 1: wps = open(sys.argv[1]).readlines() answs = open(sys.argv[2]).readlines() else: wps = open("emnlp_noIrrelev_p.txt").readlines() answs = open("emnlp_noIrrelev_a.txt").readlines() problematic = open('nogoodtrainproblems', 'w') bigtexamples = {x: ([], []) for x in ["+", "*", '/', '-', '=']} replacements = { ' two ': ' 2 ', " three ": ' 3 ', ' four ': ' 4 ', ' five ': ' 5 ', ' six ': ' 6 ', ' seven ': ' 7 ', ' eight ': ' 8 ', ' nine ': ' 9 ', ' ten ': ' 10 ', ' eleven ': ' 11 ', ' week ': ' 7 days ', ' dozen ': ' 12 of ', ' dozens ': ' 12 ', ' twice ': ' 2 ' } for k in range(len(wps)): print(k) problem = wps[k].lower() for r in replacements: problem = problem.replace(r, replacements[r]) #extract numbers: #problem = ' '.join([x.replace(",","") for x in problem.split()]) story = nlp.parse(problem) numbs = makesets.makesets(story['sentences']) numlist = [(cleannum(v.num), v) for k, v in numbs] numlist = [x for x in numlist if x[0] != ''] allnumbs = {str(k): v for k, v in numlist} if 'x' not in allnumbs: if 'x*' not in allnumbs: problematic.write('no x :' + problem) continue objs = {k: (0, v) for k, v in numlist} print('start solving') print(numlist) if len(numlist) < 2: problematic.write("not enough numbers : " + problem) continue ST = Solver([x[0] for x in numlist if x[0] != 'x']) answers = ST.solveEquations(float(answs[k])) print('done solving') #filter out where = in middle if simpler eq exists simpleranswers = [ x for x in answers if x.split(" ")[1] == '=' or x.split(" ")[-2] == "=" ] if not answers: continue if simpleranswers: answers = simpleranswers else: print(answers) problematic.write("not simple : " + problem) continue answervals = [ x for x in answers[0].split(" ") if x not in ['+', '-', '/', '=', ')', '(', '*'] ] numvals = [x[0] for x in numlist if x[0] in answervals] xidx = numvals.index("x") rightidx = [ i for i, x in enumerate(answers) if [ z for z in x.split(" ") if z not in ['+', '-', '/', '=', ')', '(', '*'] ].index('x') == xidx ] xrightanswers = [answers[i] for i in rightidx] if xrightanswers: answers = xrightanswers for j, eq in enumerate(answers): trips = [] print(j, eq) l, r = [x.strip().split(' ') for x in eq.split('=')] compound = l if len(r) == 1 else r simplex = l if len(l) == 1 else r target = simplex[0] target = (target, objs[target]) #find innermost parens? while len(compound) > 1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index('(') lpidx = rpidx + compound[rpidx:].index(")") subeq = compound[rpidx + 1:lpidx] substr = "(" + ''.join(subeq) + ")" compound = compound[:rpidx] + [substr ] + compound[lpidx + 1:] else: subeq = compound[0:3] substr = "(" + ''.join(subeq) + ")" compound = [substr] + compound[3:] if True: p, op, e = subeq #print(p,op,e) p = objs[p] e = objs[e] op = op.strip() trips.append((op, p, e)) pute = (0, makesets.combine(p[1], e[1], op)) #print("OPERATION SELECTED: ",op) #p.details() #e.details() #print(substr,pute[1].num) objs[substr] = pute if pute == -1: exit() if simplex == l: trips.append(("=", objs[simplex[0]], objs[compound[0]])) else: trips.append(("=", objs[compound[0]], objs[simplex[0]])) t = training(trips, problem, target) for op in t: bigtexamples[op][0].extend(t[op][0]) bigtexamples[op][1].extend(t[op][1]) print(op, len(bigtexamples[op][0])) pickle.dump(bigtexamples, open('data/dev_training.pickle', 'wb'))
def make_eq(q, a, equations): tdata = [] wps = q #open(q).readlines() answs = a #open(a).readlines() for k in range(len(wps)): answers = get_k_eqs(equations[k]) if answers == []: continue answers = list(set(answers)) #First preprocessing, tokenize slightly problem = wps[k] #.lower() problem = problem.strip().split(" ") for i, x in enumerate(problem): if len(x) == 0: continue if x[-1] in [',', '.', '?']: problem[i] = x[:-1] + " " + x[-1] problem = ' '.join(problem) problem = " " + problem + " " print(problem) #make story story = nlp.parse(problem) sets = makesets.makesets(story['sentences']) i = 0 xidx = [i for i, x in enumerate(sets) if x[1].num == 'x'] if not xidx: print("NO X WHY") continue #TODO look for 2 xes xidx = xidx[0] numlist = [(cleannum(v.num), v) for k, v in sets] numlist = [x for x in numlist if x[0] != ''] allnumbs = {str(k): v for k, v in numlist} objs = {k: (0, v) for k, v in numlist} print(objs.items()) consts = [ x for x in answers[0][1].split(" ") if x not in [ '(', ')', '+', '-', '/', '*', '=', ] ] present = [x for x in consts if x in objs] if consts != present: print(present, consts) print("missing thing") continue order = int(consts == [x[0] for x in numlist]) for j, eq in answers: #j = randint(0,len(answers)-1) #eq = answers[j] trips = [] print(j, eq) l, r = [x.strip().split(' ') for x in eq.split('=')] consts = " ".join([ x for x in answers[0][1].split(" ") if x not in [ '(', ')', '+', '-', '/', '*', ] ]) consts = consts.split(" = ") sp = (objs[consts[0].split(" ")[-1]][1], objs[consts[1].split(" ")[0]][1]) target = 'x' target = (target, objs[target]) #find innermost parens? sides = [] for i, compound in enumerate([l, r]): while len(compound) > 1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index('(') lpidx = rpidx + compound[rpidx:].index(")") subeq = compound[rpidx + 1:lpidx] substr = "(" + ''.join(subeq) + ")" compound = compound[:rpidx] + [substr ] + compound[lpidx + 1:] else: subeq = compound[0:3] substr = "(" + ''.join(subeq) + ")" compound = [substr] + compound[3:] if True: p, op, e = subeq p = objs[p] e = objs[e] op = op.strip() #trips.append((op,p,e)) pute = (0, makesets.combine(p[1], e[1], op)) objs[substr] = pute if pute == -1: exit() sides.append(objs[compound[0]]) tdata.append( training(sides[0], sides[1], problem, story, target, j, order, sp)) f = open("data/" + sys.argv[1][-1] + ".global.data", 'w') for v in tdata: f.write(str(v[0]) + " ") for i, j in enumerate(v[1:]): f.write(str(i + 1) + ":" + str(j) + " ") f.write("\n")
def make_eq(q, a, equations): bigtexamples = {x: ([], []) for x in ["+", "*", '/', '-', '=']} wps = q # open(q).readlines() for k in range(len(wps)): # First preprocessing, tokenize slightly problem = utils.preprocess_problem(wps[k]) print(k) print(problem) # story = nlp.parse(problem) story = utils.read_parse(int(equations[k])) eqs = utils.get_k_eqs(equations[k]) answers = [x[1] for x in eqs if x[0] == 1] if answers == []: continue answers = list(set(answers)) print(story["sentences"][0]["text"]) print(answers) #make story #story = nlp.parse(problem) sets = makesets.makesets(story['sentences']) i = 0 xidx = [i for i, x in enumerate(sets) if x[1].num == 'x'] if not xidx: print("NO X WHY") continue numlist = [(utils.cleannum(v.num), v) for k, v in sets] numlist = [x for x in numlist if x[0] != ''] objs = {k: (0, v) for k, v in numlist} print(objs.items()) consts = [x for x in answers[0].split(" ") if x not in ['(', ')', '+', '-', '/', '*', '=', ]] present = [x for x in consts if x in objs] if present != consts: print(present, consts) print("missing thing") exit() #simpleanswers = [] for j, eq in enumerate(answers): trips = [] print(j, eq) l, r = [x.strip().split(' ') for x in eq.split('=')] target = 'x' target = (target, objs[target]) #find innermost parens? for i, compound in enumerate([l, r]): while len(compound) > 1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index('(') lpidx = rpidx+compound[rpidx:].index(")") subeq = compound[rpidx+1:lpidx] substr = "("+''.join(subeq)+")" compound = compound[:rpidx]+[substr]+compound[lpidx+1:] else: subeq = compound[0:3] substr = "("+''.join(subeq)+")" compound = [substr]+compound[3:] if True: p, op, e = subeq p = objs[p] e = objs[e] op = op.strip() trips.append((op, p, e)) pute = (0, makesets.combine(p[1], e[1], op)) objs[substr] = pute if pute == -1: exit() t = training(trips, problem, story, target) for op in t: bigtexamples[op][0].extend(t[op][0]) bigtexamples[op][1].extend(t[op][1]) with open('data/' + sys.argv[1][-1] + ".local.training", 'wb') as f: pickle.dump(bigtexamples, f)
def make_eq(q, a, equations): bigtexamples = {x: ([], []) for x in ["+", "*", '/', '-', '=']} wps = q #open(q).readlines() answs = a #open(a).readlines() for k in range(len(wps)): eqs = get_k_eqs(equations[k]) answers = [x[1] for x in eqs if x[0] == 1] answers = [x for x in answers if x.split()[-2] == '='] answers = [x for x in answers if x.split()[-1] == 'x'] if answers == []: continue answers = list(set(answers)) #First preprocessing, tokenize slightly problem = wps[k] #.lower() problem = problem.strip().split(" ") for i, x in enumerate(problem): if len(x) == 0: continue if x[-1] in [',', '.', '?']: problem[i] = x[:-1] + " " + x[-1] problem = ' '.join(problem) problem = " " + problem + " " print(k) print(problem) #make story story = nlp.parse(problem) sets = makesets.makesets(story['sentences']) i = 0 xidx = [i for i, x in enumerate(sets) if x[1].num == 'x'] if not xidx: print("NO X WHY") continue numlist = [(cleannum(v.num), v) for k, v in sets] numlist = [x for x in numlist if x[0] != ''] allnumbs = {str(k): v for k, v in numlist} objs = {k: (0, v) for k, v in numlist} print(objs.items()) consts = [ x for x in answers[0].split(" ") if x not in [ '(', ')', '+', '-', '/', '*', '=', ] ] present = [x for x in consts if x in objs] if present != consts: print(present, consts) print("missing thing") continue #simpleanswers = [] #for x in answers: # try: # x1 = x[1].strip().split(" ") # if x[-2]=='=' and x[-1]=='x': # simplenaswers.append(x) # except: # pass #if simpleanswers: # answers = simpleanswers #ri = randint(0,len(answers)-1) #if answers == []: # continue #answers = [answers[ri]] for j, eq in enumerate(answers): trips = [] print(j, eq) l, r = [x.strip().split(' ') for x in eq.split('=')] target = 'x' target = (target, objs[target]) #find innermost parens? sides = [] for i, compound in enumerate([l, r]): while len(compound) > 1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index('(') lpidx = rpidx + compound[rpidx:].index(")") subeq = compound[rpidx + 1:lpidx] substr = "(" + ''.join(subeq) + ")" compound = compound[:rpidx] + [substr ] + compound[lpidx + 1:] else: subeq = compound[0:3] substr = "(" + ''.join(subeq) + ")" compound = [substr] + compound[3:] if True: p, op, e = subeq p = objs[p] e = objs[e] op = op.strip() trips.append((op, p, e)) pute = (0, makesets.combine(p[1], e[1], op)) objs[substr] = pute if pute == -1: exit() t = training(trips, problem, story, target, sets) for op in t: bigtexamples[op][0].extend(t[op][0]) bigtexamples[op][1].extend(t[op][1]) pickle.dump(bigtexamples, open('data/ixl.local.training', 'wb'))
def make_eq(q,a,VERBOSE,TRAIN): bigtexamples = {x:([],[]) for x in ["+","*",'/','-','=']} wps = open(q).readlines() answs = open(a).readlines() if not TRAIN and not VERBOSE: out = open(q+".out.txt",'w') problematic = open('somethingWrongProblems','w') replacements = {' two ':' 2 '," three ":' 3 ',' four ':' 4 ',' five ':' 5 ',' six ':' 6 ',' seven ':' 7 ',' eight ':' 8 ',' nine ':' 9 ',' ten ':' 10 ',' eleven ':' 11 ', ' twice ':' 2 '} for k in range(len(wps)): if VERBOSE: for i in range(len(wps)): print(i,wps[i]) k = int(input()) print(k) problem = wps[k].lower() #First preprocessing, tokenize slightly problem = problem.strip().split(" ") for i,x in enumerate(problem): if len(x)==0:continue if x[-1] in [',','.','?']: problem[i] = x[:-1]+" "+x[-1] problem = ' '.join(problem) problem = " " + problem + " " print(problem) for r in replacements: problem = problem.replace(r,replacements[r]) ''' if " how " in problem: left,right = problem.split(" how ") else: left = problem for r in replacements: left = left.replace(r,replacements[r]) if " how " in problem: problem = left + ' how ' + right else: problem = left ''' story = nlp.parse(problem) sets = makesets.makesets(story['sentences']) i = 0 print(sets) while i < len(sets): dups = [y for y in sets if y[1].idx != None] dups = [y for y in dups if y[1].idx == sets[i][1].idx] if len(dups)>1: good = [y for y in dups if len([x for x in y[1].num if x.isdigit()])>0] if good: others = [x for x in dups if x!=good[0]] for x in others: sets.remove(x) else: # just pick 1 for x in dups[1:]: sets.remove(x) i+=1 xidx = [x for x in sets if x[1].num=='x'] if not xidx: problematic.write('no x :'+problem); continue #TODO look for 2 xes ''' xidx = xidx[0][0] postx = [x for x in numbs if x[0]>=xidx] if len(postx)>1: # 2 vals to right twoToRight = True else: twoToRight = False ''' numlist = [(cleannum(v.num),v) for k,v in sets] numlist = [x for x in numlist if x[0]!=''] if VERBOSE: for z,v in numlist: v.details() input() allnumbs = {str(k):v for k,v in numlist} objs = {k:(0,v) for k,v in numlist} print('start solving') print(numlist) if len(numlist)<2: problematic.write("not enough numbers : "+problem);continue values = [x[0] for x in numlist if x[0]!='x'] print(values) ST = Solver(values) answers = [] answers = ST.solveEquations(float(answs[k])) if not answers: problematic.write("No answers : " + problem + "\n") problematic.write(str([x[0] for x in numlist])+'\n') problematic.write(answs[k]+'\n') continue print('done solving') # if target has 2 entities, try eqs with = x op y format ''' simpleranswers = None if twoToRight: try: simpleranswers = [x for x in answers if x.split(" ")[-4]=="=" and x.split(" ")[-3]=='x'] except: pass if not simpleranswers: simpleranswers = [x for x in answers if x.split(" ")[1] == '=' or x.split(" ")[-2]=="="] ''' simpleranswers = [x for x in answers if x.split(" ")[1] == '=' or x.split(" ")[-2]=="="] #filter out where = in middle if simpler eq exists if simpleranswers: answers = simpleranswers[:] else: problematic.write("not simple : "+problem+"\n");continue values = [x[0] for x in numlist] xidx = values.index('x') print(xidx) for a in simpleranswers: aspl = [x for x in a.split(" ") if x not in ["/","-",'+','*','=','(',')']] print(a);print(aspl);print(values) aidx = aspl.index('x') print(aidx) if aidx != xidx: print("removing ",a) answers.remove(a) print(answers) if answers==[]: answers = simpleranswers print(answers) if not VERBOSE: if not TRAIN: out.write(problem + '\n') out.write(answs[k] + "\n") out.write(str([x[0] for x in numlist])) out.write("\n") for x in answers: out.write(x + "\n") out.write("___\n") if VERBOSE: input() if not TRAIN: continue c = randint(0,len(answers)-1) answers = [answers[c]] for j,eq in enumerate(answers): trips = [] print(j,eq) l,r = [x.strip().split(' ') for x in eq.split('=')] compound = r if len(l)==1 else l simplex = l if len(l)==1 else r target = simplex[0] target = (target,objs[target]) #find innermost parens? while len(compound)>1: if "(" in compound: rpidx = (len(compound) - 1) - compound[::-1].index('(') lpidx = rpidx+compound[rpidx:].index(")") subeq = compound[rpidx+1:lpidx] substr = "("+''.join(subeq)+")" compound = compound[:rpidx]+[substr]+compound[lpidx+1:] else: subeq = compound[0:3] substr = "("+''.join(subeq)+")" compound = [substr]+compound[3:] if True: p,op,e = subeq #print(p,op,e) p = objs[p] e = objs[e] op = op.strip() trips.append((op,p,e)) pute = (0,makesets.combine(p[1],e[1],op)) #print("OPERATION SELECTED: ",op) #p.details() #e.details() #print(substr,pute[1].num) objs[substr]=pute if pute == -1: exit() if simplex == l: trips.append(("=",objs[simplex[0]],objs[compound[0]])) else: trips.append(("=",objs[compound[0]],objs[simplex[0]])) t = training(trips,problem,target) for op in t: bigtexamples[op][0].extend(t[op][0]) bigtexamples[op][1].extend(t[op][1]) print(op,len(bigtexamples[op][0])) if TRAIN: pickle.dump(bigtexamples,open('data/'+OUT+".training",'wb'))