def clean_review_clauses(): CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) query = 'select id,review_text,review_clauses from review where review_clauses is not null' records = mydb.executeQueryResult(con,query,False) idxs = [] for record in records: idx = record[0] review_text = record[1] review_clauses = record[2] review_clauses = review_clauses.replace('###','') s1,s2,jd=jaccard_distance(review_text,review_clauses) if jd<0.6: # we think this would be a bad one print jd,idx idxs.append(idx) query = 'update review set review_clauses = NULL where id = __idx__' for idx in idxs: query_n = query.replace('__idx__',str(idx)) mydb.executeQuery(con,query_n,False)
def merge(file): con = mydb.getCon(CONN_STRING) #iter = 1 json_date = open(file) map = {} totalRepl = 0 totalReplDone = 0 for entry in json_date: data = json.loads(entry) query = "select id, review_clauses from review where id = '" + data['id'] + "'" corefStr = data['review_text'] resultSet = mydb.executeQueryResult(con, query, False) #clauseStr = ''.join(resultSet[1]) if resultSet[0][1] == None: continue clauseStr = resultSet[0][1].decode('utf-8') clauseStr = re.sub(r"(\.+)", ".", clauseStr) corefStr = corefStr.replace("-RRB-", ")").replace("-LRB-", "(") corefStr = re.sub(r"(\.)+", ".", corefStr) corefLst = corefStr.split() i = 0 corefLst_2 = [] while i < len(corefLst): if "<--" in corefLst[i]: totalRepl += 1 str = '' j = i while not corefLst[j].endswith("-->"): str += corefLst[j] + ' ' j += 1 #print str #if j == i: str += corefLst[j] corefLst_2.append(str) i = j else: corefLst_2.append(corefLst[i]) i += 1 clauseLst = clauseStr.split() clauseLstClone = list(clauseLst) loc_clause = 0 loc_coref = 0 while loc_coref < len(corefLst_2): resCorefWord = getCorefWord(loc_coref, corefLst_2) if resCorefWord[0]: match = False word_coref = resCorefWord[1] loc_clause_cur = loc_clause while loc_clause_cur < len(clauseLst): word_clause = getClauseWord(loc_clause_cur, clauseLst) if word_clause == word_coref: i = 1 prevMatch = False prevClause = '' prevCoref = '' while loc_clause_cur - i >= 0 and i <= 3 and loc_coref - i >= 0: prevCoref = getCorefWord(loc_coref-i,corefLst_2)[1] + prevCoref prevClause = getClauseWord(loc_clause_cur-i,clauseLst) + prevClause i += 1 prevClause = prevClause.replace("`", "'") prevCoref= prevCoref.replace("`", "'") prevCoref= prevCoref.replace("\/", "/") if prevCoref.endswith(prevClause) or prevClause.endswith(prevCoref) or prevCoref.startswith(prevClause) or prevClause.startswith(prevCoref): prevMatch = True i = 1 nextMatch = False nextClause = '' nextCoref = '' while loc_clause_cur + i < len(clauseLst) and i <= 3 and loc_coref + i < len(corefLst_2): nextCoref = nextCoref + getCorefWord(loc_coref+i,corefLst_2)[1] nextClause = nextClause + getClauseWord(loc_clause_cur+i,clauseLst) i += 1 nextClause = nextClause.replace("`", "'") nextCoref = nextCoref.replace("`", "'") nextCoref = nextCoref.replace("\/", "/") if nextClause.startswith(nextCoref) or nextCoref.startswith(nextClause) or nextClause.endswith(nextCoref) or nextCoref.endswith(nextClause): nextMatch = True if prevMatch and nextMatch: repl = corefLst_2[loc_coref][corefLst_2[loc_coref].index("<--"):corefLst_2[loc_coref].index("-->")+3] match = match or True totalReplDone += 1 clauseLstClone[loc_clause_cur] += repl loc_clause_cur += 1 loc_coref += 1 loc_clause += 1 map[int(data['id'])] = genJSON(clauseLstClone) print totalReplDone, totalRepl, (totalReplDone + 0.0)/totalRepl with open('data.txt', 'w') as outfile: json.dump(map, outfile)
def rcCoref(input, output): con = mydb.getCon(CONN_STRING) json_date = open(input) total = 0 bMap = {} for entry in json_date: total += 1 data = json.loads(entry) iden = int(data['id']) query = "select replace from coref where id = '" + str(data['id']) + "'" resultSet = mydb.executeQueryResult(con, query, False) if len(resultSet) == 0: continue repl = json.loads(resultSet[0][0]) reacons = data['sen_pairs'] sMap = {} for reacon in reacons: reas = reacon[0] cons = reacon[1] reaMap = {} consMap = {} iRea = 0 for rea in reas: if str(rea[1]) in repl: rep = repl[str(rea[1])] for key in rep: if not rep[key] in reaMap: reaMap[rep[key]] = {rea[1]: (int(key),1)} else: if rea[1] in reaMap[rep[key]]: reaMap[rep[key]][rea[1]] = (min(reaMap[rep[key]][rea[1]][0], int(key)), reaMap[rep[key]][rea[1]][1] + 1) else: reaMap[rep[key]][rea[1]] = (int(key), 1) for cons in cons: if str(cons[1]) in repl: rep = repl[str(cons[1])] for key in rep: if not rep[key] in consMap: consMap[rep[key]] = {cons[1]: (int(key),1)} else: if cons[1] in consMap[rep[key]]: consMap[rep[key]][cons[1]] = (min(consMap[rep[key]][cons[1]][0], int(key)), consMap[rep[key]][cons[1]][1] + 1) else: consMap[rep[key]][cons[1]] = (int(key), 1) for key in reaMap: if key in consMap: for clause in reaMap[key]: if not key in sMap: sMap[key] = {} sMap[key][clause] = reaMap[key][clause] for clause in consMap[key]: sMap[key][clause] = consMap[key][clause] if not len(sMap) == 0: bMap[iden] = sMap outputRes(bMap, output)