Exemplo n.º 1
0
def clean_review_clauses():
    CONN_STRING = mydb.get_CONN()
    con = mydb.getCon(CONN_STRING)
    query = 'select id,review_text,review_clauses from review where review_clauses is not null'
    records = mydb.executeQueryResult(con,query,False)
    idxs = []
    for record in records:
        idx = record[0]
        review_text = record[1]
        review_clauses = record[2]
        review_clauses = review_clauses.replace('###','')
        s1,s2,jd=jaccard_distance(review_text,review_clauses)
        if jd<0.6: # we think this would be a bad one
            print jd,idx
            idxs.append(idx)
    
    query = 'update review set review_clauses = NULL where id = __idx__'
    for idx in idxs:
        query_n = query.replace('__idx__',str(idx))
        mydb.executeQuery(con,query_n,False)
Exemplo n.º 2
0
def merge(file):
    con = mydb.getCon(CONN_STRING)
    #iter = 1
    json_date = open(file)
    map = {}
    totalRepl = 0
    totalReplDone = 0
          
    for entry in json_date:
        data = json.loads(entry)
        query = "select id, review_clauses from review where id = '" + data['id'] + "'"
        corefStr = data['review_text']
        resultSet = mydb.executeQueryResult(con, query, False)
        #clauseStr = ''.join(resultSet[1])
        
        if resultSet[0][1] == None:
            continue
        clauseStr = resultSet[0][1].decode('utf-8')
        clauseStr = re.sub(r"(\.+)", ".", clauseStr)
        corefStr = corefStr.replace("-RRB-", ")").replace("-LRB-", "(")
        corefStr = re.sub(r"(\.)+", ".", corefStr)
        corefLst = corefStr.split()
        i = 0
        corefLst_2 = []
        
        while i < len(corefLst):
            if "<--" in corefLst[i]:
                totalRepl += 1
                str = ''
                j = i
                while not corefLst[j].endswith("-->"):
                    str += corefLst[j] + ' '
                    j += 1
                    #print str
                #if j == i:
                str += corefLst[j]
                corefLst_2.append(str)
                i = j
            else:
                corefLst_2.append(corefLst[i])
            i += 1
        clauseLst = clauseStr.split()
        clauseLstClone = list(clauseLst)
        loc_clause = 0
        loc_coref = 0

        while loc_coref < len(corefLst_2):
            resCorefWord = getCorefWord(loc_coref, corefLst_2)
            if resCorefWord[0]:
                match = False
                word_coref = resCorefWord[1]
                loc_clause_cur = loc_clause
                while loc_clause_cur < len(clauseLst):                   
                    word_clause = getClauseWord(loc_clause_cur, clauseLst)
                    
                    if word_clause == word_coref:
                        i = 1
                        prevMatch = False
                        prevClause = ''
                        prevCoref = ''
                        
                        while loc_clause_cur - i >= 0 and i <= 3 and loc_coref - i >= 0:
                            prevCoref = getCorefWord(loc_coref-i,corefLst_2)[1] + prevCoref
                            prevClause = getClauseWord(loc_clause_cur-i,clauseLst) + prevClause
                            i += 1
                        prevClause = prevClause.replace("`", "'")
                        prevCoref= prevCoref.replace("`", "'")
                        prevCoref= prevCoref.replace("\/", "/")
                        if prevCoref.endswith(prevClause) or prevClause.endswith(prevCoref) or prevCoref.startswith(prevClause) or prevClause.startswith(prevCoref):
                            prevMatch = True
                        i = 1
                        nextMatch = False
                        nextClause = ''
                        nextCoref = ''
                        while loc_clause_cur + i < len(clauseLst) and i <= 3 and loc_coref + i < len(corefLst_2):
                            nextCoref = nextCoref + getCorefWord(loc_coref+i,corefLst_2)[1]
                            nextClause = nextClause + getClauseWord(loc_clause_cur+i,clauseLst)
                            i += 1
                        nextClause = nextClause.replace("`", "'")
                        nextCoref = nextCoref.replace("`", "'")
                        nextCoref = nextCoref.replace("\/", "/")
                        if nextClause.startswith(nextCoref) or nextCoref.startswith(nextClause) or nextClause.endswith(nextCoref) or nextCoref.endswith(nextClause):
                            nextMatch = True
                        if prevMatch and nextMatch:
                            repl = corefLst_2[loc_coref][corefLst_2[loc_coref].index("<--"):corefLst_2[loc_coref].index("-->")+3]
                            match = match or True
                            totalReplDone += 1
                            clauseLstClone[loc_clause_cur] += repl
                    loc_clause_cur += 1
            loc_coref += 1
            loc_clause += 1
        map[int(data['id'])] = genJSON(clauseLstClone)
    print totalReplDone, totalRepl, (totalReplDone + 0.0)/totalRepl
    with open('data.txt', 'w') as outfile:
        json.dump(map, outfile)
Exemplo n.º 3
0
def rcCoref(input, output):
    con = mydb.getCon(CONN_STRING)
    json_date = open(input)
    total = 0
    bMap = {}
    
    for entry in json_date:
        total += 1
        data = json.loads(entry)
        iden = int(data['id'])
        query = "select replace from coref  where id = '" + str(data['id']) + "'"
        resultSet = mydb.executeQueryResult(con, query, False)
        if len(resultSet) == 0:
            continue
        repl = json.loads(resultSet[0][0])
        reacons = data['sen_pairs']
        sMap = {}
        
        for reacon in reacons:
            reas = reacon[0]
            cons = reacon[1]
            reaMap = {}
            consMap = {}
            iRea = 0
            for rea in reas:
                if str(rea[1]) in repl:
                    rep = repl[str(rea[1])]             
                    for key in rep:                
                        if not rep[key] in reaMap:
                            reaMap[rep[key]] = {rea[1]: (int(key),1)}
                        else:                            
                            if rea[1] in reaMap[rep[key]]:                                
                                reaMap[rep[key]][rea[1]] = (min(reaMap[rep[key]][rea[1]][0], int(key)), reaMap[rep[key]][rea[1]][1] + 1)                                
                            else:
                                reaMap[rep[key]][rea[1]] = (int(key), 1)
                                
            for cons in cons:
                if str(cons[1]) in repl:
                    rep = repl[str(cons[1])]                    
                    for key in rep:                
                        if not rep[key] in consMap:
                            consMap[rep[key]] = {cons[1]: (int(key),1)}
                        else:
                            if cons[1] in consMap[rep[key]]:
                                consMap[rep[key]][cons[1]] = (min(consMap[rep[key]][cons[1]][0], int(key)), consMap[rep[key]][cons[1]][1] + 1)
                            else:
                                consMap[rep[key]][cons[1]] = (int(key), 1)
                                
            for key in reaMap:
                if key in consMap:
                    for clause in reaMap[key]:
                        if not key in sMap:
                            sMap[key] = {}
                        sMap[key][clause] = reaMap[key][clause]
                    for clause in consMap[key]:
                        sMap[key][clause] = consMap[key][clause]
                        
            if not len(sMap) == 0:
                bMap[iden] = sMap
                
    outputRes(bMap, output)