Exemplo n.º 1
0
def calculate(type_name,lang):
    result = {}
    indexs={"EN":0,"ES":1,"RU":2,"FA":3}
    type_table_names=["type_en","type_es","type_ru"]
    property_table_names=["property_en","property_es","property_ru"]
    CONN_STRING = mydb.get_CONN()
    con = mydb.getCon(CONN_STRING)
    rows=[]
    property_table_name=''
    type_table_name = ''
    if lang !='FA':
        index = indexs[lang]
        type_table_name = type_table_names[index]
        property_table_name = property_table_names[index]
        type_name = '<'+type_name+'>'
        query = "select entity from __type_table__ where type = '__type_name__'"
        query = query.replace('__type_table__',type_table_name)
        query = query.replace('__type_name__',type_name)
        rows = mydb.executeQueryResult(con,query,True)
        result['__total__']=len(rows)
    else:
        property_table_name = 'property_fa'
        
        query = "select distinct entity from property_fa where property = '<http://fa.dbpedia.org/property/type>' and value ='__type_name__' "
        query = query.replace('__type_name__',type_name)
        rows = mydb.executeQueryResult(con,query,True)
        result['__total__']=len(rows)

    temp_result={}
    print len(rows)
    i=0
    for row in rows:
        entity = row[0]
        entity = entity.replace("'","''")
        query = "select property from __property_table__ where entity = '__entity_name__';"
        query = query.replace('__property_table__',property_table_name)
        query = query.replace('__entity_name__',entity)
        ps = mydb.executeQueryResult(con,query,False)
        for p in ps:
            if not p[0] in temp_result:
                temp_result[p[0]]=1
        for p in temp_result:
            if not p in result:
                result[p]=0
            result[p]+=1
        temp_result={}
        i+=1
        if i % 50 ==0:
            print i
    #print result
    return result
Exemplo n.º 2
0
def main():
    offset = 0
    # change your output dir
    dir = 'evaluate'
    pfile = open(dir + '/evaluate.pairs.' + str(offset) + '.txt','w')
    tfile = open(dir + '/evaluate.tuples.'+ str(offset) + '.txt','w')
    cfile = open(dir + '/evaluate.corefs.'+ str(offset) + '.txt','w')
    
    CONN_STRING = mydb.get_CONN()
    con = mydb.getCon(CONN_STRING)
    
    # change the amount of results, currently is 200
    query = 'select * from rc order by id limit 200 offset 0'
    rows = mydb.executeQueryResult(con,query,False)
    
    for row in rows:
        id = int(row[0])
        pairs = json.loads(row[1])
        tuples = json.loads(row[2])
        coref = None
        if row[3]:
            coref = json.loads(row[3])
        
        # write the pairs
        for pair in pairs:
            reasons = pair[0]
            consequences = pair[1]
            for reason in reasons:
                pfile.write(str(id)+'\t'+reason[0]+'\n')
            for consequence in consequences:
                pfile.write(str(id)+'\t'+consequence[0]+'\n')
        
        # write the tuples:
        for t in tuples:
            reasons = t[0]
            consequences = t[1]
            for reason in reasons:
                tfile.write(str(id)+'\t'+tuple2str(reason[0])+'\n')
            for consequence in consequences:
                tfile.write(str(id)+'\t'+tuple2str(consequence[0])+'\n')

        # write the tuples 
        if row[3]:
            corefPairs=rcreplace.mergeCoref(pairs,coref)
            for pair in corefPairs:
                reasons = pair[0]
                consequences = pair[1]
            for reason in reasons:
                cfile.write(str(id)+'\t'+reason+'\n')
            for consequence in consequences:
                cfile.write(str(id)+'\t'+consequence+'\n')

    pfile.close()
    tfile.close()
    cfile.close()
def loadAll(lang):
    global loaded
    if loaded is True:
        return

    # prepare db
    CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'"
    con = mydb.getCon(CONN_STRING)
    table_name = 'cn_' + lang

    print 'load from database...'
    query = 'select * from ' + table_name
    records = mydb.executeQueryResult(con, query, False)

    for r in records:
        rel = r[0]
        start = None
        end = None
        pos = ''
        ll = mysplit(r[1].decode('utf8'), '/')

        if len(ll) >= 3:
            start = ll[2]
        ll = mysplit(r[2].decode('utf8'), '/')

        if len(ll) >= 3:
            end = ll[2]
        if len(ll) >= 4:
            pos = ll[3]

        rel = rel + pos

        # add start to end's neighbour
        if end not in data:
            data[end] = {}
        if rel not in data[end]:
            data[end][rel] = []
        if start not in data[end][rel]:
            data[end][rel].append(start)

        # add end to start's neighbour
        if start not in data:
            data[start] = {}
        if rel not in data[start]:
            data[start][rel] = []
        if end not in data[start][rel]:
            data[start][rel].append(end)
    print 'loading done!'
    loaded = True
def loadAll(lang):
    global loaded
    if loaded is True:
        return

    # prepare db
    CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'"
    con = mydb.getCon(CONN_STRING)
    table_name = 'cn_' + lang

    print 'load from database...'
    query = 'select * from ' + table_name
    records = mydb.executeQueryResult(con, query, False)

    for r in records:
        rel = r[0]
        start = None
        end = None
        pos = ''
        ll = mysplit(r[1].decode('utf8'), '/')

        if len(ll) >= 3:
            start = ll[2]
        ll = mysplit(r[2].decode('utf8'), '/')

        if len(ll) >= 3:
            end = ll[2]
        if len(ll) >= 4:
            pos = ll[3]

        rel = rel + pos

        # add start to end's neighbour
        if end not in data:
            data[end] = {}
        if rel not in data[end]:
            data[end][rel] = []
        if start not in data[end][rel]:
            data[end][rel].append(start)

        # add end to start's neighbour
        if start not in data:
            data[start] = {}
        if rel not in data[start]:
            data[start][rel] = []
        if end not in data[start][rel]:
            data[start][rel].append(end)
    print 'loading done!'
    loaded = True
Exemplo n.º 5
0
def main():
    # file to save
    #jfile = open('result.tuple.json.txt','w')
    file_path = os.path.join(settings.PROJECT_DIR,'result/raw/result.sentence.json.txt')
    jsfile = open(file_path,'w')
    #file = open('result.txt','w')
    #jhfile = open('result.jh.txt','w')
    # db
    CONN_STRING = mydb.get_CONN()
    con = mydb.getCon(CONN_STRING)
    query = 'select id , review_clauses from review where review_clauses is not null order by id'
    rows = mydb.executeQueryResult(con,query,True)
    
    for row in rows:
        id = row[0]
        review = row[1]
        if not review:
            continue
        review = review.decode('utf-8')
        clauses = review.split('###')
        tpairs = processReview(clauses)
        if len(tpairs) == 0:
            continue
        #jfile.write(json.dumps({'id':id,'pairs':pairs})+'\n')
        jsfile.write(json.dumps({'id':id,'sen_pairs':tpairs})+'\n')
        
        #file.write('id:'+str(id)+'\n')
        #jhfile.write('id:'+str(id)+'\n')
        # for tpair in tpairs:
        #     file.write('Reasons:'+repr(tpair[0])+'\n')
        #     file.write('Consequences:'+ repr(tpair[1])+'\n')
            
        # for pair in pairs:
        #     jhfile.write('Reasons:\n')
        #     jhfile.write(repr(pair[0][0][0]['subj'])+' ')
        #     jhfile.write(repr(pair[0][0][0]['verb'])+' ')
        #     jhfile.write(repr(pair[0][0][0]['dobj'])+' ')
        #     jhfile.write(repr(pair[0][0][0]['iobj'])+'\n')
            
        #     jhfile.write('Consequences:\n')
        #     jhfile.write(repr(pair[1][0][0]['subj'])+' ')
        #     jhfile.write(repr(pair[1][0][0]['verb'])+' ')
        #     jhfile.write(repr(pair[1][0][0]['dobj'])+' ')
        #     jhfile.write(repr(pair[1][0][0]['iobj'])+'\n')
    jsfile.close()        
def loadAll(lang):
    global loaded
    if loaded:
        return
    CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'"
    con = mydb.getCon(CONN_STRING)
    table_name = 'translation_' + lang
    print table_name
    print 'loading translation from db...'
    sa.loadAll()
    query = 'select * from ' + table_name
    records = mydb.executeQueryResult(con, query, False)
    for r in records:
        ll = mysplit(r[1].decode('utf8'), '/')
        word = ll[-1]
        ll = mysplit(r[2].decode('utf8'), '/')
        if len(ll) < 3:
            continue
        en_word = ll[2]

        if word not in data:
            data[word] = en_word
    print 'loading done!'
    loaded = True
def loadAll(lang):
    global loaded
    if loaded:
        return
    CONN_STRING = "host='localhost' dbname='conceptnet' user='******' password='******'"
    con = mydb.getCon(CONN_STRING)
    table_name = 'translation_'+lang
    print table_name
    print 'loading translation from db...'
    sa.loadAll()
    query = 'select * from '+table_name
    records = mydb.executeQueryResult(con,query,False)
    for r in records:
        ll = mysplit(r[1].decode('utf8'),'/')
        word = ll[-1]
        ll = mysplit(r[2].decode('utf8'),'/')
        if len(ll)<3:
            continue
        en_word = ll[2]
        
        if not word in data:
            data[word] = en_word
    print 'loading done!'
    loaded = True
Exemplo n.º 8
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import mydb

CONN_STRING = mydb.get_CONN()
con = mydb.getCon(CONN_STRING)

query = "select distinct value from property_fa where property = '<http://fa.dbpedia.org/property/type>'"

rows = mydb.executeQueryResult(con, query, True)

file = open('types_fa.txt', 'w')

print len(rows)
for row in rows:
    file.write(row[0] + '\n')

file.close()
Exemplo n.º 9
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

import mydb

CONN_STRING = mydb.get_CONN()
con = mydb.getCon(CONN_STRING)

query = "select distinct value from property_fa where property = '<http://fa.dbpedia.org/property/type>'"

rows = mydb.executeQueryResult(con, query, True)

file = open("types_fa.txt", "w")

print len(rows)
for row in rows:
    file.write(row[0] + "\n")

file.close()
Exemplo n.º 10
0
def processSingle(start, end, fileName,dirName,p):
    fileName = os.path.abspath(fileName)
    #totalOneTime = 1000#increment each time
    totalOneTime = end
    con = mydb.getCon(CONN_STRING)
    #nltk.download()
    sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    #text = nltk.corpus.abc.raw('science.txt')
    iter = start
    per = 1
    while iter * per < totalOneTime:
        print iter
        query = "select id, review_text from review order by id LIMIT "+ str(per)+" OFFSET " + str(iter * per)
        resultSet = mydb.executeQueryResult(con, query, False)
        sentLens = list()
        #fileName = 'sents.txt'
        file = open(fileName, 'w')
       
        entry = resultSet[0]
        try:
            sents = sent_tokenizer.tokenize(entry[1])
        except UnicodeDecodeError, e:
            iter += 1
            continue
            
        #sentLens.append([entry[0], len(sents)])
#       fileName = 'sents' + str(iter * 10) + '-' + str(iter* 10 + 10) + '.txt'
        for sent in sents:
            if p.match(sent):
                print sent
                sent = 'Special'
            elif len(sent.split()) > 70:
                print sent
                sent = 'longsentence'
            file.write('<s> ' + sent + ' </s> \n')
        file.close()
        os.system('perl '+dirName+'/spade.pl ' + fileName)
        outputFileName = fileName + '.chp.edu.fmt';

        with open(outputFileName) as f:
            content = f.readlines()
        loc = 0
        #print len(content)
        clauses = list()   
        while loc < len(content):
            subLen = int(content[loc])
            loc += 1
            j = 0

            while j < subLen:
                j += 1   
                if len(content[loc].split()) > 2:
                    clauses.append(content[loc].split(' ', 1)[1].rstrip('\n').replace("'","''"))
                loc += 1
                #print subLen, j, loc
        if len(clauses) < 1:
            iter += 1
            continue
        strClauses = clauses[0]
        for clause in clauses[1:]:
            strClauses += '###' + clause
        query="UPDATE review SET (review_clauses) = ('" + strClauses + "') WHERE id = '"+str(entry[0])+"'"
        
        mydb.executeQuery(con, query, False)
        sentLens = list()
        iter += 1
Exemplo n.º 11
0
def processBatch(p):
    totalOneTime = 300 #increment each time
    con = mydb.getCon(CONN_STRING)
    #nltk.download()
    sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    #text = nltk.corpus.abc.raw('science.txt')
    iter = 237
    per = 1
    while iter * per < totalOneTime:
        query = "select id, review_text from review order by id LIMIT "+ str(per)+" OFFSET " + str(iter * per)
        resultSet = mydb.executeQueryResult(con, query, False)
        sentLens = list()
        fileName = 'sents.txt'
        file = open(fileName, 'w')

        for entry in resultSet:
            sents = sent_tokenizer.tokenize(entry[1])
            sentLens.append([entry[0], len(sents)])
#            fileName = 'sents' + str(iter * 10) + '-' + str(iter* 10 + 10) + '.txt'
            for sent in sents:
                if p.match(sent):
                    print sent
                    sent = 'Special'
                elif len(sent.split()) > 70:
                    print sent
                    sent = 'longsentence'
                file.write('<s> ' + sent + ' </s> \n')
        file.close()
        print sentLens
        os.system('perl spade.pl ' + fileName)
        outputFileName = 'sents.txt.chp.edu.fmt';
        #outputFile = open(outputFileName, 'r')
        with open(outputFileName) as f:
            content = f.readlines()
        loc = 0
        queries = list()
        print len(content)
        for lens in sentLens:
            i = 0
            clauses = list()
            while i < lens[1]:
                i += 1
                #print lens[0], content[loc]
                subLen = int(content[loc])
                loc += 1
                j = 0
                print subLen
                while j < subLen:
                    print j
                    j += 1   
                    print content[loc],
                    if len(content[loc].split()) > 2:
                        clauses.append(content[loc].split(' ', 1)[1].rstrip('\n').replace("'","''"))
                    loc += 1
                    print subLen, j, loc
                #print clauses
            strClauses = clauses[0]
            for clause in clauses[1:]:
                strClauses += '###' + clause
            query="UPDATE review SET (review_clauses) = ('" + strClauses + "') WHERE id = '"+str(lens[0])+"'"
            #print query
            mydb.executeQuery(con, query, False)
            #queries.append(query)
        #print queries
        #mydb.executeManyQuery(con, queries, False)
        sentLens = list()
        iter += 1
    #sents = sent_tokenizer.tokenize(text)
    #pprint(sents[1:2])
    con.close()