예제 #1
0
파일: preprocess.py 프로젝트: shixing/RCPE
def insertReviews(fileName):
    con = mydb.getCon(CONN_STRING)
    json_date = open(fileName,'r')
   # date = json.load(json_date)
    total = 0
    fail = 0
    succ = 0
    for entry in json_date:
        total += 1
        #print entry
        data = json.loads(entry)
        #print date['business_id']
        succ += 1
        query = "insert into review(id, business_id, user_id, review_text, review_date) values('"
        query += str(succ) + "', '"
        query += data['business_id'] + "', '"
        query += data['user_id'] + "', '"
        query += data['text'].replace("'","''") +  "', '"
        query += data['date'].replace("'","''") +  "')"
        #query += data['votes'] +  "')"
        
        try:
            mydb.executeQuery(con,query, False)
        except psycopg2.DatabaseError, e:
            fail += 1
            succ -= 1
            print 'Error %s' % e
예제 #2
0
import mydb
import sys

typeFile = open('es_property.txt', 'r')
transFile = open('es_property_translate.txt', 'r')

CONN_STRING = mydb.get_CONN()
con = mydb.getCon(CONN_STRING)


while True:
    type = typeFile.readline()
    if not type:
        break
    trans = transFile.readline()
    type = type.strip()
    trans = trans.strip()
    query = "update statistic_es set native_property = '__trans__' where property = '__type__' "
    query = query.replace('__trans__', trans)
    query = query.replace('__type__', type)
    mydb.executeQuery(con, query, True)
예제 #3
0
originTree = xml.parse(originPath)

# findLabel(originTree, 'http://dbpedia.org/ontology/soccerLeagueRelegated', 'en',1)

# create table
CONN_STRING = mydb.get_CONN()
con = mydb.getCon(CONN_STRING)
tableNames = {
    'EN': 'statistic_en',
    'RU': 'statistic_ru',
    'ES': 'statistic_es',
    'FA': 'statistic_fa'
}
tableName = tableNames[lang]
query = 'DROP TABLE IF EXISTS ' + tableName + ';'
mydb.executeQuery(con, query, True)
query = 'CREATE TABLE __tableName__(type varchar, property varchar, native_type varchar, native_property varchar, hit int, total int);'
query = query.replace('__tableName__', tableName)
mydb.executeQuery(con, query, True)

records = []
insertQuery = 'insert into __table_name__(type,property,native_type,native_property,hit,total) VALUES(%s,%s,%s,%s,%s,%s);'
insertQuery = insertQuery.replace('__table_name__', tableName)
if lang != "FA":
    # print insertQuery
    filepath = 't.xml'
    tree = xml.parse(filepath)
    xmlRoot = tree.getroot()
    for c in xmlRoot.findall('.//Type'):
        typeName = c.attrib['name']
        # typeName = 'http://dbpedia.org/ontology/University'
예제 #4
0
파일: preprocess.py 프로젝트: shixing/RCPE
def processSingle(start, end, fileName,dirName,p):
    fileName = os.path.abspath(fileName)
    #totalOneTime = 1000#increment each time
    totalOneTime = end
    con = mydb.getCon(CONN_STRING)
    #nltk.download()
    sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    #text = nltk.corpus.abc.raw('science.txt')
    iter = start
    per = 1
    while iter * per < totalOneTime:
        print iter
        query = "select id, review_text from review order by id LIMIT "+ str(per)+" OFFSET " + str(iter * per)
        resultSet = mydb.executeQueryResult(con, query, False)
        sentLens = list()
        #fileName = 'sents.txt'
        file = open(fileName, 'w')
       
        entry = resultSet[0]
        try:
            sents = sent_tokenizer.tokenize(entry[1])
        except UnicodeDecodeError, e:
            iter += 1
            continue
            
        #sentLens.append([entry[0], len(sents)])
#       fileName = 'sents' + str(iter * 10) + '-' + str(iter* 10 + 10) + '.txt'
        for sent in sents:
            if p.match(sent):
                print sent
                sent = 'Special'
            elif len(sent.split()) > 70:
                print sent
                sent = 'longsentence'
            file.write('<s> ' + sent + ' </s> \n')
        file.close()
        os.system('perl '+dirName+'/spade.pl ' + fileName)
        outputFileName = fileName + '.chp.edu.fmt';

        with open(outputFileName) as f:
            content = f.readlines()
        loc = 0
        #print len(content)
        clauses = list()   
        while loc < len(content):
            subLen = int(content[loc])
            loc += 1
            j = 0

            while j < subLen:
                j += 1   
                if len(content[loc].split()) > 2:
                    clauses.append(content[loc].split(' ', 1)[1].rstrip('\n').replace("'","''"))
                loc += 1
                #print subLen, j, loc
        if len(clauses) < 1:
            iter += 1
            continue
        strClauses = clauses[0]
        for clause in clauses[1:]:
            strClauses += '###' + clause
        query="UPDATE review SET (review_clauses) = ('" + strClauses + "') WHERE id = '"+str(entry[0])+"'"
        
        mydb.executeQuery(con, query, False)
        sentLens = list()
        iter += 1
예제 #5
0
파일: preprocess.py 프로젝트: shixing/RCPE
def processBatch(p):
    totalOneTime = 300 #increment each time
    con = mydb.getCon(CONN_STRING)
    #nltk.download()
    sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
    #text = nltk.corpus.abc.raw('science.txt')
    iter = 237
    per = 1
    while iter * per < totalOneTime:
        query = "select id, review_text from review order by id LIMIT "+ str(per)+" OFFSET " + str(iter * per)
        resultSet = mydb.executeQueryResult(con, query, False)
        sentLens = list()
        fileName = 'sents.txt'
        file = open(fileName, 'w')

        for entry in resultSet:
            sents = sent_tokenizer.tokenize(entry[1])
            sentLens.append([entry[0], len(sents)])
#            fileName = 'sents' + str(iter * 10) + '-' + str(iter* 10 + 10) + '.txt'
            for sent in sents:
                if p.match(sent):
                    print sent
                    sent = 'Special'
                elif len(sent.split()) > 70:
                    print sent
                    sent = 'longsentence'
                file.write('<s> ' + sent + ' </s> \n')
        file.close()
        print sentLens
        os.system('perl spade.pl ' + fileName)
        outputFileName = 'sents.txt.chp.edu.fmt';
        #outputFile = open(outputFileName, 'r')
        with open(outputFileName) as f:
            content = f.readlines()
        loc = 0
        queries = list()
        print len(content)
        for lens in sentLens:
            i = 0
            clauses = list()
            while i < lens[1]:
                i += 1
                #print lens[0], content[loc]
                subLen = int(content[loc])
                loc += 1
                j = 0
                print subLen
                while j < subLen:
                    print j
                    j += 1   
                    print content[loc],
                    if len(content[loc].split()) > 2:
                        clauses.append(content[loc].split(' ', 1)[1].rstrip('\n').replace("'","''"))
                    loc += 1
                    print subLen, j, loc
                #print clauses
            strClauses = clauses[0]
            for clause in clauses[1:]:
                strClauses += '###' + clause
            query="UPDATE review SET (review_clauses) = ('" + strClauses + "') WHERE id = '"+str(lens[0])+"'"
            #print query
            mydb.executeQuery(con, query, False)
            #queries.append(query)
        #print queries
        #mydb.executeManyQuery(con, queries, False)
        sentLens = list()
        iter += 1
    #sents = sent_tokenizer.tokenize(text)
    #pprint(sents[1:2])
    con.close()