def insertReviews(fileName): con = mydb.getCon(CONN_STRING) json_date = open(fileName,'r') # date = json.load(json_date) total = 0 fail = 0 succ = 0 for entry in json_date: total += 1 #print entry data = json.loads(entry) #print date['business_id'] succ += 1 query = "insert into review(id, business_id, user_id, review_text, review_date) values('" query += str(succ) + "', '" query += data['business_id'] + "', '" query += data['user_id'] + "', '" query += data['text'].replace("'","''") + "', '" query += data['date'].replace("'","''") + "')" #query += data['votes'] + "')" try: mydb.executeQuery(con,query, False) except psycopg2.DatabaseError, e: fail += 1 succ -= 1 print 'Error %s' % e
import mydb import sys typeFile = open('es_property.txt', 'r') transFile = open('es_property_translate.txt', 'r') CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) while True: type = typeFile.readline() if not type: break trans = transFile.readline() type = type.strip() trans = trans.strip() query = "update statistic_es set native_property = '__trans__' where property = '__type__' " query = query.replace('__trans__', trans) query = query.replace('__type__', type) mydb.executeQuery(con, query, True)
originTree = xml.parse(originPath) # findLabel(originTree, 'http://dbpedia.org/ontology/soccerLeagueRelegated', 'en',1) # create table CONN_STRING = mydb.get_CONN() con = mydb.getCon(CONN_STRING) tableNames = { 'EN': 'statistic_en', 'RU': 'statistic_ru', 'ES': 'statistic_es', 'FA': 'statistic_fa' } tableName = tableNames[lang] query = 'DROP TABLE IF EXISTS ' + tableName + ';' mydb.executeQuery(con, query, True) query = 'CREATE TABLE __tableName__(type varchar, property varchar, native_type varchar, native_property varchar, hit int, total int);' query = query.replace('__tableName__', tableName) mydb.executeQuery(con, query, True) records = [] insertQuery = 'insert into __table_name__(type,property,native_type,native_property,hit,total) VALUES(%s,%s,%s,%s,%s,%s);' insertQuery = insertQuery.replace('__table_name__', tableName) if lang != "FA": # print insertQuery filepath = 't.xml' tree = xml.parse(filepath) xmlRoot = tree.getroot() for c in xmlRoot.findall('.//Type'): typeName = c.attrib['name'] # typeName = 'http://dbpedia.org/ontology/University'
def processSingle(start, end, fileName,dirName,p): fileName = os.path.abspath(fileName) #totalOneTime = 1000#increment each time totalOneTime = end con = mydb.getCon(CONN_STRING) #nltk.download() sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') #text = nltk.corpus.abc.raw('science.txt') iter = start per = 1 while iter * per < totalOneTime: print iter query = "select id, review_text from review order by id LIMIT "+ str(per)+" OFFSET " + str(iter * per) resultSet = mydb.executeQueryResult(con, query, False) sentLens = list() #fileName = 'sents.txt' file = open(fileName, 'w') entry = resultSet[0] try: sents = sent_tokenizer.tokenize(entry[1]) except UnicodeDecodeError, e: iter += 1 continue #sentLens.append([entry[0], len(sents)]) # fileName = 'sents' + str(iter * 10) + '-' + str(iter* 10 + 10) + '.txt' for sent in sents: if p.match(sent): print sent sent = 'Special' elif len(sent.split()) > 70: print sent sent = 'longsentence' file.write('<s> ' + sent + ' </s> \n') file.close() os.system('perl '+dirName+'/spade.pl ' + fileName) outputFileName = fileName + '.chp.edu.fmt'; with open(outputFileName) as f: content = f.readlines() loc = 0 #print len(content) clauses = list() while loc < len(content): subLen = int(content[loc]) loc += 1 j = 0 while j < subLen: j += 1 if len(content[loc].split()) > 2: clauses.append(content[loc].split(' ', 1)[1].rstrip('\n').replace("'","''")) loc += 1 #print subLen, j, loc if len(clauses) < 1: iter += 1 continue strClauses = clauses[0] for clause in clauses[1:]: strClauses += '###' + clause query="UPDATE review SET (review_clauses) = ('" + strClauses + "') WHERE id = '"+str(entry[0])+"'" mydb.executeQuery(con, query, False) sentLens = list() iter += 1
def processBatch(p): totalOneTime = 300 #increment each time con = mydb.getCon(CONN_STRING) #nltk.download() sent_tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle') #text = nltk.corpus.abc.raw('science.txt') iter = 237 per = 1 while iter * per < totalOneTime: query = "select id, review_text from review order by id LIMIT "+ str(per)+" OFFSET " + str(iter * per) resultSet = mydb.executeQueryResult(con, query, False) sentLens = list() fileName = 'sents.txt' file = open(fileName, 'w') for entry in resultSet: sents = sent_tokenizer.tokenize(entry[1]) sentLens.append([entry[0], len(sents)]) # fileName = 'sents' + str(iter * 10) + '-' + str(iter* 10 + 10) + '.txt' for sent in sents: if p.match(sent): print sent sent = 'Special' elif len(sent.split()) > 70: print sent sent = 'longsentence' file.write('<s> ' + sent + ' </s> \n') file.close() print sentLens os.system('perl spade.pl ' + fileName) outputFileName = 'sents.txt.chp.edu.fmt'; #outputFile = open(outputFileName, 'r') with open(outputFileName) as f: content = f.readlines() loc = 0 queries = list() print len(content) for lens in sentLens: i = 0 clauses = list() while i < lens[1]: i += 1 #print lens[0], content[loc] subLen = int(content[loc]) loc += 1 j = 0 print subLen while j < subLen: print j j += 1 print content[loc], if len(content[loc].split()) > 2: clauses.append(content[loc].split(' ', 1)[1].rstrip('\n').replace("'","''")) loc += 1 print subLen, j, loc #print clauses strClauses = clauses[0] for clause in clauses[1:]: strClauses += '###' + clause query="UPDATE review SET (review_clauses) = ('" + strClauses + "') WHERE id = '"+str(lens[0])+"'" #print query mydb.executeQuery(con, query, False) #queries.append(query) #print queries #mydb.executeManyQuery(con, queries, False) sentLens = list() iter += 1 #sents = sent_tokenizer.tokenize(text) #pprint(sents[1:2]) con.close()