def buildIndex(): CONN_STRING = global_setting.get_CONN() con = None try: con = psycopg2.connect(CONN_STRING) con.set_client_encoding('UTF8') cur = con.cursor() # build index on title langs = ['en', 'es', 'ru', 'fa'] for lang in langs: query1 = 'CREATE INDEX ' + lang + 'IndexTitle ON wiki_' + lang + '(title)' query2 = 'CREATE INDEX ' + lang + 'IndexLowerTitle ON wiki_' + lang + '(lower(title))' print query1 cur.execute(query1) con.commit() print query2 cur.execute(query2) con.commit() except psycopg2.DatabaseError, e: print 'Error %s' % e sys.exit(-1)
def create_table(table_name): # CONN_STRING: you need to change it CONN_STRING = global_setting.get_CONN() try: con = psycopg2.connect(CONN_STRING) con.set_client_encoding('UTF8') #check wheter table exsits query = "select * from information_schema.tables where table_name='" + table_name + "'" cur = con.cursor() cur.execute(query) rows = cur.fetchall() if len(rows) == 0: #table doesn't exist cur = con.cursor() query = 'create table ' + table_name + '(id char(12) PRIMARY KEY,lang char(2),title text,wiki_url text, abstract text, parse_result text)' cur.execute(query) con.commit() return con except psycopg2.DatabaseError, e: print 'Error %s' % e sys.exit(-1)
def buildIndex(): CONN_STRING = global_setting.get_CONN() con = None try: con = psycopg2.connect(CONN_STRING) con.set_client_encoding('UTF8') cur = con.cursor() # Build index on title langs = ['en', 'es', 'ru', 'fa'] for lang in langs: query1 = 'CREATE INDEX ' + lang + 'IndexTitle ON wiki_' + \ lang + '(title)' query2 = 'CREATE INDEX ' + lang + 'IndexLowerTitle ON wiki_' + \ lang + '(lower(title))' print query1 cur.execute(query1) con.commit() print query2 cur.execute(query2) con.commit() except psycopg2.DatabaseError, e: print 'Error %s' % e sys.exit(-1)
def create_table(table_name): # CONN_STRING: you need to change it CONN_STRING= global_setting.get_CONN() try: con = psycopg2.connect(CONN_STRING) con.set_client_encoding('UTF8') #check wheter table exsits query = "select * from information_schema.tables where table_name='"+table_name+"'" cur = con.cursor() cur.execute(query) rows=cur.fetchall() if len(rows) == 0 : #table doesn't exist cur = con.cursor() query='create table '+table_name+'(id char(12) PRIMARY KEY,lang char(2),title text,wiki_url text, abstract text, parse_result text)' cur.execute(query) con.commit() return con except psycopg2.DatabaseError, e: print 'Error %s' % e sys.exit(-1)
def main(): # Database settings CONN_STRING = global_setting.get_CONN() con = None from optparse import OptionParser # Option usage = "usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("-i", "--input", dest="inword", help="input string (example: \"Barack Obama\")") parser.add_option("-l", "--lang", dest="lang", help="language (one of EN|RU|ES|FA)") parser.add_option( "-s", "--substring", dest="substring", action="store_true", help="match input string as substring (default is exact match)", default=False) parser.add_option( "-c", "--casesensitive", dest="case_sensitive", action="store_true", help= "match input string as case-sensitive (default is case-insensitive)", default=False) parser.add_option( "-p", "--preferredmeaning", dest="preferred_meaning", action="store_true", help="return preferred meaning of category (default is NOT preferred)", default=False) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="output debug info, default is false", default=False) parser.add_option("--stdout", dest='stdout', action='store_true', help='direct write the output to stdout', default=False) (options, args) = parser.parse_args() if not options.inword: parser.error( "Must supply input string. (Example: -i \"Barack Obama\")") if not options.lang: parser.error( "Must supply language. (Example: -l EN; allowed languages: EN|ES|RU|FA)" ) inword = options.inword lang = options.lang substring = options.substring case_sensitive = options.case_sensitive preferred_meaning = options.preferred_meaning debug = options.debug stdout = options.stdout # Prepare language suffix for yago and yago langIndex = {"EN": 0, "ES": 1, "RU": 2, "FA": 3} qlangs = ['@eng', '@spa', '@rus', '@fas'] wlangs = ['@en', '@es', '@ru', '@fa'] tables = ['wiki_en', 'wiki_es', 'wiki_ru', 'wiki_fa'] lindex = langIndex[lang] qlang = qlangs[lindex] wlang = wlangs[lindex] table_name = tables[lindex] if preferred_meaning: myset = setting() myset.inword = inword myset.substring = substring myset.case_sensitive = case_sensitive myset.preferred_meaning = preferred_meaning myset.qlang = qlang myset.wlang = wlang myset.table_name = table_name myset.lang = lang myset.CONN_STRING = CONN_STRING myset.debug = debug goon = get_paragraph_prefer.prefer_search(myset) if goon == 0: return 0 inword = inword.replace(" ", "_") query = "select title,abstract from TABLE_NAME where title ilike '@@@word@@@'" query = query.replace('TABLE_NAME', table_name) # Prepare search word if substring: inword = '%' + inword + '%' else: # Exact match inword = inword # Build query if case_sensitive and not substring: # exact match query = query.replace("ilike", "=") if case_sensitive: query = query.replace('ilike', 'like') query = query.replace('@@@word@@@', inword) if debug: print "Query:", query try: # Change CONN_STRING accordingly. con = psycopg2.connect(CONN_STRING) con.set_client_encoding('UTF8') cur = con.cursor(cursor_factory=psycopg2.extras.DictCursor) cur.execute(query) # Get result rows = cur.fetchall() i = 0 if not stdout: for row in rows: i += 1 print '#' + str(i) + " TITLE: " + row['title'] print '#' + str(i) + " ABSTRACT: " + row['abstract'] else: if len(rows) > 0: print rows[0]['abstract'] else: print 'no result!' except psycopg2.DatabaseError, e: print 'Error %s' % e sys.exit(1)
def main(): from optparse import OptionParser # option usage = "usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("-d", "--dir", dest="dirPath", help="the *.nt files dir path") (options, args) = parser.parse_args() if not options.dirPath: parser.error("Please input the dir path") dirPath = options.dirPath filePath = dirPath + '/' + 'interlanguage_links_same_as_en.nt' CONN_STRING = global_setting.get_CONN() # create table try: con = psycopg2.connect(CONN_STRING) con.set_client_encoding('UTF8') cur = con.cursor() query = 'DROP TABLE IF EXISTS multilink;' output = cur.execute(query) con.commit() print query query = 'CREATE TABLE multiLink(engTitle varchar,lang varchar, otherTitle varchar);' cur.execute(query) con.commit() print query # process files file = open(filePath, 'r') file.readline() records = [] i = 0 while True: line = file.readline() if not line: if len(records) > 0: insert_records(con, records) records = [] break line = line.decode('raw_unicode_escape') ll = line.split(' ') lang = '' if ll[2].startswith('<http://es.dbpedia.org/resource'): lang = 'ES' if ll[2].startswith('<http://fa.dbpedia.org/resource'): lang = 'FA' if ll[2].startswith('<http://ru.dbpedia.org/resource'): lang = 'RU' if lang == '': continue i += 1 engTitle = extractTitle(ll[0]) otherTitle = extractTitle(ll[2]) record = (engTitle, lang, otherTitle) records.append(record) if len(records) > 100: insert_records(con, records) records = [] if i % 10000 == 0: print '%d records inserted!', i print i con.commit() # buildIndex query = 'CREATE INDEX multilink_engTitle on multilink(engTitle);' cur.execute(query) con.commit() except psycopg2.DatabaseError, e: print 'Error %s' % e sys.exit(-1)
def main(): # Database settings CONN_STRING = global_setting.get_CONN() con = None from optparse import OptionParser # Option usage = "usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("-i", "--input", dest="inword", help="input string (example: \"Barack Obama\")") parser.add_option("-l", "--lang", dest="lang", help="language (one of EN|RU|ES|FA)") parser.add_option("-s", "--substring", dest="substring", action="store_true", help="match input string as substring (default is exact match)", default=False) parser.add_option("-c", "--casesensitive", dest="case_sensitive", action="store_true", help="match input string as case-sensitive (default is case-insensitive)", default=False) parser.add_option("-p", "--preferredmeaning", dest="preferred_meaning", action="store_true", help="return preferred meaning of category (default is NOT preferred)", default=False) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="output debug info, default is false", default=False) parser.add_option("--stdout", dest='stdout', action='store_true', help='direct write the output to stdout', default=False) (options, args) = parser.parse_args() if not options.inword: parser.error("Must supply input string. (Example: -i \"Barack Obama\")") if not options.lang: parser.error("Must supply language. (Example: -l EN; allowed languages: EN|ES|RU|FA)") inword = options.inword lang = options.lang substring = options.substring case_sensitive = options.case_sensitive preferred_meaning = options.preferred_meaning debug = options.debug stdout = options.stdout # Prepare language suffix for yago and yago langIndex = {"EN": 0, "ES": 1, "RU": 2, "FA": 3} qlangs = ['@eng', '@spa', '@rus', '@fas'] wlangs = ['@en', '@es', '@ru', '@fa'] tables = ['wiki_en', 'wiki_es', 'wiki_ru', 'wiki_fa'] lindex = langIndex[lang] qlang = qlangs[lindex] wlang = wlangs[lindex] table_name = tables[lindex] if preferred_meaning: myset = setting() myset.inword = inword myset.substring = substring myset.case_sensitive = case_sensitive myset.preferred_meaning = preferred_meaning myset.qlang = qlang myset.wlang = wlang myset.table_name = table_name myset.lang = lang myset.CONN_STRING = CONN_STRING myset.debug = debug goon = get_paragraph_prefer.prefer_search(myset) if goon == 0: return 0 inword = inword.replace(" ", "_") query = "select title,abstract from TABLE_NAME where title ilike '@@@word@@@'" query = query.replace('TABLE_NAME', table_name) # Prepare search word if substring: inword = '%'+inword+'%' else: # Exact match inword = inword # Build query if case_sensitive and not substring: # exact match query = query.replace("ilike", "=") if case_sensitive: query = query.replace('ilike', 'like') query = query.replace('@@@word@@@', inword) if debug: print "Query:", query try: # Change CONN_STRING accordingly. con = psycopg2.connect(CONN_STRING) con.set_client_encoding('UTF8') cur = con.cursor(cursor_factory=psycopg2.extras.DictCursor) cur.execute(query) # Get result rows = cur.fetchall() i = 0 if not stdout: for row in rows: i += 1 print '#' + str(i) + " TITLE: " + row['title'] print '#' + str(i) + " ABSTRACT: " + row['abstract'] else: if len(rows) > 0: print rows[0]['abstract'] else: print 'no result!' except psycopg2.DatabaseError, e: print 'Error %s' % e sys.exit(1)
def main(): from optparse import OptionParser #option usage = "usage: %prog [options]" parser = OptionParser(usage=usage) parser.add_option("-d","--dir", dest="dirPath",help="the *.nt files dir path") (options,args) = parser.parse_args() if not options.dirPath: parser.error("Please input the dir path") dirPath=options.dirPath filePath = dirPath + '/' + 'interlanguage_links_same_as_en.nt' CONN_STRING = global_setting.get_CONN() # create table try: con = psycopg2.connect(CONN_STRING) con.set_client_encoding('UTF8') cur = con.cursor() query = 'DROP TABLE IF EXISTS multilink;' output = cur.execute(query) con.commit() print query query = 'CREATE TABLE multiLink(engTitle varchar,lang varchar, otherTitle varchar);' cur.execute(query) con.commit() print query #process files file = open(filePath, 'r' ) file.readline() records = [] i = 0 while True: line = file.readline() if not line: if len(records) > 0: insert_records(con,records) records=[] break line = line.decode('raw_unicode_escape') ll = line.split(' ') lang = '' if ll[2].startswith('<http://es.dbpedia.org/resource'): lang = 'ES' if ll[2].startswith('<http://fa.dbpedia.org/resource'): lang = 'FA' if ll[2].startswith('<http://ru.dbpedia.org/resource'): lang = 'RU' if lang == '': continue i+=1 engTitle = extractTitle(ll[0]) otherTitle = extractTitle(ll[2]) record = (engTitle, lang , otherTitle) records.append(record) if len(records) > 100: insert_records(con,records) records=[] if i%10000 == 0: print '%d records inserted!', i print i con.commit() # buildIndex query = 'CREATE INDEX multilink_engTitle on multilink(engTitle);' cur.execute(query) con.commit() except psycopg2.DatabaseError, e: print 'Error %s' % e sys.exit(-1)