def main(): db,cur = cleanDb.openConnection() dest_dir = sys.argv[1] languages = {lang:(liso,lname,set(tlds.split(','))) for lang,liso,lname,tlds in [x.split(':') for x in sys.argv[2:]]} tbl = open(dest_dir + '/table.txt', 'w') for lang in languages: liso,lname,tlds = languages[lang] print 'begin: ',lang,tlds # tlds = set of tlds, lang = lang_altcode Qparams = {'tlds':tuple(tlds), 'excluded':tuple(excluded_languages - {lang} - known_fails.get(lang,set())), 'limit': 1000} print "Q params", Qparams cur.execute(Q, Qparams) print cur.rowcount, "rows" s = cur.fetchone()[0] s = re.sub(r"(\s|[0-9])+", " ", s, flags=re.MULTILINE) print 'strlen =', len(s) #print s #raw_input() tgms = defaultdict(int) for tgm in trigrams(s): tgms[tgm] += 1 #print tgms tbl.write('%s\t%s\t%s\n' % (lang,liso,lname)) codecs.open((dest_dir+'/%s-3grams.txt') % lang,'w', encoding='utf-8').write('\n'.join("%d %s"%(tgms[tgm],tgm) for tgm in tgms)) print 'end'
def enrycher_worker(in_queue, out_queue, url=None): """ Worker thread. Takes an article dict from in_queue, adds the enrycher xml, puts the enryched article in out_queue. If `url` is given, queries Enrycher at that URL, otherwise the URL is constructed based on the language of each artcile in in_queue. """ conn, cur = openConnection('rych info writer') while True: try: article = in_queue.get() lang = article.get('lang', '').split('-')[0] # auto-detect URL if not url: if lang in ('en', 'eng', 'enz'): if 0 and article.get('google_story_id'): url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/run-render' # all + stanford parses + sentiment else: url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/run-demo' elif lang in ('sl', 'slv'): url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/sl-run' else: raise ValueError('Unsupported language: %r' % lang) #print '[%s] pre-enrych %s' % (threading.currentThread().name, article['id']) #print article['id'], lang, `article.get('google_story_id')`, url article['rych'] = enrych(article['cleartext'], url) #print '[%s] pre-db %s' % (threading.currentThread().name, article['id']) DB_write_rych_info(cur, article) #print '[%s] pre-out-enqueue %s' % (threading.currentThread().name, article['id']) out_queue.put(article) except Exception as exc: # pass through the unenryched article out_queue.put(article) # report error print '!! error while processing article %s (lang %s) at %r' % ( article.get('id'), article.get('lang'), url) txt = article.get('cleartext', '').replace('\n', ' ') print 'Some stats about the input data: %d bytes, %d sentences, max sentence length %d bytes. File saved to /tmp/bad_enrycher_input' % ( len(txt), len( txt.split('. ')), max(map(len, txt.split('. ')) + [-1])) print exc, exc.args try: with open('/tmp/bad_enrycher_input', 'w') as badf: badf.write(txt.encode('utf8')) except: print '(file not saved, IOError)'
def enrycher_worker(in_queue, out_queue, url=None): """ Worker thread. Takes an article dict from in_queue, adds the enrycher xml, puts the enryched article in out_queue. If `url` is given, queries Enrycher at that URL, otherwise the URL is constructed based on the language of each artcile in in_queue. """ conn, cur = openConnection('rych info writer') while True: try: article = in_queue.get() lang = article.get('lang','').split('-')[0] # auto-detect URL if not url: if lang in ('en','eng','enz'): if 0 and article.get('google_story_id'): url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/run-render' # all + stanford parses + sentiment else: url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/run-demo' elif lang in ('sl','slv'): url = 'http://aidemo.ijs.si:8080/EnrycherWeb-render/sl-run' else: raise ValueError('Unsupported language: %r' % lang) #print '[%s] pre-enrych %s' % (threading.currentThread().name, article['id']) #print article['id'], lang, `article.get('google_story_id')`, url article['rych'] = enrych(article['cleartext'], url) #print '[%s] pre-db %s' % (threading.currentThread().name, article['id']) DB_write_rych_info(cur, article) #print '[%s] pre-out-enqueue %s' % (threading.currentThread().name, article['id']) out_queue.put(article) except Exception as exc: # pass through the unenryched article out_queue.put(article) # report error print '!! error while processing article %s (lang %s) at %r' % (article.get('id'), article.get('lang'), url) txt = article.get('cleartext', '').replace('\n',' ') print 'Some stats about the input data: %d bytes, %d sentences, max sentence length %d bytes. File saved to /tmp/bad_enrycher_input' % ( len(txt), len(txt.split('. ')), max(map(len,txt.split('. '))+[-1]) ) print exc, exc.args try: with open('/tmp/bad_enrycher_input','w') as badf: badf.write(txt.encode('utf8')) except: print '(file not saved, IOError)'
def main(): global article, conn from cleanDb import openConnection conn, cur = openConnection('cleartext feed') conn.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT ) # disable transactions, so we get notifys realtime cur_listen = conn.cursor() cur_listen.execute("LISTEN have_cleartext") zmqctx = zmq.Context() sock = zmqctx.socket(zmq.PUB) sock.setsockopt(zmq.HWM, 100) sock.bind('tcp://*:13371') try: while True: if select.select([conn], [], [], 5) == ([], [], []): print '(nothing to do)' else: conn.poll() notifies = conn.notifies while notifies: notify = notifies.pop() article_id = int(notify.payload) try: article = DB_get_full_article(cur, article_id) if article is None: print "skipping %s (not found)" % article_id continue elif not article['cleartext']: print "skipping %s (no cleartext)" % article_id continue sock.send_pyobj(article) print "ok %s" % article_id + ( '(old)' if article['found_date'].year < 2012 else '') except: print "!!! error while processing %s" % article_id traceback.print_exc() except: traceback.print_exc() return finally: sock.close() zmqctx.term()
def main(): global article, conn from cleanDb import openConnection conn, cur = openConnection('cleartext feed') conn.set_isolation_level(psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) # disable transactions, so we get notifys realtime cur_listen = conn.cursor() cur_listen.execute("LISTEN have_cleartext") zmqctx = zmq.Context() sock = zmqctx.socket(zmq.PUB) sock.setsockopt(zmq.HWM, 100) sock.bind('tcp://*:13371') try: while True: if select.select([conn],[],[],5) == ([],[],[]): print '(nothing to do)' else: conn.poll() notifies = conn.notifies while notifies: notify = notifies.pop() article_id = int(notify.payload) try: article = DB_get_full_article(cur, article_id) if article is None: print "skipping %s (not found)" % article_id continue elif not article['cleartext']: print "skipping %s (no cleartext)" % article_id continue sock.send_pyobj(article) print "ok %s" % article_id + ('(old)' if article['found_date'].year<2012 else '') except: print "!!! error while processing %s" % article_id traceback.print_exc() except: traceback.print_exc() return finally: sock.close() zmqctx.term()
def main(): db, cur = cleanDb.openConnection() dest_dir = sys.argv[1] languages = { lang: (liso, lname, set(tlds.split(','))) for lang, liso, lname, tlds in [x.split(':') for x in sys.argv[2:]] } tbl = open(dest_dir + '/table.txt', 'w') for lang in languages: liso, lname, tlds = languages[lang] print 'begin: ', lang, tlds # tlds = set of tlds, lang = lang_altcode Qparams = { 'tlds': tuple(tlds), 'excluded': tuple(excluded_languages - {lang} - known_fails.get(lang, set())), 'limit': 1000 } print "Q params", Qparams cur.execute(Q, Qparams) print cur.rowcount, "rows" s = cur.fetchone()[0] s = re.sub(r"(\s|[0-9])+", " ", s, flags=re.MULTILINE) print 'strlen =', len(s) #print s #raw_input() tgms = defaultdict(int) for tgm in trigrams(s): tgms[tgm] += 1 #print tgms tbl.write('%s\t%s\t%s\n' % (lang, liso, lname)) codecs.open( (dest_dir + '/%s-3grams.txt') % lang, 'w', encoding='utf-8').write('\n'.join("%d %s" % (tgms[tgm], tgm) for tgm in tgms)) print 'end'
reload(zmq2zmq_enrych) from zmq2zmq_enrych import is_enrychable, enrycher_worker import db2zmq_cleartext2 reload(db2zmq_cleartext2) from db2zmq_cleartext2 import DB_get_full_article import serialize2 reload(serialize2) from iso_map import iso2to3 DB_OUT = '/tmp/deleteme.sqlite' # path to the sqlite DB MAX_ENRYCHER_REQUESTS = 2 # max number of simultaneous requests conn_in, cur_in = openConnection('history_export') cur_in_ids = conn_in.cursor('foo') # named cursor for incremental fetching conn_out = sqlite3.connect(DB_OUT) conn_out.isolation_level = None cur_out = conn_out.cursor() # set up the output DB cur_out.execute( "CREATE TABLE IF NOT EXISTS news (article_id integer primary key, story_id text, xml text);" ) cur_out.execute( "CREATE INDEX IF NOT EXISTS article_id_idx ON news(article_id);") # fetch IDs to export #cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles g JOIN feed_article fa ON (fa.id=g.feed_articleid) WHERE fa.id>45909130 AND fa.found BETWEEN '2012-04-01' AND '2012-05-01' ORDER BY id") # 537466 articles; not taking lang into account #cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles")
""" Get a sample of articles from the news DB, show language distribution according to - existing 'lang' column in the DB - google's CLD (executed on the fly for each article) """ import os, sys sys.path.extend((".", "..")) from cleanDb import openConnection import cld conn, cur = openConnection() cur = conn.cursor("x") cur.execute( "SELECT m.id, p.content, m.lang_altcode FROM processed_article p JOIN feed_article_meta m ON (p.feed_articleid = m.id) WHERE p.mode='cleartext' ORDER BY m.id DESC LIMIT 100000" ) cnt = {} cnt2 = {} while True: row = cur.fetchone() if not row: break aid, txt, lang = row lang = str(lang[:2]) lang2 = cld.detect(txt.encode("utf8", "ignore"))[1] cnt[lang] = cnt.get(lang, 0) + 1 cnt2[lang2] = cnt2.get(lang2, 0) + 1 print "done", sum(cnt.itervalues())
""" Get a sample of articles from the news DB, show language distribution according to - existing 'lang' column in the DB - google's CLD (executed on the fly for each article) """ import os, sys sys.path.extend(('.', '..')) from cleanDb import openConnection import cld conn, cur = openConnection() cur = conn.cursor('x') cur.execute( "SELECT m.id, p.content, m.lang_altcode FROM processed_article p JOIN feed_article_meta m ON (p.feed_articleid = m.id) WHERE p.mode='cleartext' ORDER BY m.id DESC LIMIT 100000" ) cnt = {} cnt2 = {} while True: row = cur.fetchone() if not row: break aid, txt, lang = row lang = str(lang[:2]) lang2 = cld.detect(txt.encode('utf8', 'ignore'))[1] cnt[lang] = cnt.get(lang, 0) + 1 cnt2[lang2] = cnt2.get(lang2, 0) + 1 print 'done', sum(cnt.itervalues()) print 'done'
from Queue import Queue import cld from cleanDb import openConnection import zmq2zmq_enrych; reload(zmq2zmq_enrych) from zmq2zmq_enrych import is_enrychable, enrycher_worker import db2zmq_cleartext2; reload(db2zmq_cleartext2) from db2zmq_cleartext2 import DB_get_full_article import serialize2; reload(serialize2) from iso_map import iso2to3 DB_OUT = '/tmp/deleteme.sqlite' # path to the sqlite DB MAX_ENRYCHER_REQUESTS = 2 # max number of simultaneous requests conn_in, cur_in = openConnection('history_export') cur_in_ids = conn_in.cursor('foo') # named cursor for incremental fetching conn_out = sqlite3.connect(DB_OUT); conn_out.isolation_level = None; cur_out = conn_out.cursor(); # set up the output DB cur_out.execute("CREATE TABLE IF NOT EXISTS news (article_id integer primary key, story_id text, xml text);") cur_out.execute("CREATE INDEX IF NOT EXISTS article_id_idx ON news(article_id);") # fetch IDs to export #cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles g JOIN feed_article fa ON (fa.id=g.feed_articleid) WHERE fa.id>45909130 AND fa.found BETWEEN '2012-04-01' AND '2012-05-01' ORDER BY id") # 537466 articles; not taking lang into account #cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles") #cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles g JOIN feed_article fa ON (fa.id=g.feed_articleid) LIMIT 10 OFFSET 10") #cur_in_ids.execute("SELECT 20932789 AS feed_articleid, 'FAKE_STORY' as story_id") cur_in_ids.execute("SELECT feed_articleid, story_id FROM feed_article_googles g JOIN feed_article fa ON (fa.id=g.feed_articleid) ORDER BY fa.id DESC LIMIT 1000 OFFSET 1000") id_rows = cur_in_ids.fetchall() conn_in.commit()