def main(): OKDOC = 0 conn = get_connection() curr = conn.cursor() while True: curr.execute(""" SELECT id, text, sentences FROM documents WHERE language in ('de', 'en') -- and id = 18811 -- XXX --AND _stanford = True AND sentences IS NOT NULL AND EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND sid IS NULL ) --ORDER BY random() LIMIT 50""") count = 0 for doc in curr: id, text, sents = doc try: errmsg=analyze(id, text, sents) if errmsg: print "ERROR", errmsg else: OKDOC += 1 except IndexError, e: print "Error for id = ", id print "IndexError", str(e) count += 1 if count == 0: print "FINISH" break
def main(logger): conn = get_connection() if len(sys.argv) < 2: print "USAGE ./master.py JOB_NUMBER [JOB_NUMBER]+\nSee job_list in source code!" return 1 jobs_list = [] numbers = [] for number in sys.argv[1:]: job_list_number = int(number) numbers.append(job_list_number) job_lists = { # Just for test 0: [(testRaise, (),{}), (testRaise, (), {}), # TEST # (testReturn, (), {})], 1: [(cron, ('crontab', ), {})], # NORMAL # Obsolete 2: [(stanford, (), {'cursor' : conn.cursor(), 'sys_path' : '../db'})], # Obsolete 3: [(analyzed, ('../stahovak/analyzed.py',), {})], # Obsolete 4: [(url_stahovak, (), {'cursor' :conn.cursor(), 'sys_path' : '../stahovak'})], } jobs_list += job_lists[job_list_number] #handler = logging.handlers.TimedRotatingFileHandler( # LOG_DIR + "+".join(sys.argv[1:]) + "-" + LOG_FILENAME, when='D', interval=1, backupCount=3) #handler.setFormatter(logging.Formatter(fmt='%(asctime)s\t%(levelname)s\t%(name)s\t%(message)s')) #logger.addHandler(handler) logger.info('Starting with jobs {%s}' % ', '.join(map(str, numbers))) jobs = [] for i, todo in enumerate(jobs_list): (target, args, kwargs) = todo kwargs['__process'] = i if multiprocessing: p = multiprocessing.Process(target=target, args=args, kwargs=kwargs) else: p = threading.Thread(target=target, args=args, kwargs=kwargs) jobs.append((p, todo)) p.start() for job, (etc) in jobs: job.join() return 0
def main(logger): conn = get_connection() if len(sys.argv) < 2: print "USAGE ./master.py JOB_NUMBER [JOB_NUMBER]+\nSee job_list in source code!" return 1 jobs_list = [] numbers = [] for number in sys.argv[1:]: job_list_number = int(number) numbers.append(job_list_number) job_lists = {# Just for test 0: [(testRaise, (),{}), (testRaise, (), {}), # TEST # (testReturn, (), {})], 1: [(cron, ('crontab',), {})], # NORMAL # Obsolete 2: [(stanford, (), {'cursor' : conn.cursor(), 'sys_path' : '../db'})], # Obsolete 3: [(analyzed, ('../stahovak/analyzed.py',), {})], # Obsolete 4: [(url_stahovak, (), {'cursor' :conn.cursor(), 'sys_path' : '../stahovak'})], } jobs_list += job_lists[job_list_number] #handler = logging.handlers.TimedRotatingFileHandler( # LOG_DIR + "+".join(sys.argv[1:]) + "-" + LOG_FILENAME, when='D', interval=1, backupCount=3) #handler.setFormatter(logging.Formatter(fmt='%(asctime)s\t%(levelname)s\t%(name)s\t%(message)s')) #logger.addHandler(handler) logger.info('Starting with jobs {%s}' % ', '.join(map(str, numbers))) jobs = [] for i, todo in enumerate(jobs_list): (target, args, kwargs) = todo kwargs['__process'] = i if multiprocessing: p = multiprocessing.Process(target=target, args=args, kwargs=kwargs) else: p = threading.Thread(target=target, args=args, kwargs=kwargs) jobs.append((p, todo)) p.start() for job, (etc) in jobs: job.join() return 0
def main(): OKDOC = 0 conn = get_connection() curr = conn.cursor() while True: curr.execute( """ SELECT id, text, sentences FROM documents WHERE language in ('de', 'en') -- and id = 18811 -- XXX --AND _stanford = True AND sentences IS NOT NULL AND EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND sid IS NULL ) --ORDER BY random() LIMIT 50""" ) count = 0 for doc in curr: id, text, sents = doc try: errmsg = analyze(id, text, sents) if errmsg: print "ERROR", errmsg else: OKDOC += 1 except IndexError, e: print "Error for id = ", id print "IndexError", str(e) count += 1 if count == 0: print "FINISH" break
else: occ[id] = 1 pairs = [] for id, counts in occ.iteritems(): pairs.append((id, counts)) # sort and make string restokens = [] for id, occ_token in sorted(pairs, key=lambda a: a[0]): restokens.append("%s:%d" % (id, occ_token)) termvector = " ".join(restokens) return termvector if __name__ == "__main__": LIMIT = 20 conn = get_connection(UNICODE=True) # conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT) print "Analyzing whole database" sys.stdout.flush() while True: curr = conn.cursor() curr.execute( """SELECT id, language, text FROM documents WHERE termvector is null and language in ('en', 'de') and pubdate>='01-05-2011' and pubdate<'01-07-2011' LIMIT %s""", (LIMIT,), ) # curr.execute("""SELECT id, text FROM documents WHERE id=20875243""") count = 0 for document in curr: id, lang, text = document
# tuples to lists data = map(list, data) return data if __name__ == '__main__': data = {} print "DATA FETCHING" for name, v in config.iteritems(): url = v[0] data[name] = load(url) print "\t%s : %d" % (name, len(data[name])) print print "DB INTERACT" conn = get_connection() logfile = open(LOG_FILE, 'a') logfile.write("=" * 80) logfile.write("\nSTART (" + str(timer.timestamp()) + ')\n') outfile = open(OUT_FOLDER + str(timer.timestamp()) + '.csv', 'wb') csv_writer = UnicodeWriter(outfile) results = {} for name, values in data.iteritems(): cols = config[name][1] cur = conn.cursor() cur.execute('select %s from %s' % (', '.join(cols), name)) in_db = cur.fetchall() cur_insert = conn.cursor() print "\ttable : %s" % name
data = set(data) # tuples to lists data = map(list, data) return data if __name__ == '__main__': data = {} print "DATA FETCHING" for name, v in config.iteritems(): url = v[0] data[name] = load(url) print "\t%s : %d" % (name, len(data[name])) print print "DB INTERACT" conn = get_connection() logfile = open( LOG_FILE, 'a') logfile.write("="*80) logfile.write("\nSTART (" + str(timer.timestamp()) + ')\n') outfile = open( OUT_FOLDER + str(timer.timestamp()) + '.csv', 'wb') csv_writer = UnicodeWriter(outfile) results = {} for name, values in data.iteritems(): cols = config[name][1] cur = conn.cursor() cur.execute('select %s from %s' % (', '.join(cols), name)) in_db = cur.fetchall() cur_insert = conn.cursor() print "\ttable : %s" % name
def main(): # logging init logger = logging.getLogger("db_stahovak") logger.setLevel(logging.WARNING) # start infoo logger.info("START") # classifier tcl = TwitterClassifier() # get twitter's id's - only twitter should be classified conn = get_connection(); cursor = conn.cursor() cursor.execute("select id from sources_twitter") twitter_ids = [id[0] for id in cursor] while True: # feeds init # XXX - performance problems - sources should be before while... sources = MSources() sources.get_multi(where="_stahovak = true") feeds = [Sources(**data) for data in sources.value()] assert feeds items_count = 0 for source in feeds: logger.info("SOURCE\tSECTION:%s\tLINK:%s" % (source.get_section(), source.get_link())) modified = str2tuple(source.get_modified()) data = downloader.download(source.get_link(), source.get_etag(), modified) # update etag/modified if data['etag'] or data['modified']: diff = False if source.get_etag() != data['etag']: diff = True source.set_etag(data['etag']) if modified != data['modified']: diff = True source.set_modified(tuple2str(data['modified'])) if diff: source.update() classified_as_irelevant = 0 # work with items for item in data['items']: items_count += 1 # prepare new database insert Item = Documents() Item.set_timestamp(timer.timestamp()) Item.set_source_id(source.get_id()) Item.set_language(source.get_language()) Item.set_title(control_chars.remove(item['title'])) Item.set_text(control_chars.remove(item['text'])) try: Item.set_termvector(get_termvector( Item.get_text(), Item.get_language(), conn)) except psycopg2.ProgrammingError, e: print str(e) continue Item.set__relevance(None) # we classify only twitter's documents if ( source.get_id() in twitter_ids ): score = tcl.classify(Item.get_text(), Item.get_language()) was_classified = (score != -1) if (was_classified and score < MIN_SCORE): # skip classified_as_irelevant += 1 continue if ( was_classified ): Item.set__relevance(int(score * 100)) Item.set_link(control_chars.remove(item['link'])) Item.set_guid(source.get_section()+":"+control_chars.remove(item['guid'])) if item['pubDate']: pubDate = time.strftime("%Y-%m-%d", item['pubDate']) if pubDate : Item.set_pubDate(pubDate) pubTime = time.strftime("%H:%M:%S%z", item['pubDate']) if pubTime :Item.set_pubTime(pubTime) if not Item.get_pubDate(): # dont want items without pubdate continue ## following links if source.get__follow(): url = item['link'] logger.debug("Following LINK:%s", url) page = downloader.download_url(url) Item.set_text(control_chars.remove(page.get('text', ''))) Item.set_html_description(control_chars.remove(page.get('description', ""))) Item.set_html_keywords(control_chars.remove(page.get('keywords', ""))) # insert it if Item.get_text(): inserted, id = Item.insert() if inserted: logger.debug("Document succesfully inserted into db with id=%s" % Item.get_id()) yield str(id) # output else: logger.debug("Document already in db with id=%s" % id) else: logger.info("Item has not text!") # outputting logger.info("Created OUTPUT\tITEMS:%d\tIRELEVANT:%d", data['items_count'], classified_as_irelevant) if not items_count: print "going to sleep" timer.sleep_second(SLEEP_TIME)
def main(): # logging init logger = logging.getLogger("db_stahovak") logger.setLevel(logging.WARNING) # start infoo logger.info("START") # classifier tcl = TwitterClassifier() # get twitter's id's - only twitter should be classified conn = get_connection() cursor = conn.cursor() cursor.execute("select id from sources_twitter") twitter_ids = [id[0] for id in cursor] while True: # feeds init # XXX - performance problems - sources should be before while... sources = MSources() sources.get_multi(where="_stahovak = true") feeds = [Sources(**data) for data in sources.value()] assert feeds items_count = 0 for source in feeds: logger.info("SOURCE\tSECTION:%s\tLINK:%s" % (source.get_section(), source.get_link())) modified = str2tuple(source.get_modified()) data = downloader.download(source.get_link(), source.get_etag(), modified) # update etag/modified if data['etag'] or data['modified']: diff = False if source.get_etag() != data['etag']: diff = True source.set_etag(data['etag']) if modified != data['modified']: diff = True source.set_modified(tuple2str(data['modified'])) if diff: source.update() classified_as_irelevant = 0 # work with items for item in data['items']: items_count += 1 # prepare new database insert Item = Documents() Item.set_timestamp(timer.timestamp()) Item.set_source_id(source.get_id()) Item.set_language(source.get_language()) Item.set_title(control_chars.remove(item['title'])) Item.set_text(control_chars.remove(item['text'])) try: Item.set_termvector( get_termvector(Item.get_text(), Item.get_language(), conn)) except psycopg2.ProgrammingError, e: print str(e) continue Item.set__relevance(None) # we classify only twitter's documents if (source.get_id() in twitter_ids): score = tcl.classify(Item.get_text(), Item.get_language()) was_classified = (score != -1) if (was_classified and score < MIN_SCORE): # skip classified_as_irelevant += 1 continue if (was_classified): Item.set__relevance(int(score * 100)) Item.set_link(control_chars.remove(item['link'])) Item.set_guid(source.get_section() + ":" + control_chars.remove(item['guid'])) if item['pubDate']: pubDate = time.strftime("%Y-%m-%d", item['pubDate']) if pubDate: Item.set_pubDate(pubDate) pubTime = time.strftime("%H:%M:%S%z", item['pubDate']) if pubTime: Item.set_pubTime(pubTime) if not Item.get_pubDate(): # dont want items without pubdate continue ## following links if source.get__follow(): url = item['link'] logger.debug("Following LINK:%s", url) page = downloader.download_url(url) Item.set_text(control_chars.remove(page.get('text', ''))) Item.set_html_description( control_chars.remove(page.get('description', ""))) Item.set_html_keywords( control_chars.remove(page.get('keywords', ""))) # insert it if Item.get_text(): inserted, id = Item.insert() if inserted: logger.debug( "Document succesfully inserted into db with id=%s" % Item.get_id()) yield str(id) # output else: logger.debug("Document already in db with id=%s" % id) else: logger.info("Item has not text!") # outputting logger.info("Created OUTPUT\tITEMS:%d\tIRELEVANT:%d", data['items_count'], classified_as_irelevant) if not items_count: print "going to sleep" timer.sleep_second(SLEEP_TIME)
def main(): conn = get_connection(UNICODE=True) curr = conn.cursor() tokenizer = TreebankWordTokenizer() while True: curr.execute("""SELECT id, text, language FROM documents WHERE --guid='tw:122144569302323201' EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL) LIMIT 1""") data = curr.fetchone() if data is None: print "sleep" timer.sleep_minute(30) continue id, text, lang = data print "id", id curr.execute("""SELECT * FROM instances WHERE item_id = %s AND begintoken IS NULL""", (id,)) # throw away `confidence` instances = [list(x)[:-1] for x in curr] if not len(instances): continue instance_ = [] for ins in instances: ins[-1] = None ins[-2] = None ins[-3] = None instance_.append(ins) instances = instance_ #print instances sent_tok = PunktSentenceTokenizer() for sid, sentidx in enumerate(sent_tok.span_tokenize(text)): #print '++++' sentence = text[sentidx[0]:sentidx[1]] #print sentence #print '----' for pos, indexes in enumerate(WhitespaceTokenizer().span_tokenize(sentence)): # TODO indexy jsou pouze relativni k vete # ale instances je ma od zacatku! indexes = list(indexes) indexes[0] = sentidx[0] + indexes[0] indexes[1] = sentidx[0] + indexes[1] word = text[indexes[0]:indexes[1]] #print pos, word, indexes for i, instance in enumerate(instances): id, entity_id, item_id, exact, offset, length, sid_, begin, end =instance #print i,instance if sid_ is None: if begin is None: if offset >= indexes[0] and offset <= indexes[1]: instances[i][-2] = begin = pos instances[i][-3] = sid_ = sid if sid_ == sid: if end is None and begin is not None: off = offset + length if off <= indexes[1] and off >= indexes[0]: instances[i][-1] = pos if off == indexes[0]: instances[i][-1] = pos - 1 for instance in instances: print instance id, entity_id, item_id, exact, offset, length, sid, begin, end =instance #print exact, ">>", sid, begin, end if end is None: if not " " in exact: end = begin else: end = -1 curr.execute("""UPDATE instances SET sid=%s, begintoken=%s, endtoken=%s WHERE id=%s""", (sid, begin, end, id))
def main(): conn = get_connection(UNICODE=True) curr = conn.cursor() tokenizer = TreebankWordTokenizer() while True: curr.execute("""SELECT id, text, language FROM documents WHERE --guid='tw:122144569302323201' EXISTS ( SELECT 1 FROM instances WHERE item_id=documents.id AND begintoken IS NULL) LIMIT 1""") data = curr.fetchone() if data is None: print "sleep" timer.sleep_minute(30) continue id, text, lang = data print "id", id curr.execute( """SELECT * FROM instances WHERE item_id = %s AND begintoken IS NULL""", (id, )) # throw away `confidence` instances = [list(x)[:-1] for x in curr] if not len(instances): continue instance_ = [] for ins in instances: ins[-1] = None ins[-2] = None ins[-3] = None instance_.append(ins) instances = instance_ #print instances sent_tok = PunktSentenceTokenizer() for sid, sentidx in enumerate(sent_tok.span_tokenize(text)): #print '++++' sentence = text[sentidx[0]:sentidx[1]] #print sentence #print '----' for pos, indexes in enumerate( WhitespaceTokenizer().span_tokenize(sentence)): # TODO indexy jsou pouze relativni k vete # ale instances je ma od zacatku! indexes = list(indexes) indexes[0] = sentidx[0] + indexes[0] indexes[1] = sentidx[0] + indexes[1] word = text[indexes[0]:indexes[1]] #print pos, word, indexes for i, instance in enumerate(instances): id, entity_id, item_id, exact, offset, length, sid_, begin, end = instance #print i,instance if sid_ is None: if begin is None: if offset >= indexes[0] and offset <= indexes[1]: instances[i][-2] = begin = pos instances[i][-3] = sid_ = sid if sid_ == sid: if end is None and begin is not None: off = offset + length if off <= indexes[1] and off >= indexes[0]: instances[i][-1] = pos if off == indexes[0]: instances[i][-1] = pos - 1 for instance in instances: print instance id, entity_id, item_id, exact, offset, length, sid, begin, end = instance #print exact, ">>", sid, begin, end if end is None: if not " " in exact: end = begin else: end = -1 curr.execute( """UPDATE instances SET sid=%s, begintoken=%s, endtoken=%s WHERE id=%s""", (sid, begin, end, id))
self.stream.write(data) # empty queue self.queue.truncate(0) def writerows(self, rows): for row in rows: self.writerow(row) # data output fileDataDiseases = UnicodeWriter(open(FOLDER + 'diseases_list.dat', 'w'), dialect=csv.excel) fileDataSymptoms = UnicodeWriter(open(FOLDER + 'symptoms_list.dat', 'w'), dialect=csv.excel) conn = get_connection(UNICODE=True) cur = conn.cursor() today = '%s' % date.today() ## count douments by languages #cur.execute("""select language, count(*) from documents group by language""") #for lang, count in cur: # f = open(FOLDER + 'languages-%s.dat' % lang, 'a') # f.write("%s\t%d\n" % ( today, count )) # f.flush() # f.close() #cur.execute("""select count(*) from documents where language not in ('en', 'de')""") #(count,) = cur.fetchone() #f = open(FOLDER + 'languages-others.dat', 'a') #f.write("%s\t%s\n" % (today, count)) #f.flush()