def deal_items(get_items, row): # 添加的记录数和跳过的记录数 add_count, skip_count = 0, 0 # 最后一篇文章发表的时间 last_item_time = str(row['item_time']) # 本次更新完以后的最后发表时间 new_item_time = last_item_time insert_ret = 0 for item in get_items: # print item['pub_time'] # print last_item_time # print item['pub_time'] < last_item_time # sys.exit() # 判断发表时间决定是否该入库 if item['pub_time'] <= last_item_time: skip_count = skip_count + 1 utl.log("\tskip for " + str(item['pub_time']) + ' < ' + str(last_item_time)) continue else: add_count = add_count + 1 utl.log("\tadd for " + str(item['pub_time']) + ' > ' + str(last_item_time)) # 对比得出最新的文章的时间 if item['pub_time'] > new_item_time: utl.log("\tupdate pub_time for " + item['pub_time'] + " is > " + new_item_time) new_item_time = item['pub_time'] # tags 字段借做他用,用来标记 origins 中的 name item['tags'] = row['name'] # 调整矫正时间差 8 个小时的问题 old_time = item['pub_time'] # 兼容可能出现只有日期的情况 if len(old_time) > 7 and len(old_time) < 11: item_timestamp = time.mktime(time.strptime(old_time, '%Y-%m-%d')) else: item_timestamp = time.mktime(time.strptime(old_time, '%Y-%m-%d %X')) adjust_time = item_timestamp + 8 * 3600 item['pub_time'] = time.strftime('%Y-%m-%d %X', time.localtime( adjust_time ) ) # 记录入库 # insert_ret = 1 ###DEBUG insert_ret = insert_item(item) utl.log("insert a record, result in " + str(insert_ret)) return { 'item_time' : new_item_time, 'add_count' : add_count, 'skip_count' : skip_count, 'insert_ret' : insert_ret, }
def insert_item(record): # 检查必要参数 neces = ['title', 'url'] # neces = ['title', 'url', 'author', 'author_url', 'summary'] for field in neces: if field not in record: utl.log("ERR there is no field: " + field, 1) return 0 # 长度校验 if len(record['detail']) > 250: record['detail'] = record['detail'][:250] conn = MySQLdb.connect(host=gl.DBHOST, user=gl.DBUSER, passwd=gl.DBPASS, db='laravel_db', charset='utf8') rs = conn.cursor(cursorclass=MySQLdb.cursors.DictCursor) #hicktodo 从标题,以及来源和时间着手,防止重复插入记录 insert_ret = 0 try: insert_ret = rs.executemany( """INSERT INTO chips (title, url, summary, author, author_url, detail, pub_time, tags, source, created_at, updated_at) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", [ # (record['url'], record['url'], record['summary'], 'Hick1', 'detail1', ), # ('testTitle2', 'testUrl2', 'TestSummay2', 'Hick2', 'detail2', '2013-02-22 12:21', 'php,', 0), (record['title'], record['url'], record['summary'], record['author'], record['author_url'], record['detail'], record['pub_time'], record['tags'], record['source'], time.strftime('%Y-%m-%d %H:%M'), time.strftime('%Y-%m-%d %H:%M')) ] ) except Exception as e: utl.log("ERR insert a record: " + str(e), 1) finally: rs.close() conn.commit() conn.close() # utl.log("DEBUG==================== %s" % record['source'], 1) return insert_ret
def backup_local(): ''' rsync ''' log(u'開始本機備份:將 mycis 系統映射至 %s 磁碟機上...' % sd) try: r = subprocess.check_call([rsync, '-rltz', '--delete', '--progress', '--modify-window=1', '/cygdrive/d/mycis/', '/cygdrive/%s/mycis/' % sd]) log(u'本機備份完成!') except: log(u'無法將 mycis 系統映射至 %s 磁碟機上,本機備份失敗...' % sd)
def backup_remote_inc_ver(): td = tempfile.mkdtemp(dir=tmp) log('Starting to do incremental backup !') log('Generating version string ...') ver = cat(repo, 'version') if not os.path.isfile(ver): log('Initialize the version string: starting from 0.') open(ver, 'w').write('0') ed = int(open(ver, 'r').read()) ed_old = str(ed).zfill(8) ed_new = str(ed + 1).zfill(8) log('Get database of previous version ...') old = cat(repo, db) if not os.path.isfile(old): log('This is the first time doing delta. Copy to repo.') shutil.copyfile(db, old) old_coded = cat(repo, 'mycis_coded.db') new_ = cat(base, db) new = cat(td, db) # Use a copy to generate delta log('Use a copy to generate delta, otherwise there will be errors ...') shutil.copyfile(new_, new) delta_fn = 'delta-%s-%s' % (ed_old, ed_new) delta = cat(td, delta_fn) log('Computing md5 ...') md5_old = hashlib.md5(open(old, 'rb').read()).hexdigest() md5_new = hashlib.md5(open(new, 'rb').read()).hexdigest() if md5_old == md5_new: log('No new delta is needed: Stop.') return log('Now generating delta, please wait ...') subprocess.call([xdelta, '-e', '-f', '-s', old, new, delta]) # msg delta ex.: # From: cwtu_001, To: admin # Subject: delta-00000001-00000002 # Text: # 00000001=(md5 of mycis.db ed.00000001) # 00000002=(md5 of mycis.db ed.00000002) # Attachment: delta-00000001-00000002 log('Authoring the delta msg ...') msg = message(cid, (admin,), delta_fn, '%s=%s\n%s=%s' % (ed_old, md5_old, ed_new, md5_new), [delta]) log('Uploading delta, please wait ...') try: ch = channel() ch.append(cid, '', msg_time(), str(msg)) # Render old <- new. subprocess.call([xdelta, '-d', '-f', '-s', old, delta, old_coded]) md5_ = hashlib.md5(open(old_coded, 'rb').read()).hexdigest() if md5_ == md5_new: shutil.move(old_coded, old) else: # Should be very rare ... log('') os.remove(old_coded) return ed += 1 open(ver, 'w').write(str(ed)) log('Upload delta successfully !') except: log('Upload delta / Manage File unsuccessfully. Please check all settings & report to cytu !') finally: try: ch.close() except: pass try: ch.logout() except: pass cls(td)
def backup_remote_all(dbc=''): td = tempfile.mkdtemp(dir=tmp) ud_s = cat(td, '%s.7z' % db) new_ = dbc if dbc else cat(base, db) new = cat(td, db) try: log(u'開始上傳完整資料庫。製作完整資料庫壓縮檔...') shutil.copyfile(new_, new) r = subprocess.check_call([z, 'a', '-t7z', ud_s, new]) except: log(u'完整資料庫壓縮檔無法製作,上傳失敗...') return False md5 = hashlib.md5(open(new, 'rb').read()).hexdigest() # msg backup ex.: # From: cwtu_001 # To: admin # Subject: backup 2009-10-11 06:00:00 # Text: # (md5 of db) # Attachment: mycis.db.7z msg = message(cid, (admin,), 'backup ' + time_stamp(), md5, [ud_s]) try: log(u'完整資料庫上傳需要幾分鐘;請勿關閉視窗,耐心等候!') ch = channel() ch.append(cid, '', msg_time(), str(msg)) # Delete all previous delta, if any. try: ch.select('[Google Mail]/All Mail') r, [ids] = ch.search(None, '(FROM "%s" TEXT "delta")' % cid) if ids.split(): ch.copy(','.join(ids.split()), '[Google Mail]/Trash') except: log(u'(請手動清除備份紀錄)') log(u'完整資料庫上傳成功!') ans = True except: log(u'完整資料庫上傳失敗...') ans = False finally: try: ch.close() except: pass try: ch.logout() except: pass cls(td) return ans
def backup_remote_inc(): log(u'開始遠端備份 ...') td = tempfile.mkdtemp(dir=tmp) old = cat(repo, db) if not os.path.isfile(old): log(u'This is the first time doing backup. Copy to repo.') shutil.copyfile(db, old) new_ = cat(base, db) new = cat(td, db) # Use a copy to generate delta #log('Use a copy to generate delta, otherwise there will be errors ...') shutil.copyfile(new_, new) delta_fn = 'delta' delta = cat(td, delta_fn) md5_old = hashlib.md5(open(old, 'rb').read()).hexdigest() md5_new = hashlib.md5(open(new, 'rb').read()).hexdigest() if md5_old == md5_new: log(u'新舊資料庫版本相同,無需遠端備份!') cls(td) return log(u'製作資料庫差異檔中,請稍候...') subprocess.check_call([xdelta, '-e', '-f', '-s', old, new, delta]) # msg delta ex.: # From: cwtu_001, To: admin # Subject: delta 2009-10-11 06:00:00 # Text: # (md5 of old db) # (md5 of new db) # Attachment: delta tm = time_stamp() msg = message(cid, (admin,), delta_fn + ' ' + tm, '%s\n%s' % (md5_old, md5_new), [delta]) log(u'上傳資料庫差異檔中,請稍候...') try: ch = channel() ch.append(cid, '', msg_time(), str(msg)) log(u'遠端備份成功!') except: log(u'遠端備份失敗...') finally: try: ch.close() ch.logout() except: pass cls(td)
def parse(txt): # print "parse txt" r = feedparser.parse(txt) # print r.version # sys.exit() article_list = [] if len(r.entries) > 0: for item in r.entries: # for i in item: # print "----" # print i, ":" # print item[i] # sys.exit() # if hasattr(item, 'author_detail'): # continue # else: # for i in item: # print "----" # print i, ":" # print item[i] # sys.exit() ### 可能存在某些字段不存在等等错误,先以 log 方式记录 url = item.link try: ## 博客在线的 url 太 tnnd 长了, 发现有 jobbole.com 的统统截断 问号以后的 find_bole = url.find("jobbole.com") if find_bole > 0 and find_bole < 20: url = url.split("?")[0] ## 发现 http://www.lupaworld.com 的 rss20 有些 author_detail 都没有 if hasattr(item, 'author_detail'): author = item.author_detail.name author_url = hasattr(item.author_detail, 'href') and item.author_detail.href or url else: author = '' author_url = item.link ### 出现没有 published_parsed 的 ### print item 可以看到所有属性列表, dir 反而不行, 不知道为啥 if hasattr(item, "published_parsed"): pub_time = time.strftime('%Y-%m-%d %H:%M:%S', item.published_parsed) ### ATOM 支持的格式 elif hasattr(item, "updated_parsed"): pub_time = time.strftime('%Y-%m-%d %H:%M:%S', item.updated_parsed) else: ##################### 把 URL MD5 保存以判断是否抓取过: /tmp/url_MD5 , 存在则不继续添加, 否则时间我当前继续添加 ### item.updated 是这样的格式,暂时不知道怎么转换: 2011-12-20T12:38:22+00:00 pub_time = time.strftime(ISOTIMEFORMAT, item.updated) print pub_time print "\n\n\n\n\n\n\n\n" return article = { 'title': item.title, 'url': url, 'summary': item.summary[0:252], 'author': author, 'author_url': author_url, 'pub_time': pub_time, 'tags': '', 'source': 1, 'detail': hasattr(item, 'content') and html2text.html2text(item.content[0].value) or item.summary, } # for i in article: # print "----" # print i, ":" # print article[i] # sys.exit() article_list.append(article) except Exception, e: utl.log("!!!!!!!!ERR: %s , for url: %s" % (e, url), 1)
del_sql = "delete from chips where deleted > 0" rs.execute(del_sql) conn.commit() #hicktodo 这里时间要改成限定范围 # rs.execute("SELECT * FROM origins WHERE name = 'rss'") sql = "SELECT * FROM origins WHERE" + sql_where rs.execute(sql) # print sql # sys.exit() all_rows = rs.fetchall() if len(all_rows) < 1: utl.log("no source need to be spidered", 1) for row in all_rows: # 资源 id res_id = row['id'] # 模块名只用来定义逻辑,同一个模块名可能涉及多个 id , 比如 rss 模块 mod_name = row['name'] # 判断如果模块不存在,则报告一个错误以后继续其他操作 if not os.path.isfile(mod_name + '.py'): utl.log("!!!!!ERR: no module defined for " + mod_name, 1) continue mod = importlib.import_module(mod_name) utl.log("======== start " + mod_name + str(res_id) + " " + utl.get_host(row['url']), 1) #hicktodo 需要增加的字段,先用保留字段