def process_email(server, msg_id): #print #log.debug("Messages:") #datalist = ['FLAGS', 'RFC822', 'BODY'] datalist = ['RFC822'] response = server.fetch(msg_id, datalist) for msgid, data in response.iteritems(): email_info, plain = parse_message(msgid, data) if not email_info: continue topic_title, result = parse_plain(plain) #for data in result: # for k, v in data.items(): # log.debug(k, v) with open(common.DB_DIR + '/email_backup/%s_%s.txt' % (msgid, topic_title), 'w') as f: f.write(plain.encode('utf8')) log.info('parsing topic: %s' % topic_title) topic_title = db.ensure_topic_exists(topic_title) for data in result: article = dict(url=data['url'], title=data['title'], source=data['source'], url_date=email_info[RECEIVE_TIME]) article_id = db.ensure_article_exists(article) brief = data['brief'] db.insert_or_update_t_a_rel(topic_title, article_id, brief)
def main(): state = _load_state() for feed in default_feeds: if feed['feed_url'] not in state: state[feed['feed_url']] = feed for url, meta in state.items(): print 'processing %s' % url handler = import_module(meta['handler']).Handler() new_articles, new_last = handler.get_articles(url, meta['last']) bad_count = 0 for a in new_articles: a['source'] = meta['source'] aid = db.ensure_article_exists(a, overwrite=True) if not a['cached']: bad_count += 1 #print a print ' get %s articles, bad_count: %s' % (len(new_articles), bad_count) print new_last meta['last'] = new_last _save_state(state)
import time def fix_0A(a): if '0A' in a['url']: print a['url'] a['url'] = a['url'].replace('0A', '0') return True return False def fetch_text(a): if a['cached']: return False time.sleep(0.2) txt = lp.fetch_text(urllib.unquote(a['url'])) if txt: print a['url_date'], a['title'], a['url'] a['cached'] = txt return True return False source = u'自由時報' articles = db.list_articles_by_source(source, addition_cols=['cached']) fixed = 0 for a in articles: if any((fix_0A(a), fetch_text(a))): #print 'fixed', a db.ensure_article_exists(a, overwrite=True) fixed += 1 print 'total: %s, fixed: %s' % (len(articles), fixed)