示例#1
0
def process_email(server, msg_id):
    #print
    #log.debug("Messages:")
    #datalist = ['FLAGS', 'RFC822', 'BODY']
    datalist = ['RFC822']
    response = server.fetch(msg_id, datalist)

    for msgid, data in response.iteritems():
        email_info, plain = parse_message(msgid, data)
        if not email_info:
            continue
        topic_title, result = parse_plain(plain)
        #for data in result:
        #    for k, v in data.items():
        #        log.debug(k, v)
        with open(common.DB_DIR + '/email_backup/%s_%s.txt' % (msgid, topic_title), 'w') as f:
            f.write(plain.encode('utf8'))
        log.info('parsing topic: %s' % topic_title)
        topic_title = db.ensure_topic_exists(topic_title)
        for data in result:
            article = dict(url=data['url'],
                           title=data['title'],
                           source=data['source'],
                           url_date=email_info[RECEIVE_TIME])
            article_id = db.ensure_article_exists(article)
            brief = data['brief']
            db.insert_or_update_t_a_rel(topic_title, article_id, brief)
示例#2
0
def main():
    state = _load_state()
    for feed in default_feeds:
        if feed['feed_url'] not in state:
            state[feed['feed_url']] = feed

    for url, meta in state.items():
        print 'processing %s' % url
        handler = import_module(meta['handler']).Handler()
        new_articles, new_last = handler.get_articles(url, meta['last'])
        bad_count = 0
        for a in new_articles:
            a['source'] = meta['source']
            aid = db.ensure_article_exists(a, overwrite=True)
            if not a['cached']:
                bad_count += 1
            #print a
        print '  get %s articles, bad_count: %s' % (len(new_articles), bad_count)
        print new_last
        meta['last'] = new_last

    _save_state(state)
示例#3
0
import time

def fix_0A(a):
    if '0A' in a['url']:
        print a['url']
        a['url'] = a['url'].replace('0A', '0')
        return True
    return False

def fetch_text(a):
    if a['cached']:
        return False
    time.sleep(0.2)
    txt = lp.fetch_text(urllib.unquote(a['url']))
    if txt:
        print a['url_date'], a['title'], a['url']
        a['cached'] = txt
        return True
    return False


source = u'自由時報'
articles = db.list_articles_by_source(source, addition_cols=['cached'])
fixed = 0
for a in articles:
    if any((fix_0A(a), fetch_text(a))):
        #print 'fixed', a
        db.ensure_article_exists(a, overwrite=True)
        fixed += 1
print 'total: %s, fixed: %s' % (len(articles), fixed)