def storage(number, title, timeout, format_text): dicts = { "source_id": number, "source_url": "com.hupu.voice", "newsType": "news", "title": title, "release_time": timeout, "create_time": seconds(timeout), "format_text": format_text, "source": "虎扑", } print(dicts)
def storage(number, title, timeout, format_text): dicts = { "source_id": number, "source_url": "com.legaldaily.www", "newsType": "news", "title": title, "release_time": timeout, "create_time": seconds(timeout, exist=True), "format_text": format_text, "source": "法制网", } print(dicts)
def storage(number, data, format_text): dicts = { "source_id": number, "source_url": "com.qq.news", "newsType": "news", "title": data["title"], "release_time": data["pubtime"], "create_time": seconds(data["pubtime"]), "format_text": format_text, "source": "腾讯", } print(dicts)
def storage(number, title, timeout, format_text): dicts = { "source_id": number, "source_url": "com.chinanews.channel", "newsType": "news", "title": title, "release_time": timeout, "create_time": seconds(timeout), "format_text": format_text, "source": "中国新闻网", } print(dicts)
def storage(message, timeout, format_text): dicts = { "source_id": int(message["contentid"]), "source_url": "com.dianyingjie.www", "newsType": "news", "title": message["title"], "release_time": timeout, "create_time": seconds(timeout, exist=True), "format_text": format_text, "source": "电影网", } print(dicts)
def storage(number, message, format_text): dicts = { "source_id": number, "source_url": "com.chinanews.channel", "newsType": "news", "title": message["title"], "release_time": message["pubtime"], "create_time": seconds(message["pubtime"], exist=True), "format_text": format_text, "source": "中国新闻网", } print(dicts)
def storage(number, title, timeout, format_text): dicts = { "source_id": number, "source_url": "cn.taiwan.www", "newsType": "news", "title": title, "release_time": timeout, "create_time": seconds(timeout), "format_text": format_text, "source": "中国台湾网", } print(dicts)
def storage(message, format_text): dicts = { "source_id": int(message["guid"]), "source_url": "cn.haiwainet.opa", "newsType": "news", "title": message["title"], "release_time": message["pubtime"], "create_time": seconds(message["pubtime"]), "format_text": format_text, "source": "海外网", } print(dicts)
def storage(message, format_text): dicts = { "source_id": int(message["articleid"]), "source_url": "com.tvoao.www", "newsType": "news", "title": message["title"], "release_time": message["showtime"], "create_time": seconds(message["showtime"]), "format_text": format_text, "source": "中广互联", } print(dicts)
def storage(message, format_text): dicts = { "source_id": message["id"], "source_url": "com.cctv.news", "newsType": "news", "title": message["title"], "release_time": message["dateTime"], "create_time": seconds(message["dateTime"], exist=True), "format_text": format_text, "source": "央视网", } print(dicts)
def storage(number, title, author, timeout, format_text): dicts = { "source_id": number, "source_url": "com.techweb.www", "newsType": "news", "title": title, "author": author, "release_time": timeout, "create_time": seconds(timeout), "format_text": format_text, "source": "TechWeb", } print(dicts)
def storage(number, title, author, timeout, format_text): dicts = { "source_id": number, "source_url": "com.xinhuanet.www", "newsType": "news", "title": title, "author": author, "release_time": timeout, "create_time": seconds(timeout, exist=True), "format_text": format_text, "source": "新华网", } print(dicts)
def storage(number, title, author, timeout, format_text): dicts = { "source_id": number, "source_url": "com.enorth.news", "newsType": "news", "title": title, "authoe": author, "release_time": timeout, "create_time": seconds(timeout), "format_text": format_text, "source": "北方网", } print(dicts)
def storage(message, format_text): dicts = { "source_id": int(message["DocID"]), "source_url": "com.xinhuanet.www", "newsType": "news", "title": message["Title"], "author": message["Author"], "release_time": message["PubTime"], "create_time": seconds(message["PubTime"]), "format_text": format_text, "source": "新华网", } print(dicts)
def storage(number, title, timeout, format_text): dicts = { "source_id": number, "source_url": "com.workercn.www", "newsType": "news", "title": title, "release_time": timeout, "create_time": seconds(timeout), "format_text": format_text, "source": "中工网", } print(dicts) # storageDatabase(dicts) # timestmap = time.strptime(timeout + ":00", "%Y/%m/%d %H:%M:%S") # timeout = time.strftime("%Y-%m-%d %H:%M:%S", timestmap)
def download(html, number): pattern = re.compile('<video') exist = re.findall(pattern, html) if not exist: return pattern = re.compile('<div class="newscontent"[\s\S]*?<div id="one"') data = re.findall(pattern, html) if data: data = data[0] else: return pattern_source = re.compile('来源:澎湃新闻') source = re.findall(pattern_source, data) if not source: return pattern_title = re.compile('(<h1 class="news_title">)([\s\S]*?)(</h1>)') title = re.findall(pattern_title, data)[0][1] pattern_time = re.compile('\d+-\d+-\d+ \d+:\d+') timeout = re.findall(pattern_time, data)[0] create_time = seconds(timeout, exist=True) pattern_author = re.compile('(>责任编辑:)([\s\S]*?)(<)') author = re.findall(pattern_author, data)[0][1] pattern_text = re.compile( '(<div class="news_txt"[\s\S]*?>)([\s\S]*?)(<audio)') format_text = re.findall(pattern_text, data) if format_text: format_text = format_text[0][1] else: pattern_text = re.compile( '(<div class="news_txt"[\s\S]*?>)([\s\S]*?)(</div>[\s\S]*?<div class="go_to_topic">)' ) format_text = re.findall(pattern_text, data)[0][1] pattern = re.compile('<div[\s\S]*?>') format_text = re.sub(pattern, "<p>", format_text) pattern = re.compile('</div>') format_text = re.sub(pattern, "</p>", format_text) storage(number, title, author, timeout, create_time, format_text, source)