links_descs.append(html.unescape(a.get_text().strip())) res_dict['news_related_url'] = links res_dict['news_related_url_desc'] = links_descs content = '\n'.join(temp_content).strip() if content: res_dict['news'] = html.unescape(content) if not res_dict or 'news' not in res_dict: content_parser.logger.error( 'Ettoday url: {} did not process properly'.format(url)) content_parser.errors['process_empty_content_(rss_id)'].append( [rss_id, url]) return return res_dict content_parser = ContentParser('ETtoday') # Query the data with source name unprocessed_data = content_parser.content_query() content_parser.content_processor(unprocessed_data, ettoday_content_processor) if content_parser.errors: content_parser.sent_error_email() content_parser.encoding_cursor.close() content_parser.mydb.close() content_parser.logger.info( "Processed Ettoday {} examples in {} seconds".format( len(unprocessed_data), time.time() - start))
content = article_body_tag.text.strip() if content: res_dict['news'] = html.unescape(content) elif article_body_tag_2: content = article_body_tag_2.text.strip() if content: res_dict['news'] = html.unescape(content) if not res_dict or 'news' not in res_dict: content_parser.logger.error( 'PTS url: {} did not process properly'.format(url)) content_parser.errors['process_empty_content_(rss_id)'].append( [rss_id, url]) return return res_dict content_parser = ContentParser('公視新聞網') # Query the data with source name unprocessed_data = content_parser.content_query() content_parser.content_processor(unprocessed_data, pts_content_processor) if content_parser.errors: content_parser.sent_error_email() content_parser.encoding_cursor.close() content_parser.mydb.close() content_parser.logger.info("Processed PTS {} examples in {} seconds".format( len(unprocessed_data), time.time() - start))
prefix = '' content = prefix + '\n'.join(temp_content) #.replace('。 ', '。\n') res_dict['news'] = html.unescape(content) return res_dict else: content_parser_1.logger.error( 'Yahoo url: {} did not process properly'.format(url)) content_parser.errors['process_empty_content_(rss_id)'].append( [rss_id, url]) return start = time.time() content_parser_1 = ContentParser('Yahoo Source 1') unprocessed_data_1 = content_parser_1.content_query() content_parser_1.content_processor(unprocessed_data_1, yahoo_content_processor) if content_parser_1.errors: content_parser_1.sent_error_email() content_parser_1.encoding_cursor.close() content_parser_1.mydb.close() content_parser_1.logger.info( "Processed Yahoo Source 1 {} examples in {} seconds".format( len(unprocessed_data_1), time.time() - start)) start = time.time() content_parser_2 = ContentParser('Yahoo奇摩新聞') unprocessed_data_2 = content_parser_2.content_query() content_parser_2.content_processor(unprocessed_data_2, yahoo_content_processor) if content_parser_2.errors: content_parser_2.sent_error_email()
if a.get_text().strip() and 'www' in a['href']: links.append(a['href']) links_descs.append(a.get_text().strip()) res_dict['news_related_url'] = links res_dict['news_related_url_desc'] = links_descs content = '\n'.join(temp_content).strip() if content: res_dict['news'] = html.unescape(content) if not res_dict or 'news' not in res_dict: content_parser.logger.error('Epoch url: {} did not process properly'.format(url)) content_parser.errors['process_error_(rss_id)'].append([rss_id, url]) return return res_dict content_parser = ContentParser('大紀元') # Query the data with source name unprocessed_data = content_parser.content_query() content_parser.content_processor(unprocessed_data, epoch_content_processor) if content_parser.errors: content_parser.sent_error_email() content_parser.encoding_cursor.close() content_parser.mydb.close() content_parser.logger.info("Processed Epoch {} examples in {} seconds".format(len(unprocessed_data), time.time() - start))
for a in a_tags: if len(a): if a['href'] == '#': continue if a.get_text().strip() and 'www' in a['href']: links.append(a['href']) links_descs.append(html.unescape(a.get_text().strip())) res_dict['news_related_url'] = links res_dict['news_related_url_desc'] = links_descs content = '\n'.join(temp_content).strip() if content: res_dict['news'] = html.unescape(content) if not res_dict or 'news' not in res_dict: content_parser.logger.error('NewsTalk url: {} did not process properly'.format(url)) content_parser.errors['process_empty_content_(rss_id)'].append([rss_id, url]) return return res_dict content_parser = ContentParser('新頭殼要聞') # Query the data with source name unprocessed_data = content_parser.content_query() content_parser.content_processor(unprocessed_data, newstalk_content_processor) if content_parser.errors: content_parser.sent_error_email() content_parser.encoding_cursor.close() content_parser.mydb.close() content_parser.logger.info("Processed NewsTalk {} examples in {} seconds".format(len(unprocessed_data), time.time() - start))
date_res = d2.strftime(db_date_format) res_dict['published_date'] = date_res except Exception as e2: print(e2) content_parser.logger.info('PChome date error {}, URL: {}'.format(e2, url)) article_body_tag = soup.find('div', attrs = {'calss':'article_text'}) if article_body_tag: content = article_body_tag.text.strip() a_tags = article_body_tag.find_all('a') if content: content = re.sub('(\n)+', '\n', html.unescape(content)) content = re.sub(r'(相關新聞[\s\S]+)', '', content) res_dict['news'] = html.unescape(content) if not res_dict or 'news' not in res_dict: content_parser.logger.error('PChome url: {} did not process properly'.format(url)) content_parser.errors['process_empty_content_(rss_id)'].append([rss_id, url]) return return res_dict content_parser = ContentParser('PCHOME') # Query the data with source name unprocessed_data = content_parser.content_query() content_parser.content_processor(unprocessed_data, pchome_content_processor) if content_parser.errors: content_parser.sent_error_email() content_parser.encoding_cursor.close() content_parser.mydb.close() content_parser.logger.info("Processed PChome {} examples in {} seconds".format(len(unprocessed_data), time.time() - start))