def main(debug, url, selector, output_type, output_file): logging.basicConfig(level=logging.DEBUG if debug else logging.WARNING) logger.debug('Debug mode is on') logger.debug(f'Using {url} as URL') logger.debug(f'Using {selector} as selector') page = get_page(url) soup = parse_page(page) elements = soup.select(selector) logger.debug(f'Using {output_type} as serializer') serializer = serializers.get(output_type, 'No such serializer') logger.debug(f'Serializing {len(elements)} elements') output = serializer(elements) if output_file == '-': logger.debug('Printing output') print(output) return with open(path.abspath(output_file), 'w') as _file: logger.debug(f'Writing output to {output_file}') _file.write(output)
async def fetch_and_parse(url, data): try: page = await fetch(url) if page: data.append(parse_page(page)) except Exception as err: error_pages.append(url) print('Error: {}\nURL: {}\n'.format(err, url))
def handle_news(msg): bot.send_message(msg.chat.id, msg.chat.id) list = parse_page() for text in list: try: bot.send_message(msg.chat.id, text, parse_mode='HTML') except Exception as e: bot.send_message(msg.chat.id, 'Свежих новостей не появилось') break
def list_builder(article, total_words): word_dict = parser.parse_page(article,{}) word_array = [0 for i in range(total_words)] #print word_dict #print ordering_dict for key in word_dict: if key in joint_dict: word_array[ordering_dict[key]] = word_dict[key] #num_words = sum(word_array) #word_array = [item / float(num_words) for item in word_array] return word_array
def test_parse_page_result_instanse(self): self.assertIsInstance(parse_page('https://news.ycombinator.com/news?p=', 1, 5) , tuple) def test_parse_page_result_is_equal(self): self.assertEqual(parse_page('https://news.ycombinator.com/news?p=', 1, 5) , parse_page('https://news.ycombinator.com/news?p=', 1, 5) ) def test_parse_page_result_pages_not_equal(self): self.assertNotEqual(parse_page('https://news.ycombinator.com/news?p=', 1, 5) , parse_page('https://news.ycombinator.com/news?p=', 2, 5) ) def test_get_users_response_not_empty(self): self.assertTrue(len(parse_page('https://news.ycombinator.com/news?p=', 1, 5)) > 0 ) def test_get_users_response_not_none(self): self.assertIsNotNone(parse_page('https://news.ycombinator.com/news?p=', 1, 5) )
def scheduler(url=None): for url, updated in [('http://' + request.full_path[10:], datetime.now())] if url else parse_list(): topic = db.session.query(Topic).filter(Topic.url == url).first() if topic and topic.updated == updated: continue title, published, body = parse_page(url) if topic: topic.title = title topic.body = body topic.updated = updated else: topic = Topic(url, title, published, updated, body) db.session.add(topic) db.session.commit() return 'ok'
import parser import util ### test code ### if __name__ == '__main__': sc = SparkContext(appName="Preprocess") print datetime.datetime.today() print 'start load' page_rdd = sc.textFile(load.DATA_PATH).map(util.encode) print 'start make map' print 'start parse' parsed_page_rdd = page_rdd.map(lambda page: parser.parse_page(page)) parsed_page_rdd.cache() parsed_page_rdd.map(lambda x: json.dumps(x)) \ .saveAsTextFile(load.PARSED_PAGE_PATH) barrels_rdd = parsed_page_rdd.keys() barrels_rdd.cache() print 'start store document list' documents = barrels_rdd.map(lambda x: (x['url'], (x['title'], x['content']))) \ .distinct() \ .zipWithIndex() \ .map(lambda ((u, (t, c)), d): {"doc_id": d, "title": t, "url": u, "content": c}) \ .map(lambda x: json.dumps(x)) \ .saveAsTextFile(load.DOCUMENTS_PATH)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import telebot from parser import parse_page from config import token, chat_ids bot = telebot.TeleBot(token) list = parse_page() for text in list: try: for id in chat_ids: bot.send_message(id, text, parse_mode='HTML') except Exception as e: #bot.send_message(chat_id, 'Свежих новостей не появилось') break
def fetch_and_parse(target): page = fetch(target) return parse_page(page)
from converter import convert_file from parser import parse_page names = [] for year in range(2008, 2020): for month in [3, 6, 9, 11]: for grade in [1, 2, 3]: for subject in [1]: # math only names.append('{}-{:02d}-{}-{}'.format(year, month, grade, subject)) print(names) for name in names: try: pages = convert_file(name) for page in pages: parse_page(page) except RuntimeError: print('[!] {}.pdf not found'.format(name)) pass
def test_parse_page(): parsed = parser.parse_page('<html><head><title>wow</title></head></html>') assert str(parsed.__class__) == "<class 'bs4.BeautifulSoup'>" assert parsed.title.string == 'wow'
def test_get_users_response_not_none(self): self.assertIsNotNone(parse_page('https://news.ycombinator.com/news?p=', 1, 5) )
def test_get_users_response_not_empty(self): self.assertTrue(len(parse_page('https://news.ycombinator.com/news?p=', 1, 5)) > 0 )
def test_parse_page_result_pages_not_equal(self): self.assertNotEqual(parse_page('https://news.ycombinator.com/news?p=', 1, 5) , parse_page('https://news.ycombinator.com/news?p=', 2, 5) )