Пример #1
0
def main(debug, url, selector, output_type, output_file):
    logging.basicConfig(level=logging.DEBUG if debug else logging.WARNING)

    logger.debug('Debug mode is on')

    logger.debug(f'Using {url} as URL')
    logger.debug(f'Using {selector} as selector')

    page = get_page(url)
    soup = parse_page(page)

    elements = soup.select(selector)

    logger.debug(f'Using {output_type} as serializer')

    serializer = serializers.get(output_type, 'No such serializer')

    logger.debug(f'Serializing {len(elements)} elements')

    output = serializer(elements)

    if output_file == '-':
        logger.debug('Printing output')
        print(output)

        return

    with open(path.abspath(output_file), 'w') as _file:
        logger.debug(f'Writing output to {output_file}')
        _file.write(output)
Пример #2
0
async def fetch_and_parse(url, data):
    try:
        page = await fetch(url)
        if page:
            data.append(parse_page(page))
    except Exception as err:
        error_pages.append(url)
        print('Error: {}\nURL: {}\n'.format(err, url))
Пример #3
0
def handle_news(msg):
  bot.send_message(msg.chat.id, msg.chat.id)
  list = parse_page()
  for text in list:
    try:
      bot.send_message(msg.chat.id, text, parse_mode='HTML')
    except Exception as e:
      bot.send_message(msg.chat.id, 'Свежих новостей не появилось')
      break
def list_builder(article, total_words):	
	word_dict = parser.parse_page(article,{})
	word_array = [0 for i in range(total_words)]
	#print word_dict
	#print ordering_dict
	for key in word_dict:
		if key in joint_dict:
			word_array[ordering_dict[key]] = word_dict[key]
	#num_words = sum(word_array)
	#word_array = [item / float(num_words) for item in word_array]
	return word_array
Пример #5
0
    def test_parse_page_result_instanse(self):
        self.assertIsInstance(parse_page('https://news.ycombinator.com/news?p=', 1, 5) , tuple) 

	def test_parse_page_result_is_equal(self):
			self.assertEqual(parse_page('https://news.ycombinator.com/news?p=', 1, 5) , parse_page('https://news.ycombinator.com/news?p=', 1, 5) )

	def test_parse_page_result_pages_not_equal(self):
		self.assertNotEqual(parse_page('https://news.ycombinator.com/news?p=', 1, 5) , parse_page('https://news.ycombinator.com/news?p=', 2, 5) )

	def test_get_users_response_not_empty(self):
			self.assertTrue(len(parse_page('https://news.ycombinator.com/news?p=', 1, 5)) > 0 ) 

	def test_get_users_response_not_none(self):
			self.assertIsNotNone(parse_page('https://news.ycombinator.com/news?p=', 1, 5) ) 		
Пример #6
0
def scheduler(url=None):
    for url, updated in [('http://' + request.full_path[10:], datetime.now())] if url else parse_list():
        topic = db.session.query(Topic).filter(Topic.url == url).first()
        if topic and topic.updated == updated:
            continue
        title, published, body = parse_page(url)
        if topic:
            topic.title = title
            topic.body = body
            topic.updated = updated
        else:
            topic = Topic(url, title, published, updated, body)
        db.session.add(topic)
    db.session.commit()
    return 'ok'
Пример #7
0
def scheduler(url=None):
    for url, updated in [('http://' + request.full_path[10:],
                          datetime.now())] if url else parse_list():
        topic = db.session.query(Topic).filter(Topic.url == url).first()
        if topic and topic.updated == updated:
            continue
        title, published, body = parse_page(url)
        if topic:
            topic.title = title
            topic.body = body
            topic.updated = updated
        else:
            topic = Topic(url, title, published, updated, body)
        db.session.add(topic)
    db.session.commit()
    return 'ok'
Пример #8
0
import parser
import util

### test code ###
if __name__ == '__main__':
    sc = SparkContext(appName="Preprocess")

    print datetime.datetime.today()

    print 'start load'
    page_rdd = sc.textFile(load.DATA_PATH).map(util.encode)

    print 'start make map'

    print 'start parse'
    parsed_page_rdd = page_rdd.map(lambda page: parser.parse_page(page))
    parsed_page_rdd.cache()
    parsed_page_rdd.map(lambda x: json.dumps(x)) \
                   .saveAsTextFile(load.PARSED_PAGE_PATH)

    barrels_rdd = parsed_page_rdd.keys()
    barrels_rdd.cache()

    print 'start store document list'
    documents = barrels_rdd.map(lambda x: (x['url'], (x['title'], x['content']))) \
                           .distinct() \
                           .zipWithIndex() \
                           .map(lambda ((u, (t, c)), d): {"doc_id": d, "title": t, "url": u, "content": c}) \
                           .map(lambda x: json.dumps(x)) \
                           .saveAsTextFile(load.DOCUMENTS_PATH)
Пример #9
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import telebot
from parser import parse_page
from config import token, chat_ids

bot = telebot.TeleBot(token)

list = parse_page()
for text in list:
    try:
      for id in chat_ids:
        bot.send_message(id, text, parse_mode='HTML')
    except Exception as e:
      #bot.send_message(chat_id, 'Свежих новостей не появилось')
      break

Пример #10
0
def fetch_and_parse(target):
    page = fetch(target)
    return parse_page(page)
Пример #11
0
from converter import convert_file
from parser import parse_page

names = []
for year in range(2008, 2020):
    for month in [3, 6, 9, 11]:
        for grade in [1, 2, 3]:
            for subject in [1]:  # math only
                names.append('{}-{:02d}-{}-{}'.format(year, month, grade,
                                                      subject))
print(names)

for name in names:
    try:
        pages = convert_file(name)
        for page in pages:
            parse_page(page)
    except RuntimeError:
        print('[!] {}.pdf not found'.format(name))
        pass
Пример #12
0
def test_parse_page():
    parsed = parser.parse_page('<html><head><title>wow</title></head></html>')

    assert str(parsed.__class__) == "<class 'bs4.BeautifulSoup'>"
    assert parsed.title.string == 'wow'
Пример #13
0
	def test_get_users_response_not_none(self):
			self.assertIsNotNone(parse_page('https://news.ycombinator.com/news?p=', 1, 5) ) 		
Пример #14
0
	def test_get_users_response_not_empty(self):
			self.assertTrue(len(parse_page('https://news.ycombinator.com/news?p=', 1, 5)) > 0 ) 
Пример #15
0
	def test_parse_page_result_pages_not_equal(self):
		self.assertNotEqual(parse_page('https://news.ycombinator.com/news?p=', 1, 5) , parse_page('https://news.ycombinator.com/news?p=', 2, 5) )