def test_parse_without_metalines(self): self.link = 'https://www.ptt.cc/bbs/NBA/M.1432438578.A.4B0.html' self.article_id = 'M.1432438578.A.4B0' self.board = 'NBA' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board)
def test_parse_with_push_without_contents(self): self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1433091897.A.1C5.html' self.article_id = 'M.1433091897.A.1C5' self.board = 'Gossiping' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board)
def home(request): if request.method == 'GET': return render( request, 'demo/demo.html', ) elif request.method == 'POST' and request.is_ajax(): bname = escape(request.POST.get('board_name')) aid = escape(request.POST.get('article_id')) link = PTT_URL + '/bbs/' + bname + '/' + aid + '.html' if not (bname and aid): return HttpResponse(json.dumps({'data': {'error': 'invalid url'}, 'link': link}), content_type='application/json') if aid.lower() == 'latest' or aid.lower() == 'index': resp = requests.get( url=PTT_URL + '/bbs/' + bname + '/index.html', cookies={'over18': '1'}, verify=False ) if resp.status_code == 200: soup = BeautifulSoup(resp.text) divs = soup.find_all("div", "r-ent") aid = divs[-1].select("div.title > a")[0]['href'].split("/")[3].replace(".html", "") link = PTT_URL + '/bbs/' + bname + '/' + aid + '.html' data = json.loads( crawler.parse(link, aid, bname) ) return HttpResponse(json.dumps({'data': data, 'link': link}), content_type='application/json')
def home(request): if request.method == 'GET': return render( request, 'demo/demo.html', ) elif request.method == 'POST' and request.is_ajax(): bname = escape(request.POST.get('board_name')) aid = escape(request.POST.get('article_id')) link = PTT_URL + '/bbs/' + bname + '/' + aid + '.html' if not (bname and aid): return HttpResponse(json.dumps({ 'data': { 'error': 'invalid url' }, 'link': link }), content_type='application/json') if aid.lower() == 'latest' or aid.lower() == 'index': resp = requests.get(url=PTT_URL + '/bbs/' + bname + '/index.html', cookies={'over18': '1'}, verify=False) if resp.status_code == 200: soup = BeautifulSoup(resp.text) divs = soup.find_all("div", "r-ent") aid = divs[-1].select("div.title > a")[0]['href'].split( "/")[3].replace(".html", "") link = PTT_URL + '/bbs/' + bname + '/' + aid + '.html' data = json.loads(crawler.parse(link, aid, bname)) return HttpResponse(json.dumps({ 'data': data, 'link': link }), content_type='application/json')
def parse(self, response): items = AutoNewsItem() items['collect_log_objects'] = [] items['parsed_news_objects'] = [] data = eval(response.text)['list'] if not len(data): raise CloseSpider('close it') for news in data: url = re.sub(r'\/money.*', f'{news["HyperLink"]}?chdtv', response.url) html = requests.get(url).text crawler_time = datetime.now() date, title, article, keywords = crawler.parse(url) items['collect_log_objects'].append( CollectLog(poster='scrapy', url=url, html=html, collect_time=crawler_time)) items['parsed_news_objects'].append( ParsedNews(url=url, title=title, article=article, keywords=keywords, date=date)) yield items
def test_parse(self): self.link = 'https://www.ptt.cc/bbs/PublicServan/M.1409529482.A.9D3.html' self.article_id = 'M.1409529482.A.9D3' self.board = 'PublicServan' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board) self.assertEqual(jsondata['message_conut']['count'], 55)
def test_parse_without_metalines(self): self.link = 'https://www.ptt.cc/bbs/NBA/M.1432438578.A.4B0.html' self.article_id = 'M.1432438578.A.4B0' self.board = 'NBA' jsondata = json.loads( crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board)
def test_parse_with_push_without_contents(self): self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1433091897.A.1C5.html' self.article_id = 'M.1433091897.A.1C5' self.board = 'Gossiping' jsondata = json.loads( crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board)
def test_parse(self): self.link = 'https://www.ptt.cc/bbs/PublicServan/M.1409529482.A.9D3.html' self.article_id = 'M.1409529482.A.9D3' self.board = 'PublicServan' jsondata = json.loads( crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board) self.assertEqual(jsondata['message_conut']['count'], 55)
def test_parse_with_structured_push_contents(self): self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1119222660.A.94E.html' self.article_id = 'M.1119222660.A.94E' self.board = 'Gossiping' jsondata = json.loads(crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board) isCatched = False for msg in jsondata['messages']: if u'http://tinyurl.com/4arw47s' in msg['push_content']: isCatched = True self.assertTrue(isCatched)
def check_page(): page = crawl(configuration['targetURL']) # .decode("utf8") page_hash = md5(page) c = load() if not c['hash'] == page_hash: print("HASH CHANGED! (" + page_hash + ")") # Run a background thread to archive the page in the web archive start_new_thread(crawl, ("https://web.archive.org/save/" + configuration['targetURL'], False)) # Check if the file is online and we didn't sent the mail already (if so send it) match = parse(page.decode('utf8')) if match is not None and not c['mailSent']: print( "FILE IS ONLINE! Sending mails ... (and we didn't sent them already)" ) docx = crawl(match) for person_details in configuration['details']: variables = { "name": person_details['name'], "year": person_details['targetYear'], "quarter": person_details['quarter'], "mail": person_details['mail'], "streetAndCity": person_details['streetAndCity'], "phone": person_details['phone'], "matrikelnr": person_details['matrikelnr'] } res = parser.update_document_contents(docx, person_details) res_filename = "Antrag Wohnheimzimmer " + variables[ 'quarter'] + " " + variables['year'] + ".docx" mail.send(configuration['mail'], variables, res, res_filename) c['mailSent'] = True # Send a mail regardless of the above that there is a change notification_conf = { "body": "Something changed! Go and visit " + configuration['targetURL'], "subject": "IMPORTANT | The watched website has changed! Go check it immediately!", "recipient": configuration['mail']['notificationRecipient'], "server": configuration['mail']['server'] } if c['mailSent']: notification_conf[ 'body'] += "\n\n Oh and btw I already sent your reservation request ;)\n\n Have a good one!\n - AccommodationBot" mail.send(notification_conf) c['hash'] = page_hash else: print("Boring old same page...") save(c)
def test_parse_with_structured_push_contents(self): self.link = 'https://www.ptt.cc/bbs/Gossiping/M.1119222660.A.94E.html' self.article_id = 'M.1119222660.A.94E' self.board = 'Gossiping' jsondata = json.loads( crawler.parse(self.link, self.article_id, self.board)) self.assertEqual(jsondata['article_id'], self.article_id) self.assertEqual(jsondata['board'], self.board) isCatched = False for msg in jsondata['messages']: if u'http://tinyurl.com/4arw47s' in msg['push_content']: isCatched = True self.assertTrue(isCatched)
async def collect(news: News): collect_log_objects = [] parsed_news_objects = [] try: for url in news.urls: html = requests.get(url).text date, title, article, keywords = crawler.parse(url) collect_time = datetime.now() collect_log_objects.append( CollectLog(poster=news.poster, url=url, html=html, collect_time=collect_time)) parsed_news_objects.append( ParsedNews(url=url, title=title, article=article, keywords=keywords, date=date)) collect_news_to_db(collect_log_objects, parsed_news_objects) print(f'{news.urls} collected') except Exception as e: print(e) return {"message": "collected"}
def test(): assert_equals(('Family Guy', '11', '6'), parse('Family.Guy.S11E06.rest.avi')) assert_equals(('Family Guy', '11', '60'), parse('Family.Guy.S11E60.rest.avi')) assert_equals(('Family Guy', '1', '60'), parse('Family.Guy.S01E60.rest.avi')) assert_equals(('TheSeaEar' , '1', '60'), parse('TheSeaEar.S01E60.rest.avi'))