def _secondPass(self, clients, loop): error_clients = {} for client, value in clients.items(): print(client) client_dict = {client: None} name, number, button, url = clients[client]['Connect'] website = Website(name, number, button, url) loop.run_until_complete(website.addWebsite()) check = website.runCheck() first_check = clients[client]['Check'] second_check = self._compare(check, name) if first_check == second_check: client_dict[client] = value error_clients.update(client_dict) return error_clients
def dispatch_website(id, url, keywords): """ Dispatcher to start crawling of a website """ try: Database.set_website_status(id=id, status='queued') # create and set website and page object for a job website = Website(id=id, url=url, keywords=keywords) website.preInit() page = Page(website.url, website.url) # set website watch variables in redis db rDB.set(website.id+':pages_queued', 1) rDB.set(website.id+':pages_crawled', 0) # Enqueue job in redis-queue job = qL.enqueue(crawl_page, website, page) log.debug('Website Added in Queue :: {0}'.format(url)) except Exception as e: log.exception('Error occurred in dispatch website')
def dispatch_website(id, url, keywords): """ Dispatcher to start crawling of a website """ try: Database.set_website_status(id=id, status='queued') # create and set website and page object for a job website = Website(id=id, url=url, keywords=keywords) website.preInit() page = Page(website.url, website.url) # set website completion variables in redis db rDB.set(website.id + ':pages_queued', 1) rDB.set(website.id + ':pages_crawled', 0) # Enqueue job in redis-queue job = qH.enqueue(crawl_page, website, page) log.debug('Website Added in Queue :: {0}'.format(url)) except Exception as e: log.exception('Error occurred in dispatch website')
def crawl_page(website, page): """ Crawl a single page at a time and checks is job of crawling a website done or not and take required steps """ try: # set website status to started if it's a first page of website print 'Pages Crawled:: {0}'.format(rDB.get(website.id+':pages_crawled')) print 'Pages Queued:: {0}'.format(rDB.get(website.id+':pages_queued')) if rDB.get(website.id+':pages_queued')=='1': Database.set_website_status(id=website.id, status='started') log.debug('Crawling :: {0}'.format(page.url)) # get page content log.info('Getting Page Content :: {0}'.format(page.url)) page.get_content() # get keywords matched keys = page.get_keywords_matched(website.aho) log.info('Matched Keywords :: {0}'.format(keys)) # get external links # page.get_external_links() log.info('Found External Links :: {0}'.format(len(page.external_links))) # get internal links page.get_internal_links(website) log.info('Found Internal Links :: {0}'.format(len(page.internal_links))) # get status code of all links log.info('Getting Status of all Links') page.get_status_codes_of_links(website) log.info('Enqueueing New Jobs ') # enqueue the un-broken internal links for p in page.crawl_pages: log.info('Enqueued :: {0}'.format(p.url)) rDB.incr(website.id+':pages_queued') qH.enqueue(crawl_page, website, p) log.info('Adding Result to website') # add rotto links to result if page.rotto_links: log.info('Broken Links Found :: {0}'.format(page.rotto_links)) rDB.rpush(website.id+':result', Website.result_to_json(page)) log.debug('Crawled :: {0}'.format(page.url)) # increment website crawled page counter rDB.incr(website.id+':pages_crawled') log.info('Pages Queued:: {0}'.format(rDB.get(website.id+':pages_queued'))) log.info('Pages Crawled:: {0}'.format(rDB.get(website.id+':pages_crawled'))) # checks if website crawled completely or not if rDB.get(website.id+':pages_queued')==rDB.get(website.id+':pages_crawled'): log.info('Website {0} crawled Completely'.format(website.url)) # save results to database log.info('Saving results to database') qH.enqueue(save_result_to_database, website) # send the email to user log.info('Sending email to user') send_mail_to_user(website) except Exception as e: log.exception('Error in crawling :: {0}'.format(page.url))
def main(self): row = self._row client_links = {} error_clients = {} unreachable_clients = {'404s': [], 'Missing_Info': [], 'Selenium': []} keys = [] frames = [] loop = asyncio.get_event_loop() try: for row in self._clients: name = str(row[self._columns[0]]) number = str(row[self._columns[1]]) button = str(row[self._columns[2]]) url = str(row[self._columns[3]]) if (name and number and button and url and number[:3].isdigit() is True and re.search('eventplicity', button) is not None): client = Website(name, number, button, url) connected = loop.run_until_complete(client.addWebsite()) if connected != 0: check = client.runCheck() if check == {name: {'Index': {'Phone_Number': 0, 'Eventplicity_Link': 0}}}: if len(client._extensions) > 1: compare = self._compare(check, name) if compare is not None: error_clients[name] = {'Website': client, 'Check': compare, 'Connect': [name, number, button, url]} for key, value in check.items(): keys.append(key) frame = pd.DataFrame.from_dict(value, orient='index') frames.append(frame) else: unreachable_clients['Selenium'].append(name) else: compare = self._compare(check, name) if compare is not None: error_clients[name] = {'Website': client, 'Check': compare, 'Connect': [name, number, button, url]} for key, value in check.items(): keys.append(key) frame = pd.DataFrame.from_dict(value, orient='index') frames.append(frame) else: unreachable_clients['404s'].append(name) links = {name: client._links} client_links.update(links) else: unreachable_clients['Missing_Info'].append(name) error_clients = self._secondPass(error_clients, loop) finally: loop.close() self._generateData(keys, frames, client_links, unreachable_clients) subject, message = self._composeEmail(error_clients, unreachable_clients) return subject, message
def crawl_page(website, page): """ Crawl a single page at a time and checks is job of crawling a website done or not and take required steps """ try: # set website status to started if it's a first page of website print 'Pages Crawled:: {0}'.format( rDB.get(website.id + ':pages_crawled')) print 'Pages Queued:: {0}'.format(rDB.get(website.id + ':pages_queued')) if rDB.get(website.id + ':pages_queued') == '1': Database.set_website_status(id=website.id, status='started') log.debug('Crawling :: {0}'.format(page.url)) # get page content log.info('Getting Page Content :: {0}'.format(page.url)) page.get_content() # get keywords matched keys = page.get_keywords_matched(website.aho) log.info('Matched Keywords :: {0}'.format(keys)) # get external links # page.get_external_links() log.info('Found External Links :: {0}'.format(len( page.external_links))) # get internal links page.get_internal_links(website) log.info('Found Internal Links :: {0}'.format(len( page.internal_links))) # get status code of all links log.info('Getting Status of all Links') page.get_status_codes_of_links(website) log.info('Enqueueing New Jobs ') # enqueue the un-broken internal links for p in page.crawl_pages: log.info('Enqueued :: {0}'.format(p.url)) rDB.incr(website.id + ':pages_queued') qH.enqueue(crawl_page, website, p) log.info('Adding Result to website') # add rotto links to result if page.rotto_links: log.info('Broken Links Found :: {0}'.format(page.rotto_links)) rDB.rpush(website.id + ':result', Website.result_to_json(page)) log.debug('Crawled :: {0}'.format(page.url)) # increment website crawled page counter rDB.incr(website.id + ':pages_crawled') log.info('Pages Queued:: {0}'.format( rDB.get(website.id + ':pages_queued'))) log.info('Pages Crawled:: {0}'.format( rDB.get(website.id + ':pages_crawled'))) # checks if website crawled completely or not if rDB.get(website.id + ':pages_queued') == rDB.get(website.id + ':pages_crawled'): log.info('Website {0} crawled Completely'.format(website.url)) # save results to database log.info('Saving results to database') qH.enqueue(save_result_to_database, website) # send the email to user log.info('Sending email to user') send_mail_to_user(website) except Exception as e: log.exception('Error in crawling :: {0}'.format(page.url))