def GetInstalls(self, soupObj, appDetails): try: pull = soupObj.find('div', attrs={ 'class': 'content', 'itemprop': 'numDownloads' }) appDetails.installs = pull.text except TypeError as typeError: logger.log("Type Error at GetInstalls: ", typeError) except Exception as e: logger.log("Exception at GetInstalls: ", e)
class AppSpiderClass: def SpiderA(self): appList =[] while True: urls =['https://en.aptoide.com/apps/latest/more','https://en.aptoide.com/apps/latest/more?offset=45','https://en.aptoide.com/apps/latest/more?offset=84','https://en.aptoide.com/apps/latest/more?offset=126','https://en.aptoide.com/apps/latest/more?offset=161','https://en.aptoide.com/apps/latest/more?offset=200','https://en.aptoide.com/apps/latest/more?offset=235','https://en.aptoide.com/apps/latest/more?offset=276','https://en.aptoide.com/apps/latest/more?offset=324','https://en.aptoide.com/apps/latest/more?offset=367','https://en.aptoide.com/apps/latest/more?offset=411','https://en.aptoide.com/apps/latest/more?offset=456','https://en.aptoide.com/apps/latest/more?offset=500','https://en.aptoide.com/apps/latest/more?offset=535','https://en.aptoide.com/apps/latest/more?offset=573','https://en.aptoide.com/apps/latest/more?offset=614','https://en.aptoide.com/apps/latest/more?offset=660','https://en.aptoide.com/apps/latest/more?offset=692','https://en.aptoide.com/apps/latest/more?offset=734','https://en.aptoide.com/apps/latest/more?offset=772','https://en.aptoide.com/apps/latest/more?offset=805','https://en.aptoide.com/apps/latest/more?offset=839','https://en.aptoide.com/apps/latest/more?offset=872','https://en.aptoide.com/apps/latest/more?offset=906','https://en.aptoide.com/apps/latest/more?offset=940'] #try: for pagelink in urls: htmlPages = requests.get(pagelink) text = htmlPages.text soup = BeautifulSoup(text,"html.parser") self.FetchAptoideLinks(soup,appList) except requests.ConnectionError as connError: logger.log("Connection Error while connecting to Play store: ", urls," Error: ", connError) except requests.HTTPError as httpError: logger.log("Invalid HTTP response to Play store: ", urls, " Error: ", httpError) except requests.Timeout() as requestTimeoutError: logger.log("Time-out to connect to Play store: ", urls, " Error: ", requestTimeoutError)
def _reconnect(self): """ Set up Redis connection. """ loop = self._loop or asyncio.get_event_loop() while True: try: logger.log(logging.INFO, 'Connecting to redis') yield from loop.create_connection(lambda: self.protocol, self.host, self.port) self._reset_retry_interval() return except OSError: # Sleep and try again self._increase_retry_interval() interval = self._get_retry_interval() logger.log( logging.INFO, 'Connecting to redis failed. Retrying in %i seconds' % interval) yield from asyncio.sleep(interval)
def FetchedUrls(self): try: data = [] if (os.path.isfile(filePath) == True): data = pd.read_csv(filePath, usecols=['Urls']).T.values.tolist()[0] return data else: return data except ValueError as e: logger.log(e) except TypeError as e: logger.log(e) except KeyError as e: logger.log(e) except IndexError as e: logger.log(e) except IOError as e: logger.log(e)
def CollectAppDetails(self, appLicationsList): appTitle = [] appPrice = [] appRating = [] appTotalReviews = [] appGenere = [] appAuthor = [] appInstalls = [] appAdultContent = [] appUrls = [] for app in appLicationsList: appTitle.append( app.title) #if self.IsEmptyString(app.title) else "") appPrice.append( app.price) # if self.IsEmptyString(app.price) else "") appRating.append( app.rating) # if self.IsEmptyString(app.rating) else "") appTotalReviews.append( app.totalReviews ) # if self.IsEmptyString(app.totalReviews) else "") appGenere.append( app.genere) # if self.IsEmptyString(app.genere) else "") appAuthor.append( app.author) # if self.IsEmptyString(app.author) else "") appInstalls.append( app.installs) # if self.IsEmptyString(app.installs) else "") appAdultContent.append( app.adult) # if self.IsEmptyString(app.adult) else "") appUrls.append(app.url) # if self.IsEmptyString(app.url) else "") df_appDetails = { "Title": appTitle, "Price": appPrice, "Rating": appRating, "TotalReviews": appTotalReviews, "Genere": appGenere, "Author": appAuthor, "Installs": appInstalls, "Adult": appAdultContent, "Urls": appUrls } df = pd.DataFrame(df_appDetails) errorCount = 0 try: if (os.path.isfile(filePath) != True): df.to_csv(filePath, mode='a', header=True, index=False) else: df.to_csv(filePath, mode='a', header=False, index=False) except ValueError as e: errorCount = +1 logger.log(e) except TypeError as e: errorCount = +1 logger.log(e) except KeyError as e: errorCount = +1 logger.log(e) except IndexError as e: errorCount = +1 logger.log(e) except IOError as e: errorCount = +1 logger.log(e) except Exception as e: errorCount = +1 logger.log(e) finally: if (errorCount > 0): return False else: return True
def Spider(self, limit): page = 1 webAppList = [] while page < limit: url = r"https://play.google.com/store/apps/category/GAME/collection/topselling_new_free?hl=en" try: source_Code = requests.get(url, timeout=10) raw_text = source_Code.text soup = BeautifulSoup(raw_text, "html.parser") self.FetchAppLinks(soup, webAppList) except requests.ConnectionError as connError: logger.log("Connection Error while connecting to Play store: ", url, " Error: ", connError) except requests.HTTPError as httpError: logger.log("Invalid HTTP response to Play store: ", url, " Error: ", httpError) except requests.Timeout() as requestTimeoutError: logger.log("Time-out to connect to Play store: ", url, " Error: ", requestTimeoutError) except requests.TooManyRedirects() as redirectsError: logger.log("Too many redirects for connection to Play store: ", url, " Error: ", redirectsError) #except HTMLParseError as htmlParsingError: # logger.log("HTMLParse Error: ", htmlParsingError) except Exception as e: logger.log("Execpetion occured at Func Spider: ", e) except requests.exceptions.Timeout as timeoutException: logger.log("Timeout Exception", timeoutException) page += 1 return webAppList
def GooglePlayStoreCrawler(self, appLink): #for appLink in appList: appDetails = AppDetails() appDetails.url = appLink appDetails.appStore = "Google Play Store" #print(appLink) sourceCode = requests.get(appLink) rawText = sourceCode.text soupObj = BeautifulSoup(rawText, "html.parser") #Thread for title tTitle = threading.Thread(target=self.GetTitle, args=(soupObj, appDetails)) #Thread for ratings tReviews = threading.Thread(target=self.GetRatings, args=(soupObj, appDetails)) #Thread for price tPrice = threading.Thread(target=self.GetPrice, args=(soupObj, appDetails)) #Thread for total Reviewers tTotalReviewers = threading.Thread(target=self.GetTotalReviewers, args=(soupObj, appDetails)) #Thread for Genere tGenere = threading.Thread(target=self.GetGenere, args=(soupObj, appDetails)) #Thread for author tAuthor = threading.Thread(target=self.GetDeveloper, args=(soupObj, appDetails)) #Installs tInstalls = threading.Thread(target=self.GetInstalls, args=(soupObj, appDetails)) #Content tContent = threading.Thread(target=self.GetContent, args=(soupObj, appDetails)) threadList = [ tTitle, tGenere, tPrice, tAuthor, tContent, tInstalls, tReviews, tTotalReviewers ] try: #Initializing the threads for threadApp in threadList: threadApp.start() #Waiting for threads to complete their tasks for threadApp in threadList: threadApp.join() except Exception as e: logger.log("Exception caught in threading in Crawler Class: ", e) finally: return appDetails
def _run_once(self): """Run one full iteration of the event loop. This calls all currently ready callbacks, polls for I/O, schedules the resulting callbacks, and finally schedules 'call_later' callbacks. Basically a copy of the original one, but running ready callbacks applying a round robin strategy between the differnet partitions. Once a queue, if it had at least one callback, runs out of callbacks the IO loop is requested again for its IO and time handles. """ sched_count = sum( [len(self._partitions[p].scheduled) for p in self._p_to_process]) if (sched_count > _MIN_SCHEDULED_TIMER_HANDLES and self._timer_cancelled_count / sched_count > _MIN_CANCELLED_TIMER_HANDLES_FRACTION): # noqa for partition in self._p_to_process: # Remove delayed calls that were cancelled if their number # is too high new_scheduled = [] for handle in self._partitions[partition].scheduled: if handle._cancelled: handle._scheduled = False else: new_scheduled.append(handle) heapq.heapify(new_scheduled) self._partitions[partition].scheduled = new_scheduled self._timer_cancelled_count = 0 else: for partition in self._p_to_process: # Remove delayed calls that were cancelled from head of queue. while self._partitions[partition].scheduled and\ self._partitions[partition].scheduled[0]._cancelled: self._timer_cancelled_count -= 1 handle = heapq.heappop( self._partitions[partition].scheduled) handle._scheduled = False timeout = None any_handles = any( [bool(self._partitions[p].handles) for p in self._partitions]) any_scheduled = any( [bool(self._partitions[p].scheduled) for p in self._p_to_process]) if any_handles or self._stopping: timeout = 0 elif any_scheduled: # Compute the desired timeout. when = min([ self._partitions[p].scheduled[0]._when for p in self._p_to_process ] # noqa ) timeout = max(0, when - self.time()) if self._debug and timeout != 0: t0 = self.time() event_list = self._selector.select(timeout) dt = self.time() - t0 if dt >= 1.0: level = logging.INFO else: level = logging.DEBUG nevent = len(event_list) if timeout is None: logger.log(level, 'poll took %.3f ms: %s events', dt * 1e3, nevent) elif nevent: logger.log(level, 'poll %.3f ms took %.3f ms: %s events', timeout * 1e3, dt * 1e3, nevent) elif dt >= 1.0: logger.log(level, 'poll %.3f ms took %.3f ms: timeout', timeout * 1e3, dt * 1e3) else: event_list = self._selector.select(timeout) self._process_events(event_list) # Handle 'later' callbacks that are ready. end_time = self.time() + self._clock_resolution for partition in self._p_to_process: while self._partitions[partition].scheduled: handle = self._partitions[partition].scheduled[0] if handle._when >= end_time: break handle = heapq.heappop(self._partitions[partition].scheduled) handle._scheduled = False self._partitions[partition].handles.append(handle) partitions = [ p for p in self._partitions if self._partitions[p].handles ] ntodo = max( [len(self._partitions[p].handles) for p in self._partitions]) cnt = 0 p_to_process = set() handles_executed_per_partition = {p: 0 for p in self._partitions} while not p_to_process and cnt < ntodo: for partition in partitions: try: handle = self._partitions[partition].handles.popleft() except IndexError: if handles_executed_per_partition[partition] > 0: p_to_process.add(partition) continue else: handles_executed_per_partition[partition] += 1 if handle._cancelled: continue if self._debug: try: self._current_handle = handle t0 = self.time() handle._run() dt = self.time() - t0 if dt >= self.slow_callback_duration: logger.warning('Executing %s took %.3f seconds', _format_handle(handle), dt) finally: self._current_handle = None else: handle._run() cnt += 1 if p_to_process: self._p_to_process = p_to_process else: # keep with the same ones, we didnt run the queues. # FIXME : it can create starvation pass handle = None # Needed to break cycles when an exception occurs.
soup = BeautifulSoup(text,"html.parser") self.FetchAptoideLinks(soup,appList) except requests.ConnectionError as connError: logger.log("Connection Error while connecting to Play store: ", urls," Error: ", connError) except requests.HTTPError as httpError: logger.log("Invalid HTTP response to Play store: ", urls, " Error: ", httpError) except requests.Timeout() as requestTimeoutError: logger.log("Time-out to connect to Play store: ", urls, " Error: ", requestTimeoutError) except requests.TooManyRedirects() as redirectsError: logger.log("Too many redirects for connection to Play store: ", urls, " Error: ", redirectsError) except Exception as e: logger.log("Excpetion occured at Func Spider: ", e) except requests.exceptions.Timeout as timeoutException: logger.log("Timeout Exception", timeoutException) return appList def FetchAptoideLinks(self,soup,appList): for link in soup.findAll("span",{"class":"bundle-item__info__span--big"}): hreff = link.find('a').get('href')