Exemplo n.º 1
0
    def GetInstalls(self, soupObj, appDetails):
        try:
            pull = soupObj.find('div',
                                attrs={
                                    'class': 'content',
                                    'itemprop': 'numDownloads'
                                })
            appDetails.installs = pull.text

        except TypeError as typeError:
            logger.log("Type Error at GetInstalls: ", typeError)

        except Exception as e:
            logger.log("Exception at GetInstalls: ", e)
Exemplo n.º 2
0
class AppSpiderClass:
    
    def SpiderA(self):
        
        appList =[]
        
        while True:
            urls =['https://en.aptoide.com/apps/latest/more','https://en.aptoide.com/apps/latest/more?offset=45','https://en.aptoide.com/apps/latest/more?offset=84','https://en.aptoide.com/apps/latest/more?offset=126','https://en.aptoide.com/apps/latest/more?offset=161','https://en.aptoide.com/apps/latest/more?offset=200','https://en.aptoide.com/apps/latest/more?offset=235','https://en.aptoide.com/apps/latest/more?offset=276','https://en.aptoide.com/apps/latest/more?offset=324','https://en.aptoide.com/apps/latest/more?offset=367','https://en.aptoide.com/apps/latest/more?offset=411','https://en.aptoide.com/apps/latest/more?offset=456','https://en.aptoide.com/apps/latest/more?offset=500','https://en.aptoide.com/apps/latest/more?offset=535','https://en.aptoide.com/apps/latest/more?offset=573','https://en.aptoide.com/apps/latest/more?offset=614','https://en.aptoide.com/apps/latest/more?offset=660','https://en.aptoide.com/apps/latest/more?offset=692','https://en.aptoide.com/apps/latest/more?offset=734','https://en.aptoide.com/apps/latest/more?offset=772','https://en.aptoide.com/apps/latest/more?offset=805','https://en.aptoide.com/apps/latest/more?offset=839','https://en.aptoide.com/apps/latest/more?offset=872','https://en.aptoide.com/apps/latest/more?offset=906','https://en.aptoide.com/apps/latest/more?offset=940']
            
            #try:
            for pagelink in urls:
                htmlPages = requests.get(pagelink)
                text = htmlPages.text
                soup = BeautifulSoup(text,"html.parser")
                
                self.FetchAptoideLinks(soup,appList)
                
                
            except requests.ConnectionError as connError:
                logger.log("Connection Error while connecting to Play store: ", urls," Error: ", connError)
            
            except requests.HTTPError as httpError:
                logger.log("Invalid HTTP response to Play store: ", urls, " Error: ", httpError)
            
            except requests.Timeout() as requestTimeoutError:
                logger.log("Time-out to connect to Play store: ", urls, " Error: ", requestTimeoutError)
Exemplo n.º 3
0
 def _reconnect(self):
     """
     Set up Redis connection.
     """
     loop = self._loop or asyncio.get_event_loop()
     while True:
         try:
             logger.log(logging.INFO, 'Connecting to redis')
             yield from loop.create_connection(lambda: self.protocol, self.host, self.port)
             self._reset_retry_interval()
             return
         except OSError:
             # Sleep and try again
             self._increase_retry_interval()
             interval = self._get_retry_interval()
             logger.log(
                 logging.INFO, 'Connecting to redis failed. Retrying in %i seconds' % interval)
             yield from asyncio.sleep(interval)
Exemplo n.º 4
0
    def FetchedUrls(self):
        try:
            data = []
            if (os.path.isfile(filePath) == True):
                data = pd.read_csv(filePath,
                                   usecols=['Urls']).T.values.tolist()[0]
                return data
            else:
                return data
        except ValueError as e:
            logger.log(e)

        except TypeError as e:
            logger.log(e)

        except KeyError as e:
            logger.log(e)

        except IndexError as e:
            logger.log(e)

        except IOError as e:
            logger.log(e)
Exemplo n.º 5
0
    def CollectAppDetails(self, appLicationsList):
        appTitle = []
        appPrice = []
        appRating = []
        appTotalReviews = []
        appGenere = []
        appAuthor = []
        appInstalls = []
        appAdultContent = []
        appUrls = []

        for app in appLicationsList:
            appTitle.append(
                app.title)  #if self.IsEmptyString(app.title) else "")
            appPrice.append(
                app.price)  # if self.IsEmptyString(app.price) else "")
            appRating.append(
                app.rating)  # if self.IsEmptyString(app.rating) else "")
            appTotalReviews.append(
                app.totalReviews
            )  # if self.IsEmptyString(app.totalReviews) else "")
            appGenere.append(
                app.genere)  # if self.IsEmptyString(app.genere) else "")
            appAuthor.append(
                app.author)  # if self.IsEmptyString(app.author) else "")
            appInstalls.append(
                app.installs)  # if self.IsEmptyString(app.installs) else "")
            appAdultContent.append(
                app.adult)  # if self.IsEmptyString(app.adult) else "")
            appUrls.append(app.url)  # if self.IsEmptyString(app.url) else "")

        df_appDetails = {
            "Title": appTitle,
            "Price": appPrice,
            "Rating": appRating,
            "TotalReviews": appTotalReviews,
            "Genere": appGenere,
            "Author": appAuthor,
            "Installs": appInstalls,
            "Adult": appAdultContent,
            "Urls": appUrls
        }

        df = pd.DataFrame(df_appDetails)

        errorCount = 0
        try:

            if (os.path.isfile(filePath) != True):
                df.to_csv(filePath, mode='a', header=True, index=False)
            else:
                df.to_csv(filePath, mode='a', header=False, index=False)

        except ValueError as e:
            errorCount = +1
            logger.log(e)

        except TypeError as e:
            errorCount = +1
            logger.log(e)

        except KeyError as e:
            errorCount = +1
            logger.log(e)

        except IndexError as e:
            errorCount = +1
            logger.log(e)

        except IOError as e:
            errorCount = +1
            logger.log(e)

        except Exception as e:
            errorCount = +1
            logger.log(e)

        finally:
            if (errorCount > 0):
                return False
            else:
                return True
Exemplo n.º 6
0
    def Spider(self, limit):
        page = 1

        webAppList = []

        while page < limit:
            url = r"https://play.google.com/store/apps/category/GAME/collection/topselling_new_free?hl=en"
            try:

                source_Code = requests.get(url, timeout=10)

                raw_text = source_Code.text

                soup = BeautifulSoup(raw_text, "html.parser")

                self.FetchAppLinks(soup, webAppList)

            except requests.ConnectionError as connError:
                logger.log("Connection Error while connecting to Play store: ",
                           url, " Error: ", connError)

            except requests.HTTPError as httpError:
                logger.log("Invalid HTTP response to Play store: ", url,
                           " Error: ", httpError)

            except requests.Timeout() as requestTimeoutError:
                logger.log("Time-out to connect to Play store: ", url,
                           " Error: ", requestTimeoutError)

            except requests.TooManyRedirects() as redirectsError:
                logger.log("Too many redirects for connection to Play store: ",
                           url, " Error: ", redirectsError)

            #except HTMLParseError as htmlParsingError:
            #   logger.log("HTMLParse Error: ", htmlParsingError)

            except Exception as e:
                logger.log("Execpetion occured at Func Spider: ", e)

            except requests.exceptions.Timeout as timeoutException:
                logger.log("Timeout Exception", timeoutException)

            page += 1

        return webAppList
Exemplo n.º 7
0
    def GooglePlayStoreCrawler(self, appLink):
        #for appLink in appList:
        appDetails = AppDetails()

        appDetails.url = appLink

        appDetails.appStore = "Google Play Store"

        #print(appLink)
        sourceCode = requests.get(appLink)
        rawText = sourceCode.text
        soupObj = BeautifulSoup(rawText, "html.parser")

        #Thread for title
        tTitle = threading.Thread(target=self.GetTitle,
                                  args=(soupObj, appDetails))

        #Thread for ratings
        tReviews = threading.Thread(target=self.GetRatings,
                                    args=(soupObj, appDetails))

        #Thread for price
        tPrice = threading.Thread(target=self.GetPrice,
                                  args=(soupObj, appDetails))

        #Thread for total Reviewers
        tTotalReviewers = threading.Thread(target=self.GetTotalReviewers,
                                           args=(soupObj, appDetails))

        #Thread for Genere
        tGenere = threading.Thread(target=self.GetGenere,
                                   args=(soupObj, appDetails))

        #Thread for author
        tAuthor = threading.Thread(target=self.GetDeveloper,
                                   args=(soupObj, appDetails))

        #Installs
        tInstalls = threading.Thread(target=self.GetInstalls,
                                     args=(soupObj, appDetails))

        #Content
        tContent = threading.Thread(target=self.GetContent,
                                    args=(soupObj, appDetails))

        threadList = [
            tTitle, tGenere, tPrice, tAuthor, tContent, tInstalls, tReviews,
            tTotalReviewers
        ]

        try:

            #Initializing the threads
            for threadApp in threadList:
                threadApp.start()

            #Waiting for threads to complete their tasks
            for threadApp in threadList:
                threadApp.join()

        except Exception as e:
            logger.log("Exception caught in threading in Crawler Class: ", e)

        finally:

            return appDetails
Exemplo n.º 8
0
    def _run_once(self):
        """Run one full iteration of the event loop.

        This calls all currently ready callbacks, polls for I/O,
        schedules the resulting callbacks, and finally schedules
        'call_later' callbacks.

        Basically a copy of the original one, but running ready
        callbacks applying a round robin strategy between the differnet
        partitions. Once a queue, if it had at least one callback, runs out
        of callbacks the IO loop is requested again for its IO and time
        handles.
        """
        sched_count = sum(
            [len(self._partitions[p].scheduled) for p in self._p_to_process])

        if (sched_count > _MIN_SCHEDULED_TIMER_HANDLES
                and self._timer_cancelled_count / sched_count >
                _MIN_CANCELLED_TIMER_HANDLES_FRACTION):  # noqa

            for partition in self._p_to_process:
                # Remove delayed calls that were cancelled if their number
                # is too high
                new_scheduled = []
                for handle in self._partitions[partition].scheduled:
                    if handle._cancelled:
                        handle._scheduled = False
                    else:
                        new_scheduled.append(handle)

                heapq.heapify(new_scheduled)
                self._partitions[partition].scheduled = new_scheduled
                self._timer_cancelled_count = 0
        else:
            for partition in self._p_to_process:
                # Remove delayed calls that were cancelled from head of queue.
                while self._partitions[partition].scheduled and\
                        self._partitions[partition].scheduled[0]._cancelled:
                    self._timer_cancelled_count -= 1
                    handle = heapq.heappop(
                        self._partitions[partition].scheduled)
                    handle._scheduled = False

        timeout = None
        any_handles = any(
            [bool(self._partitions[p].handles) for p in self._partitions])
        any_scheduled = any(
            [bool(self._partitions[p].scheduled) for p in self._p_to_process])
        if any_handles or self._stopping:
            timeout = 0
        elif any_scheduled:
            # Compute the desired timeout.
            when = min([
                self._partitions[p].scheduled[0]._when
                for p in self._p_to_process
            ]  # noqa
                       )
            timeout = max(0, when - self.time())

        if self._debug and timeout != 0:
            t0 = self.time()
            event_list = self._selector.select(timeout)
            dt = self.time() - t0
            if dt >= 1.0:
                level = logging.INFO
            else:
                level = logging.DEBUG
            nevent = len(event_list)
            if timeout is None:
                logger.log(level, 'poll took %.3f ms: %s events', dt * 1e3,
                           nevent)
            elif nevent:
                logger.log(level, 'poll %.3f ms took %.3f ms: %s events',
                           timeout * 1e3, dt * 1e3, nevent)
            elif dt >= 1.0:
                logger.log(level, 'poll %.3f ms took %.3f ms: timeout',
                           timeout * 1e3, dt * 1e3)
        else:
            event_list = self._selector.select(timeout)
        self._process_events(event_list)

        # Handle 'later' callbacks that are ready.
        end_time = self.time() + self._clock_resolution
        for partition in self._p_to_process:
            while self._partitions[partition].scheduled:
                handle = self._partitions[partition].scheduled[0]
                if handle._when >= end_time:
                    break
                handle = heapq.heappop(self._partitions[partition].scheduled)
                handle._scheduled = False
                self._partitions[partition].handles.append(handle)

        partitions = [
            p for p in self._partitions if self._partitions[p].handles
        ]
        ntodo = max(
            [len(self._partitions[p].handles) for p in self._partitions])
        cnt = 0
        p_to_process = set()
        handles_executed_per_partition = {p: 0 for p in self._partitions}
        while not p_to_process and cnt < ntodo:
            for partition in partitions:
                try:
                    handle = self._partitions[partition].handles.popleft()
                except IndexError:
                    if handles_executed_per_partition[partition] > 0:
                        p_to_process.add(partition)
                    continue
                else:
                    handles_executed_per_partition[partition] += 1

                if handle._cancelled:
                    continue

                if self._debug:
                    try:
                        self._current_handle = handle
                        t0 = self.time()
                        handle._run()
                        dt = self.time() - t0
                        if dt >= self.slow_callback_duration:
                            logger.warning('Executing %s took %.3f seconds',
                                           _format_handle(handle), dt)
                    finally:
                        self._current_handle = None
                else:
                    handle._run()

            cnt += 1

        if p_to_process:
            self._p_to_process = p_to_process
        else:
            # keep with the same ones, we didnt run the queues.
            # FIXME : it can create starvation
            pass

        handle = None  # Needed to break cycles when an exception occurs.
Exemplo n.º 9
0
             soup = BeautifulSoup(text,"html.parser")
             
             self.FetchAptoideLinks(soup,appList)
             
             
         except requests.ConnectionError as connError:
             logger.log("Connection Error while connecting to Play store: ", urls," Error: ", connError)
         
         except requests.HTTPError as httpError:
             logger.log("Invalid HTTP response to Play store: ", urls, " Error: ", httpError)
         
         except requests.Timeout() as requestTimeoutError:
             logger.log("Time-out to connect to Play store: ", urls, " Error: ", requestTimeoutError)
             
         except requests.TooManyRedirects() as redirectsError:
             logger.log("Too many redirects for connection to Play store: ", urls, " Error: ", redirectsError)
             
         except Exception as e:
             logger.log("Excpetion occured at Func Spider: ", e)
         
         except requests.exceptions.Timeout as timeoutException:
             logger.log("Timeout Exception", timeoutException)
             
         return appList
 
 
 def FetchAptoideLinks(self,soup,appList):
     
         for link in soup.findAll("span",{"class":"bundle-item__info__span--big"}):
             hreff = link.find('a').get('href')