示例#1
0
 def parse_detail(self, response):
     soup = bs4.BeautifulSoup(response.text, "html.parser")
     platform = self.name
     commonname = soup.find_all(
         "h1", class_='app-name')[0].find('span').get_text()
     detailinfo = soup.find_all("div", class_='detail')[0]
     size = detailinfo.find_all("span", class_='size')[0].get_text()
     version = detailinfo.find_all("span", class_='version')[0].get_text()
     sizepattern = re.compile(ur'[0-9\.]+.*')
     versionpattern = re.compile(ur'[0-9\.]+')
     idpattern = re.compile(ur'[0-9]+')
     size = sizepattern.search(size).group()
     version = versionpattern.search(version).group()
     packagename = commonname
     platformid = idpattern.search(response.url).group()
     urllink = soup.find_all("a", class_='apk')[0]['href']
     category = soup.find_all("a", attrs={'target': '_self'})[2].get_text()
     if platformid in self.apkbf:
         return
     self.apkbf.add(platformid)
     item = ItemLoader(item=ApkspiderItem(), response=response)
     item.add_value('commonname', commonname)
     item.add_value('apkplaform', platform)
     item.add_value('apkid_specifiedbyplaform', platformid)
     item.add_value('category', category)
     item.add_value('packagename', packagename)
     item.add_value('size', size)
     item.add_value('version', version)
     item.add_value('urllink', urllink)
     item.add_value('file_urls', urllink)
     item.add_value('checkpoint', self.checkpoint)
     yield item.load_item()
示例#2
0
 def parse_detail(self, response):
     soup = bs4.BeautifulSoup(response.text, 'html.parser');
     idpattern = re.compile(ur'[0-9]+');
     appinfo = soup.select('.app-info')[0];
     apknamepattern = re.compile(ur'appdetail/.*?/');
     commonname = appinfo.select('.title')[0].get_text();
     category = response.meta['category'];
     platform = self.name;
     sv = appinfo.select('.dec')[0].get_text().split('|');
     size = sv[0];
     version = sv[1];
     print(response.url);
     apkid = idpattern.search(response.url).group();
     print(apkid);
     packagename = apknamepattern.search(response.url).group()[10:-1];
     urllink = soup.select('.download')[0]['href'];
     if apkid in self.apkbf:
         return;
     self.apkbf.add(apkid);
     item = ItemLoader(item=ApkspiderItem(), response=response);
     item.add_value('apkid_specifiedbyplaform',apkid);
     item.add_value('commonname',commonname);
     item.add_value('apkplaform',platform);
     item.add_value('category',category);
     item.add_value('packagename',packagename);
     item.add_value('size',size);
     item.add_value('version',version);
     item.add_value('urllink',urllink);
     item.add_value('file_urls',urllink);
     item.add_value('checkpoint',self.checkpoint);
     yield item.load_item();
示例#3
0
 def parse_detail(self, response):
     packagenamepattern = re.compile(ur'/[^/]*.apk')
     soup = bs4.BeautifulSoup(response.text, 'html.parser')
     idpattern = re.compile(ur'[0-9]+')
     versionpattern = re.compile(ur'[0-9.]+')
     commonname = soup.select('dt.clearfix')[0].get_text().strip()
     version = versionpattern.search(commonname).group()
     category = response.meta['category']
     msgsoup = soup.select('.msg-list')[0]
     msglist = msgsoup.select('li')
     size = msglist[0].get_text()
     size = size[size.find(u':') + 1:].strip()
     developer = msglist[1].get_text()
     developer = developer[developer.find(u':') + 1:].strip()
     updatetime = msglist[5].get_text()
     updatetime = updatetime[updatetime.find(u':') + 1:].strip()
     apkid = idpattern.search(response.url).group()
     urllink = self.httpprotocol + soup.select('.dl-btn')[0]['tempurl']
     print(urllink)
     packagename = packagenamepattern.search(urllink).group()
     item = ItemLoader(item=ApkspiderItem(), response=response)
     item.add_value('commonname', commonname)
     item.add_value('apkplaform', self.name)
     item.add_value('apkid_specifiedbyplaform', apkid)
     item.add_value('category', category)
     item.add_value('developer', developer)
     item.add_value('packagename', packagename)
     item.add_value('updatetime', updatetime)
     item.add_value('size', size)
     item.add_value('version', version)
     item.add_value('urllink', response.url)
     item.add_value('file_urls', response.url)
     yield item.load_item()
示例#4
0
    def parse_json(self, response):
        categorypattern = re.compile(ur'categoryId=-?[0-9]+');
        pagecontext = re.compile(ur'pageContext=-?[0-9]+');
        idpattern = re.compile(ur'-?[0-9]+');
        catestring = categorypattern.search(response.url).group();
        pagestring = pagecontext.search(response.url).group();
        cateid = idpattern.search(catestring).group();
        pageid = idpattern.search(pagestring).group();
        json_response = json.loads(response.body_as_unicode());
        count = 0;
        if json_response.has_key('count'):
            count = int(json_response['count']);
        else:
            return;
        print(response.url);
        print(count);
        if count <= 0:
            return;
        objs = "";
        if json_response.has_key('obj'):
            objs = json_response['obj'];
        else:
            return;
        apkplaform = 'qq';
        for obj in objs:
            if obj['apkUrl'] in self.categorybf:
                continue;
            if obj['appId'] in self.apkbf:
                continue;
            self.apkbf.add(obj['appId']);
            self.categorybf.add(obj['apkUrl']);
            print(obj);
            item = ItemLoader(item=ApkspiderItem(), response=response);
            item.add_value("commonname",obj['appName']);
            item.add_value('apkplaform',apkplaform);
            item.add_value('apkid_specifiedbyplaform',str(obj['appId']));
            item.add_value('category',obj['categoryName']);
            item.add_value('developer',obj['authorName']);
            item.add_value('packagename',obj['pkgName']);
            item.add_value('updatetime',obj['apkPublishTime']);
            item.add_value('version',obj['versionName']);
            item.add_value('urllink',obj['apkUrl']);
            item.add_value('file_urls',obj['apkUrl']);
            item.add_value('checkpoint',self.checkpoint);
            yield item.load_item();

        url = self.base_cate_url%(int(response.meta['orgname']),int(cateid),int(pageid)+self.step);
        yield Request(
            url,
            headers={"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:63.0) Gecko/20100101 Firefox/63.0"},
            meta={'orgname':response.meta['orgname']},
            callback=self.parse_json
        );
示例#5
0
 def parse_detail(self, response):
     versionpattern = re.compile(ur'[0-9\.]+')
     soup = bs4.BeautifulSoup(response.text, "html.parser")
     commonname = soup.select('.app-name')[0].get_text()
     info = soup.select('.infos-list')[0]
     size = info.find('dd').get_text()
     platform = self.name
     urllink = soup.find_all('a', class_='normal-dl-btn')[0]
     if not urllink.has_attr('href'):
         return
     urllink = urllink['href']
     version = info.select('dd')[2].get_text()
     print(version)
     if versionpattern.search(version) == None:
         version = info.select('dd')[3].get_text()
     developer = info.select('.dev-sites')
     if len(developer) == 0:
         developer = ""
     else:
         developer = developer[0].get_text()
     permission = list()
     permlist = info.find_all('span', class_='perms')
     for perm in permlist:
         permission.append(perm.get_text())
     category = info.find_all('a')[0].get_text()
     updatetime = soup.find('span', class_='update-time').get_text()
     timepattern = re.compile(ur'[0-9/]+')
     updatetime = timepattern.search(updatetime).group()
     packagename = response.url[response.url.rfind('/') + 1:]
     if packagename in self.apkbf:
         return
     self.apkbf.add(packagename)
     item = ItemLoader(item=ApkspiderItem(), response=response)
     item.add_value('apkid_specifiedbyplaform', packagename)
     item.add_value('commonname', commonname)
     item.add_value('apkplaform', platform)
     item.add_value('category', category)
     item.add_value('developer', developer)
     item.add_value('packagename', packagename)
     item.add_value('updatetime', updatetime)
     item.add_value('size', size)
     item.add_value('version', version)
     item.add_value('permission', permission)
     item.add_value('urllink', urllink)
     item.add_value('file_urls', urllink)
     item.add_value('checkpoint', self.checkpoint)
     yield item.load_item()
示例#6
0
 def parse_detail(self, response):
     soup = bs4.BeautifulSoup(response.text, "html.parser");
     info = soup.select('div.app-info.flt')[0];
     commonname = info.select('.title')[0].get_text();
     category = response.meta['category'];
     platform = self.name;
     detailinfos = info.select('li.ul-li-detail');
     if not len(detailinfos) == 4:
         return;
     size = detailinfos[0].select('span')[0].get_text();
     updatetime = detailinfos[1].select('span')[0].get_text();
     developer= detailinfos[2].select('span')[0].get_text();
     version= detailinfos[3].select('span')[0].get_text();
     permissionlist = list();
     permissions = soup.select('.hidepermission')[0].select('li');
     for p in permissions:
         if p.get_text().startswith(u'·'):
             permissionlist.append(p.get_text());
     urllink = soup.select('a.mkapp-btn.mab-download')[0];
     apkid = "";
     if not urllink.has_attr("onclick"):
         return;
     urllink = urllink['onclick'].split('\'');
     apkid = urllink[1];
     urllink = urllink[11];
     urllink = urllink[:urllink.find('?sign')];
     print(urllink);
     packagename = urllink[urllink.rfind('/')+1:];
     print(packagename);
     if apkid in self.apkbf:
         return;
     self.apkbf.add(apkid);
     item = ItemLoader(item=ApkspiderItem(), response=response);
     item.add_value('commonname',commonname);
     item.add_value('apkplaform',platform);
     item.add_value('apkid_specifiedbyplaform',apkid);
     item.add_value('category',category);
     item.add_value('developer',developer);
     item.add_value('packagename',packagename);
     item.add_value('updatetime',updatetime);
     item.add_value('size',size);
     item.add_value('version',version);
     item.add_value('permission',permissionlist);
     item.add_value('urllink',urllink);
     item.add_value('file_urls',urllink);
     item.add_value('checkpoint', self.checkpoint);
     yield item.load_item();
示例#7
0
 def parse_detail(self, response):
     urlpattern = re.compile(ur'url=.*')
     apkidpattern = re.compile(ur'soft_id/[0-9]+')
     numpattern = re.compile(ur'[0-9]+')
     packagenamepattern = re.compile(ur'/[^/]*\.apk')
     soup = bs4.BeautifulSoup(response.text, 'html.parser')
     print(response.url)
     commonname = soup.select('#app-name')[0].get_text()
     size = soup.select('.s-3')[1].get_text()
     urllink = urlpattern.search(
         soup.select('.js-downLog.dbtn')[0]['href']).group()[4:]
     packagename = packagenamepattern.search(urllink).group()[1:-4]
     apkid = numpattern.search(apkidpattern.search(
         response.url).group()).group()
     metainfo = soup.select('.base-info')[0]
     metainfo = metainfo.select('td')
     developer = metainfo[0].get_text()
     developer = developer[developer.find(u':') + 1:]
     version = metainfo[2].get_text()
     version = version[version.find(u':') + 1:]
     updatetime = metainfo[1].get_text()
     updatetime = updatetime[updatetime.find(u':') + 1:]
     permissionlist = list()
     permission = soup.select('#authority-panel')[0].select(
         'p')[0].get_text().split('\n')
     category = response.meta['category']
     for perm in permission:
         if perm.strip().startswith(u'-'):
             permissionlist.append(perm.strip())
     if apkid in self.apkbf:
         return
     self.apkbf.add(apkid)
     item = ItemLoader(item=ApkspiderItem(), response=response)
     item.add_value('commonname', commonname)
     item.add_value('apkplaform', self.name)
     item.add_value('apkid_specifiedbyplaform', apkid)
     item.add_value('category', category)
     item.add_value('developer', developer)
     item.add_value('packagename', packagename)
     item.add_value('updatetime', updatetime)
     item.add_value('size', size)
     item.add_value('version', version)
     item.add_value('permission', permissionlist)
     item.add_value('urllink', urllink)
     item.add_value('file_urls', urllink)
     item.add_value('checkpoint', self.checkpoint)
     yield item.load_item()
示例#8
0
 def parse_detail(self, response):
     soup = bs4.BeautifulSoup(response.text, 'html.parser')
     print(response.url)
     urllinkpattern = re.compile(ur'\'.*\'')
     urllink = soup.select('.download_app')[0]
     if not urllink.has_attr(
             'onclick') or urllink['onclick'] == 'return false;':
         return
     urllink = urllink['onclick']
     urllink = urllinkpattern.search(urllink).group()[1:-1]
     commonname = soup.select('.app-name')[0].get_text()
     detaillist = soup.select('.art-content')
     size = detaillist[2].get_text()
     size = size[size.find(u':') + 1:]
     version = detaillist[3].get_text()
     version = version[version.find(u':') + 1:]
     category = detaillist[6].get_text()
     category = category[category.find(u':') + 1:]
     packagename = response.url[response.url.rfind('/') + 1:]
     permissionlist = list()
     permissions = soup.select('.permissions-list')[0].find_all('li')
     for perm in permissions:
         permissionlist.append(perm.get_text())
     if packagename in self.apkbf:
         return
     self.apkbf.add(packagename)
     item = ItemLoader(item=ApkspiderItem(), response=response)
     item.add_value('apkid_specifiedbyplaform', packagename)
     item.add_value('commonname', commonname)
     item.add_value('apkplaform', self.name)
     item.add_value('category', category)
     item.add_value('packagename', packagename)
     item.add_value('size', size)
     item.add_value('version', version)
     item.add_value('permission', permissionlist)
     item.add_value('urllink', urllink)
     item.add_value('file_urls', urllink)
     item.add_value('checkpoint', self.checkpoint)
     yield item.load_item()
示例#9
0
 def parse_detail(self, response):
     print(response.url);
     numpattern = re.compile(ur'[0-9]+');
     soup = bs4.BeautifulSoup(response.text, 'html.parser');
     appdetail = soup.select('.app_detail')[0];
     commonname = appdetail.select('.detail_line')[0].select('h3')[0].get_text();
     version = appdetail.select('.app_detail_version')[0].get_text();
     appdetail = appdetail.select('#detail_line_ul')[0].select('li');
     category = appdetail[0].get_text();
     category = category[category.find(u':')+1:];
     updatetime = appdetail[2].get_text();
     updatetime = updatetime[updatetime.find(u':')+1:];
     size = appdetail[3].get_text();
     size = size[size.find(u':')+1:];
     developer = appdetail[6].get_text();
     developer = developer[developer.find(u':')+1:];
     apkid = numpattern.search(soup.select('.detail_down')[0].select('a')[0]['onclick']).group();
     dlg = self.downloadgate%int(apkid);
     proxy = Proxy(apkid, dlg);
     urllink = proxy.get_downloadaddress();
     packagenamepattern = re.compile(ur'/[^/]*\.html');
     packagename = packagenamepattern.search(response.url).group()[1:-5];
     if apkid in self.apkbf:
         return;
     self.apkbf.add(apkid);
     item = ItemLoader(item=ApkspiderItem(), response=response);
     item.add_value('commonname',commonname);
     item.add_value('apkplaform',self.name);
     item.add_value('apkid_specifiedbyplaform',apkid);
     item.add_value('category',category);
     item.add_value('developer',developer);
     item.add_value('packagename',packagename);
     item.add_value('updatetime',updatetime);
     item.add_value('size',size);
     item.add_value('version',version);
     item.add_value('urllink',urllink);
     item.add_value('file_urls',urllink);
     item.add_value('checkpoint',self.checkpoint);
     yield item.load_item();
示例#10
0
 def parse_detail(self, response):
     soup = bs4.BeautifulSoup(response.text, 'html.parser')
     infosoup = soup.select('.info_box')[0]
     versionpattern = re.compile(ur'[0-9\.]+')
     packagenamepattern = re.compile(ur'/[^/]*.apk')
     commonname = infosoup.select('h1')[0].get_text()
     version = versionpattern.search(commonname).group()
     metainfolist = infosoup.select('em')
     category = metainfolist[0].get_text()
     updatetime = metainfolist[1].get_text()
     size = metainfolist[3].get_text()
     developer = metainfolist[4].get_text()
     urllink = soup.select('.btn_android')[0]['href']
     for i in range(0, self.TRY_NUM):
         if not urllink.find('.apk') == -1:
             break
         proxy = Proxy(0, urllink)
         urllink = proxy.get_downloadaddress()
     idpattern = re.compile(ur'[0-9]+')
     apkid = idpattern.search(response.url).group()
     packagename = packagenamepattern.search(urllink).group()[1:-4]
     if apkid in self.apkbf:
         return
     self.apkbf.add(apkid)
     item = ItemLoader(item=ApkspiderItem(), response=response)
     item.add_value('commonname', commonname)
     item.add_value('apkplaform', self.name)
     item.add_value('apkid_specifiedbyplaform', apkid)
     item.add_value('category', category)
     item.add_value('developer', developer)
     item.add_value('packagename', packagename)
     item.add_value('updatetime', updatetime)
     item.add_value('size', size)
     item.add_value('version', version)
     item.add_value('urllink', urllink)
     item.add_value('file_urls', urllink)
     item.add_value('checkpoint', self.checkpoint)
     yield item.load_item()
示例#11
0
 def parse_detail(self, response):
     soup = bs4.BeautifulSoup(response.text, 'html.parser')
     appinfo = soup.select('.app-info')[0]
     commonname = appinfo.select('.app-title')[0].get_text()
     pls = soup.select('.permission-list')
     permissionlist = list()
     if not len(pls) == 0:
         for perm in pls[0].select('.clearfix')[0].find_all('li'):
             permissionlist.append(perm.get_text())
     category = response.meta['category']
     detail_info = soup.select('.app-detail-info')[0].select('strong')
     size = detail_info[1].get_text()
     updatetime = detail_info[0].get_text()
     version = detail_info[2].get_text()
     urllink = soup.select('.btn-install')[0]['appdownurl']
     platform = self.name
     detailpattern = re.compile(ur'detail_[0-9]+')
     idpattern = re.compile(ur'[0-9]+')
     detailstring = detailpattern.search(response.url).group()
     apkid = idpattern.search(detailstring).group()
     packagename = commonname
     if apkid in self.apkbf:
         return
     print("apkid%s" % apkid)
     item = ItemLoader(item=ApkspiderItem(), response=response)
     item.add_value('commonname', commonname)
     item.add_value('apkid_specifiedbyplaform', apkid)
     item.add_value('apkplaform', platform)
     item.add_value('category', category)
     item.add_value('packagename', packagename)
     item.add_value('updatetime', updatetime)
     item.add_value('size', size)
     item.add_value('version', version)
     item.add_value('permission', permissionlist)
     item.add_value('urllink', urllink)
     item.add_value('file_urls', urllink)
     item.add_value('checkpoint', self.checkpoint)
     yield item.load_item()
示例#12
0
 def parse_download(self, response):
     json_response = json.loads(response.body_as_unicode())
     if not json_response['code'] == 200:
         return
     urllink = json_response['value']['downloadUrl']
     apkid = response.meta['packagename']
     if apkid in self.apkbf:
         return
     self.apkbf.add(apkid)
     item = ItemLoader(item=ApkspiderItem(), response=response)
     item.add_value('apkid_specifiedbyplaform', apkid)
     item.add_value('commonname', response.meta['commonname'])
     item.add_value('apkplaform', response.meta['platform'])
     item.add_value('category', response.meta['category'])
     item.add_value('developer', response.meta['developer'])
     item.add_value('packagename', response.meta['packagename'])
     item.add_value('updatetime', response.meta['updatetime'])
     item.add_value('size', response.meta['size'])
     item.add_value('version', response.meta['version'])
     item.add_value('urllink', urllink)
     item.add_value('file_urls', urllink)
     item.add_value('checkpoint', self.checkpoint)
     yield item.load_item()
示例#13
0
 def parse_download(self, response):
     json_response = json.loads(response.body_as_unicode())
     if not json_response['errno'] == 0:
         return
     downloadurl = json_response['data']['file_url']
     proxy = Proxy(0, downloadurl)
     downloadurl = proxy.get_downloadaddress()
     if response.meta['appid'] in self.apkbf:
         return
     self.apkbf.add(response.meta['appid'])
     item = ItemLoader(item=ApkspiderItem(), response=response)
     item.add_value('commonname', response.meta['commonname'])
     item.add_value('apkplaform', self.name)
     item.add_value('apkid_specifiedbyplaform', response.meta['appid'])
     item.add_value('category', response.meta['category'])
     item.add_value('developer', response.meta['developer'])
     item.add_value('packagename', response.meta['packagename'])
     item.add_value('updatetime', response.meta['updatetime'])
     item.add_value('size', response.meta['size'])
     item.add_value('version', response.meta['version'])
     item.add_value('urllink', downloadurl)
     item.add_value('file_urls', downloadurl)
     item.add_value('checkpoint', self.checkpoint)
     yield item.load_item()
示例#14
0
 def parse_detail(self, response):
     soup = bs4.BeautifulSoup(response.text, "html.parser")
     downloadlinks = soup.find_all("a", class_='download')
     hideinfos = soup.select("div .details.preventDefault")
     overviews = soup.select("div .intro-titles")
     if not (len(overviews) == 1 and len(hideinfos) == 1
             and len(downloadlinks) == 1):
         return
     downloadlink = downloadlinks[0]
     overview = overviews[0]
     commonname = overview.select("h3")[0].text.encode(encoding='UTF-8',
                                                       errors='strict')
     #
     apkplaform = "xiaomi"
     #
     apkid_specifiedbyplaform = ""
     #
     category = overview.select("p.special-font.action")[0].text.encode(
         encoding='UTF-8', errors='strict')
     category = category[category.find(":") + 3:category.find("|")]
     #
     developer = overview.select("p")[0].text.encode(encoding='UTF-8',
                                                     errors='strict')
     #
     packagename = ""
     #
     size = ""
     #
     version = ""
     #
     permissionlist = list()
     #
     urlink = urlparse.urljoin(self.base_url, downloadlink['href'])
     #
     #description = "";
     updatetime = ''
     #
     hideinfogenes = hideinfos[0].select('ul.cf')
     if not len(hideinfogenes) == 1:
         return
     hideinfogene = hideinfogenes[0]
     generalinfos = hideinfogene.select('li')
     while len(generalinfos) > 1:
         infodes = generalinfos.pop(0).text.encode(encoding='UTF-8',
                                                   errors='strict')
         if infodes.strip() == 'appId:':
             apkid_specifiedbyplaform = generalinfos.pop(0).text.encode(
                 encoding='UTF-8', errors='strict')
         elif infodes.strip() == '更新时间:':
             updatetime = generalinfos.pop(0).text.encode(encoding='UTF-8',
                                                          errors='strict')
         elif infodes.strip() == '包名:':
             packagename = generalinfos.pop(0).text.encode(encoding='UTF-8',
                                                           errors='strict')
         elif infodes.strip() == '版本号:':
             version = generalinfos.pop(0).text.encode(encoding='UTF-8',
                                                       errors='strict')
         elif infodes.strip() == '软件大小:':
             size = generalinfos.pop(0).text.encode(encoding='UTF-8',
                                                    errors='strict')
     permissioninfos = hideinfos[0].select('ul.second-ul')
     if not len(permissioninfos) == 1:
         return
     permissions = permissioninfos[0].select('li')
     while len(permissions) > 0:
         permission = permissions.pop(0).text.encode(encoding='UTF-8',
                                                     errors='strict')
         permission = permission[3:].strip()
         permissionlist.append(permission)
     if apkid_specifiedbyplaform in self.apkbf:
         return
     item = ItemLoader(item=ApkspiderItem(), response=response)
     item.add_value('commonname', commonname)
     item.add_value('apkplaform', apkplaform)
     item.add_value('apkid_specifiedbyplaform', apkid_specifiedbyplaform)
     item.add_value('category', category)
     item.add_value('developer', developer)
     item.add_value('packagename', packagename)
     item.add_value('updatetime', updatetime)
     item.add_value('size', size)
     item.add_value('version', version)
     item.add_value('permission', permissionlist)
     item.add_value('urllink', urlink)
     item.add_value('file_urls', urlink)
     item.add_value('checkpoint', self.checkpoint)
     yield item.load_item()