def _get_rating(self, rating): try: return int(rating.lstrip(u'rank_now')) except Exception as e: log_error(e) return 0
def _get_rating(self, rating): try: if rating: rating = float(-int(rating.split()[1][:-3])) / 30 return rating except Exception as e: log_error(e) return 0
def item_completed(self, results, item, info): for ok, result in results: if not ok: log_error("fail to download icon for %s" % item['source_link']) else: item['icon_path'] = result['path'] break return item
def parse_item(self, response): meta = response.request.meta source = meta['domain'] log_info('parse_item_1===========') #source = 'appchina.com' url = response.request.url if self.sourcelinkprocessor_class: processor = self.sourcelinkprocessor_class() url = processor.process(url) if not self._process_response(response, source, LinkType.LEAF): service.report_status([ LinkStatus(meta['redirect_urls'][0], source, Status.FAIL, type) ]) market.remove_app(url, source) log_info('parse_item_2===========') return if not self.name.startswith( 'update.') and self.name != 'itunes.apple.com': self.parse(response) if source.endswith('hiapk.com'): body = response.body.replace('</br>', '<p>') response = response.replace(body=body) if not self.itemloader_class: log_info('parse_item_3===========') return try: selector = HtmlXPathSelector(response) try: loader = self.itemloader_class(selector, response=response) except: loader = self.itemloader_class(selector) # log_info("loader=====%s" % type(loader)) loader.add_value('source', source) loader.add_value('source_link', url) except Exception as e: log_info('parse_item_4===========\n%s' % e) log_error(e) if self.name.startswith('update.'): service.report_update_status( [LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)]) else: service.report_status( [LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)]) log_info('parse_item_5===========') try: item = loader.load_item() if (self.is_item_valid(item)): return item else: market.remove_app(url, source) except Exception as e: log_error(e)
def item_completed(self, results, item, info): images_path = [] for ok, result in results: if not ok: log_error("fail to download image for %s" % item['source_link']) else: images_path.append(result['path']) item['images_path'] = ' '.join(images_path) return item
def process_item(self, item, spider): try: adapter = ItemAdapterFactory.get_itemadapter(item.get('source')) if adapter: item = adapter.adapt(item) return item except Exception as e: print "------------------------%s" % e log_error(e) raise DropItem()
def _get_download_link(self, download_link): try: if download_link: download_link_match = self._download_link_pattern.search(download_link) if download_link_match: download_link = '%s/dl_app.php?s=%s' % (self._base_url, download_link_match.group(1)) return download_link except Exception as e: log_error(e) return download_link
def _thrift_call(func): try: transport = TSocket.TSocket(SERVICE_CONFIG['host'], SERVICE_CONFIG['port']) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = Links.Client(protocol) transport.open() return func(client) except Thrift.TException as tx: log_error(tx) finally: transport.close()
def save_download_link(item): if not item: return try: c = _conn.cursor() sql = 'insert ignore into apk_links(link, updated_at) values (%s, now())' c.execute(sql, (item['url'], )) _conn.commit() except Exception as e: log_error(e) finally: c.close()
def report_link(source, catetory, link, description=''): try: cursor = _conn.cursor() insert_sql = "INSERT INTO %s (source, category, link, description, create_time) VALUES('%s', '%s', '%s', '%s', %s)" % \ (_link_monitor_table, source, catetory, link, description, get_epoch_datetime()) cursor.execute(insert_sql) _conn.commit() except MySQLdb.Error as e: log_error(e) finally: cursor.close()
def _get_download_link(self, download_link): try: if download_link: download_link_match = self._download_link_pattern.search( download_link) if download_link_match: download_link = '%s/dl_app.php?s=%s' % ( self._base_url, download_link_match.group(1)) return download_link except Exception as e: log_error(e) return download_link
def parse_item(self, response): meta = response.request.meta source = meta['domain'] log_info('parse_item_1===========') #source = 'appchina.com' url = response.request.url if self.sourcelinkprocessor_class: processor = self.sourcelinkprocessor_class() url = processor.process(url) if not self._process_response(response, source, LinkType.LEAF): service.report_status([LinkStatus(meta['redirect_urls'][0], source, Status.FAIL, type)]) market.remove_app(url, source) log_info('parse_item_2===========') return if not self.name.startswith('update.') and self.name != 'itunes.apple.com': self.parse(response) if source.endswith('hiapk.com'): body = response.body.replace('</br>', '<p>') response = response.replace(body=body) if not self.itemloader_class: log_info('parse_item_3===========') return try: selector = HtmlXPathSelector(response) try: loader = self.itemloader_class(selector, response=response) except: loader = self.itemloader_class(selector) # log_info("loader=====%s" % type(loader)) loader.add_value('source', source) loader.add_value('source_link', url) except Exception as e: log_info('parse_item_4===========\n%s' % e) log_error(e) if self.name.startswith('update.'): service.report_update_status([LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)]) else: service.report_status([LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)]) log_info('parse_item_5===========' ) try: item = loader.load_item() if (self.is_item_valid(item)): return item else: market.remove_app(url, source) except Exception as e: log_error(e)
def _get_downloads(self, downloads): try: downloads = downloads.replace(u'\u4e0b\u8f7d', '').replace( u'\u5c0f\u4e8e', '').replace(u'\u5927\u4e8e', '').strip() if u'\u4e07\u6b21' in downloads: downloads = int(downloads.replace(u'\u4e07\u6b21', '')) * 10000 elif u'\u5343\u6b21' in downloads: downloads = int(downloads.replace(u'\u5343\u6b21', '')) * 1000 elif u'\u6b21' in downloads: downloads = int(downloads.replace(u'\u6b21', '')) except Exception as e: log_error(e) return downloads
def _get_downloads(self, download_link): try: if download_link: app_id = download_link.split(u'/')[-1] # have to pass some data to make it a POST request data = urllib.urlencode({'foo': 'bar'}) response = urllib2.urlopen(self._download_post_url + app_id, data, timeout=15) result = response.read() return int(result) except Exception as e: log_error(e) return 0
def get_category(app_id): try: cursor = _conn.cursor() sql = "SELECT cate_name FROM itunes_game_cate where app_id = %s" cursor.execute(sql, app_id) result = cursor.fetchone() if not result: return u'\u52a8\u4f5c\u6e38\u620f' else: return result[0] except MySQLdb.Error as e: log_error(e) finally: cursor.close()
def _get_rating(self, rating): try: if rating: rating = float(rating) rating = int(round(rating * 10)) if rating < 0: return 0 elif rating > 50: return 50 else: return rating except Exception as e: log_error(e) return 0
def process_item(self, item, spider): log_info("Doing=============") try: if not hasattr(spider, 'is_item_valid') or spider.is_item_valid(item, 1): if isinstance(item, (CrawledItem, )): market.save_app(item, spider.name) elif isinstance(item, (DownloadLinkItem, )): market.save_download_link(item) return item else: raise DropItem("invalid item: %s" % item) except Exception as e: print "------------------------%s" % e log_error(e) traceback.print_exc() raise DropItem()
def process_item(self, item, spider): log_info("Doing=============") try: if not hasattr(spider, 'is_item_valid') or spider.is_item_valid( item, 1): if isinstance(item, (CrawledItem, )): market.save_app(item, spider.name) elif isinstance(item, (DownloadLinkItem, )): market.save_download_link(item) return item else: raise DropItem("invalid item: %s" % item) except Exception as e: print "------------------------%s" % e log_error(e) traceback.print_exc() raise DropItem()
def save_app(item, name): if not item: return try: cursor = _conn.cursor() # record last_crawl time item['last_crawl'] = get_epoch_datetime() if name == 'play.google.com': save_final_app(cursor, item) else: _upsert_item(cursor, item, _table_dic.get(name, 'app')) _conn.commit() except Exception as e: log_error(e) finally: cursor.close()
def process_item(self, item, spider): try: icon_dic = {} icon_dic['url'] = item['icon_link'] icon_dic['source_link'] = item['source_link'] icon_dic['source'] = 'icon' market.push_image_url(icon_dic) image_dic = {} image_dic['url'] = item['images'] image_dic['source_link'] = item['source_link'] image_dic['source'] = 'image' market.push_image_url(image_dic) return item except Exception as e: log_error(e) raise DropItem()
def _get_downloads(self, downloads): try: downloads = downloads.replace( u'\u4e0b\u8f7d', '').replace( u'\u5c0f\u4e8e', '').replace( u'\u5927\u4e8e', '').strip( ) if u'\u4e07\u6b21' in downloads: downloads = int(downloads.replace(u'\u4e07\u6b21', '')) * 10000 elif u'\u5343\u6b21' in downloads: downloads = int(downloads.replace(u'\u5343\u6b21', '')) * 1000 elif u'\u6b21' in downloads: downloads = int(downloads.replace(u'\u6b21', '')) except Exception as e: log_error(e) return downloads
def get_apple_id(app_id): try: cursor = _conn.cursor() sql = "SELECT apple_id FROM app_itunes where app_id = %s" cursor.execute(sql, app_id) result = cursor.fetchone() if not result: cursor = _conn.cursor() sql = "SELECT username FROM apple_account ORDER BY app_num ASC limit 1" cursor.execute(sql) result = cursor.fetchone() sql = "UPDATE apple_account set app_num = app_num + 1 WHERE username=%s" cursor.execute(sql, result[0]) _conn.commit() return result[0] except MySQLdb.Error as e: log_error(e) finally: cursor.close()
def _get_category(self, category): try: category = category.replace(u'\u7c7b\u522b:', '').strip() except Exception as e: log_error(e) return category