def download_file(self, doc): """ download file from music 163 :param doc: :return: """ file_relative_path = self.download_relative_path(doc) path = Config().get_path() content = self.request_file(doc) if content is None: log.print_err(u"file download failed : %s" % file_relative_path) return False try: file_name = os.path.join(path, file_relative_path) # dir make dir_name = os.path.dirname(file_name) if not os.path.exists(dir_name): os.makedirs(dir_name) # file write with open(file_name, "wb") as code: code.write(content) self.download_file_tag(file_name, doc) log.print_info(u"file download complete: %s" % file_relative_path) self.download_log(doc) return True except Exception as e: log.print_err("file save failed : %s, err: %s" % (file_relative_path, e)) return False
def search(self): search_key = 'mp3' key_num = 0 for k, v in search_types.items(): if getattr(self.app.pargs, k, None): search_key = k key_num += 1 if key_num > 1: log.print_err("it could search by only one type") # input must be decode in python2 search_value = getattr(self.app.pargs, search_key) search_value = py2_decoding(search_value) res = api.search(search_value, stype=search_key, offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 50) if not res: log.print_info("nothing found!") return if search_key in PRINT_ATTR_FUNC_MAP: func = PRINT_ATTR_FUNC_MAP[search_key][1] # type: function value = (res.get(PRINT_ATTR_FUNC_MAP[search_key][0], [])) # type: list func(value)
def scls_pls(self): from NXSpider.bin.models import playlist_mo if self.param_check(['cls'], sys._getframe().f_code.co_name) is False: return download_type = self.parse_download() class_name = self.app.pargs.cls class_name = py2_decoding(class_name) if class_name != u"全部" and py2_encoding( class_name) not in api.ALL_CLASSES: log.print_err( "class name is wrong, pls check by run : nxspider sw-pl-classes" ) return playlists = api.get_top_playlists(category=class_name, offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 50) # type: list for pl_obj in playlists: playlist_detail = api.get_playlist_detail(pl_obj['id']) if playlist_detail: log.print_info(u"<{}> author:{}".format( playlist_detail['name'], playlist_detail['creator']['nickname'], )) playlist_mo.parse_model(playlist_detail, download_type=download_type, file_check=Config().get_file_check()) log.print_info("spider complete!~") pass
def login_smv(self): from NXSpider.bin.models import no_rec_mv_mo if self.param_check(['lu'], sys._getframe().f_code.co_name) is False: return plaintext_pwd = self.app.pargs.lp or None if plaintext_pwd is None: import getpass plaintext_pwd = getpass.getpass("Please input your password:"******"none"))) exit() mvs = api.my_mvs(session) mvs = [api.get_mv_detail(d['id']) for d in mvs] mvs = [d for d in mvs if d] for mv in mvs: no_rec_mv_mo.parse_model( mv, download_type=['mv'], file_check=Config().get_file_check(), shortcuts_stack=[] if Config().get_shortcut() else None) log.print_info("spider complete!~") pass
def config_check(self): self.config_show() try: config = Config() if config.config_test(): log.print_info('config check complete, all is well done!') except: log.print_err('config check failed, pls re config')
def config_spider(self): config = Config() config_dict = config.config # type: dict is_config = False try: if self.app.pargs.path_download is not None: paths = self.app.pargs.path_download.split(',') # type: list if default_path_key in paths: index = paths.index(default_path_key) paths.remove(default_path_key) paths.insert(index, default_download_dir) final_paths = [] for p in paths: try: # some error need pass if os.path.isdir(p) is False: os.mkdir(p) final_paths.append(p) except: log.print_warn("path may be wrong and be deleted: {}".format(p)) pass if not final_paths: final_paths.append(default_download_dir) log.print_info('path will be set as: ' + ','.join(final_paths)) config_dict['download_path'] = final_paths is_config = True if self.app.pargs.mv_resolution is not None: r = int(self.app.pargs.mv_resolution) if r not in mv_resolutions: log.print_warn("-mvr resolution config skip, value must be 240,480,720,1080") config_dict['mv_def_resolution'] = r is_config = True if self.app.pargs.media_tag is not None: config_dict['media_tag'] = True if self.app.pargs.media_tag.lower() == 'true'\ or self.app.pargs.media_tag == '1' else False is_config = True if self.app.pargs.media_tag_163 is not None: config_dict['media_tag_163'] = True if self.app.pargs.media_tag_163.lower() == 'true' \ or self.app.pargs.media_tag_163 == '1' else False is_config = True except: log.print_err("input error, pls check") raise if is_config: config.save_config_file() log.print_info("config success") self.config_show()
def api_request(url, data=None, method="get", json=True, session=None, headers=headers, encrypt=True, https=False): """ request and try :param https: :param encrypt: :param url: :param data: :param method: :param json: :param session: :type session: requests.Session :param headers: :return: """ url = base_https_url + url if https else base_url + url request_obj = session or requests # update cookies if isinstance(request_obj, requests.Session): for cookie in request_obj.cookies: if cookie.name == '__csrf': data['csrf_token'] = cookie.value break # encrypt if encrypt: data = encrypted_request(data) method = 'get' if not data and method == 'get' else 'post' request_method = getattr(request_obj, method, None) or request_obj.get try: req = request_method(url, data=data, headers=headers, timeout=10) req.encoding = "UTF-8" res = req.json() if json else req.text # if session: # session.cookies.save() return res except ValueError as e: log.print_err("api do not return a valuable json") return {} except requests.exceptions.RequestException as e: log.print_warn("request error: %s" % url) return {}
def login_spls(self): if self.param_check(['lu', 'lp'], sys._getframe().f_code.co_name) is False: return from NXSpider.bin.models import playlist_mo plaintext_pwd = self.app.pargs.lp or None if plaintext_pwd is None: import getpass plaintext_pwd = getpass.getpass("Please input your password:"******"none"))) exit() user_id = res['account']['id'] download_type = self.parse_download() playlists = api.user_playlist(user_id, offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 1000) log.print_info("playlists bellow will be crawled") print_playlist(playlists) for pl_obj in playlists: playlist_detail = api.get_playlist_detail(pl_obj['id']) if playlist_detail: log.print_info(u"<{}> author:{}".format( playlist_detail['name'], playlist_detail['creator']['nickname'], )) playlist_mo.parse_model( playlist_detail, download_type=download_type, file_check=Config().get_file_check(), shortcuts_stack=[] if Config().get_shortcut() else None) log.print_info("spider complete!~") pass
def get_one_model_by_key(model, model_id): """ load or create a model by id :type model: DynamicDocument :param model: :param model_id: :return: doc, is_new :rtype: (DynamicDocument, boolean) """ try: res = model.objects(id=model_id).first() if res is None: res = model(id=model_id) return res, True return res, False except Exception as e: log.print_err('load a doc err: %s' % e) return None, True
def param_check(self, params, func_name): """ this will check param inputted and require is complete or not, and print help help will be in expose(help='...'), and got by function name :param params: :param func_name: :return: """ help = None fun = getattr(self, func_name, None) if fun and getattr(fun, '__cement_meta__', None): help = fun.__cement_meta__['help'] for p in params: param = getattr(self.app.pargs, p, None) if param is None: log.print_err("param {} miss, see help:".format(p)) if help: print(help) return False return True
def config_mongo(self): config = Config() config_dict = config.config # type: dict mongo_key = 'mongo' is_config = False try: if self.app.pargs.mhost is not None: config_dict[mongo_key]['host'] = self.app.pargs.mhost config_dict['no_mongo'] = False is_config = True if self.app.pargs.mport is not None: config_dict[mongo_key]['port'] = int(self.app.pargs.mport) is_config = True if self.app.pargs.muser is not None: config_dict[mongo_key]['username'] = self.app.pargs.muser is_config = True if self.app.pargs.mpassword is not None: config_dict[mongo_key]['password'] = self.app.pargs.mpassword is_config = True if self.app.pargs.mdbname is not None: config_dict[mongo_key]['name'] = self.app.pargs.mdbname is_config = True if self.app.pargs.nomongo is not None: config_dict['no_mongo'] = True if self.app.pargs.nomongo.lower() == 'true'\ or self.app.pargs.nomongo == '1' else False is_config = True except: log.print_err("input error, pls check") raise if is_config: config.save_config_file() log.print_info("config success") self.config_show()
def crawl_playlist_by_page(page, dtype="全部", download_type=['mp3', 'mv'], save=True, file_check=True): play_url = "http://music.163.com/discover/playlist/?order=hot&cat={}&limit=35&offset={}" play_url = play_url.format(dtype, page * 35) playlist_id = [] titles = [] try: acmsk = {'class': 'msk'} scnb = {'class': 'nb'} dcu = {'class': 'u-cover u-cover-1'} ucm = {'class': 'm-cvrlst f-cb'} data = tools.curl(play_url, headers, type=RETURE_HTML) lst = data.find('ul', ucm) for play in lst.find_all('div', dcu): title = play.find('a', acmsk)['title'] link = play.find('a', acmsk)['href'].replace("/playlist?id=", "") playlist_detail = get_playlist_detail(link) with tools.ignored(Exception): log.print_info("%s author:%s" % ( "<" + playlist_detail['name'] + ">", tools.encode(playlist_detail['creator']['nickname']), )) playlist_mo = Playlist() playlist_mo.parse_model(playlist_detail, save=save, download_type=download_type, file_check=file_check) return titles except Exception as e: log.print_err("crawl html error:{} type:{} page:{}".format( e, dtype, page)) raise
from NXSpider.common import log from NXSpider.common.config import Config from NXSpider.model.export import * mongodb_conf = Config().get_mongo() try: client = pymongo.MongoClient(host=mongodb_conf['host'], port=mongodb_conf['port'], connectTimeoutMS=3000, serverSelectionTimeoutMS=3000) test_connect = client.database.test.count() del client except ServerSelectionTimeoutError as e: log.print_err("mongodb server config error") exit() model_download_url = 'download_url' model_is_download = 'downloaded' def field_value(field, value): """ Converts a supplied value to the type required by the field. If the field requires a EmbeddedDocument the EmbeddedDocument is created and updated using the supplied data. :param field: :param value: :return: """
def create_params_by_dict(obj): try: return create_params_text(json.dumps(obj)) except Exception as e: log.print_err('create params error: %s' % e) return None
def parse_model(self, crawl_dict, download_type=None, file_check=False, save=True, debug=False, shortcuts_stack=None): """ Get a model from db or create, update and save!!! this will replace some attributes into models by load_save_model also. by @attr_replace(attr_name, new_name) :param debug: :param file_check: :param download_type: :param save: save doc :param crawl_dict: must have id attr :param shortcuts_stack: stack of shortcuts path :type crawl_dict: dict :type shortcuts_stack: list[str] :return: :rtype: DynamicDocument :type save: bool """ if debug: # self.debug_save_json(crawl_dict) pass # get id if 'id' not in crawl_dict: log.print_err(u"can not load id by json obj %s" % json.dumps(crawl_dict)) return None doc_id = crawl_dict['id'] # load a mongo document doc, is_new_doc = get_one_model_by_key(self.__model_name__, doc_id) if doc is None: log.print_err(u"can not load a doc by obj %s_%d" % (self.__file_type__, doc_id)) return None # shortcuts in stack if shortcuts_stack is not None and isinstance(shortcuts_stack, list) \ and self.shortcut_relative_name(doc): shortcuts_stack.append(self.shortcut_relative_name(doc)) # if is_new_doc: # replace attr or ignore obj = dict() for k, v in crawl_dict.items(): if k in self.__model_rfilter__: continue obj[k] = v # replace object if k not in self.__attrs_replace_fucs__: continue # change attr if isinstance(v, list): v = [self.__attrs_replace_fucs__[k](self, x) for x in v] else: v = self.__attrs_replace_fucs__[k](self, v) # replace key name del obj[k] obj[self.__attrs_replace_map__[k]] = v # recursion replace a attr into a model for k, v in self.__parse_recursion__.items(): if k not in obj: continue if isinstance(obj[k], list): obj[k] = [ v.parse_model(x, save=save, download_type=download_type, file_check=file_check, debug=debug, shortcuts_stack=shortcuts_stack) for x in obj[k] ] elif isinstance(obj[k], dict): obj[k] = v.parse_model(obj[k], save=save, download_type=download_type, file_check=file_check, debug=debug, shortcuts_stack=shortcuts_stack) # update json to doc, this must be after recursion update_dynamic_doc(doc, obj) # modify doc and self.pre_save(doc, crawl_dict) # try download self.try_download(doc, download_type, file_check) if shortcuts_stack: if self.shortcut_relative_name(doc): shortcuts_stack.pop() self.create_shortcut(doc, shortcuts_stack) # save document if save and callable(getattr(doc, 'save', None)): doc.save() if debug: self.debug_print(crawl_dict) return doc
def config_test(self): result = True try: # check mongodb config if self.config['no_mongo'] is False: log.print_info('check mongodb config') mongo = self.config['mongo'] for k in ['name', 'host', 'port']: if k not in mongo: log.print_err( "mongo config error, key mongo.{} is not set yet". format(k)) result = False # try import model, which will connect to server and exit if server config wrong import NXSpider.model.mongo_model for k in [ 'download_path', 'mv_def_resolution', 'media_tag', 'media_tag_163' ]: if k not in self.config: log.print_err( "config error, key {} is not set yet".format(k)) result = False # check type type_check = { 'download_path': list, 'mv_def_resolution': int, 'media_tag': bool, 'media_tag_163': bool, 'download_file_check': bool, 'no_mongo': bool, } need_save = False for k, v in type_check.items(): if not isinstance(self.config[k], v): log.print_err("config error, {} is not a require type, " "and is reset to default value: {}".format( k, self.default_config[k])) self.config[k] = self.default_config[k] need_save = True result = False # download path check final_paths = [] for p in self.config['download_path']: try: # some error need pass if os.path.isdir(p) is False: os.mkdir(p) final_paths.append(p) except: log.print_warn( "download path may be wrong and be deleted: {}".format( p)) need_save = True result = False pass # mv resolution check if self.config['mv_def_resolution'] not in mv_resolutions: log.print_warn( "mv_def_resolution will be reset to default: {}".format( self.default_config['mv_def_resolution'])) self.config['mv_def_resolution'] = self.default_config[ 'mv_def_resolution'] need_save = True result = False if need_save: self.config['download_path'] = final_paths self.save_config_file() return result except Exception as e: log.print_err(e) return False
def search(self): search_key = 'mp3' key_num = 0 for k, v in search_types.items(): if getattr(self.app.pargs, k, None): search_key = k key_num += 1 if key_num > 1: log.print_err("it could search by only one type") # input must be decode in python2 search_value = getattr(self.app.pargs, search_key) search_value = py2_decoding(search_value) res = api.search(search_value, stype=search_key, offset=self.app.pargs.offset or 0, limit=self.app.pargs.limit or 50) if not res: log.print_info("nothing found!") table = '' if search_key == 'mp3' and 'songs' in res: table = AsciiTable( [["ID", "Name", "Album", "AlbumID", "Artist", "ArtistID"]]) table_data = [[ str(item['id']), item['name'], item['album']['name'], item['album']['id'], ','.join([ar['name'] for ar in item['artists']]), ','.join([str(ar['id']) for ar in item['artists']]), ] for item in res['songs']] table.table_data.extend(table_data) elif search_key == 'playlist' and 'playlists' in res: table = AsciiTable( [["ID", "Name", "User", "PlayCount", "FavoriteCount"]]) table_data = [[ str(item['id']), item['name'], item['creator']['nickname'], str(item['playCount']), str(item['bookCount']), ] for item in res['playlists']] table.table_data.extend(table_data) pass elif search_key == 'user' and 'userprofiles' in res: table = AsciiTable([["ID", "Name", "Signature"]]) table_data = [[ str(item['userId']), item['nickname'], item['signature'], ] for item in res['userprofiles']] table.table_data.extend(table_data) pass elif search_key == 'artist' and 'artists' in res: table = AsciiTable([["ID", "Name", "AlbumNum", "MVNum"]]) table_data = [[ str(item['id']), item['name'], str(item['albumSize']), str(item['mvSize']) ] for item in res['artists']] table.table_data.extend(table_data) elif search_key == 'album' and 'albums' in res: table = AsciiTable([["ID", "Album", "Artist", "ArtistID"]]) table_data = [[ str(item['id']), item['name'], ','.join([ar['name'] for ar in item['artists']]), ','.join([str(ar['id']) for ar in item['artists']]), ] for item in res['albums']] table.table_data.extend(table_data) pass elif search_key == 'mv' and 'mvs' in res: table = AsciiTable( [["ID", "Name", "Artist", "ArtistID", "Duration", "PlayCount"]]) table_data = [[ str(item['id']), item['name'], item['artistName'], item['artistId'], '%02d:%02d' % divmod(int(item['duration'] / 1000), 60), item['playCount'], ] for item in res['mvs']] table.table_data.extend(table_data) pass if table == '': log.print_err('nothing found') else: print(table.table)