def parse_website(self, sa_db, sc_obj, url_item, total_urls): url_obj = url_item["item"] item = {} url = url_obj["url"] print "*************************LEFT URL = ", len(total_urls) print url item["type"] = url_obj["type"] item["genre_id"] = url_obj["genre_id"] item["rank_date"] = self.begin_time.strftime("%Y-%m-%d %H-%M-%S") item["source_site"] = self.class_name item["url"] = url html = sc_obj.load(url, use_cache=False) error_code = self.check_proxy_status(html) if error_code != config.ERROR_NONE: proxy_info = html.response.request.get("proxy") error_str = "proxy error in {}, {}, {}:{}".format( url, error_code, proxy_info.host, proxy_info.port) print error_str global_sc_obj.save(["error", error_str], "error.csv") url_item["status"] = "none" return div_objs = html.q("//div/div/div[contains(@id, 'ueberDiv')]") for div_item in div_objs: try: chart_pos_div = div_item.q(".//div[@class='chartsPos']") last_pos_div = div_item.q( ".//div[@class='chartsLast']//text()").join(" ") try: item["ranking"] = int(chart_pos_div[0].x("text()").strip()) except: item["ranking"] = 0 try: item["last_ranking"] = int( last_pos_div.replace("from", "").replace("#", "")) except: item["last_ranking"] = 0 img_div = div_item.q(".//div[@class='songPic']/a/img") item["img_url"] = img_div[0].x("@src").strip() artist_div = div_item.q( ".//div[contains(@class, 'bandBox')]//a[@class='chartsArtist']" ) song_div = div_item.q( ".//div[contains(@class, 'bandBox')]//a[@class='chartsSong']" ) item["artist_link"] = artist_div[0].x("@href").strip() item["name"] = "" item["song_artist_name"] = "" if item["type"] == config.ROW_DATA_TYPE_SONG: item["name"] = song_div[0].x("text()").strip() item["song_artist_name"] = artist_div[0].x( "text()").strip() else: item["name"] = artist_div[0].x("text()").strip() # global_sc_obj.save([ # "Rank", item["ranking"], # "image link", item["img_url"], # "type", item["type"], # "name", item["name"], # "song artist name", item["song_artist_name"], # "artist page link", item["artist_link"], # "genre id", item["genre_id"], # "rank date", item["rank_date"], # "last rank", item["last_ranking"], # "source site", item["source_site"], # "url", item["url"] # ], "result.csv") db_obj = None exist = False if item["type"] == config.ROW_DATA_TYPE_SONG: db_obj = sa_db.session.query(Song).filter_by( name=item["name"], genre_id=item["genre_id"], song_artist_name=item["song_artist_name"], source_site=item["source_site"]).first() if db_obj == None: db_obj = Song( ranking=item["ranking"], last_ranking=item["last_ranking"], image_link=item["img_url"], name=item["name"], song_artist_name=item["song_artist_name"], artist_page_link=item["artist_link"], genre_id=item["genre_id"], rank_date=item["rank_date"], source_site=item["source_site"]) else: # Upate Part exist = True # Signed and Unsigned Case else: if item["type"] == config.ROW_DATA_TYPE_UNSIGNED_BAND: #unsigned part create db_obj = sa_db.session.query(UnsignedBand).filter_by( name=item["name"], genre_id=item["genre_id"], source_site=item["source_site"]).first() if db_obj == None: db_obj = UnsignedBand( ranking=item["ranking"], last_ranking=item["last_ranking"], image_link=item["img_url"], name=item["name"], song_artist_name=item["song_artist_name"], artist_page_link=item["artist_link"], genre_id=item["genre_id"], rank_date=item["rank_date"], source_site=item["source_site"]) else: exist = True else: #signed part create db_obj = sa_db.session.query(SignedBand).filter_by( name=item["name"], genre_id=item["genre_id"], source_site=item["source_site"]).first() if db_obj == None: db_obj = SignedBand( ranking=item["ranking"], last_ranking=item["last_ranking"], image_link=item["img_url"], name=item["name"], song_artist_name=item["song_artist_name"], artist_page_link=item["artist_link"], genre_id=item["genre_id"], rank_date=item["rank_date"], source_site=item["source_site"]) else: # Upate Part exist = True try: if exist == False: sa_db.session.add(db_obj) else: db_obj.ranking = item["ranking"] db_obj.last_ranking = item["last_ranking"] db_obj.image_link = item["img_url"] db_obj.artist_page_link = item["artist_link"] db_obj.song_artist_name = item["song_artist_name"] db_obj.rank_date = item["rank_date"] sa_db.session.commit() except Exception as e: self.show_exception_detail(e) break except Exception as e: self.show_exception_detail(e) url_item["status"] = "complete"
def parse_website(self, sa_db, sc_obj, url_item, total_urls): url_obj = url_item["item"] item = {} url = url_obj["url"] print "*************************LEFT URL = ", len(total_urls) print url item["type"] = url_obj["type"] item["genre_id"] = url_obj["genre_id"] item["rank_date"] = self.begin_time.strftime("%Y-%m-%d %H-%M-%S") item["source_site"] = self.class_name item["url"] = url json_obj = sc_obj.load_json(url, use_cache=False) # error_code = self.check_proxy_status(html) # print "Error Code = ", error_code # if error_code != config.ERROR_NONE: # proxy_info = html.response.request.get("proxy") # error_str = "proxy error in {}, {}, {}:{}".format(url, error_code, proxy_info.host, proxy_info.port) # print error_str # global_sc_obj.save(["error", error_str], "error.csv") # url_item["status"] = "none" # return if json_obj == None: print "Data does not exist ->", url url_item["status"] = "complete" return collections = json_obj["collection"] print "Len=", len(collections) for i, collection in enumerate(collections): try: item["ranking"] = i + 1 item["last_ranking"] = i + 1 item["img_url"] = collection["track"]["artwork_url"] if item["img_url"] == None: item["img_url"] = collection["track"]["user"]["avatar_url"] item["artist_link"] = collection["track"]["user"][ "permalink_url"] item["name"] = "" item["song_artist_name"] = "" if item["type"] == config.ROW_DATA_TYPE_SONG: title_str = collection["track"]["title"] artist_name_str = collection["track"]["user"]["username"] try: item["name"] = title_str.decode("utf8") except Exception as e: item["name"] = ''.join([ char if ord(char) < 128 else '' for char in title_str ]) try: item["song_artist_name"] = artist_name_str.decode( "utf8") except Exception as e: item["song_artist_name"] = ''.join([ char if ord(char) < 128 else '' for char in artist_name_str ]) db_obj = None exist = False if item["type"] == config.ROW_DATA_TYPE_SONG: db_obj = sa_db.session.query(Song).filter_by( name=item["name"], song_artist_name=item["song_artist_name"], source_site=item["source_site"]).first() if db_obj == None: db_obj = Song( ranking=item["ranking"], last_ranking=item["last_ranking"], image_link=item["img_url"], name=item["name"], song_artist_name=item["song_artist_name"], artist_page_link=item["artist_link"], genre_id=item["genre_id"], rank_date=item["rank_date"], source_site=item["source_site"]) else: # Upate Part exist = True # print json.dumps(item, indent=2) try: if exist == False: sa_db.session.add(db_obj) else: db_obj.ranking = item["ranking"] db_obj.last_ranking = item["last_ranking"] db_obj.image_link = item["img_url"] db_obj.artist_page_link = item["artist_link"] db_obj.song_artist_name = item["song_artist_name"] db_obj.rank_date = item["rank_date"] sa_db.session.commit() except Exception as e: self.show_exception_detail(e) break except Exception as e: self.show_exception_detail(e) url_item["status"] = "complete"