Exemplo n.º 1
0
    def parse_website(self, sa_db, sc_obj, url_item, total_urls):
        url_obj = url_item["item"]
        item = {}

        url = url_obj["url"]

        print "*************************LEFT URL = ", len(total_urls)
        print url
        item["type"] = url_obj["type"]

        item["genre_id"] = url_obj["genre_id"]
        item["rank_date"] = self.begin_time.strftime("%Y-%m-%d %H-%M-%S")
        item["source_site"] = self.class_name
        item["url"] = url

        html = sc_obj.load(url, use_cache=False)
        error_code = self.check_proxy_status(html)
        if error_code != config.ERROR_NONE:
            proxy_info = html.response.request.get("proxy")
            error_str = "proxy error in {}, {}, {}:{}".format(
                url, error_code, proxy_info.host, proxy_info.port)
            print error_str

            global_sc_obj.save(["error", error_str], "error.csv")
            url_item["status"] = "none"
            return

        div_objs = html.q("//div/div/div[contains(@id, 'ueberDiv')]")

        for div_item in div_objs:
            try:
                chart_pos_div = div_item.q(".//div[@class='chartsPos']")
                last_pos_div = div_item.q(
                    ".//div[@class='chartsLast']//text()").join(" ")

                try:
                    item["ranking"] = int(chart_pos_div[0].x("text()").strip())
                except:
                    item["ranking"] = 0

                try:
                    item["last_ranking"] = int(
                        last_pos_div.replace("from", "").replace("#", ""))
                except:
                    item["last_ranking"] = 0

                img_div = div_item.q(".//div[@class='songPic']/a/img")
                item["img_url"] = img_div[0].x("@src").strip()

                artist_div = div_item.q(
                    ".//div[contains(@class, 'bandBox')]//a[@class='chartsArtist']"
                )
                song_div = div_item.q(
                    ".//div[contains(@class, 'bandBox')]//a[@class='chartsSong']"
                )

                item["artist_link"] = artist_div[0].x("@href").strip()

                item["name"] = ""
                item["song_artist_name"] = ""
                if item["type"] == config.ROW_DATA_TYPE_SONG:
                    item["name"] = song_div[0].x("text()").strip()
                    item["song_artist_name"] = artist_div[0].x(
                        "text()").strip()
                else:
                    item["name"] = artist_div[0].x("text()").strip()

                # global_sc_obj.save([
                #     "Rank",                 item["ranking"],
                #     "image link",           item["img_url"],
                #     "type",                 item["type"],
                #     "name",                 item["name"],
                #     "song artist name",     item["song_artist_name"],
                #     "artist page link",     item["artist_link"],
                #     "genre id",             item["genre_id"],
                #     "rank date",            item["rank_date"],
                #     "last rank",            item["last_ranking"],
                #     "source site",          item["source_site"],
                #     "url",                  item["url"]
                #     ], "result.csv")

                db_obj = None
                exist = False
                if item["type"] == config.ROW_DATA_TYPE_SONG:
                    db_obj = sa_db.session.query(Song).filter_by(
                        name=item["name"],
                        genre_id=item["genre_id"],
                        song_artist_name=item["song_artist_name"],
                        source_site=item["source_site"]).first()

                    if db_obj == None:
                        db_obj = Song(
                            ranking=item["ranking"],
                            last_ranking=item["last_ranking"],
                            image_link=item["img_url"],
                            name=item["name"],
                            song_artist_name=item["song_artist_name"],
                            artist_page_link=item["artist_link"],
                            genre_id=item["genre_id"],
                            rank_date=item["rank_date"],
                            source_site=item["source_site"])
                    else:  # Upate Part
                        exist = True

                # Signed and Unsigned Case
                else:
                    if item["type"] == config.ROW_DATA_TYPE_UNSIGNED_BAND:  #unsigned part create
                        db_obj = sa_db.session.query(UnsignedBand).filter_by(
                            name=item["name"],
                            genre_id=item["genre_id"],
                            source_site=item["source_site"]).first()

                        if db_obj == None:
                            db_obj = UnsignedBand(
                                ranking=item["ranking"],
                                last_ranking=item["last_ranking"],
                                image_link=item["img_url"],
                                name=item["name"],
                                song_artist_name=item["song_artist_name"],
                                artist_page_link=item["artist_link"],
                                genre_id=item["genre_id"],
                                rank_date=item["rank_date"],
                                source_site=item["source_site"])
                        else:
                            exist = True

                    else:  #signed part create
                        db_obj = sa_db.session.query(SignedBand).filter_by(
                            name=item["name"],
                            genre_id=item["genre_id"],
                            source_site=item["source_site"]).first()

                        if db_obj == None:
                            db_obj = SignedBand(
                                ranking=item["ranking"],
                                last_ranking=item["last_ranking"],
                                image_link=item["img_url"],
                                name=item["name"],
                                song_artist_name=item["song_artist_name"],
                                artist_page_link=item["artist_link"],
                                genre_id=item["genre_id"],
                                rank_date=item["rank_date"],
                                source_site=item["source_site"])

                        else:  # Upate Part
                            exist = True
                try:
                    if exist == False:
                        sa_db.session.add(db_obj)

                    else:
                        db_obj.ranking = item["ranking"]
                        db_obj.last_ranking = item["last_ranking"]
                        db_obj.image_link = item["img_url"]
                        db_obj.artist_page_link = item["artist_link"]
                        db_obj.song_artist_name = item["song_artist_name"]
                        db_obj.rank_date = item["rank_date"]

                    sa_db.session.commit()

                except Exception as e:
                    self.show_exception_detail(e)
                    break

            except Exception as e:
                self.show_exception_detail(e)

        url_item["status"] = "complete"
Exemplo n.º 2
0
    def parse_website(self, sa_db, sc_obj, url_item, total_urls):
        url_obj = url_item["item"]
        item = {}

        url = url_obj["url"]

        print "*************************LEFT URL = ", len(total_urls)
        print url
        item["type"] = url_obj["type"]

        item["genre_id"] = url_obj["genre_id"]
        item["rank_date"] = self.begin_time.strftime("%Y-%m-%d %H-%M-%S")
        item["source_site"] = self.class_name
        item["url"] = url

        json_obj = sc_obj.load_json(url, use_cache=False)
        # error_code = self.check_proxy_status(html)

        # print "Error Code = ", error_code
        # if error_code != config.ERROR_NONE:
        #     proxy_info = html.response.request.get("proxy")
        #     error_str = "proxy error in {}, {}, {}:{}".format(url, error_code, proxy_info.host, proxy_info.port)
        #     print error_str

        #     global_sc_obj.save(["error", error_str], "error.csv")
        #     url_item["status"] = "none"
        #     return

        if json_obj == None:
            print "Data does not exist ->", url
            url_item["status"] = "complete"
            return

        collections = json_obj["collection"]
        print "Len=", len(collections)

        for i, collection in enumerate(collections):
            try:
                item["ranking"] = i + 1
                item["last_ranking"] = i + 1

                item["img_url"] = collection["track"]["artwork_url"]

                if item["img_url"] == None:
                    item["img_url"] = collection["track"]["user"]["avatar_url"]

                item["artist_link"] = collection["track"]["user"][
                    "permalink_url"]

                item["name"] = ""
                item["song_artist_name"] = ""

                if item["type"] == config.ROW_DATA_TYPE_SONG:
                    title_str = collection["track"]["title"]
                    artist_name_str = collection["track"]["user"]["username"]

                    try:
                        item["name"] = title_str.decode("utf8")
                    except Exception as e:
                        item["name"] = ''.join([
                            char if ord(char) < 128 else ''
                            for char in title_str
                        ])

                    try:
                        item["song_artist_name"] = artist_name_str.decode(
                            "utf8")
                    except Exception as e:
                        item["song_artist_name"] = ''.join([
                            char if ord(char) < 128 else ''
                            for char in artist_name_str
                        ])

                db_obj = None
                exist = False
                if item["type"] == config.ROW_DATA_TYPE_SONG:
                    db_obj = sa_db.session.query(Song).filter_by(
                        name=item["name"],
                        song_artist_name=item["song_artist_name"],
                        source_site=item["source_site"]).first()

                    if db_obj == None:
                        db_obj = Song(
                            ranking=item["ranking"],
                            last_ranking=item["last_ranking"],
                            image_link=item["img_url"],
                            name=item["name"],
                            song_artist_name=item["song_artist_name"],
                            artist_page_link=item["artist_link"],
                            genre_id=item["genre_id"],
                            rank_date=item["rank_date"],
                            source_site=item["source_site"])
                    else:  # Upate Part
                        exist = True

                # print json.dumps(item, indent=2)
                try:
                    if exist == False:
                        sa_db.session.add(db_obj)

                    else:
                        db_obj.ranking = item["ranking"]
                        db_obj.last_ranking = item["last_ranking"]
                        db_obj.image_link = item["img_url"]
                        db_obj.artist_page_link = item["artist_link"]
                        db_obj.song_artist_name = item["song_artist_name"]
                        db_obj.rank_date = item["rank_date"]

                    sa_db.session.commit()

                except Exception as e:
                    self.show_exception_detail(e)
                    break

            except Exception as e:
                self.show_exception_detail(e)

        url_item["status"] = "complete"