def search(self): countTrial = 0 accepted = False while not accepted and countTrial < self.__MAX_TRIALS: countTrial += 1 response = urllib2.urlopen(self.VIDEO_SEARCH_BASE_URL + self.__query) html = response.read() elements = re.findall(self.__REGEX_SEARCH_RESULT_ITEM, html) if len(elements) >= self.__MIN_ACCEPTED_RESULTS: accepted = True for e in elements: title = re.findall(self.__REGEX_TITLE_ATTR, e)[0].split('=')[1] title = title[1:len(title) - 1] title = title.strip().replace("\s+", ' ') if String.count_match_words(self.__IGNORE_WORDS, title) > 0: continue vidId = re.findall(self.__REGEX_ID_ATTR, e)[0].split('=')[1] vidId = vidId[1:len(vidId) - 1] vidId = vidId.strip().replace("\s+", ' ') self.__results.append({ self.__KEY_TITLE: title, self.__KEY_URL: self.VIDEO_VIEW_BASE_URL + vidId }) return len(self.__results)
def scan(self): """ Scan the stations page and retrieve all stations to __stations @return bool: True if the account contains some stations False if something wrong, e.g. incorrect profileUsername, or no station found """ preURL = self.STATIONS_VIEW_BASE_URL preURL = preURL.replace("[username]", str(self.profileUsername)) url = self.STATIONS_REQUEST_BASE_URL url = url.replace("[username]", str(self.profileUsername)) try: html = self.get_request(preURL) html = self.get_request(url) except: # there must be something wrong with the url, i.e. incorrect profile username return False elements = DOM.get_elements("div", {"class":"infobox-body"}, html) for e in elements: stationNodes = DOM.get_elements("a", {"href":"/station/[0-9]+"}, e.nodeValue) if len(stationNodes)==0: continue stationNode = stationNodes[0] stationName = String.decode_html_entities(stationNode.nodeValue) stationId = stationNode.get_attr("href").split('/')[2].strip() self.__stations.append({"name":stationName, "id":stationId}) return True
def next_list(self): """ Each time this func is invoked, a next pagination of tracks is loaded Then self.get_cur_tracks should be called to retrieve the current list of tracks @return bool: True if this new pagination still contains tracks, False if no more tracks, i.e. last pagination None if something wrong with the station, e.g. incorrect stationId """ self.__curStartIdx += self.__prevItems self.__prevItems = 0 self.__curThumbUpTracks = [] url = self.STATION_TRACKS_BASE_URL url = url.replace("[stationId]", self.__stationId) url = url.replace("[startIdx]", str(self.__curStartIdx)) try: response = urllib2.urlopen(url) except: # there must be something wrong with the url, i.e. incorrect url return None html = response.read() elements = DOM.get_elements("li", {"data-date": "[0-9]+", "data-artist": "[^>]+"}, html) for e in elements: trackNodes = DOM.get_elements("h3", {}, e.nodeValue) if len(trackNodes)==0: continue trackNode = trackNodes[0] songNodes = DOM.get_elements("a", {}, trackNode.nodeValue) if len(songNodes)<2: continue song = String.decode_html_entities(songNodes[0].nodeValue) song = String.symbols_to_words(song) song = self.__remove_redundant_words(song) artist = String.decode_html_entities(songNodes[1].nodeValue) artist = String.symbols_to_words(artist) record = song+' '+artist if not record in self.__thumbUpTracks: self.__thumbUpTracks.append(record) self.__curThumbUpTracks.append(record) self.__prevItems += 1 if self.__prevItems == 0: return False return True
def __get_video_streams_info(self): # prepare to get the streams info of the video infoUrl = self.GET_VIDEO_BASE_URL + self.__videoId resp = urllib2.urlopen(infoUrl) content = resp.read() # also, prepare to get the video title streams = "" for item in content.split('&'): if item.find(self.GET_VIDEO_TITLE_PARAM) == 0: if self.__videoTitle == '': self.__videoTitle = String.url_decode(item.split('=')[1]) self.__videoTitle = self.__videoTitle.replace('+', ' ') self.__videoTitle = self.__videoTitle.replace('/', ' ') elif item.find(self.GET_VIDEO_STREAM_PARAM) == 0: streams = item.strip() if self.__videoTitle != '': break; streams = map(lambda stream: String.url_decode("type"+stream), streams.split("type"))[1:] return streams
def __get_video_streams_info(self): # prepare to get the streams info of the video infoUrl = self.GET_VIDEO_BASE_URL + self.__videoId resp = urllib2.urlopen(infoUrl) content = resp.read() # also, prepare to get the video title streams = "" for item in content.split('&'): if item.find(self.GET_VIDEO_TITLE_PARAM) == 0: if self.__videoTitle == '': self.__videoTitle = String.url_decode(item.split('=')[1]) self.__videoTitle = self.__videoTitle.replace('+', ' ') self.__videoTitle = self.__videoTitle.replace('/', ' ') elif item.find(self.GET_VIDEO_STREAM_PARAM) == 0: streams = item.strip() if self.__videoTitle != '': break streams = map(lambda stream: String.url_decode("type" + stream), streams.split("type"))[1:] return streams
def __get_next_video_url(self): videoUrl = "" keyWords = float(len(self.__keyword.split(' '))) for i in range(self.__currentItemIdx + 1, len(self.__results)): e = self.__results[i] title = e[self.__KEY_TITLE] points = String.count_match_words(self.__keyword, title) / keyWords if points > self.__MIN_WORDS_ACCEPTED: videoUrl = e[self.__KEY_URL] self.__currentItemIdx = i break return videoUrl
def __get_next_video_url(self): videoUrl = "" keyWords = float(len(self.__keyword.split(' '))) for i in range(self.__currentItemIdx+1, len(self.__results)): e = self.__results[i] title = e[self.__KEY_TITLE] points = String.count_match_words(self.__keyword, title)/keyWords if points > self.__MIN_WORDS_ACCEPTED: videoUrl = e[self.__KEY_URL] self.__currentItemIdx = i break return videoUrl
def __get_first_match_video_url(self): maxPoints = .0 videoUrl = "" keyWords = float(len(self.__keyword.split(' '))) for i in range(0, len(self.__results)): e = self.__results[i] title = e[self.__KEY_TITLE] points = String.count_match_words(self.__keyword, title) / keyWords if points > maxPoints: maxPoints = points videoUrl = e[self.__KEY_URL] self.__currentItemIdx = i elif maxPoints > 0.: break return videoUrl
def __get_first_match_video_url(self): maxPoints = .0 videoUrl = "" keyWords = float(len(self.__keyword.split(' '))) for i in range(0, len(self.__results)): e = self.__results[i] title = e[self.__KEY_TITLE] points = String.count_match_words(self.__keyword, title)/keyWords if points > maxPoints: maxPoints = points videoUrl = e[self.__KEY_URL] self.__currentItemIdx = i elif maxPoints > 0.: break return videoUrl
def search(self): countTrial = 0 accepted = False while not accepted and countTrial<self.__MAX_TRIALS: countTrial += 1 response = urllib2.urlopen(self.VIDEO_SEARCH_BASE_URL + self.__query) html = response.read() elements = re.findall(self.__REGEX_SEARCH_RESULT_ITEM, html) if len(elements) >= self.__MIN_ACCEPTED_RESULTS: accepted = True for e in elements: title = re.findall(self.__REGEX_TITLE_ATTR, e)[0].split('=')[1] title = title[1:len(title)-1] title = title.strip().replace("\s+", ' '); if String.count_match_words(self.__IGNORE_WORDS, title) > 0: continue vidId = re.findall(self.__REGEX_ID_ATTR, e)[0].split('=')[1] vidId = vidId[1:len(vidId)-1] vidId = vidId.strip().replace("\s+", ' '); self.__results.append({ self.__KEY_TITLE: title, self.__KEY_URL: self.VIDEO_VIEW_BASE_URL+vidId }) return len(self.__results)