def _load_full_data(self, listing): if listing.is_fully_loaded(): return s = open_page(self._br, listing.url) if 'This unit is not currently listed on StreetEasy' in s: return (days, s) = html_helper.advance_and_find(s, '<h6>Days On Market</h6>', 'p>', ' day') post_timestamp = self._get_post_timestamp(days) (brokerage, broker) = ('', '') (broker_stuff, s) = html_helper.find_in_between(s, 'Listed at', "<div class='closer'></div>") if broker_stuff != None: broker_stuff = html_helper.strip_tags(broker_stuff.replace('\n', ' ')) (brokerage, broker) = broker_stuff.split(' by ') (descr, s) = html_helper.find_in_between(s, '<h2>Description</h2>', '</section>') descr = html_helper.strip_tags(descr) (amenities, s) = html_helper.find_in_between(s, '<h2>Amenities</h2>', '</section>') amenities = amenities.replace('</li>', ', ').replace('</h6>', ': ').replace('<h6>', '\n') amenities = html_helper.strip_tags(amenities) amenities = amenities.replace("googletag.cmd.push(function() {googletag.display('ad_amenity');});", '') amenities = amenities.replace(' \n', ' ').replace(', \n', '. ') listing.set_blurb(descr + '\n\n' + amenities) listing.set_posting_timestamp(post_timestamp) listing.set_broker(broker) listing.set_brokerage(brokerage) listing.save_to_db()
def _load_more_listing_data(self, listing): if listing.is_fully_loaded(): return listing s = open_page(self._br, listing.url) pos = s.find(SQFT_MARKER) if pos >= 0: (sqft, s) = html_helper.find_in_between(s[pos:], COL_START, COL_END) listing.set_sqft(int(sqft)) s = s[s.find(BLDG_MARKER):] (address, s) = html_helper.find_in_between(s, COL_START, COL_END) self._set_formatted_address(address, listing) s = s[s.find(BRKG_MARKER):] (brokerage, s) = html_helper.find_in_between(s, COL_START, COL_END) if brokerage: brokerage = html_helper.strip_tags(brokerage) listing.set_brokerage(brokerage) s = s[s.find(BRKR_MARKER):] (broker, s) = html_helper.find_in_between(s, COL_START, COL_END) if broker: broker = html_helper.strip_tags(broker) listing.set_broker(broker) pos = s.find(COMMENTS_MARKER) if pos >= 0: (blurb, s) = html_helper.find_in_between(s[pos:], ':', '<div class="cleanbreakdiv">') if blurb != None: listing.set_blurb(html_helper.strip_tags(blurb)) listing.save_to_db() return listing
def _load_details(self, listing): if listing.is_fully_loaded(): return s = open_page(self._br, listing.url) (broker, brokerage) = ('', '') if 'Brokerage:' in s: (broker, s) = html_helper.advance_and_find(s, 'Save to Favorites', '<span class="bold">', '</span>') (brokerage, s) = html_helper.advance_and_find(s, 'Brokerage: ', '<span class="bold"', '</span>') if brokerage and len(brokerage) > 0: brokerage = brokerage[1:] (features, s) = html_helper.find_in_between(s, 'Features & Amenities', '<div style="width: 640px') blurb = html_helper.strip_tags(features.replace('<td', '\n<td')) has_no_fee = 'No Fee\n' in blurb listing.set_has_fee(not has_no_fee) (desc, s) = html_helper.find_in_between(s, ' Description', '<div id="panels"') if desc != None: blurb += '\n\n' + html_helper.strip_tags(desc) listing.set_blurb(blurb) (address, s) = html_helper.find_in_between(s, "var report_listing_address = '", "'") (long, s) = html_helper.find_in_between(s, "longitude = '", "'") (lat, s) = html_helper.find_in_between(s, "latitude = '", "'") address = get_address(lat, long) listing.set_location(lat, long, address) listing.set_broker(broker) listing.set_brokerage(brokerage) listing.save_to_db()
def prepare_text(text): text = unicode(text) text = html_helper.strip_tags(text) text = ' '.join(text.splitlines()) text = text.translate(punctuationTable) words = [stemmer.stemWord(word) for word in text.split(' ') if word] text = string.join(words, ' ') return text
def _get_menu_page(self, s): items = [] while True: (item, s) = html_helper.advance_and_find(s, 'class="media-story"', '<h3>', '</h3>') if item == None: break items.append(html_helper.strip_tags(item).strip()) return items
def _load_more_data(self, listing): if listing.is_fully_loaded(): return s = open_page(self._browser, listing.url) (section, s) = html_helper.find_in_between(s, SECTION_MARKER, SECTION_END) section = html_helper.strip_tags(section) listing.set_blurb(section) listing.save_to_db()
def _find_listing(self, s): (url, s) = html_helper.advance_and_find(s, TITLE_PLACE_MARKER, 'href="', '"') (title, s) = html_helper.find_in_between(s, '>', '<') if url == None or title == None: return (None, s) title = html_helper.strip_tags(title) (price, s) = html_helper.advance_and_find(s, 'color-fg-green', '$', '<') price = int(float(price.strip().replace(',', ''))) (_, s) = html_helper.advance_and_find(s, '<td', '', '<div') (recency, s) = html_helper.advance_and_find(s, '"bold font-size-100"', '>', '</div') recency = html_helper.strip_tags(recency).lower() dt = self._understand_recency(recency, url) listing = Apartment(SOURCE, title, price, url) listing.set_posting_timestamp(dt.strftime('%s')) return (listing, s)
def FeaturedStreamsMenu(sender, page=None): dir = MediaContainer(viewGroup="List", title2="Featured Streams") url = "%s?limit=%s" % (TWITCH_FEATURED_STREAMS, PAGE_LIMIT) featured = JSON.ObjectFromURL(url, cacheTime=CACHE_INTERVAL) for stream in featured['featured']: subtitle = "%s\n%s Viewers" % (stream['stream']['game'], stream['stream']['viewers']) summary = strip_tags(stream['text']) streamUrl = "%s&channel=%s" % (TWITCH_LIVE_PLAYER, stream['stream']['channel']['name']) dir.Append(WebVideoItem(streamUrl, title=stream['stream']['channel']['display_name'], subtitle=subtitle, summary=summary, thumb=stream['stream']['preview'])) return dir
def FeaturedStreamsMenu(sender, page=None): dir = ObjectContainer(title2='Featured') #dir = MediaContainer(viewGroup="List", title2="Featured Streams") url = "%s?limit=%s" % (TWITCH_FEATURED_STREAMS, PAGE_LIMIT) featured = JSON.ObjectFromURL(url, cacheTime=CACHE_INTERVAL) for stream in featured['featured']: subtitle = "%s\n%s Viewers" % (stream['stream']['game'], stream['stream']['viewers']) summary = strip_tags(stream['text']) #streamUrl = "%s&channel=%s" % (TWITCH_LIVE_PLAYER, stream['stream']['channel']['name']) streamUrl = stream['stream']['channel']['url'] dir.add(VideoClipObject(url=streamUrl, title=stream['stream']['channel']['display_name'], summary=summary, source_title=subtitle, thumb=stream['stream']['preview']['large'])) return dir
def _set_formatted_address(self, address, listing): # Strip off first line if it has a building in it (>2 lines) num_breaks = address.count(BREAK) if num_breaks > 1: address = address[address.index(BREAK) + len(BREAK) :] # Strip out HTML tags, add NY if it's not there, fix extra spacing address = html_helper.strip_tags(address).replace('( map )', ', ') if num_breaks == 0: address += ', NY' address.replace('\n', ', ') address = html_helper.fix_spaces(address) location = get_latlong(address) if location != None: listing.set_location(location[0], location[1], address) else: listing.set_address(address) print 'error getting', address, listing.url
def FollowedMenu(sender, page=None): dir = ObjectContainer(title2="Followed") url = TWITCH_FOLLOWED_STREAMS % Prefs['username'] channel_arr = [] followed = JSON.ObjectFromURL(url, cacheTime=CACHE_INTERVAL) for follow in followed['follows']: channel = follow['channel'] ch_name = channel['name'] channel_arr.append(ch_name) streams = JSON.ObjectFromURL(TWITCH_LIST_STREAMS+"?%s" % urllib.urlencode({'channel' : ','.join(channel_arr)})) for stream in streams['streams']: subtitle = "%s\n%s Viewers" % (stream['game'], stream['viewers']) summary = strip_tags(stream['channel']['status']) streamUrl = stream['channel']['url'] dir.add(VideoClipObject(url=streamUrl, title=stream['channel']['display_name'], summary=summary, source_title=subtitle, thumb=stream['preview']['large'])) return dir