Пример #1
0
    def parse(listitem):
        title = ident = web = short_address = phone = lat = lng = None
        tags = []

        t = css(listitem, 'h1 a')
        if t:
            title = t[0].contents[0]
            ident = t[0]['href']

        t = css(listitem, '.tel-fax .record-detail')
        if t:
            phone = t[0].contents[1].strip()

        t = css(listitem, '.web a[href^=http]')
        if t:
            web = t[0]['href']

        t = css(listitem, '.p-code .record-detail')
        if t:
            short_address = str(t[0].contents[1]).strip()

        item = {
            'title': title,
            'lat': lat,
            'lng': lng,
            'url': web,
            'phone': phone,
            'short_address': short_address,
            'tags': tags,
            'origin': ident
        }

        with LOCK:
            sys.stdout.write('.')
            data[ident] = item
Пример #2
0
def do_work(*args):
    # Do something with args
    url = args[0]

    with LOCK:
        print url

    html = ''.join(urllib2.urlopen(url, timeout=TIMEOUT).readlines())
    html = html.replace('<!- Google Analytics -->', '')
    html = re.sub('<script.*?>[\s\S]*?</.*?script>', '', html)
    soup = BeautifulSoup(html)

    item = {}

    def parse(listitem):
        title = ident = web = short_address = phone = lat = lng = None
        tags = []

        t = css(listitem, 'h1 a')
        if t:
            title = t[0].contents[0]
            ident = t[0]['href']

        t = css(listitem, '.tel-fax .record-detail')
        if t:
            phone = t[0].contents[1].strip()

        t = css(listitem, '.web a[href^=http]')
        if t:
            web = t[0]['href']

        t = css(listitem, '.p-code .record-detail')
        if t:
            short_address = str(t[0].contents[1]).strip()

        item = {
            'title': title,
            'lat': lat,
            'lng': lng,
            'url': web,
            'phone': phone,
            'short_address': short_address,
            'tags': tags,
            'origin': ident
        }

        with LOCK:
            sys.stdout.write('.')
            data[ident] = item

    for listitem in css(soup, '.search-row-grey-wrapper'):
        parse(listitem)

    for listitem in css(soup, '.search-row-white-wrapper'):
        parse(listitem)
Пример #3
0
 def parse(self, response):
     html = response.body
     html = html.replace('<!- Google Analytics -->', '')
     html = re.sub('<script.*?>[\s\S]*?</.*?script>', '', html)
     soup = BeautifulSoup(html)
     items = []
     for listitem in css(soup, '.search-row-grey-wrapper'):
         items.append(self.soup_parse(listitem))
     for listitem in css(soup, '.search-row-white-wrapper'):
         items.append(self.soup_parse(listitem))
     return items
Пример #4
0
def do_work(*args):
    location = args[0]
    # print location
    lat, lng, address = location['lat'], location['lng'], location['short_address']

    url = URL_TEMPLATE % (lat, lng)

    xml = ''.join(urllib2.urlopen(url, timeout=TIMEOUT).readlines())
    soup = BeautifulSoup(xml)

    woeid = None
    woeid = css(soup, 'woeid')[0].contents[0]
    placetype = css(soup, 'type')[0].contents[0]

    item = {"lat_lon": [lat, lng], "latitude": lat, "longitude": lng, "_types": ["Location"], "name": address, "woeid": woeid, "placetype": placetype, "_cls": "Location"}

    with lock:
        print '.'
        data.append(item)
Пример #5
0
    def soup_parse(self, listitem):
        title = ident = web = short_address = area = phone = lat = lng = None
        tags = []

        t = css(listitem, 'h1 a')
        if t:
            title = t[0].contents[0]
            ident = t[0]['href']

        """
        this could look for span class=bold, extract heading, get content as next sibling
        
        """
        t = css(listitem, '.tel-fax .record-detail')
        if t:
            phone = t[0].contents[1].strip()

        t = css(listitem, '.web a[href^=http]')
        if t:
            web = t[0]['href']

        t = css(listitem, '.p-code .record-detail')
        if t:
            short_address = str(t[0].contents[1]).strip()

        t = css(listitem, '.p-code p')
        if t:    
            area = self.extract_span_heading(t[0], 'Area Covered:')

        item = {
            'title': title,
            # 'lat': lat,
            # 'lng': lng,
            'url': web,
            # 'phone': phone,
            'short_address': short_address,
            'area': area,
            # 'tags': tags,
            # 'origin': ident
        }
        return GCDItem(**item)
Пример #6
0
 def extract_span_heading(self, node, heading):
     """
     GCD pages have data as eg 
     <p><span class="bold">heading:</span>some text</p>
     
     can be p or div
     
     """
     result = None
     h = css(node, 'span.bold')
     if h and (str(h[0].contents[0].strip()) == heading):
         if len(node.contents) > 1:
             result = str(node.contents[1]).strip()
     return result
Пример #7
0
def do_work(*args):
    url = args[0]
    html = ''.join(urllib2.urlopen(url, timeout=5).readlines())
    soup = BeautifulSoup(html)

    for listitem in css(soup, 'li.listitem'):
        title = url = short_address = phone = lat = lng = None
        activities = []
        t = css(listitem, 'h3 a')
        if t:
            title = t[0].contents[0]
            url = t[0]['href']
        sa = css(listitem, 'li.shortaddress')
        if sa:
            short_address = sa[0].contents[0]
        pn = css(listitem, 'li.phonenumber')
        if pn:
            phone = pn[0].contents[0]
        for im in css(listitem, 'div.activityicons img'):
            activities.append(im['title'])

        img = css(listitem, 'div.listmap img[alt^=Map]')
        if img:
            ll = parse_qs(urlparse(img[0]['src']).query)
            lat, lng = ll['lat'][0], ll['lng'][0]

        item = {
            'title': title,
            'lat': lat,
            'lng': lng,
            'url': url,
            'phone': phone,
            'short_address': short_address,
            'tags': activities
        }

        data_lock.acquire()
        data[item['url']] = item
        print '.'
        data_lock.release()