Пример #1
0
def run_task(job_string, runner):
    try:
        task = Job.rehydrate(job_string)

        try:
            runner.current_job = task.current_job
            task.run()
        except Exception, e:
            # print traceback.format_exc()
            # task.finish(traceback.format_exc())
            raise  #####

        hxdispatcher.send('all_tasks', {'runner': runner.number,
                                        'time': utc_now().isoformat(),
                                        'message': task.key,
                                        'code_word': task.current_job.code_word})

        thread.running_task = None
Пример #2
0
def scrape_funda(username, password):
    session = requests.Session()
    session.headers.update({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'})
    tokenRequest = session.get('https://www.funda.nl/mijn/login/')

    request_validation_re = re.compile(r'<input name="__RequestVerificationToken" type="hidden" value="(.*?)" />')
    tokens = request_validation_re.findall(tokenRequest.text)

    sessionCookies = tokenRequest.cookies

    payload = {
        '__RequestVerificationToken': tokens[0],
        'Username': username,
        'Password': password,
        'RememberMe': 'false'
    }

    raw = urllib.urlencode(payload)
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
    }
    session.post('https://www.funda.nl/mijn/login/', data=raw, cookies=sessionCookies, headers=headers)

    links = list()
    resp = session.get(BEWRD_URL + 'p1')
    soup = BeautifulSoup(resp.text, "html5lib")

    pagelinks = soup.find_all("a", attrs={"data-pagination-page":True})
    pages = []
    for page in pagelinks:
        pages.append(int(page["data-pagination-page"]))
    einde = max(pages) + 1

    for i in range(1, einde):
        links.append(BEWRD_URL + 'p' + str(i))

    for page in links:
        html = session.get(page)
        soup = BeautifulSoup(html.text, "html5lib")
        houses = soup.find('ul', class_='search-results').find_all('li', class_="search-result ")
        for house in houses:
            raw_address = house.find('h3', class_='search-result-title').text

            raw_address_list = [s.strip() for s in raw_address.splitlines()]
            street_nr = raw_address_list[1]
            postalcode_city = raw_address_list[3]
            postcode = postalcode_city[:7]
            city = postalcode_city[8: ]
            lnk = "http://www.funda.nl" + house.find('a', href=True)['href']
            object_handle = house.find('a', class_='remove-object-handle')['href']
            funda_id = int(object_handle.split('tinyId=')[1])
            image_src = house.find('img')['src']

            try:
                price = int(unidecode.unidecode(house.find('span', class_='search-result-price').text).replace("EUR ", "").replace(".", "").replace("kk",""))
            except ValueError:
                price = 0
            try:
                woonopp = int(house.find('span', title="Woonoppervlakte").text.split(' ')[0])
            except ValueError:
                woonopp = 0
            try:
                percopp = int(house.find('span', title="Perceeloppervlakte").text.split(' ')[0].replace(".", ""))
            except ValueError:
                percopp = 0
            if woonopp == 0:
                sqprice = 0
            else:
                sqprice = price / woonopp

            #pcode = street_nr + " " + postalcode_city
            pcode = street_nr + " " + postcode
            url = GEOCODE_URL + pcode
            response = requests.get(url)
            try:
                # see http://gis.stackexchange.com/questions/58271/using-python-to-parse-an-xml-containing-gml-tags
                root = ET.fromstring(response.content)
                for point in root.findall('.//{http://www.opengis.net/gml}Point'):
                    rdxy = point.findtext("{http://www.opengis.net/gml}pos").split()
                pnt = GEOSGeometry('POINT({0} {1})'.format(rdxy[0], rdxy[1]), srid=28992)
                # see http://gis.stackexchange.com/questions/94640/geodjango-transform-not-working
                pnt.transform(4326)
            except:
                rdxy = [0, 0]
                pnt = GEOSGeometry('POINT({0} {1})'.format(0, 0), srid=4326)

            cm = House.objects.create(
                fuid=funda_id,
                image=image_src,
                address=street_nr + ' ' + postalcode_city,
                strnumr=street_nr,
                postcod=postcode,
                plaprov=city,
                woonopp=woonopp,
                percopp=percopp,
                vrprijs=price,
                sqprijs=sqprice,
                link=lnk,
                dellink='http://www.funda.nl' + object_handle,
                rdx=rdxy[0],
                rdy=rdxy[1],
                lat=pnt.y,
                lon=pnt.x,
                sender='backend',
                channel='homepage',
                content='yo',
                geom=pnt
            )

            #t = loader.get_template('message.html')
            hxdispatcher.send(cm.channel, {
                #'html': t.render(Context({'message': cm})),
                'fuid': funda_id,
                'image': image_src,
                'address': street_nr + ' ' + postalcode_city,
                'strnumr': street_nr,
                'postcod': postcode,
                'plaprov': city,
                'woonopp': woonopp,
                'percopp': percopp,
                'vrprijs': price,
                'sqprijs': sqprice,
                'link': lnk,
                'dellink': 'http://www.funda.nl' + object_handle,
                'lat': pnt.y,
                'lon': pnt.x
            })
Пример #3
0
 def my_long_thing():
     for i in range(5):
         print("another noodle on the python console")
         time.sleep(1)
         hxdispatcher.send(b'noodly_messages', "Another noodle")
Пример #4
0
 def dataReceived(self, data):
     hxdispatcher.send('noodly_messages', data)
Пример #5
0
 def dataReceived(self, data):
     hxdispatcher.send('noodly_messages', data)
Пример #6
0
 def my_long_thing():
     for i in range(5):
         time.sleep(1)
         hxdispatcher.send('noodly_messages', "Another noodle")
Пример #7
0
def scrape_funda(username, password):
    session = requests.Session()
    session.headers.update({'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36'})
    tokenRequest = session.get('https://www.funda.nl/mijn/login/')

    request_validation_re = re.compile(r'<input name="__RequestVerificationToken" type="hidden" value="(.*?)" />')
    tokens = request_validation_re.findall(tokenRequest.text)

    sessionCookies = tokenRequest.cookies

    payload = {
        '__RequestVerificationToken': tokens[0],
        'Username': username,
        'Password': password,
        'RememberMe': 'false'
    }

    raw = urllib.urlencode(payload)
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
    }
    session.post('https://www.funda.nl/mijn/login/', data=raw, cookies=sessionCookies, headers=headers)

    links = list()
    resp = session.get(BEWRD_URL + 'p1')

    soup = BeautifulSoup(resp.text, "lxml")

    pagelinks = soup.find_all("a", attrs={"data-pagination-page":True})
    pages = []
    for page in pagelinks:
        pages.append(int(page["data-pagination-page"]))

    einde = max(pages) + 1

    for i in range(1, einde):
        links.append(BEWRD_URL + 'p' + str(i))

    for page in links:
        html = session.get(page)
        soup = BeautifulSoup(html.text, "lxml")
        houses = soup.find('ul', class_='search-results').find_all('li', class_="search-result ")
        for house in houses:
            raw_address = house.find('h3', class_='search-result-title').text

            raw_address_list = [s.strip() for s in raw_address.splitlines()]
            street_nr = raw_address_list[1]
            postalcode_city = raw_address_list[3]
            postcode = postalcode_city[:7]
            city = postalcode_city[8: ]
            lnk = "http://www.funda.nl" + house.find('a', href=True)['href']
            object_handle = house.find('a', class_='remove-object-handle')['href']
            funda_id = int(object_handle.split('tinyId=')[1])
            image_src = house.find('img')['src']

            try:
                price = int(unidecode.unidecode(house.find('span', class_='search-result-price').text).replace("EUR ", "").replace(".", "").replace("kk",""))
            except ValueError:
                price = 0
            try:
                woonopp = int(house.find('span', title="Woonoppervlakte").text.split(' ')[0])
            except ValueError:
                woonopp = 0
            try:
                percopp = int(house.find('span', title="Perceeloppervlakte").text.split(' ')[0].replace(".", ""))
            except ValueError:
                percopp = 0
            if woonopp == 0:
                sqprice = 0
            else:
                sqprice = price / woonopp

            #pcode = street_nr + " " + postalcode_city
            pcode = street_nr + " " + postcode
            url = GEOCODE_URL + pcode
            response = requests.get(url)
            try:
                # see http://gis.stackexchange.com/questions/58271/using-python-to-parse-an-xml-containing-gml-tags
                root = ET.fromstring(response.content)
                for point in root.findall('.//{http://www.opengis.net/gml}Point'):
                    rdxy = point.findtext("{http://www.opengis.net/gml}pos").split()
                pnt = GEOSGeometry('POINT({0} {1})'.format(rdxy[0], rdxy[1]), srid=28992)
                # see http://gis.stackexchange.com/questions/94640/geodjango-transform-not-working
                pnt.transform(4326)
            except:
                rdxy = [0, 0]
                pnt = GEOSGeometry('POINT({0} {1})'.format(0, 0), srid=4326)

            cm = House.objects.create(
                fuid=funda_id,
                image=image_src,
                address=street_nr + ' ' + postalcode_city,
                strnumr=street_nr,
                postcod=postcode,
                plaprov=city,
                woonopp=woonopp,
                percopp=percopp,
                vrprijs=price,
                sqprijs=sqprice,
                link=lnk,
                dellink='http://www.funda.nl' + object_handle,
                rdx=rdxy[0],
                rdy=rdxy[1],
                lat=pnt.y,
                lon=pnt.x,
                sender='backend',
                channel='homepage',
                content='yo',
                geom=pnt
            )

            #t = loader.get_template('message.html')
            hxdispatcher.send(cm.channel, {
                #'html': t.render(Context({'message': cm})),
                'fuid': funda_id,
                'image': image_src,
                'address': street_nr + ' ' + postalcode_city,
                'strnumr': street_nr,
                'postcod': postcode,
                'plaprov': city,
                'woonopp': woonopp,
                'percopp': percopp,
                'vrprijs': price,
                'sqprijs': sqprice,
                'link': lnk,
                'dellink': 'http://www.funda.nl' + object_handle,
                'lat': pnt.y,
                'lon': pnt.x
            })
Пример #8
0
 def my_long_thing():
     for i in range(5):
         time.sleep(1)
         hxdispatcher.send('noodly_messages', "Another noodle")