Exemplo n.º 1
0
    def test_create_ads(self):

        tag1 = Tag.objects.create(name="HTML 5", slug="html5")
        tag2 = Tag.objects.create(name="jQuery", slug="jquery")
        tag3 = Tag.objects.create(name="Python", slug="python")

        self.assertEquals(0, Ad.objects.count())

        ad1 = Ad()
        ad1.company_name = "ACME International"
        ad1.company_url = "http://acme.com/"
        ad1.title = "Ninja Top Developer"
        ad1.description = """
        We are growing, we need more developers.
        Apply if you are cool and sexy.
        """
        ad1.apply_email = "*****@*****.**"

        ad1.full_clean()
        ad1.save()

        self.assertEquals(1, Ad.objects.count())

        ad1.tags.add(tag1)

        self.assertEquals(1, tag1.ads.count())

        tag2.ads.add(ad1)

        self.assertEquals(2, ad1.tags.count())

        self.assertEquals(0, tag3.ads.count())
Exemplo n.º 2
0
	def setUp(self):
		newspaper1 = Newspaper(name='Test Newspaper 1')
		newspaper1.save()

		ad = Ad()
		ad.name = 'Name 1'
		ad.content = 'Content 1'
		ad.save()
Exemplo n.º 3
0
    def setUp(self):
        newspaper1 = Newspaper(name='Test Newspaper 1')
        newspaper1.save()

        ad = Ad()
        ad.name = 'Name 1'
        ad.content = 'Content 1'
        ad.save()
Exemplo n.º 4
0
 def post(self, request, pk):
     print("Add PK", pk)
     t = get_object_or_404(Ad, id=pk)
     ad = Ad(user=request.user, ad=t)
     try:
         ad.save()  # In case of duplicate key
     except IntegrityError as e:
         pass
     return HttpResponse()
Exemplo n.º 5
0
	def setUp(self):
		ad1 = Ad()
		ad1.name = 'Test Ad 1'
		ad1.content = 'Content 1'
		ad1.save()

		ad2 = Ad()
		ad2.name = 'Test Ad 2'
		ad2.content = 'Content 2'
		ad2.save()
Exemplo n.º 6
0
    def setUp(self):
        ad1 = Ad()
        ad1.name = 'Test Ad 1'
        ad1.content = 'Content 1'
        ad1.save()

        ad2 = Ad()
        ad2.name = 'Test Ad 2'
        ad2.content = 'Content 2'
        ad2.save()
Exemplo n.º 7
0
	def setUp(self):
		newspaper1 = Newspaper(name='Test Newspaper 1')
		newspaper1.save()

		newspaper2 = Newspaper(name='Test Newspaper 2')
		newspaper2.save()

		ad = Ad()
		ad.name = 'Name 1'
		ad.content = 'Content 1'
		ad.save()
		ad.newspapers.add(newspaper1, newspaper2)
		ad.save()
Exemplo n.º 8
0
    def setUp(self):
        newspaper1 = Newspaper(name='Test Newspaper 1')
        newspaper1.save()

        newspaper2 = Newspaper(name='Test Newspaper 2')
        newspaper2.save()

        ad = Ad()
        ad.name = 'Name 1'
        ad.content = 'Content 1'
        ad.save()
        ad.newspapers.add(newspaper1, newspaper2)
        ad.save()
Exemplo n.º 9
0
	def setUp(self):
		newspaper1 = Newspaper()
		newspaper1.name = 'Test Newspaper 1'
		newspaper1.save()

		newspaper2 = Newspaper()
		newspaper2.name = 'Test Newspaper 2'
		newspaper2.save()

		newspaper3 = Newspaper()
		newspaper3.name = 'Test Newspaper 3'
		newspaper3.save()

		ad1 = Ad()
		ad1.name = 'Test Ad 1'
		ad1.id = 1
		ad1.content = 'Content 1'
		ad1.save()

		ad2 = Ad()
		ad2.name = 'Test Ad 2'
		ad2.id = 2
		ad2.content = 'Content 2'
		ad2.save()
		ad2.newspapers.add(newspaper1, newspaper2)
		ad2.save()

		ad3 = Ad()
		ad2.id = 3
		ad3.name = 'Test Ad 3'
		ad3.content = 'Content 3'
		ad3.save()

		self.browser = webdriver.Firefox()
		self.browser.implicitly_wait(3)
Exemplo n.º 10
0
    def setUp(self):
        newspaper1 = Newspaper()
        newspaper1.name = 'Test Newspaper 1'
        newspaper1.save()

        newspaper2 = Newspaper()
        newspaper2.name = 'Test Newspaper 2'
        newspaper2.save()

        newspaper3 = Newspaper()
        newspaper3.name = 'Test Newspaper 3'
        newspaper3.save()

        ad1 = Ad()
        ad1.name = 'Test Ad 1'
        ad1.id = 1
        ad1.content = 'Content 1'
        ad1.save()

        ad2 = Ad()
        ad2.name = 'Test Ad 2'
        ad2.id = 2
        ad2.content = 'Content 2'
        ad2.save()
        ad2.newspapers.add(newspaper1, newspaper2)
        ad2.save()

        ad3 = Ad()
        ad2.id = 3
        ad3.name = 'Test Ad 3'
        ad3.content = 'Content 3'
        ad3.save()

        self.browser = webdriver.Firefox()
        self.browser.implicitly_wait(3)
Exemplo n.º 11
0
    def setUp(self):
        newspaper1 = Newspaper(name='Test Newspaper 1')
        newspaper1.save()

        newspaper2 = Newspaper(name='Test Newspaper 2')
        newspaper2.save()

        newspaper3 = Newspaper(name='Test Newspaper 3')
        newspaper3.save()

        first_ad = Ad()
        first_ad.name = 'Name 1'
        first_ad.content = 'Content 1'
        first_ad.save()
        first_ad.newspapers.add(newspaper1, newspaper2)
        first_ad.save()

        second_ad = Ad()
        second_ad.name = 'Name 2'
        second_ad.content = 'Content 2'
        second_ad.save()
        second_ad.newspapers.add(newspaper2, newspaper3)
        first_ad.save()
Exemplo n.º 12
0
	def setUp(self):
		newspaper1 = Newspaper(name='Test Newspaper 1')
		newspaper1.save()

		newspaper2 = Newspaper(name='Test Newspaper 2')
		newspaper2.save()

		newspaper3 = Newspaper(name='Test Newspaper 3')
		newspaper3.save()

		first_ad = Ad()
		first_ad.name = 'Name 1'
		first_ad.content = 'Content 1'
		first_ad.save()
		first_ad.newspapers.add(newspaper1, newspaper2)
		first_ad.save()

		second_ad = Ad()
		second_ad.name = 'Name 2'
		second_ad.content = 'Content 2'
		second_ad.save()
		second_ad.newspapers.add(newspaper2, newspaper3)
		first_ad.save()
Exemplo n.º 13
0
	def setUp(self):
		self.browser = webdriver.Firefox()
		self.browser.implicitly_wait(3)

		ad1 = Ad()
		ad1.name = 'Test Ad 1'
		ad1.content = 'Content 1'
		ad1.save()

		ad2 = Ad()
		ad2.name = 'Test Ad 2'
		ad2.content = 'Content 2'
		ad2.save()

		ad3 = Ad()
		ad3.name = 'Test Ad 3'
		ad3.content = 'Content 3'
		ad3.save()
Exemplo n.º 14
0
    def setUp(self):
        self.browser = webdriver.Firefox()
        self.browser.implicitly_wait(3)

        ad1 = Ad()
        ad1.name = 'Test Ad 1'
        ad1.content = 'Content 1'
        ad1.save()

        ad2 = Ad()
        ad2.name = 'Test Ad 2'
        ad2.content = 'Content 2'
        ad2.save()

        ad3 = Ad()
        ad3.name = 'Test Ad 3'
        ad3.content = 'Content 3'
        ad3.save()
Exemplo n.º 15
0
def extract_data(ad_urls,uuid):
    result = []
    attribute_dict = { 'Date Listed' : 'date' ,
                       'Price' : 'price' ,
                       'Address' : 'address',
                       'Bathrooms (#)' : 'bathrooms' ,
                       'Furnished' : 'furnished' ,
                       'Pet Friendly' : 'pet_friendly' 
                     }

    for ad_url in ad_urls : 
        try: 
            items = dict() #empty dict used to create the db object 
            items['url'] = ad_url
            #get ad data
            ad_soup = bs(opener.open(ad_url).read())
            map_link = ''

            #title 
            title = ad_soup.find('h1',id='preview-local-title').getText().replace('google_ad_section_start','').replace('google_ad_section_end','')
            items['title'] = title
           
            #table data
            for tr in ad_soup.find('table' , id='attributeTable').findAll('tr') :
                for td in tr.findAll('td') :
                    key =  td.getText()
                    if key in attribute_dict : 
                        value = td.findNext('td').getText()
                        if attribute_dict.get(key) == 'date' :
                            items['pub_date'] =  datetime.datetime.strptime(value,"%d-%b-%y")

                        if attribute_dict.get(key) == 'bathrooms' :
                            items['bathrooms'] = float(re.match('(\d+.?\d?) bathroom.*',value).group(1))

                        if attribute_dict.get(key) == 'price' :
                            if value.find('contact') > 0 :
                                items['rent'] = 0
                            else : 
                                items['rent'] = int(float(value[1:].replace(',',''))) 

                        if attribute_dict.get(key) == 'address' :
                            items['address'] = value.replace('View map','')

                        if attribute_dict.get(key) == 'furnished' or attribute_dict.get(key) == 'pet_friendly' :
                            if value == 'No':
                                items[attribute_dict.get(key)] = False
                            else:
                                items[attribute_dict.get(key)]=True
                        
            # map coordinates
            map_url = 'http://montreal.kijiji.ca' + ad_soup.find('a', attrs = { 'class' : 'viewmap-link' } ).get('href')
            map_soup = bs(opener.open(map_url).read())
            for noscript in map_soup.findAll('noscript') :
                if noscript.find('img') :
                    map_link =  noscript.find('img').get('src')
            coords = urllib2.urlparse.parse_qs(urllib2.urlparse.urlparse(map_link).query)
            lat_lng = coords['center'][0].split(',')
            items['lat'] = float(lat_lng[0])
            items['lng'] = float(lat_lng[1])

            print items
            ad = Ad(**items)
            ad.save()
            result.append(ad)
        
        except Exception as e: 
            print "[FAILED]" , ad_url
            print e
            pass #skip to the next one 

        
        cache_obj = cache.get(uuid)
        new_obj = { 'total' : cache_obj['total'] , 'found' : cache_obj['found'] + 1 }
        cache.set(uuid,new_obj)

    return result
Exemplo n.º 16
0
def import_ads(file_id):
    imported_file = ImportFile.objects.get(id=file_id)
    clear_prev_ads(imported_file)
    with PersistentDict(imported_file.settings.path, 'w', format='json') as params:
    #with closing(shelve.open(file_path)) as params:
        ids = []
        count = len(params['lines'])
        i = 0
        for line in params['lines']:
            ad = {}
            for field in params['matching']:
                if params['matching'][field] != 'Constant' and 'Constant' not in params['matching'][field]:

                    poss = params['matching'][field] if type(params['matching'][field]) is list else [params['matching'][field]]
                    value = ""
                    for pos in poss:
                        pos = int(pos)
                        if len(line) < pos or not line[pos].strip():
                            continue
                        value += ", " + line[pos] if value else line[pos]

                    if not value:
                        continue
                    if field in CompareFieldsForm.compared_fields:
                        curr_comp_field = CompareFieldsForm.compared_fields[field]
                        compared_obj = curr_comp_field['model'].objects.get(pk=value)
                        if field == 'category':
                            if compared_obj.sub_category is not None:
                                ad['category'] = compared_obj.sub_category.category
                                ad['sub_category'] = compared_obj.sub_category
                            else:
                                ad['category'] = compared_obj.category
                        elif field == 'district':
                            if compared_obj.district:
                                ad['district'] = compared_obj.district
                                ad['town'] = compared_obj.district.town
                            else:
                                ad['town'] = compared_obj.town
                        else:
                            ad[field] = compared_obj.value
                    elif field == 'pub_date':
                        pub_date = dateutil.parser.parse(value)
                        if str(pub_date.time()) == "00:00:00":
                            pub_date = datetime.combine(pub_date.date(), datetime.now().time())
                        ad[field] = ad['order_date'] = pub_date
                    else:
                        ad[field] = Ad.filter_field(value, field)
                elif 'constants' in params and field in params['constants']:  # constant
                    value = Ad.filter_field(str(params['constants'][field]), field)
                    if field in CompareFieldsForm.compared_fields:
                        if field == 'category':
                            cat_values = value.split('-')
                            if len(cat_values) > 1:
                                sub_cat = SubCategory.objects.get(pk=cat_values[1])
                                ad['category'] = sub_cat.category
                                ad['sub_category'] = sub_cat
                            else:
                                cat = Category.objects.get(pk=cat_values[0])
                                ad['category'] = cat
                        elif 'foreign' in CompareFieldsForm.compared_fields[field]:
                            ad[field] = CompareFieldsForm.compared_fields[field]['foreign'].objects.get(pk=value)
                        else:
                            for key, val in CompareFieldsForm.compared_fields[field]['choices']:
                                if str(key) == value:
                                    ad[field] = val
                    else:
                        ad[field] = value
            empty = set(MatchFieldsForm.required) - set(ad.keys())
            if not empty:
                ad_obj = Ad(**ad)
                ad_obj.imported = imported_file
                ad_obj.save()
                find_similar.delay(ad_obj.id)
                if ad_obj.url:
                    parsed_uri = urlparse(ad_obj.url)
                    if parsed_uri.netloc in Crawler.DOMAINS:
                        import_attachments.delay(ad_obj.id)
                ids.append(ad_obj.id)
            i += 1
            celery.current_task.update_state(state='PROGRESS', meta={'current': i, 'total': count})
        params['ids'] = ids
        return ids
Exemplo n.º 17
0
    def get_ads(cls):
        errors = 5
        match = {'cena': 'price', 'datap': 'pub_date', 'dom_etag': 'floor_max', 'etag': 'floor',
                 'kol_komn': 'rooms_count', 'pl1': 'area_living', 'plk': 'area_kitchen', 'plosh': 'area',
                 'pl_land': 'area_land', 'textob': 'desc'}

        now = datetime.now().strftime("%Y-%m-%d")
        yesterday = (date.today() - timedelta(1)).strftime("%Y-%m-%d")

        data = {
            're_base_name': 'kvart',
            're_base_section': 'green',
            'rem': 0,
            'subq_start': 0,
            'subq_lines': 200,
            'order_by[data]': 'desc',
            'order_by[datap]': 'desc',
            'data_start': yesterday,
            'data_end': now,
        }
        statistic = {}
        log = ""
        try:
            for table in ['arenda', 'kvart', 'komn', 'domm', 'negil', 'client']:  #  all tales
                data['re_base_name'] = table
                print('Load from ' + table)
                statistic[table] = {}
                for private_type in ['green']:  #['green', 'red']:  # all bases
                    statistic[table][private_type] = 0
                    print('table type ' + private_type)
                    data['re_base_section'] = private_type
                    current = 0
                    total = 200
                    while current < total:   # all pages
                        print('select 200 starts from ' + str(current) + ', total is ' + str(total))
                        data['subq_start'] = current
                        str_data = urllib.urlencode(data)
                        content = cls.get('http://estate-in-kharkov.com/ps/re_base/ajax/real-estate-database.php', str_data)
                        current += 200
                        if 're_base_query_count' in content:
                            total = int(content['re_base_query_count'])

                        if 'items' not in content and errors:
                            print("hasn't content , errors left %d" % errors)
                            errors -= 1
                            print(content)
                            time.sleep((6-errors)*10)
                            current -= 200
                            continue
                        assert 'items' in content, 'unexpected content: ' + str(content)

                        if content['items'] is False:
                            break
                        for item in content['items'].values():
                            aid = item['kod']
                            try:
                                existed = Ad.objects.get(gkey=aid)
                                print(str(existed) + ' already imported!')
                                continue
                            except Ad.DoesNotExist:
                                pass

                            ad_item = {'gkey': aid}
                            for field in item:
                                if item[field] and field in match:
                                    ad_item[match[field]] = item[field]

                            ad = cls.parse_categories(ad_item, item, table)
                            # continue
                            if 'sub_category' in ad:
                                print(ad['sub_category'])
                            if 'rooms_count' in ad:
                                print('rooms: ' + str(ad['rooms_count']))

                            if item['textob']:
                                ad['title'] = truncatesmart(item['textob'], 45)
                                if not ad['title']:
                                    ad['title'] = item['textob'][:45]

                            ad['offering'] = table != 'client'
                            ad['private'] = private_type == 'green'
                            ad['phone'] = ""
                            for i in range(1, 5):
                                name = 'tel'+str(i)
                                if name in item and item[name].strip():
                                    if i > 1:
                                        ad['phone'] += ', '
                                    ad['phone'] += item[name].strip()

                            if item['metro']:
                                try:
                                    metro = Metro.objects.get(pk=item['metro'])
                                    ad['desc'] += ' ' + metro.name
                                except Metro.DoesNotExist:
                                    pass

                            if item['ulica']:
                                try:
                                    ad['address'] = Street.objects.get(pk=item['ulica'])
                                except Street.DoesNotExist:
                                    pass

                            if item['raj']:
                                try:
                                    ad['district'] = District.objects.get(pk=item['raj'])
                                except District.DoesNotExist:
                                    pass

                            if item['nasp']:
                                item['nasp'] = 1 if item['nasp'] == '293' else item['nasp']
                                try:
                                    ad['town'] = Town.objects.get(pk=item['nasp'])
                                except Town.DoesNotExist:
                                    continue

                            if ad['pub_date']:
                                pub_date = dateutil.parser.parse(ad['pub_date'])
                                if str(pub_date.time()) == "00:00:00":
                                    pub_date = datetime.combine(pub_date.date(), datetime.now().time())

                                pub_date_utc = pub_date.replace(tzinfo=timezone.get_current_timezone())
                                ad['pub_date'] = ad['order_date'] = pub_date_utc

                            if item['fotosite'] and item['fotosite'].find('&have_images'):
                                ad['url'] = item['fotosite'].replace('&have_images', '')

                            ad_obj = Ad(**ad)
                            ad_obj.save()
                            statistic[table][private_type] += 1

                            find_similar.delay(ad_obj.id)
                            if ad_obj.url:
                                parsed_uri = urlparse(ad_obj.url)
                                if parsed_uri.netloc in Crawler.DOMAINS:
                                    import_attachments.delay(ad_obj.id)

                            print(ad_obj)
                            time.sleep(1)
        except:
            log += "Error: " + traceback.format_exc() + "\n"
            raise
        finally:
            for name, stats in statistic.items():
                log += name + ': '
                if 'green' in stats:
                    log += str(stats['green'])
                log += " / "
                if 'red' in stats:
                    log += str(stats['red'])
                log += "\n"
            send_log(log)