Пример #1
0
def data_web_update():

    from webb import webb
    from aplikace.models import Product
    #    from locale import atof
    import re
    id = 0
    from html_table_parser import HTMLTableParser
    for polozka in Product.notKL():
        url = "http://www.vskprofi.cz/vyhledavani?type=sku&search=" + polozka.Obj + "&sku=OK"
        page = webb.download_page(url)
        p = HTMLTableParser()
        p.feed(page.decode('utf-8'))
        #print(p.tables)
        ar = p.tables
        try:
            data = Product.find_by_Obj(polozka.Obj)
            for i in range(6, 10):

                if re.search('technical-data', ar[0:1][0][1][i]):
                    data.TL = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
                if re.search('product-lists', ar[0:1][0][1][i]):
                    data.KL = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
                if re.search('pics', ar[0:1][0][1][i]):
                    data.Foto = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
            data.sklad = ar[0:1][0][1][3]
            if ar[0:1][0][1][11]:
                data.Poznamka = ar[0:1][0][1][11]
            #print data.Obj
            data.update(commit=False)
            id = id + 1
            if id % 100 == 0:
                print "aktualizuji data"
                db.session.commit()


#        for i in ar[0:1][0][1]:db.session.commit()rint
#                print(i)
#print(float(re.split(" ", ar[0:1][0][1][4])[0].replace(".","").replace(",",".")))

        except:

            print "Chyba" + str(id) + " " + polozka.Obj
            db.session.commit()
            #data_web_update.delay()

    db.session.commit()
    return True
Пример #2
0
def data_web_update():

    from webb import webb
    from aplikace.models import Product
#    from locale import atof
    import re
    id=0
    from html_table_parser import HTMLTableParser
    for polozka in Product.notKL():
        url="http://www.vskprofi.cz/vyhledavani?type=sku&search=" + polozka.Obj + "&sku=OK"
        page = webb.download_page(url)
        p = HTMLTableParser()
        p.feed(page.decode('utf-8'))
        #print(p.tables)
        ar=p.tables
        try:
            data=Product.find_by_Obj(polozka.Obj)
            for i in range(6,10):

                if re.search('technical-data',ar[0:1][0][1][i]):
                    data.TL = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
                if re.search('product-lists',ar[0:1][0][1][i]):
                    data.KL = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
                if re.search('pics',ar[0:1][0][1][i]):
                    data.Foto = ar[0:1][0][1][i]
                    #print ar[0:1][0][1][i]
            data.sklad = ar[0:1][0][1][3]
            if ar[0:1][0][1][11]:
                data.Poznamka = ar[0:1][0][1][11]
            #print data.Obj
            data.update(commit=False)
            id=id+1
            if  id % 100 == 0:
                print "aktualizuji data"
                db.session.commit()
#        for i in ar[0:1][0][1]:db.session.commit()rint
#                print(i)
            #print(float(re.split(" ", ar[0:1][0][1][4])[0].replace(".","").replace(",",".")))

        except:

            print "Chyba" + str(id) + " " + polozka.Obj
            db.session.commit()
            #data_web_update.delay()
            
    db.session.commit()
    return True
Пример #3
0
def update_page_active_time(request, format=None):
			if request.method == 'POST':
				data = request.data
				db=MySQLdb.connect(host="127.0.0.1",port=9306,passwd="",db="")
				cur = db.cursor()
				new_item = False
				query = ""
				if data['user_id']  ==None or data['user_id'] =='':
					resp = "Invalid Details"
					print (resp)
					return Response(resp , status=status.HTTP_202_ACCEPTED)
				if data['page_title'] =='' or data['page_title']== 'new tab' or data['page_id'] =='' or data['page_id'].startswith('chrome://'):
					resp = "Invalid Page"
					print (resp)
					return Response(resp , status=status.HTTP_202_ACCEPTED)

				if data['icon_url'] == None or data['icon_url'] == '':
					data['icon_url'] ="http://52.26.203.91:80/icon.png"
				else:
					data['icon_url'] = "http://www.google.com/s2/favicons?domain_url="+data['icon_url']

				baseUrl = data['page_id']
				if baseUrl.startswith("https://"):
					baseUrl = baseUrl.replace("https://", "",1)
					position = baseUrl.find("/")
					if position != -1:
						baseUrl = baseUrl[0:position]
				elif baseUrl.startswith("http://"):
					baseUrl = baseUrl.replace("http://", "",1)
					position = baseUrl.find("/")
					if position != -1:
						baseUrl = baseUrl[0:position]
				data['base_url'] = baseUrl

				isBlackListed = BlackListedPages.objects.filter(user_id= data['user_id'], base_url= data['base_url'] ).exists()

				if isBlackListed:
					print("Page with url :"+data['page_id']+" is black listed for user: "******"Domain is blacklisted --- Timer not Update"
					return Response(response, status=status.HTTP_202_ACCEPTED)
				else:
					print("Page "+data['page_id']+" is not on Blacklist  for user "+data['user_id']+"! ")



				page_content = webb.download_page(data['page_id'])

				soup = BeautifulSoup(page_content,"html5lib")
				soup = BeautifulSoup(soup.html.body.encode_contents())
				[tag.decompose() for tag in soup.find_all(attrs={'id' : re.compile(r'^MathJax_')})]
				html_exception = 0
				for tag in soup():
					for attribute in invalidAttrs:
				         try:
				         	del tag[attribute]
				         except:
				         	html_exception +=1
				         	

					if tag.name in invalidTags:
						tag.decompose()
					if tag.name in invalidTagsToReplace:
						tag.replaceWithChildren()	

				print("html parsing exceptions :"+ str(html_exception))
				page_content = str(soup.prettify().encode('utf-8'))
				page_content = re.sub('[^a-zA-Z0-9\.]', ' ', page_content)
				data['page_content'] = page_content

				pageItem = PageActiveTime.objects.filter(user_id = data['user_id'], page_id = data['page_id'], is_active=1, is_deleted=0)[:1]
				# print (pageItem)
				if len(pageItem) == 0:
					new_item = True
					serializer = PageActiveTimeSerializer(data = data)
					# query = "INSERT INTO tart (page_id, user_id, page_title, cumulative_time, icon_url, base_url, is_active) VALUES (\'"+data['page_id']+"\',\'"+data['user_id']+"\',\'"+data['page_title']+"\',\'"+str(data['cumulative_time'])+"\',\'"+data['icon_url']+"\',\'"+data['base_url']+"\',\'1\')"
					
				else:
					print ("Already exists")
					data['cumulative_time'] = pageItem[0].cumulative_time  + int(data['cumulative_time'])
					id = pageItem[0].id
					print(id)
					serializer = PageActiveTimeSerializer(pageItem[0],data = data)
					# query = "REPLACE INTO tart (id,  cumulative_time) VALUES (\'"+str(id)+"\',\'"+str(data['cumulative_time'])+"\')"
					query = "UPDATE tart SET cumulative_time = "+str(data['cumulative_time'])+" WHERE id ="+str(id)+" "
					cur.execute(query);
				

				if serializer.is_valid():
					serializer.save()
					if new_item:
						pageItem = PageActiveTime.objects.filter(user_id = data['user_id'], page_id = data['page_id'], is_active=1, is_deleted=0)[:1]
						id = pageItem[0].id
						query = "INSERT INTO tart (id, page_id, user_id, page_title, cumulative_time, icon_url, base_url,is_active, page_content) VALUES (\'"+str(id)+"\',\'"+data['page_id']+"\',\'"+data['user_id']+"\',\'"+data['page_title']+"\',\'"+str(data['cumulative_time'])+"\',\'"+data['icon_url']+"\',\'"+data['base_url']+"\',\'1\',\'"+page_content+"\')"
						print("here")
						cur.execute(query);

					return Response(serializer.data["page_id"], status=status.HTTP_201_CREATED)
				print ("invalid serializer")
				return Response(serializer.errors, status=status.HTTP_400_BAD_REQUEST)
Пример #4
0
# pip install webb

from webb import webb

http_address = "http://mail.ru"

webb.get_ip(http_address)
webb.get_whois_data(http_address)
webb.ping(http_address)
webb.traceroute(http_address)
webb.clean_page(webb.download_page(http_address))

# webb.web_crawl(http_address)