This repository has been archived by the owner on Jul 18, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
helpers.py
132 lines (97 loc) · 3.33 KB
/
helpers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import os
import random
from datetime import datetime
from urlparse import urlparse
import eventlet
requests = eventlet.import_patched('requests.__init__')
time = eventlet.import_patched('time')
import redis
from BeautifulSoup import BeautifulSoup
from requests.exceptions import RequestException
import settings
num_requests = 0
redis = redis.StrictRedis(host=settings.redis_host, port=settings.redis_port, db=settings.redis_db)
def make_request(url, return_soup=True, identifier=0):
# global request building and response handling
# url = format_url(url)
if "picassoRedirect" in url:
return None # skip the redirect URLs
global num_requests
if num_requests >= settings.max_requests:
raise Exception("Reached the max number of requests: {}".format(settings.max_requests))
# proxies = get_proxy()
try:
r = requests.get(url, headers=settings.headers)
except RequestException as e:
log("WARNING: Request for {} failed, trying again.".format(url))
return make_request(url) # try request again, recursively
num_requests += 1
if r.status_code != 200:
os.system('say "Got non-200 Response"')
log("WARNING: Got a {} status code for URL: {}".format(r.status_code, url))
log("404 identifier: ".format(identifier))
return None
if return_soup:
return BeautifulSoup(r.text), r.text
return r
def format_url(url):
# make sure URLs aren't relative, and strip unnecssary query args
u = urlparse(url)
scheme = u.scheme or "https"
host = u.netloc or "www.amazon.com"
path = u.path
if not u.query:
query = ""
else:
query = "?"
for piece in u.query.split("&"):
k, v = piece.split("=")
if k in settings.allowed_params:
query += "{k}={v}&".format(**locals())
query = query[:-1]
return "{scheme}://{host}{path}{query}".format(**locals())
def log(msg):
# global logging function
if settings.log_stdout:
try:
print "{}: {}".format(datetime.now(), msg)
except UnicodeEncodeError:
pass # squash logging errors in case of non-ascii text
def get_proxy():
# choose a proxy server to use for this request, if we need one
if not settings.proxies or len(settings.proxies) == 0:
return None
proxy_ip = random.choice(settings.proxies)
proxy_url = "socks5://{user}:{passwd}@{ip}:{port}/".format(
user=settings.proxy_user,
passwd=settings.proxy_pass,
ip=proxy_ip,
port=settings.proxy_port,
)
return {
"http": proxy_url,
"https": proxy_url
}
def enq_redis(stack_name, entry):
return redis.sadd(stack_name, entry)
def deq_redis(stack_name):
return redis.spop(stack_name)
def enqueue_url(u):
url = format_url(u)
return redis.sadd("listing_url_queue", url)
def dequeue_url():
return redis.spop("listing_url_queue")
def smem(stack_name):
# print (redis.smembers(stack_name))
if len(redis.smembers(stack_name)) > 0:
return True
else:
return False
def page_save(html):
f = open("thing.html", 'w')
f.write(html.encode('utf-8'))
f.close()
if __name__ == '__main__':
# test proxy server IP masking
r = make_request('https://api.ipify.org?format=json', return_soup=False)
print r.text