def scrape(forums, path): # check to see if the directories in the path string exist if not os.path.exists(path): #if not, create them os.makedirs(path) for forumID in forums: r = session.get(baseurl + 'mod/forum/view.php?id=' + forumID) if(r.status_code == 200): soup = BeautifulSoup(r.text, 'lxml') posts = soup.find_all('td', class_='topic starter') # loop through the posts on a forum page # visit each post and save the thread as a .html file print "scraping posts from forumID " + forumID for post in posts: r = session.get(post.a.get('href')) if(r.status_code == 200): soup = BeautifulSoup(r.text, 'lxml') output_file = path + str(soup.find("input", attrs={"name": "d"}).get('value')) + ".html" print "page saved at " + output_file with open(output_file, 'w') as f: f.write(str(soup)) f.close() else: print 'ERROR: ' + str(r.status) + ' ' + r.reason sys.exit()
def makeRequest(host, user, password, verbose): """Función que realiza las peticiones al servidor especificado. Recibe host, user, password y verbose. Si la opcion verbose es True, se muestra lo que se está realizando en cada paso. Se pregunta dentro de la función por las opciones --tor y --agente En caso de que --tor sea True se realiza la petición a través de TOR mediante los proxies y session() En caso de que --agente sea True se cambia el agente de usuario de Python por el de HotJava con los headers""" try: header = {} # Diccionario vacío que indicará el agente, vació indica el agente original (Python) if opts.agente: header['User-agent'] = 'HotJava/1.1.2 FCS' # Valor que se agrega al diccionario para cambiar el agente if opts.tor: session.proxies = {} session.proxies['http'] = 'socks5h://localhost:9050' # Proxys http y https para realizar la petición mediante TOR session.proxies['https'] = 'socks5h://localhost:9050' response = session.get(host, auth=(user,password), headers = header) else: response = session.get(host, auth=(user,password), headers = header) if verbose: ip = session.get('http://httpbin.org/ip', headers = header) # Se obtiene la ip desde la que se hace la petición ip = str(ip.text).replace('\"origin\":','').replace('{','').replace('}','').replace('\n','') print 'La petición se realiza desde:%s' % (ip) agente = session.get('https://httpbin.org/user-agent', headers = header) # Se obtiene el agente de usuario con el que se hace la petición agente = str(agente.text).replace('\"user-agent\":','').replace('{','').replace('}','').replace('\n','') print 'El agente de usuario es:%s' % (agente) print 'Se está intentando ingresar con usuario \'%s\' contraseña \'%s\'' % (user, password) print 'La respuesta (response) obtenida es : %s y, por lo tanto:' % (response) if response.status_code == 200: print 'CREDENCIALES ENCONTRADAS!: %s\t%s' % (user,password) if opts.reporte != None: reportResults('Usuario: \'%s\'\t\tContraseña: \'%s\'\n' % (user,password), opts.reporte) else: print 'NO FUNCIONO :c ' except ConnectionError: printError('Error en la conexion, tal vez el servidor no esta arriba.',True)
def scrape(resource, **args): session = login() if resource == 'uid': # we effectively only need the first user, so don't scrape all pages search = session.get( 'http://www.erepublik.com/en/main/search/%s/' % args['query'].replace(' ', '_') ) doc = fromstring(search.text) uid = doc.xpath('//div[@class="nameholder"]/a/@href')[0].split('/')[-1].strip() return uid elif resource == 'citizen.profile': profile = session.get( 'http://www.erepublik.com/en/citizen/profile/%s' % args['citizenId'] ) doc = fromstring(profile.text) citizen_state = doc.xpath('//div[@class="citizen_state"]/div[@class="is"]/span/img/@src') is_dead = citizen_state and 'dead_citizen' in citizen_state[0] profile = { 'general': { 'avatar': doc.xpath('//img[@class="citizen_avatar"]/@style')[0].split('(')[1].split(')')[0], 'level': doc.xpath('//*[@class="citizen_level"]')[0].text, 'experience_points': doc.xpath('//*[@class="citizen_experience"]/div/p')[0].text.split(' / ')[0].replace(',', ''), 'name': doc.xpath('//*[@class="citizen_profile_header auth"]/h2')[0].text_content().strip(), 'is_alive': str(int(not is_dead)), 'birthDay': doc.xpath('//div[@class="citizen_second"]/p')[1].text.strip(), 'nationalRank': doc.xpath('//div[@class="citizen_second"]/small/strong')[0].text, }, 'location': { 'citizenship_country_initials': doc.xpath('//div[contains(@class, "citizen_info")]/a/@href')[2].split('/')[-1], 'residence_country_name': doc.xpath('//div[contains(@class, "citizen_info")]/a/@title')[0], 'residence_region_name': doc.xpath('//div[contains(@class, "citizen_info")]/a/@title')[1], }, 'party': { 'name': doc.xpath('//div[@class="citizen_activity"]/div/div/span/a')[0].text.strip(), }, 'militaryUnit': { 'id': doc.xpath('//div[@class="citizen_activity"]/div/div/a/@href')[0].split('/')[-1], 'name': doc.xpath('//div[@class="citizen_activity"]/div/div/a/span')[0].text.strip(), }, 'militaryAttributes': { 'strength': doc.xpath('//div[@class="citizen_military"]/h4')[0].text.replace(',', '').strip(), 'rank_points': doc.xpath('//div[@class="stat"]/small/strong')[1].text.split(' / ')[0].replace(',', ''), }, } return profile
def login(session): try: login = { 'j_username': AGILE_CREDENTIALS['username'], 'j_password': AGILE_CREDENTIALS['password'] } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.5', 'Connection': 'keep-alive', 'Content-Length': '1781', 'Content-Type': 'application/x-www-form-urlencoded', 'Host': AGILE_HOST_NAME, 'Referer': AGILE_PROD_URL, 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/57.0' } timeout = 5 # seconds session.post(AGILE_LOGIN_URL, data=login, headers=headers, timeout=timeout) # open agile window session.get(AGILE_PROD_URL) except requests.exceptions.Timeout: return "no agile" pass except: return "no agile" pass
def downloadResource(session, res, path): try: src = res.a['href'] except TypeError: return r = session.get(src) if(r.status_code == 200): headers = r.headers.keys() if ('content-disposition' in headers): #got a direct file link name = r.headers['content-disposition'].decode('utf-8').split(';')[1].split('=')[1].strip('"') else: #got a preview page soup = BeautifulSoup(r.text, 'html.parser') if ('content-type' in headers) and ('content-script-type' in headers) and ('content-style-type' in headers): #it's most obviously a website, which displays a download link src = soup.find(class_='region-content').a['href'] else: #it's obviously an ugly frameset site src = soup.find_all('frame')[1]['src'] name = os.path.basename(src) name = urllib.url2pathname(name.encode('utf-8')) saveFile(session, src, path, name) else: print 'ERROR: ' + str(r.status) + ' ' + r.reason sys.exit()
def downloadCourse(session, c, sem): global files global sections files = itertools.count() sections = itertools.count() name = c['key'].replace('/', '-') + u'/' path = root + sem.replace('/', '-') + u'/' + name path = urllib.url2pathname(path.encode('utf-8')).replace(':', '-').replace('"', '') if not os.path.exists(path): os.makedirs(path) print ' +--' + colors.BOLD + name + colors.ENDC r = session.get(c['url']) if(r.status_code == 200): soup = BeautifulSoup(r.text, 'html.parser') if not os.path.exists(path + '.dump'): os.makedirs(path + '.dump') dst = path + '.dump/' + c['key'].replace('/', '-').encode('utf-8') + '-' + c['type'] + '-' + str(datetime.date.today()) + '-full.html' dst = dst.replace(':', '-').replace('"', '') with open(dst, 'wb') as f: f.write(soup.encode('utf-8')) for s in soup.find_all(class_='section main clearfix'): downloadSection(session, s, path) #print 'Saved ' + str(files.next()) + ' Files in ' + str(sections.next()) + ' Sections' else: print 'ERROR: ' + str(r.status) + ' ' + r.reason sys.exit()
def get_captcha(): # 验证码URL是按照时间戳的方式命名的 captcha_url = 'https://www.zhihu.com/captcha.gif?r=%d&type=login&lang=cn' % ( int(time.time() * 1000)) response = session.get(captcha_url, headers=headers) # 保存验证码到当前目录 with open('captcha.gif', 'wb') as f: f.write(response.content) f.close() # 自动打开刚获取的验证码 from PIL import Image try: img = Image.open('captcha.gif') img.show() img.close() except: pass captcha = { 'img_size': [200, 44], 'input_points': [], } points = [[22.796875, 22], [42.796875, 22], [63.796875, 21], [84.796875, 20], [107.796875, 20], [129.796875, 22], [150.796875, 22]] print('7个字对应的坐标是:', points, '\n') seq = input('请输入倒立字的位置\n>') for i in seq: captcha['input_points'].append(points[int(i) - 1]) return json.dumps(captcha)
def is_website_up(website, session): try: response = session.get(website, timeout=5) return (response.status_code == 200) except Timeout: print(bad + website + " is down") return False
def connectieopen(self, link): headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-US,en;q=0.5", 'Accept-Encoding': 'utf-8' } self.total_req += 1 slapen = randint(1, 2) time.sleep(slapen) print(self.total_req) try: session = requests.Session() req = session.get(link, headers=headers) content = req.content bsObj = BeautifulSoup(content.decode('utf-8', 'ignore'), "html.parser") except HTTPError as fout: print(fout + link) except URLError as fout: print("cant find url") except ConnectionError as e: print(e) print(link) else: return bsObj
def exchange(): URL = 'https://pro-api.coinmarketcap.com/v1/tools/price-conversion?amount={}&symbol={}&convert={}&CMC_PRO_API_KEY={}' API_KEY = 'a8cf5d78-8dac-4325-838b-2ebe27f2f644' parameters = { 'amount':'1', 'symbol':'EUR', 'convert':'BTC' } headers = { 'Accepts': 'application/json', 'X-CMC_PRO_API_KEY': 'API_KEY', } #response = requests.get(URL.format(request.values.get('from_quantity'), request.values.get('from_currency'), request.values.get('to_currency'), API_KEY)) session = session() session.headers.update(headers) try: response = session.get(URL, params=parameters) data = json.loads(response.text) print(data) except (ConnectionError, Timeout, TooManyRedirects) as e: print(e)
def startTweet(): increment() session = requests.session() result = session.get('https://twitter.com/login') tree = html.fromstring(result.text) authenticity_token = list(set(tree.xpath("//input[@name='authenticity_token']/@value")))[0] print ('auth token is: ' + authenticity_token) payload = { 'authenticity_token' : {authenticity_token, authenticity_token}, 'redirect_after_login' : '', 'remember_me' : '1', 'scribe_log': '', 'session[password]': AutomatedPosting.pass_entry.get(), 'session[username_or_email]' : AutomatedPosting.user_entry.get(), 'ui_metrics':'' } params ={ 'authenticity_token' : authenticity_token, 'batch_mode' : 'off', 'is_permalink_page' :'false', 'place_id' : '', 'status' : AutomatedPosting.msg_entry.get() + " " + str(COUNT), 'tagged_users':'' } result = session.post('https://twitter.com/sessions', data = payload) cookies = result.cookies result = session.post('https://twitter.com/i/tweet/create', cookies = cookies, params = params, headers =dict(referer = 'https://www.twitter.com/')) print(result.text)
def login_sso(): print("starting sso log in ...") login_page = session.get(login_sso_url) print(login_page.text) login_page.encoding = 'utf8' root = etree.HTML(login_page.content) if "You have successfully logged in" in login_page.text or "退出登录" in login_page.text: print("sso already logged in") return True form = root.xpath('//div[@class="clearfix login_btncont"]')[0] lt = root.xpath('//input[@name="lt"]/@value')[0] execution = form.xpath('//input[@name="execution"]/@value')[0] _eventId = form.xpath('//input[@name="_eventId"]/@value')[0] login_result = session.post(login_sso_url, data={ 'submit': '登录', 'username': username, 'password': password, 'code': '', 'lt': lt, 'execution': execution, "_eventId": _eventId }) result = 'You have successfully logged in' in login_result.text print("sso log in done, result={0}".format(result)) return result
def makeRequest(host, user, password, verbose): """Función que realiza las peticiones al servidor especificado. Recibe host, user, password y verbose. Si la opcion verbose es True, se muestra lo que se está realizando en cada paso. Se pregunta dentro de la función por las opciones --tor y --agente En caso de que --tor sea True se realiza la petición a través de TOR mediante los proxies y session() En caso de que --agente sea True se cambia el agente de usuario de Python por el de HotJava con los headers""" try: header = { } # Diccionario vacío que indicará el agente, vació indica el agente original (Python) if opts.agente: header[ 'User-agent'] = 'HotJava/1.1.2 FCS' # Valor que se agrega al diccionario para cambiar el agente if opts.tor: session.proxies = {} session.proxies[ 'http'] = 'socks5h://localhost:9050' # Proxys http y https para realizar la petición mediante TOR session.proxies['https'] = 'socks5h://localhost:9050' response = session.get(host, auth=(user, password), headers=header) else: response = session.get(host, auth=(user, password), headers=header) if verbose: ip = session.get( 'http://httpbin.org/ip', headers=header ) # Se obtiene la ip desde la que se hace la petición ip = str(ip.text).replace('\"origin\":', '').replace( '{', '').replace('}', '').replace('\n', '') print 'La petición se realiza desde:%s' % (ip) agente = session.get( 'https://httpbin.org/user-agent', headers=header ) # Se obtiene el agente de usuario con el que se hace la petición agente = str(agente.text).replace('\"user-agent\":', '').replace( '{', '').replace('}', '').replace('\n', '') print 'El agente de usuario es:%s' % (agente) print 'Se está intentando ingresar con usuario \'%s\' contraseña \'%s\'' % ( user, password) print 'La respuesta (response) obtenida es : %s y, por lo tanto:' % ( response) if response.status_code == 200: print 'CREDENCIALES ENCONTRADAS!: %s\t%s' % (user, password) if opts.reporte != None: reportResults( 'Usuario: \'%s\'\t\tContraseña: \'%s\'\n' % (user, password), opts.reporte) else: print 'NO FUNCIONO :c ' except ConnectionError: printError('Error en la conexion, tal vez el servidor no esta arriba.', True)
def get_auth_token(): global auth_token print("getting auth token...") redirect_1 = \ session.get(token_url, allow_redirects=False).headers['Location'] print("redirecting to: {0}".format(redirect_1)) redirect_2 = session.get(url=redirect_1, headers=default_headers, allow_redirects=False).headers['Location'] print("redirecting to: {0}".format(redirect_2)) redirect_3 = session.get(url=redirect_2, headers=default_headers, allow_redirects=False).headers['Location'] print("redirecting to: {0}".format(redirect_3)) auth_token = redirect_3.split('=')[1] print("token: {0}".format(auth_token))
def getPlayerStatus(self, session, id): response = session.get( "http://live.nicovideo.jp/api/getplayerstatus?v=" + id) #print (response.text) status = ElementTree.fromstring(response.content) if status.attrib["status"] != "ok": print "getplayerstatus failed" else: return status
def get_xsrf(): response = session.get('https://www.zhihu.com', headers=headers) _xsrf_regex = '[\s\S]*<input type="hidden" name="_xsrf" value="(.*?)"/>' res_xsrf = re.match(_xsrf_regex, response.text) if res_xsrf: _xsrf = res_xsrf.group(1) else: _xsrf = '' return _xsrf
def _downloadTorrent_TYT(self, torrentURL, session, torrentDownloadDir): logging.debug(' Downloading torrent from TYT url:"%s".' % torrentURL) torrentFile = session.get(torrentURL) torrentId = re.sub('http://tenyardtracker.com/download\.php\?torrent=', '', torrentURL) with open(torrentDownloadDir + '/tenyardtracker_%s.torrent' % torrentId, 'wb') as f: for chunk in torrentFile.iter_content(1024): f.write(chunk) f.close()
def drupal_version(website, session): response = session.get(website) try: version = response.headers["X-Generator"].replace( " (https://www.drupal.org)", "") print(good + "Drupal Version: " + version) return version except KeyError: print(bad + website + " does not appear to be running Drupal") return None
def is_website_up(website, session): try: response = session.get(website, timeout=5) return (response.status_code == 200) except Timeout: return False except RequestException as error: print(bad + website + " RequestException: ") print(bad + str(error)) return False
def drupal_version(website, session): response = session.get(website) try: version = response.headers["X-Generator"].replace( " (https://www.drupal.org)", "") print(good + "Drupal Version: " + version) return version except KeyError: print(bad + "Drupal Version: Not Detected") return None
def CanvasLogin(session): res = session.get(MainUrl) #print 'Return status:', res.status_code assert res.status_code == 200 # code = OK #print res.text PayloadLogin['authenticity_token'] = getAuthToken(res.text, 0) updateUserInfoForPayloadLogin() session.post(LoginPostUrl, data=PayloadLogin)
def gdown(self, url, path, enable=False): baseurl = 'https://docs.google.com/uc?export=download' fileid = url.split('id=')[1] params = {'id': fileid} session = requests.session() response = session.get(baseurl, params=params, stream=True) tokens = self.get_confirm_token(response) filename = self.get_gdrive_name(response) filesize = self.get_content_len(response) if tokens: params.update(dict(confirm=tokens)) path = os.path.join(path, filename) if not os.path.exists(path) or enable: respons = session.get(baseurl, params=params, stream=True) self.download(respons, path) # if os.path.exists(path): # print('success') return filesize
def test_is_login(): try: session.cookies.load() except: print('加载cookie失败') response = session.get('https://www.zhihu.com/inbox', headers=headers, allow_redirects=False) with open('sixin.html', 'wb') as f: f.write(response.text.encode('utf-8')) print('ok')
def check_if_file_created(target, file_name, session): url = target + file_name print(good + 'Checking... ' + url) file_response = session.get(url) if file_response.status_code != 404: print(good + 'File successfully created:') print(' ' * 4 + file_response.text.strip()) cleanup_file(target, file_name, session) return True else: print(bad + target + ' File creation unsuccessful') return False
def get_tweets_json(self, q, since='', until='', f='tweets', username='', lang='', cursor='', session=None): url = prepare_data(q, since, until, f, username, lang, cursor) self.headers['Referer'] = url return session.get(url, headers=self.headers).json()
def getXLSofRounds(session, roundsIds, baseApiUrl, cookies, resultsFolder): for roundsId in roundsIds: apiUrl = baseApiUrl + '/' + roundsId['type'] + '/rounds/' + str( roundsId['id']) + '/report/results/excel' print(apiUrl) resp = session.get(apiUrl, cookies=cookies, stream=True) resp.raw.decode_content = True if resp.status_code == 200: with open(resultsFolder + '/' + roundsId['name'] + '.xlsx', 'wb') as output: shutil.copyfileobj(resp.raw, output)
def getCoursesForSem(session, s): r = session.get(conf['baseurl'] + 'index.php?role=0&cat=1&csem=1&sem=' + s) if(r.status_code == 200): soup = BeautifulSoup(r.text) courses = list() for o in soup.find_all('h3'): c = getInfo(o.contents[0]) courses.append(c) return courses else: print 'ERROR: ' + str(r.status) + ' ' + r.reason sys.exit()
def getCoursesForSem(session, s): r = session.get(baseurl + 'index.php?role=0&cat=1&csem=0&sem=' + s) if (r.status_code == 200): soup = BeautifulSoup(r.text) courses = list() for o in soup.find_all('h3'): c = getInfo(o.contents[0]) courses.append(c) return courses else: print 'ERROR: ' + str(r.status) + ' ' + r.reason sys.exit()
def get_login_cookies(session, baseUrl): """ Get PHPSESSID cookie to use the API """ credentials = config('loginCredentials') print("start login") loginPage = session.get(baseUrl + '/login') soup = BeautifulSoup(loginPage.text, "html.parser") csrf = soup.find("input", type="hidden") print(csrf) # Get user and password from config.ini file credentials = config('loginCredentials') payload = { '_username': credentials['user'], '_password': credentials['password'], '_csrf_token': csrf['value'], } # Fake browser header headers = { 'user-agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.98 Safari/537.36" } loginCheck = session.post(baseUrl + '/login_check', data=payload, headers=headers, cookies=loginPage.cookies) # Response ok or not? if loginCheck.status_code == 200: securePage = session.get( 'https://amsterdam.apptimizeplatform.nl/api/inspectionround/area/rounds/planning', headers=headers) # Check for name in html page which is only visible after login if securePage.status_code == 200: print("login succeeded!") return loginCheck.cookies if loginCheck.status_code == 401 or loginCheck == 403: print('login failed!')
def saveFile(session, src, path, name): global files files.next() dst = path + name try: with open(dst): print u'| | +--{:<50s}'.format(name) + u'['+colors.OKBLUE+'skipped'+colors.ENDC+']' pass except IOError: with open(dst, 'wb') as handle: print u'| | +--{:<50s}'.format(name) + u'['+colors.OKGREEN+'downloading'+colors.ENDC+']' r = session.get(src, stream=True) for block in r.iter_content(1024): if not block: break handle.write(block)
def detour(): randlist = [ 'aboleth', 'acolyte', 'adult-black-dragon', 'adult-blue-dragon', 'adult-brass-dragon', 'adult-bronze-dragon', 'adult-copper-dragon', 'adult-gold-dragon', 'adult-green-dragon', 'adult-red-dragon', 'adult-silver-dragon', 'adult-white-dragon', 'air-elemental', 'allosaurus', 'ancient-black-dragon', 'ancient-blue-dragon', 'ancient-brass-dragon', 'ancient-bronze-dragon', 'ancient-copper-dragon', 'ancient-gold-dragon', 'ancient-green-dragon', 'ancient-red-dragon', 'ancient-silver-dragon', 'ancient-white-dragon', 'androsphinx' ] randstring = randlist[random.randint(0, len(randlist) - 1)] url = request_url + randstring with requests.Session() as session: raw = session.get(url, headers=headers) sleep(random.randint(20, 50))
def saveFile(session, src, path, name): global files files.next() dst = path + name.decode('utf-8') try: with open(dst): print '[' + colors.OKBLUE + 'skip' + colors.ENDC + '] | | +--%s' % name pass except IOError: with open(dst, 'wb') as handle: print '[' + colors.OKGREEN + 'save' + colors.ENDC + '] | | +--%s' % name r = session.get(src, stream=True) for block in r.iter_content(1024): if not block: break handle.write(block)
def saveFile(session, src, path, name): global files files.next() dst = path + name.decode('utf-8') try: with open(dst): print '['+colors.OKBLUE+'skip'+colors.ENDC+'] | | +--%s' %name pass except IOError: with open(dst, 'wb') as handle: print '['+colors.OKGREEN+'save'+colors.ENDC+'] | | +--%s' %name r = session.get(src, stream=True) for block in r.iter_content(1024): if not block: break handle.write(block)
def downloadResource(session, res, path): src = res.a['href'] r = session.get(src) if(r.status_code == 200): headers = r.headers.keys() if ('content-disposition' in headers): name = r.headers['content-disposition'].decode('utf-8').split(';')[1].split('=')[1].strip('"') else: soup = BeautifulSoup(r.text) if ('content-type' in headers) and ('content-script-type' in headers) and ('content-style-type' in headers): src = soup.find(class_='region-content').a['href'] else: src = soup.find_all('frame')[1]['src'] name = os.path.basename(src).replace('%20', ' ').replace('%28', '(').replace('%29', ')') saveFile(session, src, path, name) else: print 'ERROR: ' + str(r.status) + ' ' + r.reason sys.exit()
def get_area_from_net(self): session = requests.session() session.headers.update({ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36' }) response = session.get(baseUrl) soup = BeautifulSoup(response.text, "html5lib") dl = soup.find_all("dl", attrs={"id": "rentid_D04_01"}) # 获取各地区的 url 地址的 dl 标签 my_as = dl[0].find_all("a") # 获取 dl 标签中所有的 a 标签, for my_a in my_as: if my_a.text == "不限" or "周边" in my_a.text: # 清除不限地区的 continue # print(my_a["href"]) # print(my_a.text) self.areaList.append(my_a.text)
def getListofRounds(session, baseApiUrl, cookies): roundsIds = [] apiUrls = ['area', 'object'] for apiUrl in apiUrls: rounds = session.get(baseApiUrl + '/' + apiUrl + '/rounds/planning', cookies=cookies) roundsJson = rounds.json() roundsItems = [{ 'id': r['id'], 'name': r['name'].replace('/', '-'), 'type': apiUrl } for r in roundsJson] roundsIds.extend(roundsItems) print(roundsIds) return roundsIds
def get_orders_detail(account): payload = f"""SrcAccount={account['SrcAccount']}&PlaceId={account['SrcAccount_placeId']}&AccountKey={account['AccountKey']}""" url = f"""{url_trade}/GetOrdersDetail?{payload}""" result = [] try: r = session.get(url, headers=headers) except: return result soup = BeautifulSoup(r.content) data_table = soup.find_all(class_="data-table") if len(data_table) > 0: result = table_to_list(data_table[0]) return result
def downloadCourse(session, c, sem): global files global sections files = itertools.count() sections = itertools.count() name = c['key'].replace('/', '-') + '/' path = root + sem.replace('/', '-') + '/' + name #TODO: secure pathnames if not os.path.exists(path): os.makedirs(path) print '+--' + name r = session.get(c['url']) if(r.status_code == 200): soup = BeautifulSoup(r.text) for s in soup.find_all(class_='section main clearfix'): downloadSection(session, s, path) #print 'Saved ' + str(files.next()) + ' Files in ' + str(sections.next()) + ' Sections' else: print 'ERROR: ' + str(r.status) + ' ' + r.reason sys.exit()
def saveLink(session, url, path, name): global files files.next() fname = name.encode('utf-8').replace('/', '') + '.html' dst = path.encode('utf-8') + fname try: with open(dst): print '['+colors.OKBLUE+'skip'+colors.ENDC+'] | | +--%s' %name pass except IOError: with open(dst, 'wb') as handle: print '['+colors.OKGREEN+'save'+colors.ENDC+'] | | +--%s' %name r = session.get(url) soup = BeautifulSoup(r.text) link = soup.find(class_='region-content').a['href'] try: handle.write(u'<a href="' + link.decode('utf-8') + u'">' + name.decode('utf-8') + u'</a>') except UnicodeEncodeError: os.remove(dst) print '['+colors.FAIL+'fail'+colors.ENDC+'] | | +--%s' %name pass
def scrape(link_set, path): # check to see if the directories in the path string exist if not os.path.exists(path): #if not, create them os.makedirs(path) for link in link_set: r = session.get(link) if(r.status_code == 200): soup = BeautifulSoup(r.text, 'lxml') # extract the user id # useful for naming the html and png files we will scrape user_id = re.search(r"(?<=user\/view\.php\?id=)\d*", link).group() # save the profile page as a .html file #output_file = path + str(user_id) + ".html" #with open(output_file, 'w') as f: # f.write(str(soup)) #f.close() #print "page saved at " + output_file image_link = soup.find_all('img', class_='userpicture')[1].get('src') # save the profile image as a .png file output_file = path + str(user_id) + ".png" response = requests.get(image_link, stream=True) with open(output_file, 'wb') as out_file: shutil.copyfileobj(response.raw, out_file) del response print "image saved at " + output_file else: print 'ERROR: ' + str(r.status) + ' ' + r.reason sys.exit()