def getHttpStatusCode2(url, result): ''' 获取http状态吗 ''' Logger.debug(u'---- getHttpStatusCode2 ---- BEGIN ----') Logger.debug(u'---- params1:url:%s ----' % url) status = '' try: r = http.urlopen(method='GET', url=url, timeout=10, retries=False, redirect=False) status = r.status result.append([url, status]) Logger.info("%s : %s" % (url, status)) host = urllib3.get_host(url) if (status in [301, 302]): redirect = r.get_redirect_location() if (urllib3.get_host(redirect)[1] == None): redirect = host[0] + '://' + host[1] + '/' + redirect Logger.debug(u'重定向url:%s' % redirect) if (redirect == url): # 自己重定向自己,跳出 pass else: return getHttpStatusCode2(redirect, result) except urllib3.exceptions.MaxRetryError as e: Logger.debug(u'---- return ----') result.append([url, '链接无效']) return '链接无效' except urllib3.exceptions.ConnectTimeoutError as e: # 链接超时 Logger.debug(u'---- return ----') result.append([url, '链接超时']) return '链接超时' except urllib3.exceptions.SSLError as e: # 链接超时 Logger.debug(u'---- return ----') result.append([url, 'SSLError']) return 'SSLError' except ConnectionResetError as e: raise except urllib3.exceptions.ProtocolError as e: raise except: raise else: return status Logger.debug(u'---- return1:%s ----' % status) Logger.debug(u'---- getHttpStatusCode2 ---- END ----')
def __init__( self, billomat_id, billomat_api_key, billomat_app_id = None, billomat_app_secret = None, ): # Base URL self.url = "https://{billomat_id}.billomat.net/".format( billomat_id = billomat_id ) # Headers self.headers = { "X-BillomatApiKey": billomat_api_key, "Content-Type": "application/xml", } if billomat_app_id: self.headers["X-AppId"] = billomat_app_id if billomat_app_secret: self.headers["X-AppSecret"] = billomat_app_secret # Bind Urlfetch for Google App Engine self.urlfetch = urlfetch # Initialize Urllib3-ConnectionPool if urllib3: scheme, host, port = urllib3.get_host(self.url) self.conn = urllib3.HTTPSConnectionPool( host = host, port = port ) else: self.conn = None
def fetch_all_album_in_page() -> []: url = "http://www.gqxzt.com/gaoqingr/qingzxies/xinyanxiaogongzhu/1.html" resp = requests.get(url) resp.encoding = "gb2312" bs = BeautifulSoup(resp.text, "html.parser") album_list = bs.select("#list")[0].find_all("li") host = urllib3.get_host(url)[0] + "://" + urllib3.get_host(url)[1] print(f"host: {host}") album_info_list = [] for album in album_list: title = album.a.get("title") link = host + album.a.get("href") album_info_list.append((title, link)) print(album_info_list) return album_info_list
def school(self, response): current = response.meta['current'] for schoolInfo in response.xpath( '//div[contains(@class, "mar_t_30 overhidden")]'): self.schoolId += 1 # 获取学校的信息 schoolItem = SchoolItem() schoolItem['id'] = self.schoolId schoolItem['type'] = 'school' schoolItem['name'] = schoolInfo.xpath( './following-sibling::div[@class="p_bor"]/a/text()' ).extract_first() schoolItem['address'] = schoolInfo.xpath( './descendant::tr[1]/td[1]/text()').extract_first() schoolItem['url'] = schoolInfo.xpath( './descendant::tr[1]/td[2]/a/@href').extract_first() if schoolInfo.xpath('./descendant::tr[2]/td[1]/text()' ).extract_first().split(':')[1]: schoolItem['schoolType'] = schoolInfo.xpath( './descendant::tr[2]/td[1]/text()').extract_first().split( ':')[1].strip() schoolItem['phone'] = schoolInfo.xpath( './descendant::tr[2]/td[2]/text()').extract_first().split( ':')[1].strip() schoolItem['top'] = schoolInfo.xpath( './descendant::tr[3]/td[1]/text()').extract_first().split( ':')[1].strip() yield schoolItem # 学校的图片 imageItem = ImageItem() imageItem['id'] = self.schoolId imageItem['type'] = 'image' imageItem['image_urls'] = schoolInfo.xpath( './descendant::img/@src').extract() imageItem['image_paths'] = '' yield imageItem middleItem = CityToSchoolItem() middleItem['type'] = 'middle' middleItem['provinceId'] = current['ppid'] middleItem['cityId'] = current['pid'] middleItem['townId'] = current['id'] middleItem['schoolId'] = self.schoolId yield middleItem # 爬取分页 host = urllib3.get_host(response.url) nextPage = response.xpath("//div[@id='pagenav']/ul/li[3]/a/@href") if nextPage: nextUrl = host[0] + host[1] + nextPage request = scrapy.Request(nextUrl, callback=self.school) request.meta['current'] = current yield request
def _redirect(self, request, secure): protocol = secure and "https" or "http" newurl = "%s://%s%s" % (protocol, get_host(request), request.get_full_path()) if settings.DEBUG and request.method == 'POST': raise RuntimeError( """Django can't perform a SSL redirect while maintaining POST data. Please structure your views so that redirects only occur during GETs.""" ) return HttpResponsePermanentRedirect(newurl)
def ViewResult(Keyword, curIP, nType, bSave, _AccountKey=None): conn = MySQLdb.connect(**ConfigFile) #conn.text_factory=str ResData = BingSearch(Keyword, _AccountKey) JsonData = {} try: JsonData = json.loads(ResData) except: pass url = [] for key, value in JsonData.items(): for key1, value1 in value.items(): for lv in value1: try: if nType == 0: uu = lv['Url'] i = uu.find('/', 8) url.append(uu[0:i]) elif nType == 1: print lv['Url'] elif nType == 2: print '%s -> %s' % (lv['Url'], lv['Title']) if bSave: host = urllib3.get_host(lv['Url'])[1] newip = getIp(host) if newip == curIP: d_query_set = Data.objects.filter(ip=curIP).filter( uri=host) if d_query_set.exists(): print("update 2 ip-host : %s->%s" % (newip, host)) #同个ip 如果更新域名则同步更新内容 d_query = d_query_set[0] d_query.uri = host d_query.title = lv['Title'] d_query.descript = lv['Description'] d_query.state = '6' #域名更新 d_query.save() else: print "save to database %s" % host szSQL = "Insert into Data(IP,URI,Title,Descript) values ('%s','%s','%s','%s');" % ( curIP, host, lv['Title'], lv['Description']) cur = conn.cursor() cur.execute(szSQL) conn.commit() else: print("fake curip") except Exception, e: print e finally: pass
def ViewResult(data): JsonData = {} for r in data: ResData = r.get("ResData", "") curIP = r.get("curIP", "") updated = r.get("updated", "") data_query = Data.objects.filter(ip=curIP) if not updated: #如果快照的内容没有更新则更新数据 print("nothing changed compared with cached page") logger.info("nothing changed compared with cached page") continue try: JsonData = json.loads(ResData) except: pass print("[%s]" % curIP) logger.info("[%s]" % curIP) url = [] for key, value in JsonData.items(): for key1, value1 in value.items(): for lv in value1: try: print(lv['Url']) logger.info(lv['Url']) host = urllib3.get_host(lv['Url'])[1] d_query_set = data_query.filter(uri=host) if d_query_set.exists(): print("update 2 ip-host : %s->%s" % (curIP, host)) #同个ip 如果更新域名则同步更新内容 d_query = d_query_set[0] d_query.uri = host d_query.title = lv['Title'] d_query.descript = lv['Description'] d_query.state = '6' #域名更新 d_query.save() else: print "save to database %s" % host now = datetime.datetime.now() Data.objects.create(ip=curIP, uri=host, title=lv['Title'], descript=lv['Description'], time=now) except Exception, e: print e finally: pass
def url_query_convert_to_path(url, file_type): """Конвертация урла на path. Пример: http://lenta.ru/news/2013/03/dtp/index.html => [CUR_DIR]/lenta.ru/news/2013/03/dtp/index.txt """ host = get_host(url)[1] query = urlparse(url).path query = query.strip('/') query = '/'.join(query.split('/')[0:-1]) file_name = list(filter(None, urlparse(url).path.split('/')))[-1] file_name = os.path.splitext(file_name)[0] return "{host}/{query}".format(host=host, query=query), \ "{file_name}.{file_type}".format( file_name=file_name, file_type=file_type)
def ViewResult(Keyword,curIP,nType,bSave, _AccountKey=None): conn = MySQLdb.connect(**ConfigFile) #conn.text_factory=str ResData=BingSearch(Keyword, _AccountKey) JsonData={} try: JsonData=json.loads(ResData) except: pass url=[] for key,value in JsonData.items(): for key1,value1 in value.items(): for lv in value1: try: if nType==0: uu=lv['Url'] i=uu.find('/',8) url.append(uu[0:i]) elif nType==1: print lv['Url'] elif nType==2: print '%s -> %s' % (lv['Url'],lv['Title']) if bSave: host = urllib3.get_host(lv['Url'])[1] newip = getIp(host) if newip == curIP: d_query_set = Data.objects.filter(ip=curIP).filter(uri=host) if d_query_set.exists(): print ("update 2 ip-host : %s->%s" %(newip, host)) #同个ip 如果更新域名则同步更新内容 d_query = d_query_set[0] d_query.uri = host d_query.title = lv['Title'] d_query.descript = lv['Description'] d_query.state = '6' #域名更新 d_query.save() else: print "save to database %s" % host szSQL="Insert into Data(IP,URI,Title,Descript) values ('%s','%s','%s','%s');" % (curIP,host,lv['Title'],lv['Description']) cur = conn.cursor() cur.execute(szSQL) conn.commit() else: print ("fake curip") except Exception, e: print e finally: pass
def ViewResult(data): JsonData={} for r in data: ResData = r.get("ResData", "") curIP = r.get("curIP", "") updated = r.get("updated", "") data_query = Data.objects.filter(ip=curIP) if not updated: #如果快照的内容没有更新则更新数据 print ("nothing changed compared with cached page") logger.info("nothing changed compared with cached page") continue try: JsonData=json.loads(ResData) except: pass print ("[%s]" % curIP) logger.info("[%s]" % curIP) url=[] for key,value in JsonData.items(): for key1,value1 in value.items(): for lv in value1: try: print (lv['Url']) logger.info(lv['Url']) host = urllib3.get_host(lv['Url'])[1] d_query_set = data_query.filter(uri=host) if d_query_set.exists(): print ("update 2 ip-host : %s->%s" %(curIP, host)) #同个ip 如果更新域名则同步更新内容 d_query = d_query_set[0] d_query.uri = host d_query.title = lv['Title'] d_query.descript = lv['Description'] d_query.state = '6' #域名更新 d_query.save() else: print "save to database %s" % host now = datetime.datetime.now() Data.objects.create(ip=curIP,uri=host,title=lv['Title'],descript=lv['Description'], time=now) except Exception, e: print e finally: pass
def process_view(self, request, view_func, view_args, view_kwargs): if not settings.DEBUG: # print(settings.DEBUG) """ only perform the redirect if not in debug mode """ protocol = 'https://' if request.is_secure() else 'http://' host = get_host(request) print('host:', host) new_url = '' try: if host in settings.CANON_URLS_TO_REWRITE: new_url = protocol + settings.CANON_URL_HOST + request.get_full_path( ) except AttributeError: if host != settings.CANON_URL_HOST: new_url = protocol + settings.CANON_URL_HOST + request.get_full_path( ) if new_url: return HttpResponsePermanentRedirect(new_url)
def __init__( self, billomat_id, billomat_api_key, billomat_app_id=None, billomat_app_secret=None, timeout_seconds=600 # 10 Minutes ): self.timeout_seconds = timeout_seconds # Base URL self.url = "https://{billomat_id}.billomat.net/".format( billomat_id=billomat_id) # Headers self.headers = { "X-BillomatApiKey": billomat_api_key, "Content-Type": "application/xml", } if billomat_app_id: self.headers["X-AppId"] = billomat_app_id if billomat_app_secret: self.headers["X-AppSecret"] = billomat_app_secret # Bind Urlfetch for Google App Engine self.urlfetch = urlfetch # Initialize Urllib3-ConnectionPool if urllib3: scheme, host, port = urllib3.get_host(self.url) import certifi self.conn = urllib3.HTTPSConnectionPool( host=host, port=port, timeout=self.timeout_seconds, cert_reqs='CERT_REQUIRED', ca_certs=certifi.where()) else: self.conn = None
def parse_uri(url, path): logging.getLogger("urllib3").setLevel(logging.INFO) host = urllib3.get_host(url) logger.info(host) if host[0] == 'http': pool = urllib3.HTTPConnectionPool(host=host[1], port=host[2], maxsize=10) elif host[0] == 'https': pool = urllib3.HTTPSConnectionPool(host=host[1], port=host[2], maxsize=10) #req = urllib3.HTTPResponse() req = pool.request("GET", url, timeout=2.5) if req.status == 200: logging.info("Make directory %s" % path) try: os.mkdir(os.path.join(os.path.curdir, path)) except Exception as e: logger.warning(str(e)) parse_body(req.data, url, path) else: logger.error("Error while loading page: %s" % str(req.status))
def get_and_test(self, url: str = 'https://baidu.com', num: int = 3, timeout=1): # 只测试域名是否能访问 url = '://'.join(urllib3.get_host(url)[:2]) proxy_list, proxy_type = [], f'http{["", "s"][url[5] == "s"]}' while num > 0: proxy_ip = self.get(proxy_type) if not proxy_ip: print('代理池准备中', proxy_ip) time.sleep(3) try: self.log('获取到代理 ip:', proxy_ip, proxy_type) self.log(f'使用{url}来测试代理') get(url, proxies=f'{proxy_type}://{proxy_ip}', timeout=timeout, verify=False) num -= 1 proxy_list.append(f'{proxy_type}://{proxy_ip}') except Exception as e: self.log(f'代理 {proxy_ip} 出错:{e}') self.delete(proxy_ip) self.log('获取代理完成:', proxy_list) return proxy_list
'.net.bz', '.cc', '.com.co', '.net.co', '.nom.co', '.de', '.es', '.com.es', '.nom.es', '.org.es', '.eu', '.fm', '.fr', '.gs', '.in', '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in', '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl', '.nu', '.co.nz', '.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw', '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg' ] # 保证先匹配有两个后缀的域名 sorted(shuffix_lst, key=len)[::-1] # 获取当前时间用于建立文件夹 time_stamp = str(time.time())[:10] os.mkdir('domains_{}'.format(time_stamp)) for line in d: name, url = line.split() # 解析主机名 url = urllib3.get_host(url)[1] # 获取主站 for shuffix in shuffix_lst: if shuffix in url: url = url.split(shuffix)[0].split('.')[-1] + shuffix break print(url) try: res = pythonwhois.get_whois(url) except: res = '{}: 解析失败'.format(url) with open('domains_{}/{}-{}.txt'.format(time_stamp, name, url), 'w') as f: json.dump(res, f, ensure_ascii=False, cls=DateEncoder, indent=4)
def __init__(self, html_text, url, settings=None): self._url = url self.config = Config(get_host(url)[1], settings) self.soup = BeautifulSoup(html_text, features='html.parser')
import urllib3 body = urllib3.get_host("https://baidu.com") print(body)
time.sleep(0.1) @classmethod def tearDownClass(cls): import urllib # Yup, that's right. try: urllib.urlopen(cls.scheme + '://' + cls.host + ':' + str(cls.port) + '/shutdown') except IOError: pass cls.server_thread.join() class HTTPSDummyServerTestCase(HTTPDummyServerTestCase): scheme = 'https' host = 'localhost' port = 18082 if __name__ == '__main__': log.setLevel(logging.DEBUG) log.addHandler(logging.StreamHandler(sys.stderr)) from urllib3 import get_host url = "http://localhost:8081" if len(sys.argv) > 1: url = sys.argv[1] scheme, host, port = get_host(url) make_server(scheme=scheme, host=host, port=port)
#如果快照的内容没有更新则更新数据 print("nothing changed compared with cached page") return try: JsonData = json.loads(ResData) except Exception, e: print e pass print "[%s]" % curIP url = [] for key, value in JsonData.items(): for key1, value1 in value.items(): for lv in value1: try: print lv['Url'] host = urllib3.get_host(lv['Url'])[1] # newip = getIp(host) # if newip == curIP: d_query_set = data_query.filter(uri=host) if d_query_set.exists(): print("update 2 ip-host : %s->%s" % (curIP, host)) #同个ip 如果更新域名则同步更新内容 d_query = d_query_set[0] d_query.uri = host d_query.title = lv['Title'] d_query.descript = lv['Description'] d_query.state = '6' #域名更新 d_query.save() else: print "save to database %s" % host Data.objects.create(ip=curIP,
#coding:utf-8 from tkinter import * from tkinter.scrolledtext import ScrolledText import threading import time import queue import urllib3 import requests li = list() host = urllib3.get_host( "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm") li.append("host:" + (str)(host)) status = requests.get( "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").status_code li.append("status:" + (str)(status)) charset = requests.get( "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").encoding li.append("charset:" + (str)(charset)) cookies = requests.get( "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").cookies li.append("cookies:" + (str)(cookies)) headers = requests.get( "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").headers li.append("headers:" + (str)(headers)) url = requests.get("http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").url li.append("url:" + (str)(url)) codes = requests.codes li.append("codes:" + (str)(codes)) history = requests.get( "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").history li.append("history:" + (str)(history))
import urllib3 from bs4 import BeautifulSoup quote_page = 'https://www.bloomberg.com/quote/SPX:IND' page = urllib3.get_host(quote_page) soup = BeautifulSoup(page, 'html.parser') name_box = soup.find('h1', attrs={'class': 'companyName__99a4824b'}) name = name_box.text.strip() # strip() is used to remove starting and trailing print(name)
#如果快照的内容没有更新则更新数据 print ("nothing changed compared with cached page") return try: JsonData=json.loads(ResData) except Exception, e: print e; pass print "[%s]" % curIP url=[] for key,value in JsonData.items(): for key1,value1 in value.items(): for lv in value1: try: print lv['Url'] host = urllib3.get_host(lv['Url'])[1] # newip = getIp(host) # if newip == curIP: d_query_set = data_query.filter(uri=host) if d_query_set.exists(): print ("update 2 ip-host : %s->%s" %(curIP, host)) #同个ip 如果更新域名则同步更新内容 d_query = d_query_set[0] d_query.uri = host d_query.title = lv['Title'] d_query.descript = lv['Description'] d_query.state = '6' #域名更新 d_query.save() else: print "save to database %s" % host Data.objects.create(ip=curIP,uri=host,title=lv['Title'],descript=lv['Description'])
import urllib3 url = 'http://www.acme.com/products/3322' response = urllib3.get_host(url) #.urlopen(url).read() print("The Response is :",response)
def make_server(**kw): try: return eventlet_server(**kw) except ImportError: return simple_server(**kw) def make_server_thread(target, **kw): import threading t = threading.Thread(target=target, kwargs=kw) t.start() return t if __name__ == '__main__': log.setLevel(logging.DEBUG) log.addHandler(logging.StreamHandler(sys.stderr)) from urllib3 import get_host url = "http://localhost:8081" if len(sys.argv) > 1: url = sys.argv[1] print "Starting server at: %s" % url scheme, host, port = get_host(url) make_server(scheme=scheme, host=host, port=port)
#! /usr/bin/env python # -*- coding: utf-8 -*- # File Name: webpage # Description : # Author : SanYapeng # date: 2019/2/2 # Change Activity: 2019/2/2: import urllib3 url = "http://www.baidu.com" webpage = urllib3.get_host(url) print(webpage)
def download_mplce_url(urldest_tuple): import requests, re, urllib, urllib2, urllib3, OpenSSL, subprocess from os import path import urllib3.contrib.pyopenssl urllib3.contrib.pyopenssl.inject_into_urllib3() urllib3.disable_warnings() countimage = 0 countstyle = 0 image_url, destpath = urldest_tuple destdir = path.dirname(destpath) colorstyle = destpath.split('/')[-1][:9] alt_number = destpath.split('_')[-1][0] try: image_url = 'https://www.drop'.join(image_url.split('https://wwwop')) image_url = urllib.unquote_plus(image_url) except: pass ######################################################## ################## REGEX Filters Defined ############## ## Image URL Cleanup and Replace Extraneous/Bad Chars ## ######################################################## ####### Dropbox Fix for View vs DL value ############### regex_dbx = re.compile(r'^https://www.dropbox.com/.+?\.[jpngJPNG]{3}$') regex_dbxprev = re.compile(r'^https://www.dropbox.com/.+?preview.*\.[jpngJPNG]{3}$') image_url = image_url.replace('dl=9', 'dl=1').replace('dl=0', 'dl=1').replace('dl=2', 'dl=1').replace('dl=3', 'dl=1').replace( 'dl=4', 'dl=1').replace('dl=5', 'dl=1').replace('dl=6', 'dl=1').replace('dl=7', 'dl=1').replace('dl=8', 'dl=1') regex_dl = re.compile(r'^.+\?dl=1.*?$') if regex_dbx.findall(image_url): if regex_dbxprev.findall(image_url): print 'REGEX DBXPRE' # import http_tools.auth.Dropbox.dropboxapi_service as dropboxapi_service final_path = image_url # dropboxapi_service.download_auth_file(image_url=image_url, destpath=destpath) if final_path: print final_path, 'Final DBX Path' image_url = final_path else: pass else: image_url.replace('.JPG', '.jpg') image_url.replace('.PNG', '.png') print 'REGEX DBX dl=1' if not regex_dl.findall(image_url): image_url = image_url + '&dl=1' regex_validurl = re.compile(r'^http[s]?://.+?$', re.U) regex_ftpurl = re.compile(r'^ftp[s]?://.+?$', re.U) regex_drive2 = re.compile(r'^(https://d(.+?)\.google\.com/).*\?id\=(?P<fileId>.+?)\&?.*?$', re.U) regex_drive3 = re.compile(r'^(https://d(.+?)\.google\.com/file/d/)(?P<fileId>.+?)/(edit|view)\?usp\=.*?$', re.U) # regex_dropbox = re.compile(r'^https?://www.dropbox.com/.+?\.[jpngJPNG]{3}$') ###################### #### BOX API AUTH #### removed # regex_boxapi = re.compile(r'^(https?)?(?:\://)?(?P<VENDER_ROOT>.*)?(.*?)\.box\.com/(s/)?(?P<SHARED_LINK_ID>.+)?/?(\.?[jpngJPNG]{3,4})?(.*?)?\??(.*?)?$', re.U) ######################## #### DRIVE API AUTH #### # if regex_drive2.findall(image_url): ######################################################## ######################################################## ######################################################## import urllib3 print ' 404 - 1 - Trying Urllib3 ', image_url hostname = urllib3.get_host(image_url)[1] #################################################################################################### #################################################################################################### #### ################# ########### TEMPORARY RESTRICT URL from Merchantry Placeholder ########### old_blue_sweater_placeholder_url = 'https://pim-image-4b2a111d-0df4-447e-bc31-d288a9-s3bucket-1in4kl2m21ire.s3.amazonaws.com/7274a8bcbb3fa06076da6e1c2950e617b24baf4f5d467def6c88f276a0e15f12.jpg' placeholder_url = 'https://pim-image-4b2a111d-0df4-447e-bc31-d288a9-s3bucket-1in4kl2m21ire.s3.amazonaws.com/5d167c72a8100a911f06940395589769857cbd395e452bf21416503d1c43861c.jpg' if image_url == placeholder_url or image_url == old_blue_sweater_placeholder_url: print 'RESTRICTING DOWNLOAD of Merchantry Placeholder image.--> {}'.format(image_url) return ######################### END ###### Adjust below elif to initial if once removed above ############# ################# #### #################################################################################################### #################################################################################################### elif regex_drive3.findall(image_url): image_url = drive_match_fileid(image_url) print image_url, ' DRIVE3 --ID--> ' import http_tools.auth.Google.google_drive_auth_downloader as google_drive_auth_downloader try: final_path = google_drive_auth_downloader.download_google_drive_file(image_url=image_url, destpath=destpath) if final_path: return final_path else: print 'Final DRIVE Failure ', destpath, '\n', image_url except IndexError: print 'Final DRIVE Exception ', destpath, '\n', image_url ## Tmp - Cannot use auth from johnb or http_tools above except: pass elif regex_drive2.findall(image_url): print image_url, ' DRIVE' # import jbmodules # from jbmodules import http_tools.auth.Google.google_drive_auth_downloader as google_drive_auth_downloader try: final_path = google_drive_auth_downloader.download_google_drive_file(image_url=image_url, destpath=destpath) if final_path: return final_path else: print 'Final DRIVE Failure ', destpath, '\n', image_url except IndexError: print 'Final DRIVE Exception ', destpath, '\n', image_url # return ## Tmp - Cannot use auth from johnb or http_tools above except: pass elif regex_ftpurl.findall(image_url): print image_url, ' FTP--FTP\n...probably Jaipur...' from http_tools.ftp_functions import pycurl_ftp_download import pycurl # from jbmodules try: res = pycurl_ftp_download(imageurl=image_url, destpath=destpath) print 'FTEPPEE --> {}\n{}\t\n'.format(res, image_url, destpath) return destpath except pycurl.error, error: print 'Pycurl error in FTP Download --> ', error ## Tmp - Cannot use auth from johnb or http_tools above except:
import os import argparse import time import urllib3 from logger import Logger from pathlib import Path from http_helper import HttpHelper from html_parser import DictHtmlParser BASE_URL = 'https://www.spanishdict.com' HOST = urllib3.get_host(BASE_URL)[1] SEARCH_URL = '{0}/translate/'.format(BASE_URL) CONJUGATE_URL = '{0}/conjugate'.format(BASE_URL) PROJECT_FILEPATH = os.getcwd() DOWNLOAD_FILEPATH = '{0}/downloads'.format(PROJECT_FILEPATH) JSON_DOWNLOAD_FILEPATH = '{0}/json'.format(DOWNLOAD_FILEPATH) HTML_TRANSLATION_DOWNLOAD_FILEPATH = '{0}/translate'.format(DOWNLOAD_FILEPATH) HTML_CONJUGATION_DOWNLOAD_FILEPATH = '{0}/conjugate'.format(DOWNLOAD_FILEPATH) LOG_FILEPATH = "{0}/logs".format(PROJECT_FILEPATH) JSON_HISTORY_LOG = '{0}/.json_history'.format(LOG_FILEPATH) HTML_HISTORY_LOG = '{0}/.history'.format(LOG_FILEPATH) ERROR_LOG = '{0}/.error_log'.format(LOG_FILEPATH) def create_parser(): parser = argparse.ArgumentParser() parser.add_argument('-s', '--scrape', help='Scrape words') parser.add_argument( '-p', '--parse', help='Parse dictionary entries from downloaded html pages') parser.add_argument('-d', '--define', help='Define word') parser.add_argument('-i', '--input', help='Input file containing list of words to define/conjugate.')