Exemplo n.º 1
0
def getHttpStatusCode2(url, result):
    '''
	获取http状态吗
	'''
    Logger.debug(u'---- getHttpStatusCode2 ---- BEGIN ----')
    Logger.debug(u'---- params1:url:%s ----' % url)
    status = ''
    try:
        r = http.urlopen(method='GET',
                         url=url,
                         timeout=10,
                         retries=False,
                         redirect=False)
        status = r.status
        result.append([url, status])
        Logger.info("%s : %s" % (url, status))
        host = urllib3.get_host(url)
        if (status in [301, 302]):
            redirect = r.get_redirect_location()
            if (urllib3.get_host(redirect)[1] == None):
                redirect = host[0] + '://' + host[1] + '/' + redirect
            Logger.debug(u'重定向url:%s' % redirect)
            if (redirect == url):
                # 自己重定向自己,跳出
                pass
            else:
                return getHttpStatusCode2(redirect, result)
    except urllib3.exceptions.MaxRetryError as e:
        Logger.debug(u'---- return ----')
        result.append([url, '链接无效'])
        return '链接无效'
    except urllib3.exceptions.ConnectTimeoutError as e:
        # 链接超时
        Logger.debug(u'---- return ----')
        result.append([url, '链接超时'])
        return '链接超时'
    except urllib3.exceptions.SSLError as e:
        # 链接超时
        Logger.debug(u'---- return ----')
        result.append([url, 'SSLError'])
        return 'SSLError'
    except ConnectionResetError as e:
        raise
    except urllib3.exceptions.ProtocolError as e:
        raise
    except:
        raise
    else:
        return status
        Logger.debug(u'---- return1:%s ----' % status)
        Logger.debug(u'---- getHttpStatusCode2 ---- END ----')
Exemplo n.º 2
0
    def __init__(
        self,
        billomat_id,
        billomat_api_key,
        billomat_app_id = None,
        billomat_app_secret = None,
    ):

        # Base URL
        self.url = "https://{billomat_id}.billomat.net/".format(
            billomat_id = billomat_id
        )

        # Headers
        self.headers = {
            "X-BillomatApiKey": billomat_api_key,
            "Content-Type": "application/xml",
        }
        if billomat_app_id:
            self.headers["X-AppId"] = billomat_app_id
        if billomat_app_secret:
            self.headers["X-AppSecret"] = billomat_app_secret

        # Bind Urlfetch for Google App Engine
        self.urlfetch = urlfetch

        # Initialize Urllib3-ConnectionPool
        if urllib3:
            scheme, host, port = urllib3.get_host(self.url)
            self.conn = urllib3.HTTPSConnectionPool(
                host = host, port = port
            )
        else:
            self.conn = None
Exemplo n.º 3
0
def fetch_all_album_in_page() -> []:
    url = "http://www.gqxzt.com/gaoqingr/qingzxies/xinyanxiaogongzhu/1.html"
    resp = requests.get(url)
    resp.encoding = "gb2312"
    bs = BeautifulSoup(resp.text, "html.parser")

    album_list = bs.select("#list")[0].find_all("li")

    host = urllib3.get_host(url)[0] + "://" + urllib3.get_host(url)[1]
    print(f"host: {host}")

    album_info_list = []
    for album in album_list:
        title = album.a.get("title")
        link = host + album.a.get("href")
        album_info_list.append((title, link))

    print(album_info_list)
    return album_info_list
Exemplo n.º 4
0
    def school(self, response):
        current = response.meta['current']
        for schoolInfo in response.xpath(
                '//div[contains(@class, "mar_t_30 overhidden")]'):
            self.schoolId += 1

            # 获取学校的信息
            schoolItem = SchoolItem()
            schoolItem['id'] = self.schoolId
            schoolItem['type'] = 'school'
            schoolItem['name'] = schoolInfo.xpath(
                './following-sibling::div[@class="p_bor"]/a/text()'
            ).extract_first()
            schoolItem['address'] = schoolInfo.xpath(
                './descendant::tr[1]/td[1]/text()').extract_first()
            schoolItem['url'] = schoolInfo.xpath(
                './descendant::tr[1]/td[2]/a/@href').extract_first()
            if schoolInfo.xpath('./descendant::tr[2]/td[1]/text()'
                                ).extract_first().split(':')[1]:
                schoolItem['schoolType'] = schoolInfo.xpath(
                    './descendant::tr[2]/td[1]/text()').extract_first().split(
                        ':')[1].strip()
            schoolItem['phone'] = schoolInfo.xpath(
                './descendant::tr[2]/td[2]/text()').extract_first().split(
                    ':')[1].strip()
            schoolItem['top'] = schoolInfo.xpath(
                './descendant::tr[3]/td[1]/text()').extract_first().split(
                    ':')[1].strip()
            yield schoolItem

            # 学校的图片
            imageItem = ImageItem()
            imageItem['id'] = self.schoolId
            imageItem['type'] = 'image'
            imageItem['image_urls'] = schoolInfo.xpath(
                './descendant::img/@src').extract()
            imageItem['image_paths'] = ''
            yield imageItem

            middleItem = CityToSchoolItem()
            middleItem['type'] = 'middle'
            middleItem['provinceId'] = current['ppid']
            middleItem['cityId'] = current['pid']
            middleItem['townId'] = current['id']
            middleItem['schoolId'] = self.schoolId
            yield middleItem

        # 爬取分页
        host = urllib3.get_host(response.url)
        nextPage = response.xpath("//div[@id='pagenav']/ul/li[3]/a/@href")
        if nextPage:
            nextUrl = host[0] + host[1] + nextPage
            request = scrapy.Request(nextUrl, callback=self.school)
            request.meta['current'] = current
            yield request
Exemplo n.º 5
0
    def _redirect(self, request, secure):
        protocol = secure and "https" or "http"

        newurl = "%s://%s%s" % (protocol, get_host(request),
                                request.get_full_path())

        if settings.DEBUG and request.method == 'POST':
            raise RuntimeError(
                """Django can't perform a SSL redirect while maintaining POST data.            
            Please structure your views so that redirects only occur during GETs."""
            )
        return HttpResponsePermanentRedirect(newurl)
Exemplo n.º 6
0
def ViewResult(Keyword, curIP, nType, bSave, _AccountKey=None):
    conn = MySQLdb.connect(**ConfigFile)
    #conn.text_factory=str
    ResData = BingSearch(Keyword, _AccountKey)
    JsonData = {}
    try:
        JsonData = json.loads(ResData)
    except:
        pass
    url = []
    for key, value in JsonData.items():
        for key1, value1 in value.items():
            for lv in value1:
                try:
                    if nType == 0:
                        uu = lv['Url']
                        i = uu.find('/', 8)
                        url.append(uu[0:i])
                    elif nType == 1:
                        print lv['Url']
                    elif nType == 2:
                        print '%s -> %s' % (lv['Url'], lv['Title'])
                    if bSave:
                        host = urllib3.get_host(lv['Url'])[1]
                        newip = getIp(host)
                        if newip == curIP:
                            d_query_set = Data.objects.filter(ip=curIP).filter(
                                uri=host)
                            if d_query_set.exists():
                                print("update 2 ip-host : %s->%s" %
                                      (newip, host))
                                #同个ip 如果更新域名则同步更新内容
                                d_query = d_query_set[0]
                                d_query.uri = host
                                d_query.title = lv['Title']
                                d_query.descript = lv['Description']
                                d_query.state = '6'  #域名更新
                                d_query.save()
                            else:
                                print "save to database %s" % host
                                szSQL = "Insert into Data(IP,URI,Title,Descript) values ('%s','%s','%s','%s');" % (
                                    curIP, host, lv['Title'],
                                    lv['Description'])
                                cur = conn.cursor()
                                cur.execute(szSQL)
                                conn.commit()
                        else:
                            print("fake curip")
                except Exception, e:
                    print e
                finally:
                    pass
Exemplo n.º 7
0
def ViewResult(data):
    JsonData = {}

    for r in data:
        ResData = r.get("ResData", "")
        curIP = r.get("curIP", "")
        updated = r.get("updated", "")
        data_query = Data.objects.filter(ip=curIP)

        if not updated:
            #如果快照的内容没有更新则更新数据
            print("nothing changed compared with cached page")
            logger.info("nothing changed compared with cached page")
            continue
        try:
            JsonData = json.loads(ResData)
        except:
            pass
        print("[%s]" % curIP)
        logger.info("[%s]" % curIP)
        url = []
        for key, value in JsonData.items():
            for key1, value1 in value.items():
                for lv in value1:
                    try:
                        print(lv['Url'])
                        logger.info(lv['Url'])
                        host = urllib3.get_host(lv['Url'])[1]
                        d_query_set = data_query.filter(uri=host)
                        if d_query_set.exists():
                            print("update 2 ip-host : %s->%s" % (curIP, host))
                            #同个ip 如果更新域名则同步更新内容
                            d_query = d_query_set[0]
                            d_query.uri = host
                            d_query.title = lv['Title']
                            d_query.descript = lv['Description']
                            d_query.state = '6'  #域名更新
                            d_query.save()
                        else:
                            print "save to database %s" % host
                            now = datetime.datetime.now()
                            Data.objects.create(ip=curIP,
                                                uri=host,
                                                title=lv['Title'],
                                                descript=lv['Description'],
                                                time=now)
                    except Exception, e:
                        print e

                    finally:
                        pass
Exemplo n.º 8
0
def url_query_convert_to_path(url, file_type):
    """Конвертация урла на path.
    Пример: http://lenta.ru/news/2013/03/dtp/index.html =>
            [CUR_DIR]/lenta.ru/news/2013/03/dtp/index.txt
    """
    host = get_host(url)[1]
    query = urlparse(url).path
    query = query.strip('/')
    query = '/'.join(query.split('/')[0:-1])
    file_name = list(filter(None, urlparse(url).path.split('/')))[-1]

    file_name = os.path.splitext(file_name)[0]
    return "{host}/{query}".format(host=host, query=query), \
           "{file_name}.{file_type}".format(
               file_name=file_name, file_type=file_type)
Exemplo n.º 9
0
def ViewResult(Keyword,curIP,nType,bSave, _AccountKey=None):
    conn = MySQLdb.connect(**ConfigFile)
    #conn.text_factory=str
    ResData=BingSearch(Keyword, _AccountKey)
    JsonData={}
    try:
        JsonData=json.loads(ResData)
    except:
        pass
    url=[]
    for key,value in JsonData.items():
        for key1,value1 in value.items():
            for lv in value1:
                try:
                    if nType==0:
                        uu=lv['Url']
                        i=uu.find('/',8)
                        url.append(uu[0:i])
                    elif nType==1:
                        print lv['Url']
                    elif nType==2:
                        print '%s -> %s' % (lv['Url'],lv['Title'])
                    if bSave:
                        host = urllib3.get_host(lv['Url'])[1]
                        newip = getIp(host)
                        if newip == curIP:
                            d_query_set = Data.objects.filter(ip=curIP).filter(uri=host)
                            if d_query_set.exists():
                                print ("update 2 ip-host : %s->%s" %(newip, host))
                                #同个ip 如果更新域名则同步更新内容
                                d_query = d_query_set[0]
                                d_query.uri = host
                                d_query.title = lv['Title']
                                d_query.descript = lv['Description']
                                d_query.state = '6' #域名更新
                                d_query.save()
                            else:
                                print "save to database %s" % host
                                szSQL="Insert into Data(IP,URI,Title,Descript) values ('%s','%s','%s','%s');" % (curIP,host,lv['Title'],lv['Description'])
                                cur = conn.cursor()
                                cur.execute(szSQL)
                                conn.commit()
                        else:
                            print ("fake curip")
                except Exception, e:
                    print e
                finally:
                    pass
Exemplo n.º 10
0
def ViewResult(data):
    JsonData={}

    for r in data:
        ResData = r.get("ResData", "")
        curIP = r.get("curIP", "")
        updated = r.get("updated", "")
        data_query = Data.objects.filter(ip=curIP)

        if not updated:
            #如果快照的内容没有更新则更新数据
            print ("nothing changed compared with cached page")
            logger.info("nothing changed compared with cached page")
            continue
        try:
            JsonData=json.loads(ResData)
        except:
            pass
        print ("[%s]" % curIP)
        logger.info("[%s]" % curIP)
        url=[]
        for key,value in JsonData.items():
            for key1,value1 in value.items():
                for lv in value1:
                    try:
                        print (lv['Url'])
                        logger.info(lv['Url'])
                        host = urllib3.get_host(lv['Url'])[1]
                        d_query_set = data_query.filter(uri=host)
                        if d_query_set.exists():
                            print ("update 2 ip-host : %s->%s" %(curIP, host))
                            #同个ip 如果更新域名则同步更新内容
                            d_query = d_query_set[0]
                            d_query.uri = host
                            d_query.title = lv['Title']
                            d_query.descript = lv['Description']
                            d_query.state = '6' #域名更新
                            d_query.save()
                        else:
                            print "save to database %s" % host
                            now = datetime.datetime.now()
                            Data.objects.create(ip=curIP,uri=host,title=lv['Title'],descript=lv['Description'], time=now)
                    except Exception, e:
                        print e

                    finally:
                        pass
Exemplo n.º 11
0
    def process_view(self, request, view_func, view_args, view_kwargs):
        if not settings.DEBUG:
            # print(settings.DEBUG)
            """ only perform the redirect if not in debug mode """
            protocol = 'https://' if request.is_secure() else 'http://'
            host = get_host(request)
            print('host:', host)
            new_url = ''
            try:
                if host in settings.CANON_URLS_TO_REWRITE:
                    new_url = protocol + settings.CANON_URL_HOST + request.get_full_path(
                    )
            except AttributeError:
                if host != settings.CANON_URL_HOST:
                    new_url = protocol + settings.CANON_URL_HOST + request.get_full_path(
                    )

            if new_url:
                return HttpResponsePermanentRedirect(new_url)
Exemplo n.º 12
0
    def __init__(
            self,
            billomat_id,
            billomat_api_key,
            billomat_app_id=None,
            billomat_app_secret=None,
            timeout_seconds=600  # 10 Minutes
    ):

        self.timeout_seconds = timeout_seconds

        # Base URL
        self.url = "https://{billomat_id}.billomat.net/".format(
            billomat_id=billomat_id)

        # Headers
        self.headers = {
            "X-BillomatApiKey": billomat_api_key,
            "Content-Type": "application/xml",
        }
        if billomat_app_id:
            self.headers["X-AppId"] = billomat_app_id
        if billomat_app_secret:
            self.headers["X-AppSecret"] = billomat_app_secret

        # Bind Urlfetch for Google App Engine
        self.urlfetch = urlfetch

        # Initialize Urllib3-ConnectionPool
        if urllib3:
            scheme, host, port = urllib3.get_host(self.url)
            import certifi

            self.conn = urllib3.HTTPSConnectionPool(
                host=host,
                port=port,
                timeout=self.timeout_seconds,
                cert_reqs='CERT_REQUIRED',
                ca_certs=certifi.where())

        else:
            self.conn = None
Exemplo n.º 13
0
def parse_uri(url, path):
    logging.getLogger("urllib3").setLevel(logging.INFO)
    host = urllib3.get_host(url)
    logger.info(host)
    if host[0] == 'http':
        pool = urllib3.HTTPConnectionPool(host=host[1], port=host[2], maxsize=10)
    elif host[0] == 'https':
        pool = urllib3.HTTPSConnectionPool(host=host[1], port=host[2], maxsize=10)

    #req = urllib3.HTTPResponse()
    req = pool.request("GET", url, timeout=2.5)
    if req.status == 200:
        logging.info("Make directory %s" % path)
        try:
            os.mkdir(os.path.join(os.path.curdir, path))
        except Exception as e:
            logger.warning(str(e))
        parse_body(req.data, url, path)
    else:
        logger.error("Error while loading page: %s" % str(req.status))
 def get_and_test(self, url: str = 'https://baidu.com', num: int = 3, timeout=1):
     # 只测试域名是否能访问
     url = '://'.join(urllib3.get_host(url)[:2])
     proxy_list, proxy_type = [], f'http{["", "s"][url[5] == "s"]}'
     while num > 0:
         proxy_ip = self.get(proxy_type)
         if not proxy_ip:
             print('代理池准备中', proxy_ip)
             time.sleep(3)
         try:
             self.log('获取到代理 ip:', proxy_ip, proxy_type)
             self.log(f'使用{url}来测试代理')
             get(url, proxies=f'{proxy_type}://{proxy_ip}', timeout=timeout, verify=False)
             num -= 1
             proxy_list.append(f'{proxy_type}://{proxy_ip}')
         except Exception as e:
             self.log(f'代理 {proxy_ip} 出错:{e}')
             self.delete(proxy_ip)
     self.log('获取代理完成:', proxy_list)
     return proxy_list
Exemplo n.º 15
0
        '.net.bz', '.cc', '.com.co', '.net.co', '.nom.co', '.de', '.es',
        '.com.es', '.nom.es', '.org.es', '.eu', '.fm', '.fr', '.gs', '.in',
        '.co.in', '.firm.in', '.gen.in', '.ind.in', '.net.in', '.org.in',
        '.it', '.jobs', '.jp', '.ms', '.com.mx', '.nl', '.nu', '.co.nz',
        '.net.nz', '.org.nz', '.se', '.tc', '.tk', '.tw', '.com.tw', '.idv.tw',
        '.org.tw', '.hk', '.co.uk', '.me.uk', '.org.uk', '.vg'
    ]
    # 保证先匹配有两个后缀的域名
    sorted(shuffix_lst, key=len)[::-1]
    # 获取当前时间用于建立文件夹
    time_stamp = str(time.time())[:10]
    os.mkdir('domains_{}'.format(time_stamp))
    for line in d:
        name, url = line.split()
        # 解析主机名
        url = urllib3.get_host(url)[1]
        # 获取主站
        for shuffix in shuffix_lst:
            if shuffix in url:
                url = url.split(shuffix)[0].split('.')[-1] + shuffix
                break
        print(url)

        try:
            res = pythonwhois.get_whois(url)
        except:
            res = '{}: 解析失败'.format(url)
        with open('domains_{}/{}-{}.txt'.format(time_stamp, name, url),
                  'w') as f:
            json.dump(res, f, ensure_ascii=False, cls=DateEncoder, indent=4)
Exemplo n.º 16
0
 def __init__(self, html_text, url, settings=None):
     self._url = url
     self.config = Config(get_host(url)[1], settings)
     self.soup = BeautifulSoup(html_text, features='html.parser')
Exemplo n.º 17
0
import urllib3

body = urllib3.get_host("https://baidu.com")
print(body)
Exemplo n.º 18
0
        time.sleep(0.1)

    @classmethod
    def tearDownClass(cls):
        import urllib # Yup, that's right.
        try:
            urllib.urlopen(cls.scheme + '://' + cls.host + ':' + str(cls.port) + '/shutdown')
        except IOError:
            pass
        cls.server_thread.join()


class HTTPSDummyServerTestCase(HTTPDummyServerTestCase):
    scheme = 'https'
    host = 'localhost'
    port = 18082


if __name__ == '__main__':
    log.setLevel(logging.DEBUG)
    log.addHandler(logging.StreamHandler(sys.stderr))

    from urllib3 import get_host

    url = "http://localhost:8081"
    if len(sys.argv) > 1:
        url = sys.argv[1]

    scheme, host, port = get_host(url)
    make_server(scheme=scheme, host=host, port=port)
Exemplo n.º 19
0
     #如果快照的内容没有更新则更新数据
     print("nothing changed compared with cached page")
     return
 try:
     JsonData = json.loads(ResData)
 except Exception, e:
     print e
     pass
 print "[%s]" % curIP
 url = []
 for key, value in JsonData.items():
     for key1, value1 in value.items():
         for lv in value1:
             try:
                 print lv['Url']
                 host = urllib3.get_host(lv['Url'])[1]
                 # newip = getIp(host)
                 # if newip == curIP:
                 d_query_set = data_query.filter(uri=host)
                 if d_query_set.exists():
                     print("update 2 ip-host : %s->%s" % (curIP, host))
                     #同个ip 如果更新域名则同步更新内容
                     d_query = d_query_set[0]
                     d_query.uri = host
                     d_query.title = lv['Title']
                     d_query.descript = lv['Description']
                     d_query.state = '6'  #域名更新
                     d_query.save()
                 else:
                     print "save to database %s" % host
                     Data.objects.create(ip=curIP,
Exemplo n.º 20
0
#coding:utf-8
from tkinter import *
from tkinter.scrolledtext import ScrolledText
import threading
import time
import queue
import urllib3
import requests
li = list()
host = urllib3.get_host(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm")
li.append("host:" + (str)(host))
status = requests.get(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").status_code
li.append("status:" + (str)(status))
charset = requests.get(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").encoding
li.append("charset:" + (str)(charset))
cookies = requests.get(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").cookies
li.append("cookies:" + (str)(cookies))
headers = requests.get(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").headers
li.append("headers:" + (str)(headers))
url = requests.get("http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").url
li.append("url:" + (str)(url))
codes = requests.codes
li.append("codes:" + (str)(codes))
history = requests.get(
    "http://localhost:8099/MeetingMag/web/MeetMag/menu.htm").history
li.append("history:" + (str)(history))
Exemplo n.º 21
0
import urllib3
from bs4 import BeautifulSoup

quote_page = 'https://www.bloomberg.com/quote/SPX:IND'

page = urllib3.get_host(quote_page)

soup = BeautifulSoup(page, 'html.parser')
name_box = soup.find('h1', attrs={'class': 'companyName__99a4824b'})
name = name_box.text.strip()  # strip() is used to remove starting and trailing
print(name)
Exemplo n.º 22
0
     #如果快照的内容没有更新则更新数据
     print ("nothing changed compared with cached page")
     return 
 try:
     JsonData=json.loads(ResData)
 except Exception, e:
     print e;
     pass
 print "[%s]" % curIP
 url=[]
 for key,value in JsonData.items():
     for key1,value1 in value.items():
         for lv in value1:
             try:
                 print lv['Url']
                 host = urllib3.get_host(lv['Url'])[1]
                 # newip = getIp(host)
                 # if newip == curIP:
                 d_query_set = data_query.filter(uri=host)
                 if d_query_set.exists():
                     print ("update 2 ip-host : %s->%s" %(curIP, host))
                     #同个ip 如果更新域名则同步更新内容
                     d_query = d_query_set[0]
                     d_query.uri = host
                     d_query.title = lv['Title']
                     d_query.descript = lv['Description']
                     d_query.state = '6' #域名更新
                     d_query.save()
                 else:
                     print "save to database %s" % host
                     Data.objects.create(ip=curIP,uri=host,title=lv['Title'],descript=lv['Description'])
Exemplo n.º 23
0
import urllib3

url = 'http://www.acme.com/products/3322'
response = urllib3.get_host(url)
    #.urlopen(url).read()
print("The Response is  :",response)
Exemplo n.º 24
0

def make_server(**kw):
    try:
        return eventlet_server(**kw)
    except ImportError:
        return simple_server(**kw)


def make_server_thread(target, **kw):
    import threading
    t = threading.Thread(target=target, kwargs=kw)
    t.start()
    return t


if __name__ == '__main__':
    log.setLevel(logging.DEBUG)
    log.addHandler(logging.StreamHandler(sys.stderr))

    from urllib3 import get_host

    url = "http://localhost:8081"
    if len(sys.argv) > 1:
        url = sys.argv[1]

    print "Starting server at: %s" % url

    scheme, host, port = get_host(url)
    make_server(scheme=scheme, host=host, port=port)
Exemplo n.º 25
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-

#    File Name:       webpage
#    Description :
#    Author :          SanYapeng
#    date:            2019/2/2
#    Change Activity:  2019/2/2:

import urllib3

url = "http://www.baidu.com"
webpage = urllib3.get_host(url)
print(webpage)
def download_mplce_url(urldest_tuple):
    import requests, re, urllib, urllib2, urllib3, OpenSSL, subprocess
    from os import path

    import urllib3.contrib.pyopenssl
    urllib3.contrib.pyopenssl.inject_into_urllib3()
    urllib3.disable_warnings()

    countimage = 0
    countstyle = 0
    image_url, destpath = urldest_tuple
    destdir = path.dirname(destpath)
    colorstyle = destpath.split('/')[-1][:9]
    alt_number = destpath.split('_')[-1][0]
    try:
        image_url = 'https://www.drop'.join(image_url.split('https://wwwop'))
        image_url = urllib.unquote_plus(image_url)
    except:
        pass
    ########################################################
    ##################  REGEX Filters Defined ##############
    ## Image URL Cleanup and Replace Extraneous/Bad Chars ##
    ########################################################
    ####### Dropbox Fix for View vs DL value ###############
    regex_dbx = re.compile(r'^https://www.dropbox.com/.+?\.[jpngJPNG]{3}$')
    regex_dbxprev = re.compile(r'^https://www.dropbox.com/.+?preview.*\.[jpngJPNG]{3}$')
    image_url = image_url.replace('dl=9', 'dl=1').replace('dl=0', 'dl=1').replace('dl=2', 'dl=1').replace('dl=3',
                                                                                                          'dl=1').replace(
        'dl=4', 'dl=1').replace('dl=5', 'dl=1').replace('dl=6', 'dl=1').replace('dl=7', 'dl=1').replace('dl=8', 'dl=1')
    regex_dl = re.compile(r'^.+\?dl=1.*?$')
    if regex_dbx.findall(image_url):
        if regex_dbxprev.findall(image_url):
            print 'REGEX DBXPRE'
            # import http_tools.auth.Dropbox.dropboxapi_service as dropboxapi_service
            final_path = image_url  # dropboxapi_service.download_auth_file(image_url=image_url, destpath=destpath)

            if final_path:
                print final_path, 'Final DBX Path'
                image_url = final_path
            else:
                pass
        else:
            image_url.replace('.JPG', '.jpg')
            image_url.replace('.PNG', '.png')
            print 'REGEX DBX dl=1'
        if not regex_dl.findall(image_url):
            image_url = image_url + '&dl=1'

    regex_validurl = re.compile(r'^http[s]?://.+?$', re.U)
    regex_ftpurl = re.compile(r'^ftp[s]?://.+?$', re.U)
    regex_drive2 = re.compile(r'^(https://d(.+?)\.google\.com/).*\?id\=(?P<fileId>.+?)\&?.*?$', re.U)
    regex_drive3 = re.compile(r'^(https://d(.+?)\.google\.com/file/d/)(?P<fileId>.+?)/(edit|view)\?usp\=.*?$', re.U)

    # regex_dropbox = re.compile(r'^https?://www.dropbox.com/.+?\.[jpngJPNG]{3}$')
    ######################
    #### BOX API AUTH #### removed
    # regex_boxapi  = re.compile(r'^(https?)?(?:\://)?(?P<VENDER_ROOT>.*)?(.*?)\.box\.com/(s/)?(?P<SHARED_LINK_ID>.+)?/?(\.?[jpngJPNG]{3,4})?(.*?)?\??(.*?)?$', re.U)
    ########################
    #### DRIVE API AUTH ####
    # if regex_drive2.findall(image_url):
    ########################################################
    ########################################################
    ########################################################
    import urllib3
    print ' 404 - 1 - Trying Urllib3 ', image_url
    hostname = urllib3.get_host(image_url)[1]
    ####################################################################################################
    ####################################################################################################
    ####
    #################
    ########### TEMPORARY RESTRICT URL from Merchantry Placeholder ###########
    old_blue_sweater_placeholder_url = 'https://pim-image-4b2a111d-0df4-447e-bc31-d288a9-s3bucket-1in4kl2m21ire.s3.amazonaws.com/7274a8bcbb3fa06076da6e1c2950e617b24baf4f5d467def6c88f276a0e15f12.jpg'
    placeholder_url = 'https://pim-image-4b2a111d-0df4-447e-bc31-d288a9-s3bucket-1in4kl2m21ire.s3.amazonaws.com/5d167c72a8100a911f06940395589769857cbd395e452bf21416503d1c43861c.jpg'
    if image_url == placeholder_url or image_url == old_blue_sweater_placeholder_url:
        print 'RESTRICTING DOWNLOAD of Merchantry Placeholder image.--> {}'.format(image_url)
        return
    ######################### END ###### Adjust below elif to initial if once removed above #############
    #################
    ####
    ####################################################################################################
    ####################################################################################################
    elif regex_drive3.findall(image_url):
        image_url = drive_match_fileid(image_url)
        print image_url, ' DRIVE3 --ID--> '
        import http_tools.auth.Google.google_drive_auth_downloader as google_drive_auth_downloader
        try:
            final_path = google_drive_auth_downloader.download_google_drive_file(image_url=image_url, destpath=destpath)
            if final_path:
                return final_path
            else:
                print 'Final DRIVE Failure ', destpath, '\n', image_url
        except IndexError:
            print 'Final DRIVE Exception ', destpath, '\n', image_url
        ## Tmp - Cannot use auth from johnb or http_tools above
        except:
            pass

    elif regex_drive2.findall(image_url):
        print image_url, ' DRIVE'
        # import jbmodules
        # from jbmodules
        import http_tools.auth.Google.google_drive_auth_downloader as google_drive_auth_downloader
        try:
            final_path = google_drive_auth_downloader.download_google_drive_file(image_url=image_url, destpath=destpath)
            if final_path:
                return final_path
            else:
                print 'Final DRIVE Failure ', destpath, '\n', image_url
        except IndexError:
            print 'Final DRIVE Exception ', destpath, '\n', image_url
            # return
            ## Tmp - Cannot use auth from johnb or http_tools above
        except:
            pass

    elif regex_ftpurl.findall(image_url):
        print image_url, ' FTP--FTP\n...probably Jaipur...'
        from http_tools.ftp_functions import pycurl_ftp_download
        import pycurl
        # from jbmodules
        try:
            res = pycurl_ftp_download(imageurl=image_url, destpath=destpath)
            print 'FTEPPEE --> {}\n{}\t\n'.format(res, image_url, destpath)
            return destpath
        except pycurl.error, error:
            print 'Pycurl error in FTP Download --> ', error
        ## Tmp - Cannot use auth from johnb or http_tools above
        except:
Exemplo n.º 27
0
import os
import argparse
import time
import urllib3
from logger import Logger
from pathlib import Path
from http_helper import HttpHelper
from html_parser import DictHtmlParser

BASE_URL = 'https://www.spanishdict.com'
HOST = urllib3.get_host(BASE_URL)[1]
SEARCH_URL = '{0}/translate/'.format(BASE_URL)
CONJUGATE_URL = '{0}/conjugate'.format(BASE_URL)
PROJECT_FILEPATH = os.getcwd()
DOWNLOAD_FILEPATH = '{0}/downloads'.format(PROJECT_FILEPATH)
JSON_DOWNLOAD_FILEPATH = '{0}/json'.format(DOWNLOAD_FILEPATH)
HTML_TRANSLATION_DOWNLOAD_FILEPATH = '{0}/translate'.format(DOWNLOAD_FILEPATH)
HTML_CONJUGATION_DOWNLOAD_FILEPATH = '{0}/conjugate'.format(DOWNLOAD_FILEPATH)
LOG_FILEPATH = "{0}/logs".format(PROJECT_FILEPATH)
JSON_HISTORY_LOG = '{0}/.json_history'.format(LOG_FILEPATH)
HTML_HISTORY_LOG = '{0}/.history'.format(LOG_FILEPATH)
ERROR_LOG = '{0}/.error_log'.format(LOG_FILEPATH)


def create_parser():
    parser = argparse.ArgumentParser()
    parser.add_argument('-s', '--scrape', help='Scrape words')
    parser.add_argument(
        '-p', '--parse', help='Parse dictionary entries from downloaded html pages')
    parser.add_argument('-d', '--define', help='Define word')
    parser.add_argument('-i', '--input', help='Input file containing list of words to define/conjugate.')