Пример #1
0
def getLink(url):
    proxyList = proxy.getProxy()

    for proxyItem in proxyList:
        print(f'use proxy {proxyItem["ip"]}:{proxyItem["port"]}')
        response = connect('本書介紹', url, proxyItem)
        if response is None:
            print('connection is invalid')
            continue
        print('connection is valid')
        d = pyquery.PyQuery(response.text)
        posts = d('div.type02_m057:contains("內容簡介")')

        print(posts)

        break
Пример #2
0
def getList(url):
    proxyList = proxy.getProxy()
    rows = []

    for proxyItem in proxyList:
        print(f'use proxy {proxyItem["ip"]}:{proxyItem["port"]}')
        response = connect('即時榜', url, proxyItem)
        if response is None:
            print('connection is invalid')
            continue
        print('connection is valid')

        # 開二進位檔案
        with open('book.b.html', 'wb') as f:
            # 寫入二進位資料
            f.write(response.content)

        d = pyquery.PyQuery(response.text)
        posts = d('ul.clearfix li.item')

        for idx, post in enumerate(posts.items()):
            if idx == 0:
                order = post('strong.no').text()
                title = post('h4').text()
                author = post('ul.msg').text()[3:]
                price = post('li.price_a').text()[4:]
                link = post('h4 a').attr('href')
                print('排名 : ', order)
                print('書名 : ', title)
                # print('作者 : ', author)
                # print(price)
                print('連結 : ', link)
                print('----------------------')
                # getLink(link)
                rows.append({'order': order, 'title': title, 'link': link})
        text = json.dumps(rows, sort_keys=True, indent=4)
        with codecs.open(f'books.json', 'w', 'utf-8') as f:
            f.write(text)
            break
Пример #3
0
def setup():
    logger.info("Entered Setup")
    headers = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
        "Accept-Encoding": "gzip, deflate",
        "Accept":
        "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "DNT": "1",
        "Connection": "close",
        "Upgrade-Insecure-Requests": "1",
    }
    logger.debug(f"Created headers: {headers}")
    PROXY = getProxy()
    logger.debug(f"Created proxy: {PROXY}")
    proxies = {
        "httpProxy": PROXY,
        "ftpProxy": PROXY,
        "sslProxy": PROXY,
        "proxyType": "MANUAL",
    }
    logger.info(f"Setup is done: {[headers, proxies]}")
    return [headers, proxies]
Пример #4
0
from bs4 import BeautifulSoup as bs
from urllib import request as req
import proxy

PROXY_URL = proxy.getProxy()

main_array = []          # ALL ITEM ARRAYS ARE IN  HERE AS A TUPLE
cathegory_array = []     # ITEM CATHEGORIES ARE IN HERE
stats_array = []         # ITEM STATUS (SEEDERS AND LEECHERS) WILL BE HERE
item_link_array = []     # ITEM LINKS ARE HERE
item_name_array = []     # ITEM NAMES ARE HERE
item_date_array = []     # ITEM UPLOADED DATES ARE HERE
item_size_array = []     # ITEM SIZES ARE HERE

def search_item(search_request):
 
    search_term = search_request
    search_term.replace(' ', '_')
    site_link = ''

    while True:

        for i in PROXY_URL:
            link = '{}/s/?q={}&category=0&page=0&orderby=99'.format(i, search_term)
            site_link = i

            try:
                source = req.urlopen(link).read()
                soup = bs(source, 'html.parser')
                main_item = soup.find_all('tr')
                result_count = soup.find('h2')
Пример #5
0
def SaveScrape(baseurl, PageSaveFolder, ScrapeFile, Scrapewait, useProxy, **kwargs):
    _time=time.time()
    XMLsaveFile="XML_scrape_" + (datetime.datetime.now()).strftime('%Y-%m-%d')
    xmlFile=PageSaveFolder + XMLsaveFile 
    with open(xmlFile +'.xml', "w") as saveXML:
        print("blank xml created")

    XMLsaveFile="XML_scrape_" + (datetime.datetime.now()).strftime('%Y-%m-%d')
    # time.sleep(random.randint(1,10))
    # ua = UserAgent()
    #headers = {'User-Agent':str(ua.random)}
    headers={ 'User-Agent': proxy.getHeader(random.randint(0,249))  } 
    if useProxy != '':
        print("using previous proxy:", useProxy)
        r_proxy=useProxy
    elif useProxy == '':
        r_proxy,prox_status=proxy.getProxy(ps_user="******", ps_pass="******", ps_host="172.22.114.65",ps_port="5432", ps_db="scrape_db", update=True)
        if prox_status==False: 
            print('error getting proxy, quitting')
            sys.exit() 

    _pass=False    
    _loopcount=0
    while _pass==False:
        try:
            response = requests.get(baseurl + ScrapeFile,headers=headers, timeout=Scrapewait, proxies= {'http' : 'http://' + r_proxy, 'https' : 'https://' + r_proxy}) #r_proxy)
            _pass=True
        except: # requests.exceptions.Timeout:
            _waittime=+random.randint(1,9)
            print("count:",_loopcount,"-timeout, wait secs before retry:", _waittime)
            time.sleep(_waittime)
            _loopcount+=1
        if _loopcount >=20: 
            print("getting new proxy after 20 tries, link:",baseurl + ScrapeFile)
            r_proxy,prox_status=proxy.getProxy(ps_user="******", ps_pass="******", ps_host="172.22.114.65",ps_port="5432", ps_db="scrape_db", update=True)
            _loopcount=0
    gz_save_name =ScrapeFile[:-7] + '_' + (datetime.datetime.now()).strftime('%Y-%m-%d') + '.gz'

    #save to gz
    open(PageSaveFolder + gz_save_name, 'wb').write(response.content)
    time.sleep(5)
        #feast upon that rich gooey xml 
    _xml_save = ScrapeFile[:-7] + '_' + (datetime.datetime.now()).strftime('%Y-%m-%d') + '.xml'  
    _pass=False
    _loopcount=0
    while _pass==False:
        try:
            with gzip.open(PageSaveFolder + gz_save_name, 'rb') as f_in:
                time.sleep(5)
                with open(PageSaveFolder + _xml_save, 'wb') as f_out: 
                    time.sleep(5)
                    shutil.copyfileobj(f_in, f_out)
            tree = etree.parse(PageSaveFolder + _xml_save)
            with open(PageSaveFolder + _xml_save, "wb") as saveXML:
                saveXML.write(etree.tostring(tree,pretty_print=True))
            _pass=True
        except: 
            _waittime=+random.randint(1,9)
            print("count:",_loopcount,"-error extracting file, wait secs before retry:", _waittime)
            time.sleep(_waittime)
            _loopcount+=1
        if _loopcount==1:
            print("20 tries, aborting")
            sys.exit() 

    body=tree.xpath('//ns:url',namespaces={'ns':"http://www.sitemaps.org/schemas/sitemap/0.9"})
    _count=1
    #now we parse and read, using lists instead of df since its A BUNCH faster
    list_lastmod=[]
    list_url=[]
    list_state=[]
    list_proptype=[]
    list_suburb=[]
    list_propid=[]
    for element in body:
        # if _count % 10000 == 0: 
        #     print("interval:", str(_count-1)," -total runtime:", time.time()-_time)
        list_lastmod.append(element[1].text)
        list_url.append(element[0].text)
        _splitval=''
        if '-nsw-' in element[0].text: _splitval='-nsw-'
        # elif '+nsw+' in element[0].text: _splitval='+nsw+' 
        elif '-qld-' in element[0].text: _splitval='-qld-'
        # elif '+qld+' in element[0].text:  _splitval='+qld+'  
        elif '-tas-' in element[0].text: _splitval='-tas-'
        # elif '+tas+' in element[0].text: _splitval='+tas+'
        elif '-act-' in element[0].text: _splitval='-act-'
        # elif '+act+' in element[0].text: _splitval='+act+'
        elif '-sa-' in element[0].text: _splitval='-sa-'
        # elif '+sa+' in element[0].text: _splitval='+sa+'
        elif '-nt-' in element[0].text: _splitval='-nt-'
        # elif '+nt+' in element[0].text: _splitval='+nt+'
        elif '-wa-' in element[0].text: _splitval='-wa-'
        # elif '+wa+' in element[0].text: _splitval='+wa+'
        elif '-vic-' in element[0].text: _splitval='-vic-'
        # elif '+vic+' in element[0].text: _splitval='+vic+'

        if _splitval !='':
            list_state.append(_splitval.replace('-','').replace('+',''))
            list_proptype.append( (element[0].text).split(_splitval)[0].replace('https://www.realestate.com.au/property-','').replace('+', ' ') )
            list_suburb.append( (element[0].text).split(_splitval)[1].replace('https://www.realestate.com.au/property-','').replace((element[0].text).split('-')[-1],'').replace('-',' ').replace('+', ' ').strip() )
        else: 
            list_state.append('')
            list_proptype.append('')
            list_suburb.append('')
        list_propid.append( (element[0].text).split('-')[-1] )
        # _count+=1 

    XML_gz_Dataset = pd.DataFrame(
        np.column_stack([list_lastmod, list_url, list_proptype, list_state, list_suburb, list_propid]), 
        columns=['lastmod', 'url', 'proptype', 'state', 'suburb', 'prop_id'])

    XML_gz_Dataset.to_csv(PageSaveFolder + '/parsed_csv/' + _xml_save[:-3] + '_results' +'.csv')
    print("file saved to: " + PageSaveFolder + '\\parsed_csv\\' + _xml_save[:-3] + '_results' +'.csv')
    XML_gz_Dataset['lastmod']=pd.to_datetime(XML_gz_Dataset['lastmod'])
    print("total xml time:", time.time() - _time)

    XML_gz_Dataset['parent_gz']=XMLsaveFile
    XML_gz_Dataset['scrape_dt']=(datetime.datetime.now()).strftime('%Y-%m-%d %H:%M:%S')
    XML_gz_Dataset['lastmod']=pd.to_datetime(XML_gz_Dataset['lastmod'])
    XML_gz_Dataset['external_ip']=r_proxy

    #now we add to db table 
    #parent file link
    connection = psycopg2.connect(user="******",password="******",host="172.22.114.65",port="5432",database="scrape_db")
    cursor = connection.cursor()
    # with connection.cursor() as cursor:
    cursor.execute("""
        select max(s_fileid)
        FROM sc_land.sc_source_file
        WHERE s_filename = %(s_filename)s
        and date(lastmod) = %(lastmod)s;
        """,
            {
                's_filename': XML_gz_Dataset['parent_gz'].drop_duplicates()[0]
                ,'lastmod' : XML_gz_Dataset['lastmod'].dt.date.drop_duplicates()[0]
            }
        )
    result = cursor.fetchone()
    print("parent file link is:",ScrapeFile,"is:", result[0])
    XML_gz_Dataset['s_fileid']=result[0]

    #remove redundant link
    XML_gz_Dataset=XML_gz_Dataset.drop(columns=['parent_gz'])
    # #time to insert  
    engine = create_engine('postgresql://*****:*****@172.22.114.65:5432/scrape_db')
    XML_gz_Dataset.to_sql(
        name='sc_property_links'
        ,schema='sc_land'
        ,con=engine
        ,method=db_import.psql_insert_copy
        ,if_exists='append'
        ,index=False
        )
    os.remove(PageSaveFolder + _xml_save)
    print("total runtime", time.time() - _time)
    print('----------------------------------------------------------------')
    return r_proxy
def spider_with_class(cls, position, ip=0, date="ofo_temp"):
    mypos = []

    for item in position:
        if item['class'] == cls:
            mypos.append(item)
            mypos.append({"lng": item['lng'] + 0.005, "lat": item['lat']})
            mypos.append({
                "lng": item['lng'] + 0.005,
                "lat": item['lat'] + 0.005
            })
            mypos.append({"lng": item['lng'], "lat": item['lat'] + 0.005})
    # print("线程%d  共%d个原始点"%(cls,len(mypos)))
    #
    global history
    # record = []

    ip = proxy.getProxy()

    i = len(mypos)
    for pos in mypos:
        i = i - 1
        count = 0
        local = []
        localString = []
        info = 0
        try:
            info = spider.spider_single(pos['lng'], pos['lat'], ip)
        except:
            if not proxy.testProxy(ip):
                proxy.deleteProxy(ip)
            ip = proxy.getProxy()
            try:
                info = spider.spider_single(pos['lng'], pos['lat'], ip)
            except:
                proxy.updateProxy()
                ip = proxy.getProxy()
                try:
                    info = spider.spider_single(pos['lng'], pos['lat'], ip)
                except:
                    print(
                        "线程%d因网络问题退出 时间:" %
                        (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
                    raise Exception
                    return 0

        lock.acquire()
        for bike in info['body']['bicycles']:
            posStr = str(bike['longitude']) + str(bike['latitude'])
            if not (posStr in history):
                result.append(bike)
                local.append(bike)
                count += 1
                if not (posStr in localString):
                    localString.append(posStr)
        history.extend(localString)
        global total_point
        total_point -= 1
        if total_point % 4000 == 0:
            print("%s 本次剩余%d个" % (time.strftime(
                "%Y-%m-%d %H:%M:%S", time.localtime()), total_point))
        lock.release()
        if not save_in_db(local, date):
            while not save_in_db(local, date):
                print("线程%d 数据库连接失败" % (cls))
            # save_in_db(local)
        # print("线程%d 剩余%d次 获取%d个 保存%d个"%(cls,i,info['body']['total'],count))
    # lock.acquire()
    # result.extend(record)
    # lock.release()
    return
# f = open("thread_test100.txt","w")
# f.write(json.dumps(result))
# f.close()

# while 1:
#     if time.strftime("%H:%M",time.localtime()) == "00:00":
#         break

while 1:
    date = time.strftime("%Y-%m-%d", time.localtime())

    start_time = time.time()
    print("开始新的查询 当前时间:%s" %
          (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    start_time_string = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
    proxy.getProxy()

    thread = []
    history = []
    result = []
    total_point = len(position) * 4
    for i in range(100):
        thread.append(
            threading.Thread(target=spider_with_class,
                             args=(i, position, 0, date)))
    for i in range(100):
        thread[i].setDaemon(True)
        thread[i].start()
    for i in range(100):
        thread[i].join()
Пример #8
0
<<<<<<< HEAD
            ydl_opts = {
                'outtmpl': f'{fullpath}/{filename}/{filename}.mp4',
                'format':'best'
            }
            with suppress_stdout():
                with youtube_dl.YoutubeDL(ydl_opts) as ydl:
                    with suppress_stdout():
                        ydl.download([url])

           
=======
            if not status_manager.Manager.getStatus():
                
                self.__proxy = proxy.getProxy()
            
                ydl_opts = {
                    'outtmpl': f'{fullpath}/{filename}/{filename}',
                    'format':'best',
                    'proxy':self.__proxy['proxy']
                }
                logging.error(f"DOWNLOADING:USING_PROXY")

            else:
                ydl_opts = {
                    'outtmpl': f'{fullpath}/{filename}/{filename}',
                    'format':'best'
                }
                logging.error(f"DOWNLOADING:WITHOUT_PROXY")