Пример #1
0
def scrape(departure_city, departure_airport_code, arrival_city, arrival_state,
           arrival_airport_code, month, day, year):
    scraped_data = parse(departure_city, departure_airport_code, arrival_city,
                         arrival_state, arrival_airport_code, month, day, year)
    #print ("Writing data to output file")departure_ci

    arrival_city = '_'.join(arrival_city.split())
    #print(type(arrival_city))
    #print(arrival_city)
    with open(
            'data/%s-%s-%s-%s-%s-flight-results.json' %
        (month, day, year, departure_city, arrival_city), 'w') as fp:
        json.dump(scraped_data, fp, indent=4)
import expedia

m=expedia.parse('DEL', 'BOM', '02/12/2018')

print(m)
    def scrape(self):
        searchKey = self.location.get()  # Change this to your city
        startDate = self.check_in.get()  #Format %d/%m/%Y
        endDate = self.check_out.get()  #Format %d/%m/%Y
        startDate = startDate.split('/')
        endDate = endDate.split('/')

        ua = UserAgent()  # From here we generate a random user agent
        #otas = ['https://www.expedia.co.in','https://in.hotels.com','https://www.goibibo.com/hotels/']
        otas = ['https://www.goibibo.com/hotels/']
        '''
        Gives a range of dates
        '''
        dateRange = []

        def daterange(date1, date2):
            for n in range(int((date2 - date1).days) + 1):
                yield date1 + timedelta(n)

        startDate = date(int(startDate[2]), int(startDate[1]),
                         int(startDate[0]))
        endDate = date(int(endDate[2]), int(endDate[1]), int(endDate[0]))
        for dt in daterange(startDate, endDate):
            dateRange.append(dt.strftime("%d/%m/%Y"))
        '''
        Retrieve latest proxies
        '''

        def proxyGenerator():
            proxies_req = Request(
                'http://list.didsoft.com/[email protected]&pass=r6gt4j&pid=http3000&showcountry=no&https=yes'
            )
            proxies_req.add_header('User-Agent', ua.random)
            proxies_doc = urlopen(proxies_req).read().decode('utf8')
            proxies = proxies_doc.split('\n')
            return proxies

        '''
        errorProxies = []
        for proxy in proxies:
            req = Request('http://icanhazip.com')
            req.set_proxy(proxy['ip'] + ':' + proxy['port'], 'http')
            try:
                my_ip = urlopen(req).read().decode('utf8')
                print('#' + str(1) + ': ' + my_ip)
            except: # If error, delete this proxy and find another one
                errorProxies.append(proxy)
                print('Proxy ' + proxy['ip'] + ':' + proxy['port'] + ' deleted.')
        
        proxies = [x for x in proxies if x not in errorProxies]
        '''
        proxies = proxyGenerator()

        def random_proxy():
            return random.randint(0, len(proxies) - 1)

        '''
        Crawling through the wepages in otas
        '''
        df = []
        for oneDate in dateRange:
            inputs = [searchKey, oneDate, '01/05/2020']
            for url in otas:
                proxy_index = random_proxy()
                proxy = proxies[proxy_index]
                while True:
                    #driver = random.choice([1, 2])
                    driver = 1
                    req = Request('http://icanhazip.com')
                    req.set_proxy(proxy, 'http')
                    try:
                        my_ip = urlopen(req).read().decode('utf8')
                        print('#' + str(1) + ': ' + my_ip)
                        temp = otas.index(url)
                        if temp == 1:
                            df = expedia.parse(url, proxy, driver, inputs)
                            if len(df) > 1:
                                df.to_csv('datasetHotelNames/expedia' +
                                          oneDate + '.csv')
                                df = []
                                break
                        if temp == 2:
                            df = Hotelsdotcom.parse(url, proxy, driver, inputs)
                            if len(df) > 1:
                                df.to_csv('datasetHotelNames/Hotelsdotcom' +
                                          oneDate + '.csv')
                                df = []
                                break
                        if temp == 0:
                            df = Goibibo.parse(url, proxy, driver, inputs)
                            if len(df) > 1:
                                df.to_csv('datasetHotelNames/goibibo' +
                                          oneDate + '.csv')
                                df = []
                                break
                    except:  # If error, delete this proxy and find another one
                        #del proxies[proxy_index]
                        #print('Proxy ' + proxy['ip'] + ':' + proxy['port'] + ' deleted.')
                        del proxies[proxy_index]
                        try:
                            proxy_index = random_proxy()
                        except:
                            proxies = proxyGenerator()
                        proxy = proxies[proxy_index]
                sleep(random.choice([1, 2, 3, 4]))
'''
Crawling through the wepages in otas
'''
for url in otas:
    proxy_index = random_proxy()
    proxy = proxies[proxy_index]
    driver = random.choice([1, 2])
    while True: 
        req = Request('http://icanhazip.com')
        req.set_proxy(proxy['ip'] + ':' + proxy['port'], 'http')
        try:
            my_ip = urlopen(req).read().decode('utf8')
            print('#' + str(1) + ': ' + my_ip)
            temp = otas.index(url)
            if temp == 0:
                df = expedia.parse(url, proxy, driver, inputs)
                df.to_csv('expedia.csv')
                break
            if temp == 1:
                df = Goibibo.parse(url, proxy, driver, inputs)
                df.to_csv('Goibibo.csv')
                break
            if temp == 2:
                df = Hotelsdotcom.parse(url, proxy, driver, inputs)
                df.to_csv('Hotelsdotcom.csv')
                break
        except: # If error, delete this proxy and find another one
            #del proxies[proxy_index]
            #print('Proxy ' + proxy['ip'] + ':' + proxy['port'] + ' deleted.')
            proxy_index = random_proxy()
            proxy = proxies[proxy_index]
Пример #5
0
df = pd.read_csv(task_file, header=0)

depts = [c.lower() for c in list(df['dept'])]
arrvs = [c.lower() for c in list(df['arrv'])]
dept_arrv_tuple_list = list(zip(depts, arrvs))

dataset = []

counter = 0
total = len(dept_arrv_tuple_list)
for dept, arrv in dept_arrv_tuple_list:
    print('Complete:', "{0:.0f}%".format(counter / total * 100))
    counter += 1
    print("Checking from {} to {}".format(dept, arrv))

    data_scraped = parse(dept, arrv, date, display_url=True)
    print(len(data_scraped), 'flights found in this route.')
    dataset.append({
        'dept': dept,
        'arrv': arrv,
        'date': date,
        'flights': data_scraped
    })
    #a list
m = []
for data in dataset:
    dept = data['dept']
    arrv = data['arrv']
    date = data['date']
    flights = data['flights']
    for f in flights: