def scrape(departure_city, departure_airport_code, arrival_city, arrival_state, arrival_airport_code, month, day, year): scraped_data = parse(departure_city, departure_airport_code, arrival_city, arrival_state, arrival_airport_code, month, day, year) #print ("Writing data to output file")departure_ci arrival_city = '_'.join(arrival_city.split()) #print(type(arrival_city)) #print(arrival_city) with open( 'data/%s-%s-%s-%s-%s-flight-results.json' % (month, day, year, departure_city, arrival_city), 'w') as fp: json.dump(scraped_data, fp, indent=4)
import expedia m=expedia.parse('DEL', 'BOM', '02/12/2018') print(m)
def scrape(self): searchKey = self.location.get() # Change this to your city startDate = self.check_in.get() #Format %d/%m/%Y endDate = self.check_out.get() #Format %d/%m/%Y startDate = startDate.split('/') endDate = endDate.split('/') ua = UserAgent() # From here we generate a random user agent #otas = ['https://www.expedia.co.in','https://in.hotels.com','https://www.goibibo.com/hotels/'] otas = ['https://www.goibibo.com/hotels/'] ''' Gives a range of dates ''' dateRange = [] def daterange(date1, date2): for n in range(int((date2 - date1).days) + 1): yield date1 + timedelta(n) startDate = date(int(startDate[2]), int(startDate[1]), int(startDate[0])) endDate = date(int(endDate[2]), int(endDate[1]), int(endDate[0])) for dt in daterange(startDate, endDate): dateRange.append(dt.strftime("%d/%m/%Y")) ''' Retrieve latest proxies ''' def proxyGenerator(): proxies_req = Request( 'http://list.didsoft.com/[email protected]&pass=r6gt4j&pid=http3000&showcountry=no&https=yes' ) proxies_req.add_header('User-Agent', ua.random) proxies_doc = urlopen(proxies_req).read().decode('utf8') proxies = proxies_doc.split('\n') return proxies ''' errorProxies = [] for proxy in proxies: req = Request('http://icanhazip.com') req.set_proxy(proxy['ip'] + ':' + proxy['port'], 'http') try: my_ip = urlopen(req).read().decode('utf8') print('#' + str(1) + ': ' + my_ip) except: # If error, delete this proxy and find another one errorProxies.append(proxy) print('Proxy ' + proxy['ip'] + ':' + proxy['port'] + ' deleted.') proxies = [x for x in proxies if x not in errorProxies] ''' proxies = proxyGenerator() def random_proxy(): return random.randint(0, len(proxies) - 1) ''' Crawling through the wepages in otas ''' df = [] for oneDate in dateRange: inputs = [searchKey, oneDate, '01/05/2020'] for url in otas: proxy_index = random_proxy() proxy = proxies[proxy_index] while True: #driver = random.choice([1, 2]) driver = 1 req = Request('http://icanhazip.com') req.set_proxy(proxy, 'http') try: my_ip = urlopen(req).read().decode('utf8') print('#' + str(1) + ': ' + my_ip) temp = otas.index(url) if temp == 1: df = expedia.parse(url, proxy, driver, inputs) if len(df) > 1: df.to_csv('datasetHotelNames/expedia' + oneDate + '.csv') df = [] break if temp == 2: df = Hotelsdotcom.parse(url, proxy, driver, inputs) if len(df) > 1: df.to_csv('datasetHotelNames/Hotelsdotcom' + oneDate + '.csv') df = [] break if temp == 0: df = Goibibo.parse(url, proxy, driver, inputs) if len(df) > 1: df.to_csv('datasetHotelNames/goibibo' + oneDate + '.csv') df = [] break except: # If error, delete this proxy and find another one #del proxies[proxy_index] #print('Proxy ' + proxy['ip'] + ':' + proxy['port'] + ' deleted.') del proxies[proxy_index] try: proxy_index = random_proxy() except: proxies = proxyGenerator() proxy = proxies[proxy_index] sleep(random.choice([1, 2, 3, 4]))
''' Crawling through the wepages in otas ''' for url in otas: proxy_index = random_proxy() proxy = proxies[proxy_index] driver = random.choice([1, 2]) while True: req = Request('http://icanhazip.com') req.set_proxy(proxy['ip'] + ':' + proxy['port'], 'http') try: my_ip = urlopen(req).read().decode('utf8') print('#' + str(1) + ': ' + my_ip) temp = otas.index(url) if temp == 0: df = expedia.parse(url, proxy, driver, inputs) df.to_csv('expedia.csv') break if temp == 1: df = Goibibo.parse(url, proxy, driver, inputs) df.to_csv('Goibibo.csv') break if temp == 2: df = Hotelsdotcom.parse(url, proxy, driver, inputs) df.to_csv('Hotelsdotcom.csv') break except: # If error, delete this proxy and find another one #del proxies[proxy_index] #print('Proxy ' + proxy['ip'] + ':' + proxy['port'] + ' deleted.') proxy_index = random_proxy() proxy = proxies[proxy_index]
df = pd.read_csv(task_file, header=0) depts = [c.lower() for c in list(df['dept'])] arrvs = [c.lower() for c in list(df['arrv'])] dept_arrv_tuple_list = list(zip(depts, arrvs)) dataset = [] counter = 0 total = len(dept_arrv_tuple_list) for dept, arrv in dept_arrv_tuple_list: print('Complete:', "{0:.0f}%".format(counter / total * 100)) counter += 1 print("Checking from {} to {}".format(dept, arrv)) data_scraped = parse(dept, arrv, date, display_url=True) print(len(data_scraped), 'flights found in this route.') dataset.append({ 'dept': dept, 'arrv': arrv, 'date': date, 'flights': data_scraped }) #a list m = [] for data in dataset: dept = data['dept'] arrv = data['arrv'] date = data['date'] flights = data['flights'] for f in flights: