def connect_to_tor(): """Setup the Tor proxy, the socket and perform the dns resolution to translates the domain name into a IPV4 address""" socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9050) socket.socket = socks.socksocket socket.getaddrinfo = getaddrinfo log.debug('Setting up socket and connecting to TOR')
def query_flights( origin, destination, day, max_attempts=20, seconds_sleep=1, country="ES", currency="EUR", locale="en-US", ): """ Query flights iterating until there is a result Args: origin: code for origin airport destination: code for destination airport day: day for the flights [date] max_attempts: number of retries seconds_sleep: seconds to sleep before returning a result country: code for country (default: ES) currency: code for currency (default: EUR) locale: code for output info (default: en-US) """ url = f"{BASE_URL}{country}/{currency}/{locale}/{origin}/{destination}/{day:%Y-%m-%d}" for attemp_num in range(max_attempts): log.debug( f"Quering {origin}-{destination} for date '{day}' (attempt {attemp_num})" ) response = requests.get(url, headers=HEADERS) if response.status_code == 200: sleep(seconds_sleep) return response # If there are 'Too many requests' sleep a little elif response.status_code == 429: log.warning(f"API limit reached at attempt {attemp_num + 1}") sleep(2 * attemp_num + 1) # Raise unknown cases else: response.raise_for_status() log.error(f"Number max of attempts reached ({max_attempts})") raise TimeoutError("TimeOut")
def get_airports(): """ Retreive airports """ header = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } url = "https://datahub.io/core/airport-codes/r/airport-codes.csv" response = requests.get(url, headers=header) response.raise_for_status() log.debug("Airport data retrived") df = pd.read_csv(StringIO(response.text)) log.debug("Airport data parsed") return df
def start_bee(): """The start_bee function starts all the tasks related to starting the bee. When this function is called the bee will start working.""" log.debug("Bee has been started") while True: urls = get_urls() # update the url so different instances don't crawl the same url for url in urls: update_url(url) if len(urls) == 0: print("No URLs to be crawled, waiting for 60 seconds.") log.info('No URLs to be crawled, waiting for 60 seconds.') sleep(60) commit() continue connect_to_tor() for url in urls: try: content = get_content_from_url(url.url) print("{} is now being beeed".format(url.url)) content_hashed = hash_content(content) content_cleaned = clean_html(content) check_blocked = check_blocked_keywords(content_cleaned) if check_blocked is None: keywords = filter_keywords(content_cleaned) save_content(url.id, content_cleaned, content, content_hashed, keywords) else: print( "URL: " + url.url + " has been blocked because it contains a blocked keyword " + check_blocked) except (ValueError, NameError, TypeError) as error: log.error(str(error))
async def main(loop): log.debug('scout has been started') while True: urls = get_urls_from_database() # update urls immediately to avoid different instances crawling the same urls for url in urls: update_url(url) if len(urls) == 0: print("No URLs to be crawled, waiting for 60 seconds.") log.info('No URLs to be crawled, waiting for 60 seconds.') sleep(60) commit() continue results = await tor.get_content_from_urls(loop, urls) urls_from_content = get_urls_from_results(urls, results) for u in urls_from_content: if u is not None: save_url(u) print('Found ', len(urls_from_content), ' urls')
def main(): """ Get all flights of each pair and export them as a csv for each pair """ path_raw = config["PATHS"]["DATA"] + f"flights/{date.today():%Y_%m_%d}/" # Create folder os.makedirs(path_raw, exist_ok=True) airports_pairs = get_pairs() total_pairs = len(airports_pairs) for i, (origin, dest) in enumerate(airports_pairs): log.info( f"Quering flights from '{origin}' to '{dest}' ({i + 1}/{total_pairs})" ) df = query_pair(origin, dest) if df is not None: uri = f"{path_raw}{origin}_{dest}.csv" df.to_csv(uri, index=False) log.debug(f"Exporting '{uri}'")
def fix_encodings(dfi): """ Fix some encoding problems with latin1. For example: Aeròdrom de Pals --> Aeròdrom de Pals """ df = dfi.copy() def _fix_latin(x): """ Decode latin1 and encode as utf8 """ if pd.isna(x): return x return x.encode("latin1").decode("utf8", "ignore") for col in ["name", "municipality"]: # Not sure why but it needs to be applyed twice df[col] = df[col].apply(_fix_latin).apply(_fix_latin) log.debug("String encodings fixed") return df