예제 #1
0
파일: tor.py 프로젝트: cradess/HIVE
def connect_to_tor():
    """Setup the Tor proxy, the socket and perform the dns resolution to translates the domain name
    into a IPV4 address"""
    socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 9050)
    socket.socket = socks.socksocket
    socket.getaddrinfo = getaddrinfo
    log.debug('Setting up socket and connecting to TOR')
예제 #2
0
def query_flights(
    origin,
    destination,
    day,
    max_attempts=20,
    seconds_sleep=1,
    country="ES",
    currency="EUR",
    locale="en-US",
):
    """
        Query flights iterating until there is a result

        Args:
            origin:         code for origin airport
            destination:    code for destination airport
            day:            day for the flights [date]
            max_attempts:   number of retries
            seconds_sleep:  seconds to sleep before returning a result
            country:        code for country (default: ES)
            currency:       code for currency (default: EUR)
            locale:         code for output info (default: en-US)
    """

    url = f"{BASE_URL}{country}/{currency}/{locale}/{origin}/{destination}/{day:%Y-%m-%d}"

    for attemp_num in range(max_attempts):

        log.debug(
            f"Quering {origin}-{destination} for date '{day}' (attempt {attemp_num})"
        )

        response = requests.get(url, headers=HEADERS)

        if response.status_code == 200:
            sleep(seconds_sleep)
            return response

        # If there are 'Too many requests' sleep a little
        elif response.status_code == 429:
            log.warning(f"API limit reached at attempt {attemp_num + 1}")
            sleep(2 * attemp_num + 1)

        # Raise unknown cases
        else:
            response.raise_for_status()

    log.error(f"Number max of attempts reached ({max_attempts})")
    raise TimeoutError("TimeOut")
예제 #3
0
def get_airports():
    """ Retreive airports """

    header = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
        "X-Requested-With": "XMLHttpRequest",
    }

    url = "https://datahub.io/core/airport-codes/r/airport-codes.csv"
    response = requests.get(url, headers=header)
    response.raise_for_status()

    log.debug("Airport data retrived")

    df = pd.read_csv(StringIO(response.text))
    log.debug("Airport data parsed")

    return df
예제 #4
0
파일: bee.py 프로젝트: cradess/HIVE
def start_bee():
    """The start_bee function starts all the tasks related to starting the bee.
    When this function is called the bee will start working."""
    log.debug("Bee has been started")
    while True:
        urls = get_urls()
        # update the url so different instances don't crawl the same url

        for url in urls:
            update_url(url)

        if len(urls) == 0:
            print("No URLs to be crawled, waiting for 60 seconds.")
            log.info('No URLs to be crawled, waiting for 60 seconds.')
            sleep(60)
            commit()
            continue

        connect_to_tor()

        for url in urls:
            try:
                content = get_content_from_url(url.url)
                print("{} is now being beeed".format(url.url))
                content_hashed = hash_content(content)

                content_cleaned = clean_html(content)

                check_blocked = check_blocked_keywords(content_cleaned)

                if check_blocked is None:
                    keywords = filter_keywords(content_cleaned)
                    save_content(url.id, content_cleaned, content,
                                 content_hashed, keywords)
                else:
                    print(
                        "URL: " + url.url +
                        "  has been blocked because it contains a blocked keyword "
                        + check_blocked)

            except (ValueError, NameError, TypeError) as error:
                log.error(str(error))
예제 #5
0
async def main(loop):
    log.debug('scout has been started')
    while True:
        urls = get_urls_from_database()
        # update urls immediately to avoid different instances crawling the same urls
        for url in urls:
            update_url(url)

        if len(urls) == 0:
            print("No URLs to be crawled, waiting for 60 seconds.")
            log.info('No URLs to be crawled, waiting for 60 seconds.')
            sleep(60)
            commit()
            continue

        results = await tor.get_content_from_urls(loop, urls)
        urls_from_content = get_urls_from_results(urls, results)

        for u in urls_from_content:
            if u is not None:
                save_url(u)
        print('Found ', len(urls_from_content), ' urls')
예제 #6
0
def main():
    """
        Get all flights of each pair and export them as a csv for each pair
    """

    path_raw = config["PATHS"]["DATA"] + f"flights/{date.today():%Y_%m_%d}/"

    # Create folder
    os.makedirs(path_raw, exist_ok=True)

    airports_pairs = get_pairs()
    total_pairs = len(airports_pairs)

    for i, (origin, dest) in enumerate(airports_pairs):

        log.info(
            f"Quering flights from '{origin}' to '{dest}' ({i + 1}/{total_pairs})"
        )
        df = query_pair(origin, dest)

        if df is not None:
            uri = f"{path_raw}{origin}_{dest}.csv"
            df.to_csv(uri, index=False)
            log.debug(f"Exporting '{uri}'")
예제 #7
0
def fix_encodings(dfi):
    """
        Fix some encoding problems with latin1.
        
        For example:
             Aeròdrom de Pals --> Aeròdrom de Pals
    """

    df = dfi.copy()

    def _fix_latin(x):
        """ Decode latin1 and encode as utf8 """

        if pd.isna(x):
            return x
        return x.encode("latin1").decode("utf8", "ignore")

    for col in ["name", "municipality"]:
        # Not sure why but it needs to be applyed twice
        df[col] = df[col].apply(_fix_latin).apply(_fix_latin)

    log.debug("String encodings fixed")

    return df