def deal_with_feature(features: dict, x_request_id: str, residence_id: int) -> dict: """ Function responsible for deal with each feature and send to insert in database. Parameters: features: dict x_request_id: str residence_id: int Returns: dict """ try: for feature in features.keys(): create_residence_features( x_request_id=x_request_id, residence_id=residence_id, residence_feature_key=feature, residence_feature_value=features[feature], ) except ( AttributeError, IndexError, NotImplementedError, SyntaxError, ) as exception: error_handler( x_request_id=x_request_id, exception=exception, _msg="Exception occurred in deal with feature.", )
def get_furniture_flag(x_request_id: str, driver) -> bool: """ Function responsible for get flag that represent if the resident already have furniture. Parameters: x_request_id: unique id driver: google chrome instance Returns: int """ send_log(x_request_id=x_request_id, message="Searching for a furniture flag...") sleep(number=2) try: flag_furniture_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[7]/div/div/span" ) if flag_furniture_data: send_log( x_request_id=x_request_id, message="Found information about furniture in the residence...", ) flag_furniture = flag_furniture_data.text flag_furniture = flag_furniture.lower() return bool("sem" not in flag_furniture) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def get_link_of_resident_block(x_request_id, div_number_row: int, div_number_column: int, driver) -> classmethod: """ Function responsible for get link of one of blocks in QuintoAndar homepage. Parameters: x_request_id: UniqueId div_number_row: Number of the block in row in the page div_number_column: Number of the block in column in page driver: Google Chrome instance uses: wemake-services/[email protected] continue-on-error: true with: Returns Link <str> """ send_log( x_request_id=x_request_id, message=f"Getting link of a respective residence base on row " f"{div_number_row} and column {div_number_column}...", ) try: link = driver.find_element_by_xpath( "/html/body/div[1]/main" "/section[2]/div[2]/div" f"/div[1]/div[{div_number_row}]/div[{div_number_column}]/div/a") return link if link else None except (AttributeError, NoSuchElementException) as exception: error_handler( x_request_id=x_request_id, _msg="Exception occurred get_link_of_resident_block", exception=exception, )
def pet_flag(x_request_id: str, driver) -> bool: """ Function responsible for flag if the residence can have pet or not. Parameters: x_request_id: unique id driver: google chrome instance Returns: Bool <True or False> """ send_log(x_request_id=x_request_id, message="Searching for pet flag...") sleep(number=2) try: pet_flag_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[6]/div/div/span" ) if pet_flag_data: send_log( x_request_id=x_request_id, message="Found information about pet flag...", ) pet_flag_text = pet_flag_data.text send_log( x_request_id=x_request_id, message=f"Pet flag informatio is {pet_flag_text}", ) return not bool("Não" in pet_flag_text or "Nao" in pet_flag_text) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def scroll_quinto_andar_page(x_request_id: str, div_number_row: int, driver) -> None: """ Function responsible for make scroll in quinto andar page base on below divs. Parameters: x_request_id: Unique Id. div_number_row: Number of the block in row in the page driver: google chrome instance Returns: None """ try: element = driver.find_element_by_xpath( f"/html/body/div[1]/main/section[2]/div[2]/div/div[1]/div[{div_number_row+2}]" ) sleep(number=3) if element: actions = ActionChains(driver) actions.move_to_element(element) actions.perform() except (ElementClickInterceptedException, AttributeError) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def get_metro_flag(x_request_id: str, driver) -> bool: """ Function responsible for identify if has metro close to the residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: Bool <True or False> """ send_log(x_request_id=x_request_id, message="Searching for subway flag...") sleep(number=2) try: metro_flag_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[8]/div/div/span" ) if metro_flag_data: send_log( x_request_id=x_request_id, message="Found information about subway...", ) metro_flag_text = metro_flag_data.text return bool(metro_flag_text.find("Não")) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def receive_messages(queue: any, max_number: int, wait_time: int): """ Receive a batch of messages in a single request from an SQS queue. Parameters: queue: any The queue from which to receive messages max_number: any The maximum number of messages to receive. The actual number of messages received might be less. wait_time: any The maximum time to wait (in seconds) before returning. When this number is greater than zero, long polling is used. This can result in reduced costs and fewer false empty responses. Returns: The list of Message objects received. These each contain the body of the message and metadata and custom attributes. """ try: messages = queue.receive_messages( MessageAttributeNames=["All"], MaxNumberOfMessages=max_number, WaitTimeSeconds=wait_time, ) return messages except ClientError as exception: error_handler(exception=exception)
def get_type_residence(x_request_id: str, driver) -> str: """ Function responsible for return type of residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: str """ send_log( x_request_id=x_request_id, message="Searching for the type of residence...", ) sleep(number=2) try: type_residence_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[2]/div[1]/h1" ) if type_residence_data: send_log( x_request_id=x_request_id, message="Found the type of residence...", ) type_residence = type_residence_data.text type_residence = type_residence.lower() if "casa" in type_residence: return "house" return "apartment" except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def dealing_with_empty_queue(queue: any) -> None: """ Function responsible for deal when SQS Queue do not have any messages. When these happens is going to send the default event. Parameters: queue: AWS SQS Queue Returns: None """ try: data = { "x-request-id": "", "events": ["quintoAndarScraper"], "data": { "consumer_name": "default", "type_scraper": "quinto-andar", }, } send_message( x_request_id="", queue=queue, message_body=json.dumps(data), message_attributes={}, ) except (AttributeError, ClientError) as exception: error_handler( _msg="Exception occurred in dealing_with_empty_queue", exception=exception, )
def scraper_flow(x_request_id: str, driver: any): """ Function responsible for deal with flow logic of QuintoAndar scraper. Parameters: x_request_id: Unique id. driver: Google Chrome instance Returns: void """ try: timeout_start = time.time() send_log( x_request_id=x_request_id, message=f"Initiating the flow of scraper. Time: {timeout_start}", ) recursive_scraper_logic( x_request_id=x_request_id, div_number_row=quinto_andar["div_number_row_initiator"], div_number_column=quinto_andar["div_number_column_initiator"], limit_scraper=quinto_andar["limit_scraper"], timeout_start=timeout_start, driver=driver, ) sleep(10) send_log(x_request_id=x_request_id, message="Finished the flow of scraper.") except (WebDriverException, ElementNotInteractableException) as exception: error_handler( x_request_id=x_request_id, _msg="Exception occurred on scraper_flow", exception=exception, )
def number_of_rooms(x_request_id: str, driver) -> int: """ Function responsible for return number of rooms. Parameters: x_request_id: unique id driver: google chrome instance Returns: Bool <True or False> """ send_log(x_request_id=x_request_id, message="Searching for number of rooms...") sleep(number=2) try: number_rooms_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[2]/div/div" ) if number_rooms_data: send_log( x_request_id=x_request_id, message="Found information about number of rooms...", ) number_rooms = number_rooms_data.text # Start verification if has digit # then going to return. If do not have then return 0. return (int(re.findall(r"\d+", number_rooms)[0]) if verification_string_has_digit(x_request_id=x_request_id, text=number_rooms) else 0) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def get_residence_id(x_request_id: str, driver: any) -> int: """ Function responsible for return id of residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: int """ send_log(x_request_id=x_request_id, message="Searching for the residence id...") sleep(number=2) try: residence_id = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/nav/ol/li[5]/a") if residence_id: send_log( x_request_id=x_request_id, message="Found id of residence...", ) residence_id_text = residence_id.text # Start verification if has digit # then going to return. If do not have then return 0. return (int(re.findall(r"\d+", residence_id_text)[0]) if verification_string_has_digit(x_request_id=x_request_id, text=residence_id_text) else 0) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def residence_size(x_request_id: str, driver) -> int: """ Function responsible for return the size of the residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: int: size of residence """ send_log( x_request_id=x_request_id, message="Searching for the number of bedrooms...", ) sleep(number=2) try: size_residence_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[3]/div/div[1]/div/div/span" ) if size_residence_data: send_log( x_request_id=x_request_id, message="Found information about bedrooms...", ) size_residence = size_residence_data.text # Start verification if has digit # then going to return. If do not have then return 0. return (int(re.findall(r"\d+", size_residence)[0]) if verification_string_has_digit(x_request_id=x_request_id, text=size_residence) else 0) except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def resident_localization_data(x_request_id: str, driver) -> list: """ Function responsible for get all information about localization of specific residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: Object { street_name<String>, district_name<String>, state_name<String> } """ send_log( x_request_id=x_request_id, message="Searching for address of residence...", ) sleep(number=7) try: localization_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[1]/div/div[2]/div[2]/p" ) if localization_data: send_log( x_request_id=x_request_id, message="Found information about address...", ) localization_data = localization_data.text return localization_data.split(",") if localization_data else None except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def get_rent_values(x_request_id: str, driver) -> dict: """ Function responsible for get all values about the rent of the residence. Parameters: x_request_id: unique id driver: google chrome instance Returns: dict """ send_log( x_request_id=x_request_id, message="Searching for a values of the rent...", ) try: sleep(number=2) rent_values_data = driver.find_element_by_xpath( "/html/body/div[1]/div/main/section/div/div[2]/section/div/ul") if rent_values_data: send_log(x_request_id=x_request_id, message="Found the values...") rent_values_dict = { "rent_without_taxes": int, "condominium_tax": int, "house_tax": int, "fire_insurance": int, "service_tax": int, "total_rent_value": int, } rent_values = rent_values_data.text if rent_values: rent_values = rent_values.replace("Incluso", "0") rent_values = re.findall(r"(?<![.,])\d+[,.]{0,1}\d*", rent_values) # Going to get values in case of find 6 numbers in array. if len(rent_values) == 6: rent_values_dict["rent_without_taxes"] = rent_values[0] rent_values_dict["condominium_tax"] = rent_values[1] rent_values_dict["house_tax"] = rent_values[2] rent_values_dict["fire_insurance"] = rent_values[3] rent_values_dict["service_tax"] = rent_values[4] rent_values_dict["total_rent_value"] = rent_values[5] return rent_values_dict except (AttributeError, NoSuchElementException) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def create(x_request_id: str, data: dict, table_name: str) -> dict: """ Function responsible for create data scraped. Parameters: x_request_id: str data: dict table_name: str Returns: int """ body = {"data": data, "tableName": table_name} # path = config.get("wmh_backoffice_endpoint", None) token = config.get("wmh_backoffice_token", None) try: response = api_integration( x_request_id=x_request_id, url="https://wmhbackoffice-prod.onrender.com/v1/wmh/update-data", token=token, body=body, ) response = dict(response) return response["data"] except (AssertionError, AttributeError, IndexError, KeyError) as exception: return error_handler( x_request_id=x_request_id, _msg="Exception occurred in create service.", exception=exception, )
def scraper_initiator(x_request_id: str, properties: str, driver: any) -> None: """ Function responsible for initiate scraper. Parameters: x_request_id: unique id driver: google chrome instance properties: type of scraper that going to initiate Returns: None """ try: if properties == "quinto-andar": homepage(x_request_id=x_request_id, driver=driver) except AttributeError as exception: error_handler(x_request_id=x_request_id, exception=exception)
def send_message( x_request_id: str, queue, message_body, message_attributes=None, thread_number: int = 0, ) -> None: """ Send a message to an Amazon SQS queue. Parameters: x_request_id: unique id queue: The queue to receive the messages. message_body: The messages to send to the queue. These are simplified to contain only the message body and attributes. message_attributes: any thread_number: int represent the number of thread of queue. these is important to make QUEUE work in thread Returns: The response from SQS that contains the assigned message ID. """ if not message_attributes: message_attributes = {} try: queue.send_message( MessageBody=message_body, MessageAttributes=message_attributes, MessageDeduplicationId=f"wmh_scraper_{random_number(10000)}", MessageGroupId=f"wmh_scraper_{thread_number}", ) message_body = json.loads(message_body) send_log( message=f"Sending the follow msg to SQS QUEUE {message_body}", x_request_id=x_request_id, ) except (ClientError, TypeError) as exception: error_handler( x_request_id=x_request_id, _msg=f"Send message failed: {message_body}", exception=exception, )
def delete_message(x_request_id: str, message) -> None: """ Delete an message from a queue. Parameters: x_request_id: Unique id str message: The message to delete. The message's queue URL is contained in the message's metadata. Returns: None """ try: message.delete() send_log( x_request_id=x_request_id, message="Message have been deleted with success.", ) except (ClientError, AttributeError) as exception: error_handler(x_request_id=x_request_id, exception=exception)
def creation_residence_data( x_request_id: str, residence_data: QuintoAndarSchema ) -> None: """ Function responsible for create all data from residence. Parameters: x_request_id: str residence_data: QuintoAndarSchema Returns: None Notes: Function deal_with_feature is responsible for receive each different features and create respectively to each residence. """ try: send_log( message=f"Going to create the follow data {residence_data}", x_request_id=x_request_id, ) residence_address_id = create_residence_address( x_request_id=x_request_id, residence_data=residence_data ) residence_id = create_residence( x_request_id=x_request_id, residence_address_id=residence_address_id, residence_data=residence_data, ) create_residence_values( x_request_id=x_request_id, residence_id=residence_id, residence_data=residence_data, ) features = { "petFlag": residence_data.pet_flag, "metroFlag": residence_data.metro_flag, "furnitureFlag": residence_data.furniture_flag, } deal_with_feature( features=features, x_request_id=x_request_id, residence_id=residence_id, ) except (SyntaxError, AttributeError, AssertionError) as exception: return error_handler( exception=exception, _msg="Exception occurred in create_residence_flow", )
def consumer_message_handler(x_request_id: str, message: any, driver: any) -> None: """ Function responsible for handler with SQS Messages Parameters: x_request_id: id unique message: sqs message instance driver: google chrome instance Returns: None """ try: data = validate_message_data(x_request_id=x_request_id, message=message) executor( x_request_id=x_request_id, consumer=data.get("data").get("consumer_name"), properties=data.get("data").get("type_scraper"), driver=driver, ) except AttributeError as exception: error_handler(x_request_id=x_request_id, exception=exception)
def main(driver: any, queue: any) -> None: """ Consumer responsible for receive messages from SQS Queue Parameters: driver: any queue: any Returns: None """ try: while True: messages = receive_messages(queue=queue, max_number=1, wait_time=0) if len(messages) == 0: send_log( x_request_id="", message="QUEUE with 0 messages, going to send default event in 30 minutes...", ) sleep(number=1800) dealing_with_empty_queue(queue=queue) else: for message in messages: x_request_id = request_handler(message=message.body) send_log( x_request_id=x_request_id, message="Receive message going to start scraper flow...", ) consumer_message_handler( message=message.body, x_request_id=x_request_id, driver=driver, ) delete_message(x_request_id=x_request_id, message=message) except AttributeError as exception: error_handler(exception=exception)
def create_residence_values( x_request_id: str, residence_id: int, residence_data: QuintoAndarSchema ) -> None: """ Function responsible for create residence values. Parameters: x_request_id: str residence_id: int residence_data: QuintoAndarSchema Returns: int """ try: table_name = TableNameSchema() data = { "ResidenceId": residence_id, "price": float(residence_data.rent_price_without_tax), "condominiumTax": float(residence_data.condominium_tax), "houseTax": float(residence_data.house_tax), "fireInsurence": float(residence_data.fire_insurance), "serviceTax": float(residence_data.service_tax), "totalRentPrice": float(residence_data.total_rent_price), } residence_values = create( x_request_id=x_request_id, data=data, table_name=table_name.residence_values, ) send_log( x_request_id=x_request_id, message=f"Inserted in database the follow residence values {residence_values}...", ) except ( TimeoutError, SyntaxError, IndexError, AttributeError, ) as exception: return error_handler( x_request_id=x_request_id, exception=exception, _msg="Exception occurred in create_residence_value", )
def api_integration(x_request_id: str, url: str, token: str, body: dict) -> json: """ Function responsible for send request based on path and data body. Parameters: x_request_id: str url: str token: str body: dict """ headers = {"Content-Type": "application/json", "Authorization": token} try: send_log( x_request_id=x_request_id, message= f"Sending request to follow path: {url} with follow data: {body}", ) data = requests.post(url=url, data=json.dumps(body), headers=headers, timeout=25) send_log( x_request_id=x_request_id, message=f"Request finish with status: {data.status_code}", ) return data except ( requests.exceptions.Timeout, requests.exceptions.ReadTimeout, ) as exception: return error_handler( x_request_id=x_request_id, _msg="Exception occurred in api service.", exception=exception, )