def post(self): ip_payload = request.get_json(force=True) logger.info('Post method is called to add the IP') if ip_payload['secret_key'] == 'icanprotect': self.ip_service.remove(ip_payload['ip'])
def alert_diff_previous(self, product_id, tenant_id, diff_attributes): ''' This method is to check if the user was already alerted regarding the diff_Attributes Update diff attributes is created based on the previous alert- Adds only the attributes that doesnt exist in the previous alert meta data ''' session = Session() logger.info('Finding updated diff attributes') updated_diff_attributes = [] sub_query_id = session.query(func.max(Alert.id)).filter( Alert.product_id == product_id).filter(Alert.tenant_id == tenant_id) latest_alert = session.query(Alert).filter( Alert.product_id == product_id).filter(Alert.tenant_id == tenant_id).filter(Alert.id == sub_query_id).one_or_none() logger.debug('latest alert is {0}'.format(latest_alert)) if latest_alert is not None: for field in diff_attributes: if field not in json.loads(latest_alert.meta_data): updated_diff_attributes.append(field) logger.debug('updated diff attributes are computed {0}'.format( updated_diff_attributes)) return updated_diff_attributes logger.info('updated diff attribues based on alert are {0}'.format( updated_diff_attributes)) return updated_diff_attributes
def diff_images(self, input_images, output_images): ''' This method is to compute if the images have changed. Currently checks the hires and thumb nail images ''' image_diff = [] image_compare = SpyderDataProcess.image_compare logger.info('computing the diff of the images') if len(input_images) < len(output_images): image_range = len(input_images)-1 else: image_range = len(output_images)-1 for position in range(0, image_range): input_image_dict = input_images[position] if position <= len(output_images)-1: output_image_dict = output_images[position] for key in input_image_dict.keys(): if key in output_image_dict.keys() and key in image_compare: if not input_image_dict[key] == output_image_dict[key]: alternate_key = self.aletrnate_names_create(key) if alternate_key not in image_diff: image_diff.append(alternate_key) return image_diff
def diff_compute(self, product, output): ''' Handler method to compute diff for attributes, images, and buxbox sellers ''' logger.info('About to compute diffs for the product') diffAttributes = [] # calculating diff for all the attributes diffAttributes = self.diff_compute_attributes( output, json.loads(product.ideal_state)) # calcualting diff for the images diffImages = self.diff_images(output['images'], json.loads( product.ideal_state)['images']) if diffImages: logger.info('updating images with diffatrributes') diffAttributes.extend(diffImages) # Check if buy box seller info with the info on tenant settings if output['buyboxSellerName']: buy_box_check = self.buy_box_infocheck( product.tenant_id, output['buyboxSellerName']) if not buy_box_check: alternate_name = self.aletrnate_names_create( 'buyboxSellerName') diffAttributes.append(alternate_name) return diffAttributes
def process(self, output, product): ''' This method to initiate the data processing for the crwaled data ''' logger.info("processing started for the crawled data") self.state_update(output, product)
def send_sms(self, product_id, mobile_num_list, asin, title, diffAttributes, message): ''' This method is to send sms info to the kafka que ''' kafka_topic = 'send_sms' logger.info('sending message') alert_info = {'type': 'sms', 'product_id': product_id, 'mobile_num': mobile_num_list, 'asin': asin, 'title': title, 'message': message, 'diffAttributes': diffAttributes} self.kafka_que.producer_call(kafka_topic, alert_info)
def send_email(self, product_id, email_id_list, asin, title, diffAttributes, message): ''' This method to send alert email info to the kafka que ''' kafka_topic = 'send_email' logger.info('sending email') alert_info = {'type': 'e-mail', 'product_id': product_id, 'email_id': email_id_list, 'asin': asin, 'title': title, 'message': message, 'diffAttributes': diffAttributes} self.kafka_producer_que(kafka_topic, alert_info)
def url_construct(self, platform, asin): if platform == 'Amazon': # url = 'https://www.amazon.com/dp/{}'.format(asin.strip()) self.current_ip = Spyder.IP_GC[Spyder.count % len(Spyder.IP_GC)] Spyder.count += 1 # print('current count{}'.format(Spyder.count)) url = 'http://{0}/scrape?asin={1}'.format(self.current_ip, asin.strip()) logger.info('construcuted url {}'.format(url)) return url
def product_state_history_create(self, product, diffAttributes): ''' This method to create an entry in the product state history table ''' logger.info('creating product state history entry') session = Session() product_state_history_info = {'product_id': product.id, 'current_state': product.current_state, 'ideal_state': product.ideal_state, 'state_diff': diffAttributes, 'create_dt': datetime.utcnow(), 'tenant_id': product.tenant_id} product_state_history = ProductStateHistory( **product_state_history_info) session.add(product_state_history)
def find_image(self, output_images): ''' This method is to find the image in the crawled data Needs to update to get the image apart from the hi res and thumb nail ''' logger.info('Finding the image for the product') for position in range(0, len(output_images)-1): for key in output_images[position].keys(): if key in ['hiRes']: if output_images[position][key] is not None: return output_images[position][key] return None
def alert_create(self, product, diffAttributes, previous_diffAttributes, alert_type, message): ''' This method ot create the alert ''' logger.info('creating the alert for product {0}'.format(product.asin)) session = Session() alert_info = {'product_id': product.id, 'alert_type': alert_type, 'message': message, 'meta_data': json.dumps(diffAttributes), 'status': 'new', 'create_dt': datetime.utcnow(), 'created_by': product.created_by, 'tenant_id': product.tenant_id} alert = Alert(**alert_info) self.send_notification(product.id, product.created_by, product.tenant_id, product.asin, product.title, diffAttributes, previous_diffAttributes, alert_type, message) session.add(alert) logger.info('Alert is created') return alert
def diff_compute_attributes(self, current_state, ideal_state): ''' This method computes the diff by comparing attributes ''' logger.info('computing diff of the attributes') diffAttributes = [] diff_exclude_fieldlist = SpyderDataProcess.diff_exclude_fieldlist for key in current_state.keys(): if key in ideal_state.keys(): if key not in diff_exclude_fieldlist: if current_state[key] != ideal_state[key]: alternate_key = self.aletrnate_names_create(key) diffAttributes.append(alternate_key) return diffAttributes
def kafka_consumer_start(self): ''' This method is to retrive the info from the kafka que Retrieves the info and info is sent to spyder data process ''' logger.info('kafka consumer started') for message in self.consumer: product_info = message.value['product_info'] product = Product(**product_info) data = self.spyder.crawl(product) if data and data['title']: sleep(randint(3, 8)) self.skp.process(data, product) elif data is None or 'ip' in data: if data['ip'] not in self.BAD_IP: self.BAD_IP.append(data['ip']) self.skp.send_admin_alert_email(data['ip']) self.skp.send_admin_alert_mobile(data['ip'])
def producer_call(self, kafka_topic_name, alert_info): ''' This is method to send the info to kafka que ''' if alert_info: ack = self.producer.send(kafka_topic_name, {'alert_info': alert_info}) try: record_metadata = ack.get(timeout=10) except KafkaError: # Decide what to do if produce request failed... logger.info( 'Alert info is sent to kafka que- Ack:{0}'.format(ack)) pass else: logger.error('Alert info is None and cannot be sent to kafka que')
def buy_box_infocheck(self, tenant_id, buy_box_name): ''' This method check if the buy box name exists in the tenant settings ''' logger.info('Checking Buy Box Info') session = Session() tenant = session.query(Tenant).filter( Tenant.tenant_id == tenant_id).one_or_none() if not tenant.settings: return False buy_box_names_tenant_list = [name.strip().lower() for name in json.loads( tenant.settings)['buy_box_names']] if buy_box_names_tenant_list and buy_box_names_tenant_list[0] != "": if buy_box_name.strip().lower() in buy_box_names_tenant_list: return True return False
def send_notification(self, product_id, user_id, tenant_id, asin, title, diffAttributes, previous_diffAttributes, alert_type, message): ''' This is a handler method to send notifications- email and mobile ''' user_email_list = [] user_sms_list = [] selected_attributes_email = [] selected_attributes_sms = [] logger.info( 'working towards sending notifications for the product {0}'.format(asin)) session = Session() # compose user email list and the attributes to be sent for the tenant users = session.query(User).filter(User.tenant_id == tenant_id).filter( User.email_validation == True).filter(User.email_alert == True).all() if users: user_email_list, selected_attributes_email = self.compose_contact_list( users, diffAttributes, previous_diffAttributes, alert_type, 'e-mail') # compose user sms list and attributes to be sent for the tenant users = session.query(User).filter(User.tenant_id == tenant_id).filter( User.sms_alert == True).all() if users: user_sms_list, selected_attributes_sms = self.compose_contact_list( users, diffAttributes, previous_diffAttributes, alert_type, 'sms') # send email to the users under the tenant if user_email_list and selected_attributes_email: self.send_email(product_id, json.dumps(user_email_list), asin, title, json.dumps(selected_attributes_email), message) # send sms to the users under the tenant if user_sms_list and selected_attributes_sms: self.send_sms(product_id, json.dumps(user_sms_list), asin, title, json.dumps(selected_attributes_sms), message)
def compose_contact_list(self, users, diffAttributes, previous_diffAttributes, alert_type, contact_type): ''' This method composes the contact list either mobile or e-mail. It only adds the contact info any attributes in diffAttributes, if users opts to alert the attribute ''' logger.info('composing contact list to send {0}'.format(contact_type)) user_contact_list = [] user_prefered_attributes = [] user_alert_attributes = [] for user in users: for attribute in diffAttributes: # send notfication if they don't have any preference or if the prefered attributes is in diffAttributes if not json.loads(user.alert_preference) or attribute in json.loads(user.alert_preference): if previous_diffAttributes and (attribute not in previous_diffAttributes and attribute not in user_prefered_attributes): user_prefered_attributes.append(attribute) # add to contact list only if diffattribute is in user_alert_preference if contact_type == 'e-mail' and user.email_id and user.email_id not in user_contact_list: user_contact_list.append(user.email_id) elif user.mobile_num and user.mobile_num not in user_contact_list: user_contact_list.append(user.mobile_num) if alert_type == 'Healthy': user_alert_attributes = diffAttributes # if any alert attribute is in user prefered attributes, send the whole alert- send all attributes if user_prefered_attributes: user_alert_attributes = diffAttributes return(user_contact_list, user_alert_attributes)
import os import yaml from flask import Flask from flask_cors import CORS from flask_restful import Resource, Api from flask_jwt_extended import JWTManager from process.globalutils import logger from process.que.kafkaconsumerspyder import KafkaConsumerSpyder APP = Flask(__name__) CORS(APP) API = Api(APP) skp = KafkaConsumerSpyder() skp.kafka_consumer_start() with open('config.yml') as config_input: config = yaml.load(config_input) # Or can import mode from global utils mode = os.environ['EnvMode'] if __name__ == '__main__': logger.info('app is launched') APP.run(host=config[mode]['APP_HOST'], port=config[mode]['APP_PORT'], debug=True, threaded=True)
def _scrapecontent(self, response): root = scrapy.Selector(text=response.content) data = { 'metaLink': self.read_xpath(root, '//link[(@rel = "canonical")]/@href'), 'metaContent': self.read_xpath(root, '//meta[(@name = "description")]/@content'), 'metaTitle': self.read_xpath(root, '//meta[(@name = "title")]/@content'), 'metaKeywords': self.read_xpath(root, '//meta[(@name = "keywords")]/@content'), 'metaPageTitle': self.read_xpath(root, '//title/text()'), 'title': self.read_xpath(root, '//*[(@id = "productTitle")]/text()'), 'bylineInfo': self.read_xpath(root, '//*[(@id = "bylineInfo")]/text()'), 'bylineUrl': self.read_xpath(root, '//*[(@id = "bylineInfo")]/@href'), 'currentReviewRating': self.read_xpath( root, '//*[(@id = "averageCustomerReviews")]/span/span/@title', self._processreviewrating), 'noOfReviews': self.read_xpath(root, '//span[(@id = "acrCustomerReviewText")]/text()', self._processreview), 'listItems': self.read_xpath( root, '//div[(@data-feature-name = "featurebullets")]//span[(@class="a-list-item")]/text()', self._processlist), 'buyboxSellerName': self.read_xpath(root, '//*[(@id = "merchant-info")]/a/text()', self._firstItem), 'fullfilledBy': self.read_xpath(root, '//*[(@id = "merchant-info")]/a/text()'), 'merchantId': self.read_xpath( root, '//form[(@id = "addToCart")]/input[(@id="merchantID")]/@value' ), 'sellingCustomerId': self.read_xpath( root, '//form[(@id = "addToCart")]/input[(@id="sellingCustomerID")]/@value' ), 'isMerchantExclusive': self.read_xpath( root, '//form[(@id = "addToCart")]/input[(@id="isMerchantExclusive")]/@value' ), 'sellerDescription': self.read_xpath(root, '//*[(@cel_widget_id = "aplus")]'), 'description': self.read_xpath(root, '//*[(@id = "productDescription")]'), 'price': self.read_xpath( root, '//div[@id="cerberus-data-metrics"]/@data-asin-price'), 'bestSeller': self.read_xpath( root, '//*[contains(concat( " ", @class, " " ), concat( " ", "p13n-best-seller-badge", " " ))]/text()' ) } # , (lambda v: v[1]) r = root.xpath('//script').extract() images = [] for s in r: if 'ImageBlockATF' in s and '\'colorImages\'' in s: start = s.find('\'colorImages\':') + 14 end = s.find('\'colorToAsin\'', start) imagesScript = s[start:end] f = imagesScript.strip()[13:-2] images = json.loads(f) data['images'] = images # 1. sponsored products # 2. Need more work to format this review, extract the review, rating, person, date, verified purchase, desc # commenting out the reviews temp '''revs = root.xpath( '//div[contains(@data-hook, "review-collapsed")]/text()') prevs = [] for rev in revs: prevs.append({ 'review': rev.extract() }) data['reviews'] = prevs''' if data['title']: logger.info('The title for the scrapped product is {0}'.format( data['title'])) else: data['title'] data['ip'] = self.current_ip logger.debug(json.dumps(data)) logger.warn('scraping might have messed up!') # Extract seller name pd_headings = root.xpath( '//*[(@id = "productDetails_detailBullets_sections1")]//th/text()' ).extract() pd_tds = root.xpath( '//*[(@id = "productDetails_detailBullets_sections1")]//td' ).extract() if (pd_headings and pd_tds): for i in range(0, len(pd_headings)): heading = self._strval(pd_headings[i]) root = scrapy.Selector(text=pd_tds[i]) td_val = self._strval(root.xpath('//text()').extract()) if (heading == 'Product Dimensions' or heading == 'Package Dimensions'): data['productDimension'] = td_val elif (heading == 'Best Sellers Rank'): data['bestSellerRank'] = td_val return data
def state_update(self, output, product): ''' This method is a handler to compute the diff, updating the product and creating alert ''' session = Session() # variables diff = {} diff['diff'] = {} ideal_state = {} updated_diffAttributes = [] diffAttributes = [] previous_diffAttributes = [] # retrieving the product info # use product id product_new = session.query(Product).filter( Product.asin == product.asin).filter( Product.tenant_id == product.tenant_id).one_or_none() logger.info('product health status is {0}'.format(product_new)) logger.info('product health status is {0}'.format(product_new.asin)) health_status_previous = product_new.health_status logger.info('printing the state_diff {0}'.format( product_new.state_diff)) if product_new.state_diff: previous_diffAttributes = json.loads(product_new.state_diff) # If the product is newly added load the output to ideal state # else compute diffs,create aleters, and send notifications if not json.loads(product_new.ideal_state): logger.info( 'Updating the product for the first time: Asin {0}'.format(product_new.asin)) ideal_state = json.dumps(output) product_new.ideal_state = ideal_state product_new.current_state = ideal_state product_new.title = output['title'] product_new.update_dt = datetime.utcnow() product_new.refresh_dt = datetime.utcnow() if output['images']: product_new.image = self.find_image(output['images']) # buy box check if output['buyboxSellerName']: buy_box_check = self.buy_box_infocheck( product.tenant_id, output['buyboxSellerName']) if not buy_box_check: alternate_name = self.aletrnate_names_create( 'buyboxSellerName') diffAttributes.append(alternate_name) else: logger.info('Updating the details for already exisisting product: Asin {0}'.format( product_new.asin)) product_new.current_state = json.dumps(output) product_new.update_dt = datetime.utcnow() product_new.refresh_dt = datetime.utcnow() # persisting ideal_state ideal_state = product_new.ideal_state diffAttributes = self.diff_compute(product_new, output) if len(diffAttributes) > 0 and product_new.health_status == 'Healthy': product_new.state_diff = json.dumps(diffAttributes) product_new.health_status = 'Unhealthy' self.alert_create( product_new, diffAttributes, previous_diffAttributes, 'Unhealthy', 'The following product attributes have changed') elif len(diffAttributes) > 0: updated_diffAttributes = self.alert_diff_previous( product_new.id, product_new.tenant_id, diffAttributes) product_new.state_diff = json.dumps(diffAttributes) product_new.health_status = 'Unhealthy' if len(updated_diffAttributes) > 0: self.alert_create( product_new, diffAttributes, previous_diffAttributes, 'Unhealthy', 'The following product attributes have changed') else: product_new.state_diff = json.dumps(diffAttributes) product_new.health_status = 'Healthy' # if product_new.health_status == 'Unhealthy': # checking previous and current health status if health_status_previous == 'Unhealthy' and product_new.health_status == 'Healthy': message = 'Great, status changed from Unhealthy to Healthy. The following attributes have been corrected' # when the status is changed to healthy, previous_diffattributes have to be sent to the user self.alert_create( product_new, previous_diffAttributes, None, 'Healthy', message) # creating product state history info self.product_state_history_create( product_new, json.dumps(diffAttributes)) product_new.product_info_status = 'Updated'
def kafka_producer_que(self, kafka_topic_name, alert_info): ''' This method is to send the topic name and alert info to kafka producer ''' logger.info('alert info being sent to kafka producer') self.kafka_que.producer_call(kafka_topic_name, alert_info)