def refresh_merchants(): section("Loading Merchants") fh = _download_content("http://services.formetocoupon.com/getMerchants?key=%s" % settings.FMTC_ACCESS_KEY, "Merchants_Content_%s" % datetime.datetime.now().strftime(DATETIME_FORMAT)) data = etree.iterparse(fh, tag='merchant') for event, merchant in data: try: name = unescape_html(merchant.find("name").text) id = merchant.find("id").text print "\t%s,%s" % (id,name) print '=' * 40 link = merchant.find('link').text skimlinks = merchant.find('skimlinks').text homepageurl = merchant.find('homepageurl').text model, created = Merchant.objects.get_or_create(name=name) model.name = name.strip() model.directlink = homepageurl model.skimlinks = skimlinks model.link = homepageurl model.save() affiliate_data, created = MerchantAffiliateData.objects.get_or_create(ref_id=id, merchant=model) affiliate_data.network = merchant.find('network').text affiliate_data.networkid = merchant.find('networkid').text affiliate_data.networknote = merchant.find('networknote').text affiliate_data.link = link if merchant.find('network').text == 'CJ': affiliate_data.primary = True affiliate_data.save() except: print_stack_trace()
def refresh_merchants(): section("Loading Merchants") fh = _download_content( "http://services.formetocoupon.com/getMerchants?key=%s" % settings.FMTC_ACCESS_KEY, "Merchants_Content_%s" % datetime.datetime.now().strftime(DATETIME_FORMAT)) data = etree.iterparse(fh, tag='merchant') for event, merchant in data: try: name = unescape_html(merchant.find("name").text) id = merchant.find("id").text print "\t%s,%s" % (id, name) print '=' * 40 link = merchant.find('link').text skimlinks = merchant.find('skimlinks').text homepageurl = merchant.find('homepageurl').text model, created = Merchant.objects.get_or_create(name=name) model.name = name.strip() model.directlink = homepageurl model.skimlinks = skimlinks model.link = homepageurl model.save() affiliate_data, created = MerchantAffiliateData.objects.get_or_create( ref_id=id, merchant=model) affiliate_data.network = merchant.find('network').text affiliate_data.networkid = merchant.find('networkid').text affiliate_data.networknote = merchant.find('networknote').text affiliate_data.link = link if merchant.find('network').text == 'CJ': affiliate_data.primary = True affiliate_data.save() except: print_stack_trace()
def go_validate((coupon_model, last_validate_end_time, firsttime, pulseonly)): from core.signals import update_object try: print show_time(), coupon_model.directlink sqoot_url = coupon_model.directlink is_bad_link, response = fetch_page(sqoot_url) if is_bad_link: coupon_model.status='confirmed-inactive' coupon_model.save() handle_exceptions(update_object.send(sender=Coupon, instance=coupon_model)) return is_deal_dead = check_if_deal_dead(coupon_model, response, sqoot_url) if is_deal_dead: coupon_model.status='confirmed-inactive' else: coupon_model.status='considered-active' coupon_model.save() handle_exceptions(update_object.send(sender=Coupon, instance=coupon_model)) reset_db_queries() # Note: Commenting out address/category correction logic (not implemented yet) # if firsttime: # confirm_or_correct_deal_data(coupon_model, response) # else: # if pulseonly: # return # if last_validate_end_time and (last_validate_end_time > coupon_model.date_added): # return # Data check only the newly added deals. # confirm_or_correct_deal_data(coupon_model, response) except: print_stack_trace()
def create_localinfo_index_if_doesnt_exist(self): if not self.es.indices.exists(index='localinfo'): try: settings_and_mappings = { "mappings": { "populars": { "properties": { "user_uuid": { "type": "string" }, "location": { "type": "geo_point" }, "search_keyword": { "type": "string" }, "search_category": { "type": "string" } } } } } self.es.indices.create(index='localinfo', body=settings_and_mappings) except: print_stack_trace()
def read_sqoot_log(current_stage): row_to_lookup, column_to_lookup = LOOKUP_PER_STAGE[current_stage] try: f = open(SQOOT_LOG_PATH, 'r') except IOError: print_stack_trace() all_rows = f.readlines() if len(all_rows) == 1: f.close() return None else: last_ten_rows = all_rows[-10:] latest_runs_of_this_step = [ r for r in last_ten_rows if r.replace('\r\n', '').split(',')[0] == row_to_lookup ] if len(latest_runs_of_this_step) == 0: f.close() return None very_last_run = latest_runs_of_this_step[-1] timestamp_string = very_last_run.replace( '\r\n', '').split(',')[column_to_lookup] timestamp_wanted = parse(timestamp_string) f.close() return timestamp_wanted
def email_subscribe(request): try: e = EmailSubscription() e.app = request.POST["app"] e.email = request.POST["email"] e.session_key = request.POST["session_key"] try: e.first_name = request.POST["first_name"] except: pass try: e.last_name = request.POST["last_name"] except: pass try: e.full_name = request.POST["full_name"] except: pass try: e.context = request.POST["context"] except: pass e.save() return HttpResponse('1') except: print_stack_trace()
def create_from_skimlinks_commissions(self, commissions): default_to_empty_string = lambda x: "" if x == None else x for c in commissions["skimlinksAccount"]["commissions"].keys(): try: commission = commissions["skimlinksAccount"]["commissions"][c] if self.filter(commissionID = commission["commissionID"]).count() > 0: continue #commission already recorded comm = Commission( commissionID = commission["commissionID"], commissionType = "skimlinks", commissionValue = float(commission["commissionValue"])/100, #values comes in cents - we convert to dollars) orderValue = float(commission["orderValue"])/100, #values comes in cents - we convert to dollars) currency = default_to_empty_string(commission["currency"]), customID = default_to_empty_string(commission["customID"]), date = datetime.strptime(commission["date"],"%Y-%m-%d").date(), domainID = default_to_empty_string(commission["domainID"]), merchantID = default_to_empty_string(commission["merchantID"]), publisherID = default_to_empty_string(commission["publisherID"]), items = int(commission["items"]) if commission["items"] is not None else 0, sales = int(commission["sales"]) if commission["sales"] is not None else 0, remoteReferer = default_to_empty_string(commission["remoteReferer"]), remoteUserAgent = default_to_empty_string(commission["remoteUserAgent"]), url = default_to_empty_string(commission["url"]), domain = default_to_empty_string(shorten_to_domain(commission["url"]) if commission["url"] else ""), status = default_to_empty_string(commission["status"]) ) comm.save() except: print json.dumps(commissions["skimlinksAccount"]["commissions"][c], indent=4) print_stack_trace()
def save(self, *args, **kwargs): super(Visitor, self).save(*args, **kwargs) try: if self.rev_visitor.all().exists(): for rv in self.rev_visitor.all(): rv.save() except: print_stack_trace()
def _remove_skimlinks(skimlinked_url): try: parsed = urlparse.urlparse(skimlinked_url) query = parsed.query.replace('&','&') return urlparse.parse_qs(query)["url"][0] except: print_stack_trace() return skimlinked_url
def _remove_skimlinks(skimlinked_url): try: parsed = urlparse.urlparse(skimlinked_url) query = parsed.query.replace('&', '&') return urlparse.parse_qs(query)["url"][0] except: print_stack_trace() return skimlinked_url
def refresh_deals(): section("Loading Deals/Coupons") fh = _download_content("http://services.formetocoupon.com/getDeals?key=%s" % settings.FMTC_ACCESS_KEY, "Deals_Content_%s" % datetime.datetime.now().strftime(DATETIME_FORMAT)) data = etree.iterparse(fh, tag='item') for event, deal in data: try: id = deal.find('couponid').text coupon, created = Coupon.active_objects.get_or_create(ref_id=id) if not created: continue merchant_name = deal.find('merchantname').text merchantid = deal.find('merchantid').text merchant, created = Merchant.objects.get_or_create(name=merchant_name) coupon.merchant=merchant coupon.categories.clear() for category in deal.find("categories"): coupon.categories.add(Category.objects.get(code=category.text, ref_id_source__isnull=True)) coupon.dealtypes.clear() dealtypes = deal.find('dealtypes') for dealtype in dealtypes.findall("type"): coupon.dealtypes.add(DealType.objects.get(code=dealtype.text)) coupon.description = unescape_html(deal.find('label').text) restrictions = deal.find('restrictions').text or '' coupon.restrictions = unescape_html(restrictions) coupon_code = deal.find('couponcode').text or '' coupon.code = unescape_html(coupon_code) coupon.start = get_dt(deal.find('startdate').text) coupon.end = get_dt(deal.find('enddate').text) coupon.lastupdated = get_dt(deal.find('lastupdated').text) coupon.created = get_dt(deal.find('created').text) coupon.link = deal.find('link').text # removing skimlinks prefix from coupon link coupon.link = extract_url_from_skimlinks(deal.find('link').text) coupon.directlink = deal.find('directlink').text coupon.skimlinks = deal.find('skimlinks').text coupon.status = deal.find('status').text coupon.countries.clear() for country in deal.findall("country"): c, created = Country.objects.get_or_create(code=country.text) c.name = country.text c.save() coupon.countries.add(c) coupon.price = deal.find('price').text coupon.discount = deal.find('discount').text coupon.listprice = deal.find('listprice').text coupon.percent = deal.find('percent').text coupon.image = deal.find('image').text coupon.save() except: print_stack_trace()
def get_retailer_link(self): """retrieves the direct link to the page""" try: parsed = urlparse.urlparse(self.skimlinks) query = parsed.query.replace('&', '&') return urlparse.parse_qs(query)["url"][0] except: print_stack_trace() return self.skimlinks
def get_retailer_link(self): """retrieves the direct link to the page""" try: parsed = urlparse.urlparse(self.skimlinks) query = parsed.query.replace('&','&') return urlparse.parse_qs(query)["url"][0] except: print_stack_trace() return self.skimlinks
def create_short_desc(self): try: short = self.description.lower() if not short: return "coupon" arr = short.split(" ") try: if "% off" in short: for i in range(len(arr)): if arr[i].startswith("off"): break return " ".join([arr[i - 1], "off"]) except: pass try: if self.has_deal_type("percent"): for i in range(len(arr)): if arr[i].endswith("%"): return "%s off" % arr[i] except: pass try: if self.has_deal_type("dollar"): for i in range(len(arr)): if arr[i].startswith("$"): return "%s off" % arr[i] except: pass try: if self.discount and self.discount > 0: return "$%s off" % int(self.discount) except: pass if self.has_deal_type("gift"): return "gift" if self.has_deal_type("sale"): return "sale" if self.has_deal_type("offer"): return "offer" if self.has_deal_type("freeshipping") or self.has_deal_type( "totallyfreeshipping"): return "free ship" except: print self.ref_id, "Description is", self.description print_stack_trace() return "coupon"
def write_sqoot_log(finished_stage, start_time, end_time): time_took = end_time - start_time try: with open(SQOOT_LOG_PATH, 'a') as csvfile: log_writer = csv.writer(csvfile) log_writer.writerow([finished_stage, start_time, end_time, time_took.seconds/60,]) csvfile.close() except: print_stack_trace() print "^-- WARNING: Problem logging it: {}{}{}{}{}{}{}"\ .format(finished_stage, ",", start_time, ",", end_time, ",", time_took.seconds/60)
def create_short_desc(self): try: short = self.description.lower() if not short: return "coupon" arr = short.split(" ") try: if "% off" in short: for i in range(len(arr)): if arr[i].startswith("off"): break return " ".join([arr[i-1], "off"]) except: pass try: if self.has_deal_type("percent"): for i in range(len(arr)): if arr[i].endswith("%"): return "%s off" % arr[i] except: pass try: if self.has_deal_type("dollar"): for i in range(len(arr)): if arr[i].startswith("$"): return "%s off" % arr[i] except: pass try: if self.discount and self.discount > 0: return "$%s off" % int(self.discount) except: pass if self.has_deal_type("gift"): return "gift" if self.has_deal_type("sale"): return "sale" if self.has_deal_type("offer"): return "offer" if self.has_deal_type("freeshipping") or self.has_deal_type("totallyfreeshipping"): return "free ship" except: print self.ref_id, "Description is", self.description print_stack_trace() return "coupon"
def embedly(args): _from = 0 _to = Merchant.objects.all().count() if len(args) == 2: _from = int(args[0]) _to = int(args[1]) if _to == 1: _to = Merchant.objects.all().count() print "loading from", _from, "to", _to for merchant in Merchant.objects.all()[_from:_to]: try: EmbedlyMerchant(merchant).update_coupons() except: print_stack_trace()
def refresh_calculated_fields(): section("Refresh Calculated Fields") for m in Merchant.objects.all(): print 'Calculating coupons for %s' % m.name try: m.refresh_coupon_count() except: print "Error with: ", m.name, m.id print_stack_trace() regex = r'/o/(?P<coupon_id>[\d]+)/$' for ct in ClickTrack.objects.filter(coupon__isnull=True): print 'Processing click track %s' % ct.id r = re.search(regex, ct.target_url) if r: try: coupon = Coupon.objects.get(pk=r.groups()[0]) ct.coupon = coupon except Coupon.DoesNotExist: ct.coupon = None ct.save() for ct in ClickTrack.objects.filter(merchant__isnull=True): print 'Processing click track %s' % ct.id if ct.coupon: ct.coupon.merchant = ct.coupon.merchant ct.save() tracks = ClickTrack.objects.exclude(coupon__isnull=True).values('coupon_id')\ .annotate(popularity=Count('coupon__id')) for track in tracks: Coupon.objects.filter(id=track['coupon_id']).update( popularity=track['popularity']) tracks = ClickTrack.objects.exclude(merchant__isnull=True).values('merchant_id')\ .annotate(popularity=Count('merchant__id')) for track in tracks: print 'Processing click track %s' % track['merchant_id'] Merchant.objects.filter(id=track['merchant_id']).update( popularity=track['popularity']) for c in Coupon.objects.filter(coupon_type__isnull=True).only( 'id', 'categories', 'dealtypes'): print 'Calculating coupon type for coupon %s' % c.id c.coupon_type = c.get_coupon_type() c.save() Coupon.objects.exclude( Q(end__gt=datetime.datetime.now()) | Q(end__isnull=True)).update(is_active=False)
def prepare_list_of_deals_to_scrub(): start_time = time.time() deals_to_scrub = Coupon.all_objects.filter( pk__in=SCRUB_LIST).order_by('merchant__name') probably_dup_deals_list = [ ] # List of coupon pks that look like a duplicate. probably_dup_deals_list = crosscheck_by_field(deals_to_scrub, probably_dup_deals_list, 'coupon_directlink') probably_dup_deals_list = crosscheck_by_field(deals_to_scrub, probably_dup_deals_list, 'merchant_name') probably_dup_deals_list = list(set(probably_dup_deals_list)) print "merchant_pk^merchant_ref_id^merchant_name^address^locality^region^postal_code^coupon_pk^coupon_ref_id^coupon_title^coupon_short_title^parent_category^child_category^deal_price^deal_value^provider^link^is_duplicate?" for d in deals_to_scrub: categories = d.categories.all() parent_category = [cat for cat in categories if cat.parent == None] parent_category = parent_category[0].name if parent_category else None child_category = [cat for cat in categories if cat.parent != None] child_category = child_category[0].name if child_category else None address = d.merchant_location.address if d.merchant_location.address else "" locality = d.merchant_location.locality if d.merchant_location.locality else "" region = d.merchant_location.region if d.merchant_location.region else "" postal_code = d.merchant_location.postal_code if d.merchant_location.postal_code else "" if d.pk in probably_dup_deals_list: is_duplicate = 1 else: is_duplicate = 0 try: print "%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s" %\ (d.merchant.pk, d.merchant.ref_id, d.merchant.name.lower(), address, locality, region, postal_code, d.pk, d.ref_id, d.embedly_title,\ d.embedly_description, parent_category, child_category, d.price, d.listprice, d.coupon_network.name, d.directlink, is_duplicate) except: print "!!!ERROR: merchant_pk == {}".format(d.merchant.pk) print_stack_trace() continue end_time = time.time() time_elapsed = end_time - start_time print time_elapsed
def write_sqoot_log(finished_stage, start_time, end_time): time_took = end_time - start_time try: with open(SQOOT_LOG_PATH, 'a') as csvfile: log_writer = csv.writer(csvfile) log_writer.writerow([ finished_stage, start_time, end_time, time_took.seconds / 60, ]) csvfile.close() except: print_stack_trace() print "^-- WARNING: Problem logging it: {}{}{}{}{}{}{}"\ .format(finished_stage, ",", start_time, ",", end_time, ",", time_took.seconds/60)
def create_localinfo_index_if_doesnt_exist(self): if not self.es.indices.exists(index='localinfo'): try: settings_and_mappings = { "mappings": { "populars": { "properties": { "user_uuid": {"type": "string"}, "location": {"type": "geo_point"}, "search_keyword": {"type": "string"}, "search_category": {"type": "string"} } } } } self.es.indices.create(index='localinfo', body=settings_and_mappings) except: print_stack_trace()
def refresh_calculated_fields(): section("Refresh Calculated Fields") for m in Merchant.objects.all(): print 'Calculating coupons for %s' % m.name try: m.refresh_coupon_count() except: print "Error with: ", m.name, m.id print_stack_trace() regex = r'/o/(?P<coupon_id>[\d]+)/$' for ct in ClickTrack.objects.filter(coupon__isnull=True): print 'Processing click track %s' % ct.id r = re.search(regex, ct.target_url) if r: try: coupon = Coupon.objects.get(pk=r.groups()[0]) ct.coupon = coupon except Coupon.DoesNotExist: ct.coupon = None ct.save() for ct in ClickTrack.objects.filter(merchant__isnull=True): print 'Processing click track %s' % ct.id if ct.coupon: ct.coupon.merchant = ct.coupon.merchant ct.save() tracks = ClickTrack.objects.exclude(coupon__isnull=True).values('coupon_id')\ .annotate(popularity=Count('coupon__id')) for track in tracks: Coupon.objects.filter(id=track['coupon_id']).update(popularity=track['popularity']) tracks = ClickTrack.objects.exclude(merchant__isnull=True).values('merchant_id')\ .annotate(popularity=Count('merchant__id')) for track in tracks: print 'Processing click track %s' % track['merchant_id'] Merchant.objects.filter(id=track['merchant_id']).update(popularity=track['popularity']) for c in Coupon.objects.filter(coupon_type__isnull=True).only('id', 'categories', 'dealtypes'): print 'Calculating coupon type for coupon %s' % c.id c.coupon_type = c.get_coupon_type() c.save() Coupon.objects.exclude(Q(end__gt=datetime.datetime.now()) | Q(end__isnull=True)).update(is_active=False)
def create_from_skimlinks_commissions(self, commissions): default_to_empty_string = lambda x: "" if x == None else x for c in commissions["skimlinksAccount"]["commissions"].keys(): try: commission = commissions["skimlinksAccount"]["commissions"][c] if self.filter( commissionID=commission["commissionID"]).count() > 0: continue #commission already recorded comm = Commission( commissionID=commission["commissionID"], commissionType="skimlinks", commissionValue=float(commission["commissionValue"]) / 100, #values comes in cents - we convert to dollars) orderValue=float(commission["orderValue"]) / 100, #values comes in cents - we convert to dollars) currency=default_to_empty_string(commission["currency"]), customID=default_to_empty_string(commission["customID"]), date=datetime.strptime(commission["date"], "%Y-%m-%d").date(), domainID=default_to_empty_string(commission["domainID"]), merchantID=default_to_empty_string( commission["merchantID"]), publisherID=default_to_empty_string( commission["publisherID"]), items=int(commission["items"]) if commission["items"] is not None else 0, sales=int(commission["sales"]) if commission["sales"] is not None else 0, remoteReferer=default_to_empty_string( commission["remoteReferer"]), remoteUserAgent=default_to_empty_string( commission["remoteUserAgent"]), url=default_to_empty_string(commission["url"]), domain=default_to_empty_string( shorten_to_domain(commission["url"] ) if commission["url"] else ""), status=default_to_empty_string(commission["status"])) comm.save() except: print json.dumps( commissions["skimlinksAccount"]["commissions"][c], indent=4) print_stack_trace()
def fetch_page(sqoot_url, tries=1): ''' Summary: Check if url is valid and return a boolean with a response. ''' try: if not sqoot_url: return True, None response = requests.get(sqoot_url, timeout=5) if response.status_code != 200: return True, None return False, response except Exception, e: print_stack_trace() print "^---- Offending URL: ", sqoot_url if tries < 3: print "Retrying in 5 seconds, maybe the server just needs a break" time.sleep(5) return fetch_page(sqoot_url, tries+1) else: raise e #reraise exception
def get_visitor_tag(url, visitor_id): from core.util import print_stack_trace try: if 'go.redirectingat.com' in url: parsed = urlparse(url) query_dict = parse_qs(parsed.query) for key in query_dict.keys(): query_dict[key] = query_dict[key][0] if not 'xcust' in query_dict.keys(): query_dict['xcust'] = '' query_dict['xcust'] = visitor_id url = query_dict['url'] del query_dict['url'] return 'http://go.redirectingat.com/?%s&%s' % ( urllib.urlencode(query_dict), urllib.urlencode({'url': url})) else: return url except: print_stack_trace() return url
def get_visitor_tag(url, visitor_id): from core.util import print_stack_trace try: if 'go.redirectingat.com' in url: parsed = urlparse(url) query_dict = parse_qs(parsed.query) for key in query_dict.keys(): query_dict[key] = query_dict[key][0] if not 'xcust' in query_dict.keys(): query_dict['xcust'] = '' query_dict['xcust'] = visitor_id url = query_dict['url'] del query_dict['url'] return 'http://go.redirectingat.com/?%s&%s' % (urllib.urlencode(query_dict), urllib.urlencode({'url':url})) else: return url except: print_stack_trace() return url
def fetch_page(sqoot_url, tries=1): ''' Summary: Check if url is valid and return a boolean with a response. ''' try: if not sqoot_url: return True, None response = requests.get(sqoot_url, timeout=5) if response.status_code != 200: return True, None return False, response except Exception, e: print_stack_trace() print "^---- Offending URL: ", sqoot_url if tries < 3: print "Retrying in 5 seconds, maybe the server just needs a break" time.sleep(5) return fetch_page(sqoot_url, tries + 1) else: raise e #reraise exception
def prepare_list_of_deals_to_scrub(): start_time = time.time() deals_to_scrub = Coupon.all_objects.filter(pk__in=SCRUB_LIST).order_by('merchant__name') probably_dup_deals_list = [] # List of coupon pks that look like a duplicate. probably_dup_deals_list = crosscheck_by_field(deals_to_scrub, probably_dup_deals_list, 'coupon_directlink') probably_dup_deals_list = crosscheck_by_field(deals_to_scrub, probably_dup_deals_list, 'merchant_name') probably_dup_deals_list = list(set(probably_dup_deals_list)) print "merchant_pk^merchant_ref_id^merchant_name^address^locality^region^postal_code^coupon_pk^coupon_ref_id^coupon_title^coupon_short_title^parent_category^child_category^deal_price^deal_value^provider^link^is_duplicate?" for d in deals_to_scrub: categories = d.categories.all() parent_category = [cat for cat in categories if cat.parent == None] parent_category = parent_category[0].name if parent_category else None child_category = [cat for cat in categories if cat.parent != None] child_category = child_category[0].name if child_category else None address = d.merchant_location.address if d.merchant_location.address else "" locality = d.merchant_location.locality if d.merchant_location.locality else "" region = d.merchant_location.region if d.merchant_location.region else "" postal_code = d.merchant_location.postal_code if d.merchant_location.postal_code else "" if d.pk in probably_dup_deals_list: is_duplicate = 1 else: is_duplicate = 0 try: print "%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s^%s" %\ (d.merchant.pk, d.merchant.ref_id, d.merchant.name.lower(), address, locality, region, postal_code, d.pk, d.ref_id, d.embedly_title,\ d.embedly_description, parent_category, child_category, d.price, d.listprice, d.coupon_network.name, d.directlink, is_duplicate) except: print "!!!ERROR: merchant_pk == {}".format(d.merchant.pk) print_stack_trace() continue end_time = time.time() time_elapsed = end_time - start_time print time_elapsed
def read_sqoot_log(current_stage): row_to_lookup, column_to_lookup = LOOKUP_PER_STAGE[current_stage] try: f = open(SQOOT_LOG_PATH, 'r') except IOError: print_stack_trace() all_rows = f.readlines() if len(all_rows) == 1: f.close() return None else: last_ten_rows = all_rows[-10:] latest_runs_of_this_step = [r for r in last_ten_rows if r.replace('\r\n', '').split(',')[0] == row_to_lookup] if len(latest_runs_of_this_step) == 0: f.close() return None very_last_run = latest_runs_of_this_step[-1] timestamp_string = very_last_run.replace('\r\n', '').split(',')[column_to_lookup] timestamp_wanted = parse(timestamp_string) f.close() return timestamp_wanted
def handle(self, *args, **options): if options['savedown']: try: savedown_sqoot_data() except: print_stack_trace() if options['analyze']: try: analyze_sqoot_deals() except: print_stack_trace() if options['scrubprepare']: try: prepare_list_of_deals_to_scrub() except: print_stack_trace() if options['scrubexecute']: try: read_scrub_list_and_update(args) except: print_stack_trace()
def read_scrub_list_and_update(args): try: filename = args[0] except: pass # Thomas' Bing Geocoder api key (free basic access) # dotus_geocoder = geocoders.GeocoderDotUS() for consideration as a fallback bing_geocoder = geocoders.Bing( 'AvxLEwiPhVJzf0S3Pozgg01NnUQQX0RR6g9K46VPLlZ8OfZkKS-76gaPyzoV6IHI') path = os.path.join(settings.BASE_DIR, 'readonly', filename) try: f = open(path) except IOError: print_stack_trace() rows = [] for row in f: rows.append(row.replace("\r\n", "").split("\t")) for row in rows[1:]: # Skip the header try: coupon_pk = int(row[1]) is_duplicate = True if row[3] == '1' else False is_inactive = True if row[4] == '1' else False is_category_wrong = True if row[5] == '1' else False is_location_wrong = True if row[8] == '1' else False correction_needed = is_duplicate or is_inactive or is_category_wrong or is_location_wrong if correction_needed: coupon_obj = Coupon.all_objects.get(pk=coupon_pk) if is_duplicate: coupon_obj.is_duplicate = True # print "Correction: ", coupon_pk, " is_duplicate=True" #DEBUG if is_inactive: coupon_obj.status = 'confirmed-inactive' # print "Correction: ", coupon_pk, " status=confirmed-inactive" #DEBUG if is_category_wrong: coupon_obj.categories.clear() try: parent_category = Category.objects.get( ref_id_source='sqoot', name=row[6]) coupon_obj.categories.add(parent_category) # print "Correction: ", coupon_pk, " Parent category -> ", parent_category.name #DEBUG except: pass try: child_category = Category.objects.get( ref_id_source='sqoot', name=row[7]) coupon_obj.categories.add(child_category) # print "Correction: ", coupon_pk, " Child category -> ", child_category.name #DEBUG except: pass if is_location_wrong: location_obj = coupon_obj.merchant_location address = row[9] if row[9] != '' else '' locality = row[10] if row[10] != '' else '' region = row[11] if row[11] != '' else '' postal_code = row[12] if row[12] != '' else '' spacer1 = ', ' if address != '' else '' spacer2 = ' ' if locality != '' else '' lookup_text = address + spacer1 + locality + spacer2 + region try: place, (lat, lng) = bing_geocoder.geocode(lookup_text) pnt = 'POINT({} {})'.format(lng, lat) location_obj.geometry = pnt except: pass location_obj.address = address if address != '' else location_obj.address location_obj.locality = locality if locality != '' else location_obj.locality location_obj.region = region if region != '' else location_obj.region location_obj.postal_code = postal_code if postal_code != '' else location_obj.postal_code location_obj.save() # print "Correction: ", coupon_pk, " Location fixed" #DEBUG coupon_obj.save() except: print_stack_trace() scrub_list_retrieved = [ row[1] for row in rows[1:] ] # list of original coupon pks imported from 'scrub_list.py' deals_to_scrub = Coupon.all_objects.filter(pk__in=scrub_list_retrieved)\ .exclude(Q(status='confirmed-inactive') | Q(status='implied-inactive') | Q(is_duplicate=True))\ .order_by('merchant__name') probably_dup_deals_list = [ ] # List of coupon pks that look like a duplicate. probably_dup_deals_list = crosscheck_by_field(deals_to_scrub, probably_dup_deals_list, 'coupon_directlink') probably_dup_deals_list = crosscheck_by_field(deals_to_scrub, probably_dup_deals_list, 'merchant_name') probably_dup_deals_list = list(set(probably_dup_deals_list)) for pk in probably_dup_deals_list: try: coupon = Coupon.all_objects.get(pk=pk) coupon.is_duplicate = True coupon.save() # print "Correction: ", coupon_pk, " is_duplicate=True" #DEBUG except: print_stack_trace()
def refresh_deals(): section("Loading Deals/Coupons") fh = _download_content( "http://services.formetocoupon.com/getDeals?key=%s" % settings.FMTC_ACCESS_KEY, "Deals_Content_%s" % datetime.datetime.now().strftime(DATETIME_FORMAT)) data = etree.iterparse(fh, tag='item') for event, deal in data: try: id = deal.find('couponid').text coupon, created = Coupon.active_objects.get_or_create(ref_id=id) if not created: continue merchant_name = deal.find('merchantname').text merchantid = deal.find('merchantid').text merchant, created = Merchant.objects.get_or_create( name=merchant_name) coupon.merchant = merchant coupon.categories.clear() for category in deal.find("categories"): coupon.categories.add( Category.objects.get(code=category.text, ref_id_source__isnull=True)) coupon.dealtypes.clear() dealtypes = deal.find('dealtypes') for dealtype in dealtypes.findall("type"): coupon.dealtypes.add(DealType.objects.get(code=dealtype.text)) coupon.description = unescape_html(deal.find('label').text) restrictions = deal.find('restrictions').text or '' coupon.restrictions = unescape_html(restrictions) coupon_code = deal.find('couponcode').text or '' coupon.code = unescape_html(coupon_code) coupon.start = get_dt(deal.find('startdate').text) coupon.end = get_dt(deal.find('enddate').text) coupon.lastupdated = get_dt(deal.find('lastupdated').text) coupon.created = get_dt(deal.find('created').text) coupon.link = deal.find('link').text # removing skimlinks prefix from coupon link coupon.link = extract_url_from_skimlinks(deal.find('link').text) coupon.directlink = deal.find('directlink').text coupon.skimlinks = deal.find('skimlinks').text coupon.status = deal.find('status').text coupon.countries.clear() for country in deal.findall("country"): c, created = Country.objects.get_or_create(code=country.text) c.name = country.text c.save() coupon.countries.add(c) coupon.price = deal.find('price').text coupon.discount = deal.find('discount').text coupon.listprice = deal.find('listprice').text coupon.percent = deal.find('percent').text coupon.image = deal.find('image').text coupon.save() except: print_stack_trace()
def _get_image(user, image_url, specific_height=-1, specific_width=-1): """ Returns the image at the image_url of the specific height and width if image is not locally buffers, then it downloads the original image at the url if specific height and width are set, it also creates a resized version of the image """ def _download_image_to_local(src_image_url, src_image_pointer, height, width): prefix,ext = os.path.splitext(urlparse.urlparse(src_image_url).path) prefix = prefix.replace('/', '').replace('\\', '') prefix = url2path(prefix) filename = '%s_%s%s' % (uuid.uuid4().hex, uuid.uuid4().hex, ext) local_copy = os.path.join(settings.IMAGE_LOCAL_COPY_DIR, filename) local_url = os.path.join(settings.IMAGE_LOCAL_COPY_DIR_NO_PREFIX, filename) file_saved_path = local_copy if height == -1: #ensure exists assert src_image_pointer.status_code == 200 #ensure is image! content_type = src_image_pointer.headers['content-type'].lower() if content_type[:5] != 'image': #cloudfront servers typically hosting images as octet-streams #we need to handle that assert 'application/octet-stream' in content_type #also in these cases we don't allow files greater than 700KB -> 716800 = 700*1024 assert int('30611') < 716800 if src_image_pointer.status_code == 200: #download remote file file_saved_path = local_copy local_copy = open(local_copy, 'wb') local_copy.write(src_image_pointer.content) local_copy.close() else: raise Http404() else: img_util.resize(src_image_pointer, (specific_width, specific_height), True, local_copy) # s3_url = s3.upload(file_saved_path) #store reference in imagestore # img = ImageStore(remote_url=image_url, local_url=s3_url, source_user=user, height=height, width=width) img = ImageStore(remote_url=image_url, local_url="/%s" % local_url, source_user=user, height=height, width=width) img.save() return img def _download_temp_image(src_image_url, src_image_pointer, height, width): prefix,ext = os.path.splitext(urlparse.urlparse(src_image_url).path) prefix = prefix.replace('/', '').replace('\\', '') prefix = url2path(prefix) filename = '%s_%s%s' % (uuid.uuid4().hex, uuid.uuid4().hex, ext) local_copy = os.path.join(settings.IMAGE_LOCAL_COPY_DIR, filename) local_url = os.path.join(settings.IMAGE_LOCAL_COPY_DIR_NO_PREFIX, filename) if height == -1: #ensure exists assert src_image_pointer.status_code == 200 #ensure is image! content_type = src_image_pointer.headers['content-type'].lower() if content_type[:5] != 'image': #cloudfront servers typically hosting images as octet-streams #we need to handle that assert 'application/octet-stream' in content_type #also in these cases we don't allow files greater than 700KB -> 716800 = 700*1024 assert int('30611') < 716800 if src_image_pointer.status_code == 200: #download remote file local_copy = open(local_copy, 'wb') local_copy.write(src_image_pointer.content) local_copy.close() else: raise Http404() return local_url if user.is_anonymous() or not(user.is_authenticated()): try: user = User.objects.get(username=IMAGE_ANONYMOUS_USER) except: print_stack_trace() user = User.objects.create_user(username=IMAGE_ANONYMOUS_USER, email='*****@*****.**', password=IMAGE_ANONYMOUS_USER) image_url = urllib.unquote_plus(image_url) original_image = None #check if image already exists image_url_to_check = image_url if ShortenedURL.objects.should_shorten_url(image_url): image_url_to_check = ShortenedURL.objects.shorten_url(image_url).shortened_url for image in ImageStore.objects.filter(remote_url = image_url_to_check): if image.height == specific_height and image.width == specific_width: #found the image with the exact dimensions return image if image.height == image.width == -1: #found the original image original_image = image if not original_image: #original image not available #download original image original_image = _download_image_to_local(image_url, requests.get(image_url), -1, -1) else: original_image.local_url = _download_temp_image(image_url, requests.get(image_url), -1, -1) if specific_height == specific_width == -1: # os.remove(os.path.join(settings.IMAGE_LOCAL_COPY_DIR, original_image.local_url[1:].split('/')[-1])) return original_image else: #required image is not available #resize original image to required image resized_image = _download_image_to_local(image_url, os.path.join(settings.IMAGE_LOCAL_COPY_DIR, original_image.local_url[1:].split('/')[-1]), specific_height, specific_width) # os.remove(os.path.join(settings.IMAGE_LOCAL_COPY_DIR, resized_image.local_url[1:].split('/')[-1])) # os.remove(os.path.join(settings.IMAGE_LOCAL_COPY_DIR, original_image.local_url[1:].split('/')[-1])) return resized_image
def process_request(self, request): # don't process AJAX requests if request.path.startswith("/s/") or request.path.startswith("/static/") or request.path.startswith("/admin/")\ or request.path.startswith("/favicon.ico") or (request.is_ajax() and not request.path.startswith('/o/')): return # create some useful variables ip_address = utils.get_ip(request) user_agent = unicode(request.META.get('HTTP_USER_AGENT', '')[:255], errors='ignore') # retrieve untracked user agents from cache ua_key = '_tracking_untracked_uas' untracked = cache.get(ua_key) if untracked is None: log.info('Updating untracked user agent cache') untracked = UntrackedUserAgent.objects.all() cache.set(ua_key, untracked, 3600) # see if the user agent is not supposed to be tracked for ua in untracked: # if the keyword is found in the user agent, stop tracking if user_agent.find(ua.keyword) != -1: log.debug('Not tracking UA "%s" because of keyword: %s' % (user_agent, ua.keyword)) return if hasattr(request, 'session') and request.session.session_key: # use the current session key if we can session_key = request.session.session_key else: # otherwise just fake a session key session_key = '%s:%s' % (ip_address, user_agent) session_key = session_key[:40] # ensure that the request.path does not begin with any of the prefixes for prefix in self.prefixes: if request.path.startswith(prefix): log.debug('Not tracking request to: %s' % request.path) return # if we get here, the URL needs to be tracked # determine what time it is now = datetime.now() attrs = { 'session_key': session_key, 'ip_address': ip_address } visitor_id = request.session.get('visitor_id', None) if not visitor_id: # for some reason, Visitor.objects.get_or_create was not working here try: visitor = Visitor.objects.only('id').get(**attrs) except Visitor.DoesNotExist: request.session.set_test_cookie() # see if there's a visitor with the same IP and user agent # within the last 5 minutes cutoff = now - timedelta(minutes=5) visitors = Visitor.objects.only('id').filter( ip_address=ip_address, user_agent=user_agent, last_update__gte=cutoff ) if len(visitors): visitor = visitors[0] visitor.session_key = session_key log.debug('Using existing visitor for IP %s / UA %s: %s' % (ip_address, user_agent, visitor.id)) else: # it's probably safe to assume that the visitor is brand new visitor = Visitor(**attrs) log.debug('Created a new visitor: %s' % attrs) try: visitor.save() except DatabaseError: print_stack_trace() log.error('There was a problem saving visitor information:\n%s\n\n%s' % (traceback.format_exc(), locals())) except: return request.session['visitor_id'] = visitor_id = visitor.id redis_data = redis.get('visitor_data_%s' % visitor_id) or '{}' visitor_data = json.loads(redis_data) visitor_data['visitor_id'] = visitor_id # update the tracking information visitor_data['user_agent'] = user_agent # if the visitor record is new, or the visitor hasn't been here for # at least an hour, update their referrer URL one_hour_ago = pytz.UTC.localize(now - timedelta(hours=1)) # TODO: ensure that we are on the same time zone - I just put UTC for now # to get it working last_update = visitor_data.get('last_update', None) if not last_update or last_update <= time.mktime(one_hour_ago.timetuple()): visitor_data['referrer'] = utils.u_clean(request.META.get('HTTP_REFERER', 'unknown')[:255]) # reset the number of pages they've been to visitor_data['page_views'] = 0 visitor_data['session_start'] = time.mktime(now.timetuple()) visitor_data['url'] = request.path page_views = visitor_data.get('page_views', 0) + 1 visitor_data['page_views'] = page_views visitor_data['last_update'] = time.mktime(now.timetuple()) try: # Extracting visitor data from GA cookie cookie = request.COOKIES.get('__utmz') if cookie: try: data = cookie.split('.', 4)[-1] data = dict(match.groups() for match in re.finditer( r'(utm(?:csr|ccn|cmd|ctr))=([^\|]*)', data)) except (ValueError, IndexError): log.error('Malformed GA cookie: {0!r}'.format(cookie)) else: visitor_data['source'] = normalize_ga_value(data.get('utmcsr')) visitor_data['medium'] = normalize_ga_value(data.get('utmcmd')) visitor_data['campaign'] = normalize_ga_value(data.get('utmccn')) visitor_data['keywords'] = normalize_ga_value(data.get('utm.ctr')) utm_source = request.GET.get("utm_source", "unknown") request.session['acquisition_source_name'] = utm_source if utm_source != "unknown": # utm_source: Identify the advertiser, site, publication, etc. that is sending traffic to your property, e.g. google, citysearch, newsletter4, billboard. # utm_medium: The advertising or marketing medium, e.g.: cpc, banner, email newsletter. # utm_campaign: The individual campaign name, slogan, promo code, etc. for a product. # utm_term: Identify paid search keywords. If you're manually tagging paid keyword campaigns, you should also use utm_term to specify the keyword. # utm_content: Used to differentiate similar content, or links within the same ad. For example, if you have two call-to-action links within the same email message, you can use utm_content and set different values for each so you can tell which version is more effective. #update the tracking info with the latest and bump the old one to be stored in the history #visitor.bump_past_acquisition_info() past_acquisition_info = visitor_data.get('past_acquisition_info', []) if visitor_data.get('acquisition_source', None): old_visitor_data = {'date_valid_until': time.time()} for k in VISITOR_PARAMS_MAPPING.keys(): old_visitor_data[k] = visitor_data.get(k, None) past_acquisition_info.append(old_visitor_data) visitor_data['past_acquisition_info'] = past_acquisition_info for k,v in VISITOR_PARAMS_MAPPING.items(): value = request.GET.get(v, 'unknown')[:255] visitor_data[k] = value except: print_stack_trace() redis.set('visitor_data_%s' % visitor_id, json.dumps(visitor_data))
def assign_visitor_tag(context, url): try: return get_visitor_tag(url, context['visitor'].id) except: print_stack_trace() return url
def click_track(request, clicked_link_path=None): try: referer = utils.u_clean( request.META.get('HTTP_REFERER', 'unknown')[:255]) clicked_link = request.POST["clicked"][:255] try: clicked_link = clicked_link.lower() except: print_stack_trace() source_url_type = 'landing' if re.search('/coupons/[a-z0-9-]+/[a-z0-9-]+/[\d]+/', referer): source_url_type = 'coupon' elif re.search('/coupons/[a-z0-9-]+/', referer): source_url_type = 'company' elif re.search('/categories/[A-z0-9-]+/', referer): source_url_type = 'category' coupon = None merchant = None if "/coupon/" in clicked_link: #skimlinks will assume the source url to be the /coupon/ url if clicked_link.endswith("/"): coupon_id = clicked_link.split("/")[-2] #assumes trailing '/' else: coupon_id = clicked_link.split("/")[-1] source_url = clicked_link coupon = Coupon.active_objects.get(id=int(coupon_id)) try: merchant = Merchant.objects.get(id=coupon.merchant.id) except: merchant = None target_url = coupon.get_retailer_link() else: source_url = referer target_url = clicked_link merchant = None if 'go.redirectingat.com' in target_url: target_url = _remove_skimlinks(target_url) merchant_domain = shorten_to_domain(target_url) visitor = Visitor.objects.get(pk=request.session['visitor_id']) click_track = ClickTrack() click_track.visitor = visitor click_track.user_agent = visitor.user_agent[:255] click_track.referer = referer[:255] click_track.target_url = target_url[:255] click_track.source_url_type = source_url_type[:255] click_track.source_url = source_url[:255] click_track.merchant = merchant click_track.coupon = coupon click_track.merchant_domain = merchant_domain[:255] try: click_track.acquisition_source = visitor.acquisition_source click_track.acquisition_medium = visitor.acquisition_medium click_track.acquisition_term = visitor.acquisition_term click_track.acquisition_content = visitor.acquisition_content click_track.acquisition_campaign = visitor.acquisition_campaign click_track.acquisition_gclid = visitor.acquisition_gclid except: print_stack_trace() try: click_track.save() except: try: print "Visitor ID", click_track.visitor print "User Agent", click_track.user_agent print "Referer", click_track.referer print "target_url", click_track.target_url print "source_url_type", click_track.source_url_type print "merchant", click_track.merchant print "coupon", click_track.coupon print "merchant_domain", click_track.merchant_domain print merchant.name, merchant.id except: pass print_stack_trace() except: print_stack_trace() return success()
def read_scrub_list_and_update(args): try: filename = args[0] except: pass # Thomas' Bing Geocoder api key (free basic access) # dotus_geocoder = geocoders.GeocoderDotUS() for consideration as a fallback bing_geocoder = geocoders.Bing('AvxLEwiPhVJzf0S3Pozgg01NnUQQX0RR6g9K46VPLlZ8OfZkKS-76gaPyzoV6IHI') path = os.path.join(settings.BASE_DIR, 'readonly', filename) try: f = open(path) except IOError: print_stack_trace() rows = [] for row in f: rows.append(row.replace("\r\n", "").split("\t")) for row in rows[1:]: # Skip the header try: coupon_pk = int(row[1]) is_duplicate = True if row[3] == '1' else False is_inactive = True if row[4] == '1' else False is_category_wrong = True if row[5] == '1' else False is_location_wrong = True if row[8] == '1' else False correction_needed = is_duplicate or is_inactive or is_category_wrong or is_location_wrong if correction_needed: coupon_obj = Coupon.all_objects.get(pk=coupon_pk) if is_duplicate: coupon_obj.is_duplicate = True # print "Correction: ", coupon_pk, " is_duplicate=True" #DEBUG if is_inactive: coupon_obj.status = 'confirmed-inactive' # print "Correction: ", coupon_pk, " status=confirmed-inactive" #DEBUG if is_category_wrong: coupon_obj.categories.clear() try: parent_category = Category.objects.get(ref_id_source='sqoot', name=row[6]) coupon_obj.categories.add(parent_category) # print "Correction: ", coupon_pk, " Parent category -> ", parent_category.name #DEBUG except: pass try: child_category = Category.objects.get(ref_id_source='sqoot', name=row[7]) coupon_obj.categories.add(child_category) # print "Correction: ", coupon_pk, " Child category -> ", child_category.name #DEBUG except: pass if is_location_wrong: location_obj = coupon_obj.merchant_location address = row[9] if row[9] != '' else '' locality = row[10] if row[10] != '' else '' region = row[11] if row[11] != '' else '' postal_code = row[12] if row[12] != '' else '' spacer1 = ', ' if address != '' else '' spacer2 = ' ' if locality != '' else '' lookup_text = address + spacer1 + locality + spacer2 + region try: place, (lat, lng) = bing_geocoder.geocode(lookup_text) pnt = 'POINT({} {})'.format(lng, lat) location_obj.geometry = pnt except: pass location_obj.address = address if address != '' else location_obj.address location_obj.locality = locality if locality != '' else location_obj.locality location_obj.region = region if region != '' else location_obj.region location_obj.postal_code = postal_code if postal_code != '' else location_obj.postal_code location_obj.save() # print "Correction: ", coupon_pk, " Location fixed" #DEBUG coupon_obj.save() except: print_stack_trace() scrub_list_retrieved = [row[1] for row in rows[1:]] # list of original coupon pks imported from 'scrub_list.py' deals_to_scrub = Coupon.all_objects.filter(pk__in=scrub_list_retrieved)\ .exclude(Q(status='confirmed-inactive') | Q(status='implied-inactive') | Q(is_duplicate=True))\ .order_by('merchant__name') probably_dup_deals_list = [] # List of coupon pks that look like a duplicate. probably_dup_deals_list = crosscheck_by_field(deals_to_scrub, probably_dup_deals_list, 'coupon_directlink') probably_dup_deals_list = crosscheck_by_field(deals_to_scrub, probably_dup_deals_list, 'merchant_name') probably_dup_deals_list = list(set(probably_dup_deals_list)) for pk in probably_dup_deals_list: try: coupon = Coupon.all_objects.get(pk=pk) coupon.is_duplicate = True coupon.save() # print "Correction: ", coupon_pk, " is_duplicate=True" #DEBUG except: print_stack_trace()
def log_click_track(request, coupon=None): try: referer = utils.u_clean(request.META.get('HTTP_REFERER', 'unknown')[:255]) clicked_link = request.path source_url_type='landing' if re.search('/coupons/[a-z0-9-]+/[a-z0-9-]+/[\d]+/', referer): source_url_type = 'coupon' elif re.search('/coupons/[a-z0-9-]+/', referer): source_url_type = 'company' elif re.search('/categories/[A-z0-9-]+/', referer): source_url_type = 'category' merchant=None if "/o/" in clicked_link: source_url = clicked_link merchant = coupon.merchant target_url = coupon.get_retailer_link() else: source_url = referer target_url = clicked_link merchant = None if 'go.redirectingat.com' in target_url: target_url = _remove_skimlinks(target_url) merchant_domain = shorten_to_domain(target_url) visitor = Visitor.objects.get(pk=request.session['visitor_id']) click_track = ClickTrack(visitor=visitor, user_agent=visitor.user_agent[:255], referer=referer[:255], target_url=target_url[:255], source_url_type=source_url_type[:255], source_url=source_url[:255], merchant=merchant, coupon=coupon, merchant_domain=merchant_domain[:255]) try: click_track.acquisition_source = visitor.acquisition_source click_track.acquisition_medium = visitor.acquisition_medium click_track.acquisition_term = visitor.acquisition_term click_track.acquisition_content = visitor.acquisition_content click_track.acquisition_campaign = visitor.acquisition_campaign click_track.acquisition_gclid = visitor.acquisition_gclid except: print_stack_trace() try: click_track.save() except: try: print "Visitor ID", click_track.visitor print "User Agent", click_track.user_agent print "Referer", click_track.referer print "target_url", click_track.target_url print "source_url_type", click_track.source_url_type print "merchant", click_track.merchant print "coupon", click_track.coupon print "merchant_domain", click_track.merchant_domain print merchant.name, merchant.id except: pass print_stack_trace() except: print_stack_trace()
def update_coupons(self): for coupon in self.coupons: try: coupon.update() except: print_stack_trace()
def log_click_track(request, coupon=None): try: referer = utils.u_clean( request.META.get('HTTP_REFERER', 'unknown')[:255]) clicked_link = request.path source_url_type = 'landing' if re.search('/coupons/[a-z0-9-]+/[a-z0-9-]+/[\d]+/', referer): source_url_type = 'coupon' elif re.search('/coupons/[a-z0-9-]+/', referer): source_url_type = 'company' elif re.search('/categories/[A-z0-9-]+/', referer): source_url_type = 'category' merchant = None if "/o/" in clicked_link: source_url = clicked_link merchant = coupon.merchant target_url = coupon.get_retailer_link() else: source_url = referer target_url = clicked_link merchant = None if 'go.redirectingat.com' in target_url: target_url = _remove_skimlinks(target_url) merchant_domain = shorten_to_domain(target_url) visitor = Visitor.objects.get(pk=request.session['visitor_id']) click_track = ClickTrack(visitor=visitor, user_agent=visitor.user_agent[:255], referer=referer[:255], target_url=target_url[:255], source_url_type=source_url_type[:255], source_url=source_url[:255], merchant=merchant, coupon=coupon, merchant_domain=merchant_domain[:255]) try: click_track.acquisition_source = visitor.acquisition_source click_track.acquisition_medium = visitor.acquisition_medium click_track.acquisition_term = visitor.acquisition_term click_track.acquisition_content = visitor.acquisition_content click_track.acquisition_campaign = visitor.acquisition_campaign click_track.acquisition_gclid = visitor.acquisition_gclid except: print_stack_trace() try: click_track.save() except: try: print "Visitor ID", click_track.visitor print "User Agent", click_track.user_agent print "Referer", click_track.referer print "target_url", click_track.target_url print "source_url_type", click_track.source_url_type print "merchant", click_track.merchant print "coupon", click_track.coupon print "merchant_domain", click_track.merchant_domain print merchant.name, merchant.id except: pass print_stack_trace() except: print_stack_trace()
def process_request(self, request): # don't process AJAX requests if request.path.startswith("/s/") or request.path.startswith("/static/") or request.path.startswith("/admin/")\ or request.path.startswith("/favicon.ico") or (request.is_ajax() and not request.path.startswith('/o/')): return # create some useful variables ip_address = utils.get_ip(request) user_agent = unicode(request.META.get('HTTP_USER_AGENT', '')[:255], errors='ignore') # retrieve untracked user agents from cache ua_key = '_tracking_untracked_uas' untracked = cache.get(ua_key) if untracked is None: log.info('Updating untracked user agent cache') untracked = UntrackedUserAgent.objects.all() cache.set(ua_key, untracked, 3600) # see if the user agent is not supposed to be tracked for ua in untracked: # if the keyword is found in the user agent, stop tracking if user_agent.find(ua.keyword) != -1: log.debug('Not tracking UA "%s" because of keyword: %s' % (user_agent, ua.keyword)) return if hasattr(request, 'session') and request.session.session_key: # use the current session key if we can session_key = request.session.session_key else: # otherwise just fake a session key session_key = '%s:%s' % (ip_address, user_agent) session_key = session_key[:40] # ensure that the request.path does not begin with any of the prefixes for prefix in self.prefixes: if request.path.startswith(prefix): log.debug('Not tracking request to: %s' % request.path) return # if we get here, the URL needs to be tracked # determine what time it is now = datetime.now() attrs = {'session_key': session_key, 'ip_address': ip_address} visitor_id = request.session.get('visitor_id', None) if not visitor_id: # for some reason, Visitor.objects.get_or_create was not working here try: visitor = Visitor.objects.only('id').get(**attrs) except Visitor.DoesNotExist: request.session.set_test_cookie() # see if there's a visitor with the same IP and user agent # within the last 5 minutes cutoff = now - timedelta(minutes=5) visitors = Visitor.objects.only('id').filter( ip_address=ip_address, user_agent=user_agent, last_update__gte=cutoff) if len(visitors): visitor = visitors[0] visitor.session_key = session_key log.debug('Using existing visitor for IP %s / UA %s: %s' % (ip_address, user_agent, visitor.id)) else: # it's probably safe to assume that the visitor is brand new visitor = Visitor(**attrs) log.debug('Created a new visitor: %s' % attrs) try: visitor.save() except DatabaseError: print_stack_trace() log.error( 'There was a problem saving visitor information:\n%s\n\n%s' % (traceback.format_exc(), locals())) except: return request.session['visitor_id'] = visitor_id = visitor.id redis_data = redis.get('visitor_data_%s' % visitor_id) or '{}' visitor_data = json.loads(redis_data) visitor_data['visitor_id'] = visitor_id # update the tracking information visitor_data['user_agent'] = user_agent # if the visitor record is new, or the visitor hasn't been here for # at least an hour, update their referrer URL one_hour_ago = pytz.UTC.localize(now - timedelta(hours=1)) # TODO: ensure that we are on the same time zone - I just put UTC for now # to get it working last_update = visitor_data.get('last_update', None) if not last_update or last_update <= time.mktime( one_hour_ago.timetuple()): visitor_data['referrer'] = utils.u_clean( request.META.get('HTTP_REFERER', 'unknown')[:255]) # reset the number of pages they've been to visitor_data['page_views'] = 0 visitor_data['session_start'] = time.mktime(now.timetuple()) visitor_data['url'] = request.path page_views = visitor_data.get('page_views', 0) + 1 visitor_data['page_views'] = page_views visitor_data['last_update'] = time.mktime(now.timetuple()) try: # Extracting visitor data from GA cookie cookie = request.COOKIES.get('__utmz') if cookie: try: data = cookie.split('.', 4)[-1] data = dict(match.groups() for match in re.finditer( r'(utm(?:csr|ccn|cmd|ctr))=([^\|]*)', data)) except (ValueError, IndexError): log.error('Malformed GA cookie: {0!r}'.format(cookie)) else: visitor_data['source'] = normalize_ga_value( data.get('utmcsr')) visitor_data['medium'] = normalize_ga_value( data.get('utmcmd')) visitor_data['campaign'] = normalize_ga_value( data.get('utmccn')) visitor_data['keywords'] = normalize_ga_value( data.get('utm.ctr')) utm_source = request.GET.get("utm_source", "unknown") request.session['acquisition_source_name'] = utm_source if utm_source != "unknown": # utm_source: Identify the advertiser, site, publication, etc. that is sending traffic to your property, e.g. google, citysearch, newsletter4, billboard. # utm_medium: The advertising or marketing medium, e.g.: cpc, banner, email newsletter. # utm_campaign: The individual campaign name, slogan, promo code, etc. for a product. # utm_term: Identify paid search keywords. If you're manually tagging paid keyword campaigns, you should also use utm_term to specify the keyword. # utm_content: Used to differentiate similar content, or links within the same ad. For example, if you have two call-to-action links within the same email message, you can use utm_content and set different values for each so you can tell which version is more effective. #update the tracking info with the latest and bump the old one to be stored in the history #visitor.bump_past_acquisition_info() past_acquisition_info = visitor_data.get( 'past_acquisition_info', []) if visitor_data.get('acquisition_source', None): old_visitor_data = {'date_valid_until': time.time()} for k in VISITOR_PARAMS_MAPPING.keys(): old_visitor_data[k] = visitor_data.get(k, None) past_acquisition_info.append(old_visitor_data) visitor_data[ 'past_acquisition_info'] = past_acquisition_info for k, v in VISITOR_PARAMS_MAPPING.items(): value = request.GET.get(v, 'unknown')[:255] visitor_data[k] = value except: print_stack_trace() redis.set('visitor_data_%s' % visitor_id, json.dumps(visitor_data))
def _get_image(user, image_url, specific_height=-1, specific_width=-1): """ Returns the image at the image_url of the specific height and width if image is not locally buffers, then it downloads the original image at the url if specific height and width are set, it also creates a resized version of the image """ def _download_image_to_local(src_image_url, src_image_pointer, height, width): prefix, ext = os.path.splitext(urlparse.urlparse(src_image_url).path) prefix = prefix.replace('/', '').replace('\\', '') prefix = url2path(prefix) filename = '%s_%s%s' % (uuid.uuid4().hex, uuid.uuid4().hex, ext) local_copy = os.path.join(settings.IMAGE_LOCAL_COPY_DIR, filename) local_url = os.path.join(settings.IMAGE_LOCAL_COPY_DIR_NO_PREFIX, filename) file_saved_path = local_copy if height == -1: #ensure exists assert src_image_pointer.status_code == 200 #ensure is image! content_type = src_image_pointer.headers['content-type'].lower() if content_type[:5] != 'image': #cloudfront servers typically hosting images as octet-streams #we need to handle that assert 'application/octet-stream' in content_type #also in these cases we don't allow files greater than 700KB -> 716800 = 700*1024 assert int('30611') < 716800 if src_image_pointer.status_code == 200: #download remote file file_saved_path = local_copy local_copy = open(local_copy, 'wb') local_copy.write(src_image_pointer.content) local_copy.close() else: raise Http404() else: img_util.resize(src_image_pointer, (specific_width, specific_height), True, local_copy) # s3_url = s3.upload(file_saved_path) #store reference in imagestore # img = ImageStore(remote_url=image_url, local_url=s3_url, source_user=user, height=height, width=width) img = ImageStore(remote_url=image_url, local_url="/%s" % local_url, source_user=user, height=height, width=width) img.save() return img def _download_temp_image(src_image_url, src_image_pointer, height, width): prefix, ext = os.path.splitext(urlparse.urlparse(src_image_url).path) prefix = prefix.replace('/', '').replace('\\', '') prefix = url2path(prefix) filename = '%s_%s%s' % (uuid.uuid4().hex, uuid.uuid4().hex, ext) local_copy = os.path.join(settings.IMAGE_LOCAL_COPY_DIR, filename) local_url = os.path.join(settings.IMAGE_LOCAL_COPY_DIR_NO_PREFIX, filename) if height == -1: #ensure exists assert src_image_pointer.status_code == 200 #ensure is image! content_type = src_image_pointer.headers['content-type'].lower() if content_type[:5] != 'image': #cloudfront servers typically hosting images as octet-streams #we need to handle that assert 'application/octet-stream' in content_type #also in these cases we don't allow files greater than 700KB -> 716800 = 700*1024 assert int('30611') < 716800 if src_image_pointer.status_code == 200: #download remote file local_copy = open(local_copy, 'wb') local_copy.write(src_image_pointer.content) local_copy.close() else: raise Http404() return local_url if user.is_anonymous() or not (user.is_authenticated()): try: user = User.objects.get(username=IMAGE_ANONYMOUS_USER) except: print_stack_trace() user = User.objects.create_user( username=IMAGE_ANONYMOUS_USER, email='*****@*****.**', password=IMAGE_ANONYMOUS_USER) image_url = urllib.unquote_plus(image_url) original_image = None #check if image already exists image_url_to_check = image_url if ShortenedURL.objects.should_shorten_url(image_url): image_url_to_check = ShortenedURL.objects.shorten_url( image_url).shortened_url for image in ImageStore.objects.filter(remote_url=image_url_to_check): if image.height == specific_height and image.width == specific_width: #found the image with the exact dimensions return image if image.height == image.width == -1: #found the original image original_image = image if not original_image: #original image not available #download original image original_image = _download_image_to_local(image_url, requests.get(image_url), -1, -1) else: original_image.local_url = _download_temp_image( image_url, requests.get(image_url), -1, -1) if specific_height == specific_width == -1: # os.remove(os.path.join(settings.IMAGE_LOCAL_COPY_DIR, original_image.local_url[1:].split('/')[-1])) return original_image else: #required image is not available #resize original image to required image resized_image = _download_image_to_local( image_url, os.path.join(settings.IMAGE_LOCAL_COPY_DIR, original_image.local_url[1:].split('/')[-1]), specific_height, specific_width) # os.remove(os.path.join(settings.IMAGE_LOCAL_COPY_DIR, resized_image.local_url[1:].split('/')[-1])) # os.remove(os.path.join(settings.IMAGE_LOCAL_COPY_DIR, original_image.local_url[1:].split('/')[-1])) return resized_image
def crosscheck_by_field(deals_to_dedup, field_name): from core.signals import update_object duplicate_deals_list = [] # List of duplicate coupon pks. if field_name == 'coupon_directlink': field_list = list(set([d.directlink for d in deals_to_dedup])) elif field_name == 'merchant_name': field_list = list(set([d.merchant.name for d in deals_to_dedup])) else: return all_active_deals = len(deals_to_dedup) num_of_unique_fields = len(field_list) try: print "\n...Detected {} deals by '{}' field to dedup out of {} total active deals".format( num_of_unique_fields, field_name, all_active_deals), show_time() except: pass progress_count = 1 clear_cache_timer = 1 for x in field_list: try: same_looking_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_duplicate=False, is_deleted=False, online=False)\ .exclude(end__lt=datetime.now(pytz.utc)) if field_name == 'coupon_directlink': same_looking_deals = same_looking_deals.filter(directlink=x) elif field_name == 'merchant_name': same_looking_deals = same_looking_deals.filter( merchant__name__contains=x) if same_looking_deals.count() <= 1: print show_time(), '({}/{}) DEDUP-HARD:'.format( progress_count, num_of_unique_fields), '...no duplicate, skipping...' progress_count += 1 clear_cache_timer += 1 continue try: print show_time(), '({}/{}) DEDUP-HARD:'.format( progress_count, num_of_unique_fields), 'all deals with {}=={}'.format( field_name, x) except: pass while True: current_count = same_looking_deals.count() if current_count == 1: break else: for c in same_looking_deals[1:current_count]: if c.is_duplicate or (c.pk in duplicate_deals_list): continue does_it_look_duplicate, which_deal = compare_location_between( same_looking_deals[0], c) if not does_it_look_duplicate: continue if which_deal == same_looking_deals[0]: duplicate_deals_list.append(which_deal.pk) break else: duplicate_deals_list.append(which_deal.pk) same_looking_deals = same_looking_deals.exclude( pk=same_looking_deals[0].pk) progress_count += 1 clear_cache_timer += 1 if clear_cache_timer >= 100: duplicate_deals_list = list(set(duplicate_deals_list)) Coupon.all_objects.filter(pk__in=duplicate_deals_list).update( is_duplicate=True) for coupon in Coupon.all_objects.filter( pk__in=duplicate_deals_list): handle_exceptions( update_object.send(sender=Coupon, instance=coupon)) print 'Updated %s' % coupon.id duplicate_deals_list = [] clear_cache_timer = 1 except: try: print "!!!ERROR: field: {}".format(x) except: pass print_stack_trace()
def click_track(request, clicked_link_path=None): try: referer = utils.u_clean(request.META.get('HTTP_REFERER', 'unknown')[:255]) clicked_link = request.POST["clicked"][:255] try: clicked_link=clicked_link.lower() except: print_stack_trace() source_url_type='landing' if re.search('/coupons/[a-z0-9-]+/[a-z0-9-]+/[\d]+/', referer): source_url_type = 'coupon' elif re.search('/coupons/[a-z0-9-]+/', referer): source_url_type = 'company' elif re.search('/categories/[A-z0-9-]+/', referer): source_url_type = 'category' coupon=None merchant=None if "/coupon/" in clicked_link: #skimlinks will assume the source url to be the /coupon/ url if clicked_link.endswith("/"): coupon_id = clicked_link.split("/")[-2] #assumes trailing '/' else: coupon_id = clicked_link.split("/")[-1] source_url = clicked_link coupon = Coupon.active_objects.get(id=int(coupon_id)) try: merchant = Merchant.objects.get(id=coupon.merchant.id) except: merchant = None target_url = coupon.get_retailer_link() else: source_url = referer target_url = clicked_link merchant = None if 'go.redirectingat.com' in target_url: target_url = _remove_skimlinks(target_url) merchant_domain = shorten_to_domain(target_url) visitor = Visitor.objects.get(pk=request.session['visitor_id']) click_track = ClickTrack() click_track.visitor = visitor click_track.user_agent = visitor.user_agent[:255] click_track.referer = referer[:255] click_track.target_url = target_url[:255] click_track.source_url_type = source_url_type[:255] click_track.source_url = source_url[:255] click_track.merchant = merchant click_track.coupon = coupon click_track.merchant_domain = merchant_domain[:255] try: click_track.acquisition_source = visitor.acquisition_source click_track.acquisition_medium = visitor.acquisition_medium click_track.acquisition_term = visitor.acquisition_term click_track.acquisition_content = visitor.acquisition_content click_track.acquisition_campaign = visitor.acquisition_campaign click_track.acquisition_gclid = visitor.acquisition_gclid except: print_stack_trace() try: click_track.save() except: try: print "Visitor ID", click_track.visitor print "User Agent", click_track.user_agent print "Referer", click_track.referer print "target_url", click_track.target_url print "source_url_type", click_track.source_url_type print "merchant", click_track.merchant print "coupon", click_track.coupon print "merchant_domain", click_track.merchant_domain print merchant.name, merchant.id except: pass print_stack_trace() except: print_stack_trace() return success()
def crosscheck_by_field(deals_to_dedup, field_name): from core.signals import update_object duplicate_deals_list = [] # List of duplicate coupon pks. if field_name == 'coupon_directlink': field_list = list(set([d.directlink for d in deals_to_dedup])) elif field_name == 'merchant_name': field_list = list(set([d.merchant.name for d in deals_to_dedup])) else: return all_active_deals = len(deals_to_dedup) num_of_unique_fields = len(field_list) try: print "\n...Detected {} deals by '{}' field to dedup out of {} total active deals".format(num_of_unique_fields, field_name, all_active_deals), show_time() except: pass progress_count = 1 clear_cache_timer = 1 for x in field_list: try: same_looking_deals = Coupon.all_objects.filter(ref_id_source='sqoot', is_duplicate=False, is_deleted=False, online=False)\ .exclude(end__lt=datetime.now(pytz.utc)) if field_name == 'coupon_directlink': same_looking_deals = same_looking_deals.filter(directlink=x) elif field_name == 'merchant_name': same_looking_deals = same_looking_deals.filter(merchant__name__contains=x) if same_looking_deals.count() <= 1: print show_time(), '({}/{}) DEDUP-HARD:'.format(progress_count, num_of_unique_fields), '...no duplicate, skipping...' progress_count += 1 clear_cache_timer += 1 continue try: print show_time(), '({}/{}) DEDUP-HARD:'.format(progress_count, num_of_unique_fields), 'all deals with {}=={}'.format(field_name, x) except: pass while True: current_count = same_looking_deals.count() if current_count == 1: break else: for c in same_looking_deals[1:current_count]: if c.is_duplicate or (c.pk in duplicate_deals_list): continue does_it_look_duplicate, which_deal = compare_location_between(same_looking_deals[0], c) if not does_it_look_duplicate: continue if which_deal == same_looking_deals[0]: duplicate_deals_list.append(which_deal.pk) break else: duplicate_deals_list.append(which_deal.pk) same_looking_deals = same_looking_deals.exclude(pk=same_looking_deals[0].pk) progress_count += 1 clear_cache_timer += 1 if clear_cache_timer >= 100: duplicate_deals_list = list(set(duplicate_deals_list)) Coupon.all_objects.filter(pk__in=duplicate_deals_list).update(is_duplicate=True) for coupon in Coupon.all_objects.filter(pk__in=duplicate_deals_list): handle_exceptions(update_object.send(sender=Coupon, instance=coupon)) print 'Updated %s' % coupon.id duplicate_deals_list = [] clear_cache_timer = 1 except: try: print "!!!ERROR: field: {}".format(x) except: pass print_stack_trace()