def patch2(patch_data): Model = getattr(models, patch_data['model']) key_list = patch_data['-key'] val_list = subdict(patch_data, key_list) try: utils.create_or_update(Model, patch_data, key_list, create=False, update=True) LOG.info("successfully patched %s", val_list) return True except Model.DoesNotExist: LOG.warn("%s not found, skipping patch", patch_data['model']) return False except Exception as err: LOG.error("unhandled exception attempting to patch %s: %s", val_list, err)
def scrape(slug, url, name, title=None): f = urlopen(url) doc = f.read() doc, errs = tidy_document( doc, options={ "output-html": 1, #'indent':1, "clean": 1, "drop-font-tags": 1, }, ) if errs: # raise Exception, errs print errs doc = html5lib.parse(doc, treebuilder="lxml") # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) td = jQuery("td#content") assert len(td) == 1 for img in td("img"): # print 'img:', PyQuery (img) img = PyQuery(img) src = img.attr("src") # alt = img.attr('alt') # if src.startswith ('/image'): rslt = getimage(src, slug.split("/")[0]) img.attr("src", rslt) if trace: print rslt # td = # no_fonts (td) # need to fix links here content = PyQuery(td[0]) # content = content.html() content = no_namespaces(content.html()) print slug, content[:60] # .html() # [:60] if dbteeth: # q, created = QuickPage.objects.get_or_create ( qp, created = create_or_update( QuickPage, keys=dict(slug=slug), fields=dict( name=name, title=title if title else name, content=content, # defaults = dict (sortorder = sortorder), ), )
def scrape(slug, url, name, title=None): f = urlopen(url) doc = f.read() doc, errs = tidy_document( doc, options={ 'output-html': 1, #'indent':1, 'clean': 1, 'drop-font-tags': 1, }) if errs: #raise Exception, errs print errs doc = html5lib.parse( doc, treebuilder='lxml' ) # this didn't work, but above three lines did: encoding='utf-8', html.xhtml_to_html(doc) jQuery = PyQuery([doc]) td = jQuery('td#content') assert len(td) == 1 for img in td('img'): #print 'img:', PyQuery (img) img = PyQuery(img) src = img.attr('src') #alt = img.attr('alt') #if src.startswith ('/image'): rslt = getimage(src, slug.split('/')[0]) img.attr('src', rslt) if trace: print rslt #td = #no_fonts (td) # need to fix links here content = PyQuery(td[0]) #content = content.html() content = no_namespaces(content.html()) print slug, content[:60] #.html() # [:60] if dbteeth: #q, created = QuickPage.objects.get_or_create ( qp, created = create_or_update( QuickPage, keys=dict(slug=slug), fields=dict( name=name, title=title if title else name, content=content, #defaults = dict (sortorder = sortorder), ))
def load (csvfile): # for a by-city table with ~2600 entries rows = reader (csvfile) rows.next() # skip over column headings for city, tax, county in rows: unique_city = city.replace ('*','') try: instance, created = create_or_update (CataxByCity, dict (city=unique_city), dict (tax=tax, county=county)) except DuplicateException, e: unique_city = '%s %(%s County)' % (city, row.county) instance, created = create_or_update (CataxByCity, dict(city=unique_city), dict (tax=tax, county=county)) if created: print 'Created:', unique_city, tax else: print 'Updated:', unique_city, tax
def load2 (csvfile): # loads the normalized-by-county-rate table, ~110 entries rows = reader (csvfile) rows.next() # skip over column headings #### First build unique dict tree by county & tax rate, listing cities: catax = {} count = 0 for city, rate, county in rows: count += 1 city = re.sub('(\(.*\))', '', city) city = city.strip().replace ('*', '') county = county.strip().replace ('*', '') rate = '%4g' % float (rate.strip('% ')) if county in catax: if rate in catax [county]: catax [county][rate] += ', ' + city else: catax [county][rate] = city else: catax [county] = { rate:city } if trace: pprint (catax) #### Now count cities and build minimized list - county default is highest city count, then exceptions listed with cities catax2 = [] for county, ratesdict in catax.items(): #print len (ratesdict) rates = [[rate, len (cities.split(',')), cities] for (rate, cities) in ratesdict.items()] rates.sort(key=lambda r: r[1], reverse=1) #print rates catax2 += [(county, '', rates [0][0], rates [0][1])] rates [0].append (county) # append name for r in rates [1:]: catax2 += [(county, r [2], r [0], r [1])] r.append ('%s (%s)' % (county, r [2])) if dbteeth: for rate, count, cities, name in rates: #for (rate, cities) in ratesdict.items() try: instance, created = create_or_update (Catax, dict(name=name), dict (tax=rate, county=county, cities=cities, count=count)) except Exception, e: print e, name, rate, county, cities, count if created: print 'Created:', name, rate else: print 'Updated:', name, rate
# row ['notes'] = row ['notes'].encode ('utf8', 'ignore') row["notes"] = unicode(row["notes"], "utf8", "ignore") if not row["shipprice"]: row.pop("shipprice") if not order_id.isdigit(): print "NOT imported:", order_id continue # row ['notes'] += 'Original order was %s - imported attached to order #1 as a catchall - JJW' # order_id = 1 # elif int (order_id) <25300: # continue try: ol, created = create_or_update(ImportedOrderLine, dict(line=line, order_id=order_id), row) # print ol, created except Exception, e: print e from pprint import pprint pprint(row) raise continue # old: l = ImportedOrder(id=line.pop("id")) o.shiprate = line.pop("shiprate") o.shipcost = line.pop("shipcost") o.ordernum = line.pop("ordernum")
def scrape_redirect(fm, to): redir, created = create_or_update(Redirect, keys=dict(old_path=fm, site_id=1), fields=dict(new_path=to)) print redir
def load2 (csvfile): # loads the normalized-by-county-rate table, ~110 entries rows = reader (csvfile) rows.next() # skip over column headings #### First build unique dict tree by county & tax rate, listing cities: catax = {} count = 0 for city, rate, county in rows: count += 1 city = re.sub('(\(.*\))', '', city) city = city.strip().replace ('*', '') county = county.strip().replace ('*', '') rate = '%4g' % float (rate.strip('% ')) if county in catax: if rate in catax [county]: catax [county][rate] += ', ' + city else: catax [county][rate] = city else: catax [county] = { rate:city } if trace: pprint (catax) #### Now count cities and build minimized list - county default is highest city count, then exceptions listed with cities catax2 = [] for county, ratesdict in catax.items(): #print len (ratesdict) rates = [[rate, len (cities.split(',')), cities] for (rate, cities) in ratesdict.items()] rates.sort(key=lambda r: r[1], reverse=1) #print rates catax2 += [(county, '', rates [0][0], rates [0][1])] rates [0].append (county) # append name for r in rates [1:]: catax2 += [(county, r [2], r [0], r [1])] r.append ('%s (%s)' % (county, r [2])) if dbteeth: for rate, count, cities, name in rates: #for (rate, cities) in ratesdict.items() instance, created = create_or_update (Catax, dict(name=name), dict (tax=rate, county=county, cities=cities, count=count)) if created: print 'Created:', name, rate else: print 'Updated:', name, rate catax2.sort() if trace: pprint (catax2) #print "Org len (total CA cities):", len (catax.items()), "Normalized:", len (catax2) print "Org len (total CA cities):", count, "Counties:", len (catax), "Unique rates:", len (catax2) #### Now create legacy catax.calist for Zope, as needed if teeth: calist = [ ('', 'Select CA County') ] for (county, city, tax, count) in catax2: tax = float (tax) #+ 1.25 if city: val = county + ' (%s)' % city else: val = county val = val + ' %.4g%%' % (tax) if city: county = '%s.%s' % (county, city.replace(' ','').replace(',','_')) nam = "(%s,'%s')" % (tax, county) calist.append ( (nam, val) ) f = open ('catax.py','w') f.write ('calist=' + pformat (calist)) f.close()
row['notes'] = unicode(row['notes'], 'utf8', 'ignore') if not row['shipprice']: row.pop('shipprice') if not order_id.isdigit(): print 'NOT imported:', order_id continue #row ['notes'] += 'Original order was %s - imported attached to order #1 as a catchall - JJW' #order_id = 1 #elif int (order_id) <25300: # continue try: ol, created = create_or_update(ImportedOrderLine, dict(line=line, order_id=order_id), row) #print ol, created except Exception, e: print e from pprint import pprint pprint(row) raise continue # old: l = ImportedOrder(id=line.pop('id')) o.shiprate = line.pop('shiprate') o.shipcost = line.pop('shipcost') o.ordernum = line.pop('ordernum')
def update_ca_tax (self): # loads the normalized-by-county-rate table, ~110 entries url="http://www.boe.ca.gov/sutax/files/city_rates.csv" data = urllib2.urlopen(url) while True: each_line = data.readline() if "City" in each_line: break# skip over column headings #### First build unique dict tree by county & tax rate, listing cities: catax = {} count = 0 while True: each_line = data.readline() words = each_line.split(",") if len(words)!=3: break city = words[0] rate = words[1] county = words[2] count += 1 city = re.sub('(\(.*\))', '', city) city = city.strip().replace ('*', '') county = county.strip().replace ('*', '') rate = '%4g' % float (rate.strip('% ')) if county in catax: if rate in catax [county]: catax [county][rate] += ', ' + city else: catax [county][rate] = city else: catax [county] = { rate:city } if trace: pprint(catax) catax2 = [] for county, ratesdict in catax.items(): rates = [[rate, len (cities.split(',')), cities] for (rate, cities) in ratesdict.items()] rates.sort(key=lambda r: r[1], reverse=1) #print rates catax2 += [(county, '', rates [0][0], rates [0][1])] rates [0].append (county) # append name for r in rates [1:]: catax2 += [(county, r [2], r [0], r [1])] r.append ('%s (%s)' % (county, r [2])) if dbteeth: for rate, count, cities, name in rates: instance, created = create_or_update (Catax, dict(name=name), dict (tax=rate, county=county, cities=cities, count=count)) if created: print 'Created:', name, rate else: print 'Updated:', name, rate catax2.sort() if trace: pprint (catax2) #print "Org len (total CA cities):", len (catax.items()), "Normalized:", len (catax2) print "Org len (total CA cities):", count, "Counties:", len (catax), "Unique rates:", len (catax2) #### Now create legacy catax.calist for Zope, as needed if teeth: calist = [ ('', 'Select CA County') ] for (county, city, tax, count) in catax2: tax = float (tax) #+ 1.25 if city: val = county + ' (%s)' % city else: val = county val = val + ' %.4g%%' % (tax) if city: county = '%s.%s' % (county, city.replace(' ','').replace(',','_')) nam = "(%s,'%s')" % (tax, county) calist.append ( (nam, val) ) f = open ("./test.py",'w') f.write ('calist=' + pformat (calist)) f.close()
def scrape_redirect (fm, to): redir, created = create_or_update (Redirect, keys = dict(old_path = fm, site_id=1), fields = dict(new_path = to)) print redir