Пример #1
0
def patch2(patch_data):
    Model = getattr(models, patch_data['model'])
    key_list = patch_data['-key']
    val_list = subdict(patch_data, key_list)
    try:
        utils.create_or_update(Model, patch_data, key_list, create=False, update=True)
        LOG.info("successfully patched %s", val_list)
        return True
    except Model.DoesNotExist:
        LOG.warn("%s not found, skipping patch", patch_data['model'])
        return False
    except Exception as err:
        LOG.error("unhandled exception attempting to patch %s: %s", val_list, err)
Пример #2
0
def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            "output-html": 1,
            #'indent':1,
            "clean": 1,
            "drop-font-tags": 1,
        },
    )
    if errs:
        # raise Exception, errs
        print errs

    doc = html5lib.parse(doc, treebuilder="lxml")  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery("td#content")
    assert len(td) == 1

    for img in td("img"):
        # print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr("src")
        # alt = img.attr('alt')

        # if src.startswith ('/image'):
        rslt = getimage(src, slug.split("/")[0])
        img.attr("src", rslt)
        if trace:
            print rslt

    # td =
    # no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    # content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  # .html()  # [:60]

    if dbteeth:
        # q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                # defaults = dict (sortorder = sortorder),
            ),
        )
Пример #3
0
def scrape(slug, url, name, title=None):
    f = urlopen(url)
    doc = f.read()

    doc, errs = tidy_document(
        doc,
        options={
            'output-html': 1,
            #'indent':1,
            'clean': 1,
            'drop-font-tags': 1,
        })
    if errs:
        #raise Exception, errs
        print errs

    doc = html5lib.parse(
        doc, treebuilder='lxml'
    )  # this didn't work, but above three lines did: encoding='utf-8',
    html.xhtml_to_html(doc)
    jQuery = PyQuery([doc])

    td = jQuery('td#content')
    assert len(td) == 1

    for img in td('img'):
        #print 'img:', PyQuery (img)
        img = PyQuery(img)
        src = img.attr('src')
        #alt = img.attr('alt')

        #if src.startswith ('/image'):
        rslt = getimage(src, slug.split('/')[0])
        img.attr('src', rslt)
        if trace: print rslt

    #td =
    #no_fonts (td)

    # need to fix links here

    content = PyQuery(td[0])
    #content = content.html()
    content = no_namespaces(content.html())

    print slug, content[:60]  #.html()  # [:60]

    if dbteeth:
        #q, created = QuickPage.objects.get_or_create (

        qp, created = create_or_update(
            QuickPage,
            keys=dict(slug=slug),
            fields=dict(
                name=name,
                title=title if title else name,
                content=content,
                #defaults = dict (sortorder = sortorder),
            ))
Пример #4
0
def load (csvfile):  # for a by-city table with ~2600 entries
    rows = reader (csvfile)
    rows.next()  # skip over column headings

    for city, tax, county in rows:
        unique_city = city.replace ('*','')

        try:
            instance, created = create_or_update (CataxByCity, dict (city=unique_city), dict (tax=tax, county=county))
        except DuplicateException, e:
            unique_city = '%s %(%s County)' % (city, row.county)
            instance, created = create_or_update (CataxByCity, dict(city=unique_city), dict (tax=tax, county=county))

        if created:
            print 'Created:', unique_city, tax
        else:
            print 'Updated:', unique_city, tax
Пример #5
0
def load (csvfile):  # for a by-city table with ~2600 entries
    rows = reader (csvfile)
    rows.next()  # skip over column headings

    for city, tax, county in rows:
        unique_city = city.replace ('*','')

        try:
            instance, created = create_or_update (CataxByCity, dict (city=unique_city), dict (tax=tax, county=county))
        except DuplicateException, e:
            unique_city = '%s %(%s County)' % (city, row.county)
            instance, created = create_or_update (CataxByCity, dict(city=unique_city), dict (tax=tax, county=county))

        if created:
            print 'Created:', unique_city, tax
        else:
            print 'Updated:', unique_city, tax
Пример #6
0
def load2 (csvfile):  # loads the normalized-by-county-rate table, ~110 entries
    rows = reader (csvfile)
    rows.next()  # skip over column headings


    #### First build unique dict tree by county & tax rate, listing cities:

    catax = {}
    count = 0

    for city, rate, county in rows:
        count += 1
        city = re.sub('(\(.*\))', '', city)
        city = city.strip().replace ('*', '')
        county = county.strip().replace ('*', '')
        rate = '%4g' % float (rate.strip('% '))

        if county in catax:
            if rate in catax [county]:
                catax [county][rate] += ', ' + city
            else:
                catax [county][rate] = city
        else:
            catax [county] = { rate:city }

    if trace:
        pprint (catax)


    #### Now count cities and build minimized list - county default is highest city count, then exceptions listed with cities

    catax2 = []

    for county, ratesdict in catax.items():
        #print len (ratesdict)
        rates = [[rate, len (cities.split(',')), cities] for (rate, cities) in ratesdict.items()]
        rates.sort(key=lambda r: r[1], reverse=1)
        #print rates
        catax2 += [(county, '', rates [0][0], rates [0][1])]
        rates [0].append (county)  # append name

        for r in rates [1:]:
            catax2 += [(county, r [2], r [0], r [1])]
            r.append ('%s (%s)' % (county, r [2]))

        if dbteeth:
            for rate, count, cities, name in rates:
                #for (rate, cities) in ratesdict.items()

                try:
                    instance, created = create_or_update (Catax, dict(name=name), dict (tax=rate, county=county, cities=cities, count=count))
                except Exception, e:
                    print e, name, rate, county, cities, count

                if created:
                    print 'Created:', name, rate
                else:
                    print 'Updated:', name, rate
        # row ['notes'] = row ['notes'].encode ('utf8', 'ignore')
        row["notes"] = unicode(row["notes"], "utf8", "ignore")

        if not row["shipprice"]:
            row.pop("shipprice")

        if not order_id.isdigit():
            print "NOT imported:", order_id
            continue
            # row ['notes'] += 'Original order was %s - imported attached to order #1 as a catchall - JJW'
            # order_id = 1
        # elif int (order_id) <25300:
        #    continue

        try:
            ol, created = create_or_update(ImportedOrderLine, dict(line=line, order_id=order_id), row)
            # print ol, created
        except Exception, e:
            print e
            from pprint import pprint

            pprint(row)
            raise

        continue

        # old:
        l = ImportedOrder(id=line.pop("id"))
        o.shiprate = line.pop("shiprate")
        o.shipcost = line.pop("shipcost")
        o.ordernum = line.pop("ordernum")
Пример #8
0
def scrape_redirect(fm, to):
    redir, created = create_or_update(Redirect, keys=dict(old_path=fm, site_id=1), fields=dict(new_path=to))
    print redir
Пример #9
0
def load2 (csvfile):  # loads the normalized-by-county-rate table, ~110 entries
    rows = reader (csvfile)
    rows.next()  # skip over column headings


    #### First build unique dict tree by county & tax rate, listing cities:

    catax = {}
    count = 0

    for city, rate, county in rows:
        count += 1
        city = re.sub('(\(.*\))', '', city)
        city = city.strip().replace ('*', '')
        county = county.strip().replace ('*', '')
        rate = '%4g' % float (rate.strip('% '))

        if county in catax:
            if rate in catax [county]:
                catax [county][rate] += ', ' + city
            else:
                catax [county][rate] = city
        else:
            catax [county] = { rate:city }

    if trace:
        pprint (catax)


    #### Now count cities and build minimized list - county default is highest city count, then exceptions listed with cities

    catax2 = []

    for county, ratesdict in catax.items():
        #print len (ratesdict)
        rates = [[rate, len (cities.split(',')), cities] for (rate, cities) in ratesdict.items()]
        rates.sort(key=lambda r: r[1], reverse=1)
        #print rates
        catax2 += [(county, '', rates [0][0], rates [0][1])]
        rates [0].append (county)  # append name

        for r in rates [1:]:
            catax2 += [(county, r [2], r [0], r [1])]
            r.append ('%s (%s)' % (county, r [2]))

        if dbteeth:
            for rate, count, cities, name in rates:
                #for (rate, cities) in ratesdict.items()

                instance, created = create_or_update (Catax, dict(name=name), dict (tax=rate, county=county, cities=cities, count=count))

                if created:
                    print 'Created:', name, rate
                else:
                    print 'Updated:', name, rate


    catax2.sort()
    if trace:
        pprint (catax2)
        #print "Org len (total CA cities):", len (catax.items()), "Normalized:", len (catax2)
        print "Org len (total CA cities):", count, "Counties:", len (catax), "Unique rates:", len (catax2)


    #### Now create legacy catax.calist for Zope, as needed

    if teeth:
        calist = [ ('', 'Select CA County') ]

        for (county, city, tax, count) in catax2:
            tax = float (tax) #+ 1.25

            if city:
                val = county + ' (%s)' % city
            else:
                val = county

            val = val + ' %.4g%%' % (tax)

            if city:
                county = '%s.%s' % (county, city.replace(' ','').replace(',','_'))

            nam = "(%s,'%s')" % (tax, county)
            calist.append ( (nam, val) )

        f = open ('catax.py','w')
        f.write ('calist=' + pformat (calist))
        f.close()
Пример #10
0
        row['notes'] = unicode(row['notes'], 'utf8', 'ignore')

        if not row['shipprice']:
            row.pop('shipprice')

        if not order_id.isdigit():
            print 'NOT imported:', order_id
            continue
            #row ['notes'] += 'Original order was %s - imported attached to order #1 as a catchall - JJW'
            #order_id = 1
        #elif int (order_id) <25300:
        #    continue

        try:
            ol, created = create_or_update(ImportedOrderLine,
                                           dict(line=line, order_id=order_id),
                                           row)
            #print ol, created
        except Exception, e:
            print e
            from pprint import pprint
            pprint(row)
            raise

        continue

        # old:
        l = ImportedOrder(id=line.pop('id'))
        o.shiprate = line.pop('shiprate')
        o.shipcost = line.pop('shipcost')
        o.ordernum = line.pop('ordernum')
Пример #11
0
    def update_ca_tax (self):  # loads the normalized-by-county-rate table, ~110 entries
        url="http://www.boe.ca.gov/sutax/files/city_rates.csv"
        data = urllib2.urlopen(url)
        while True:
            each_line = data.readline()
            if "City" in each_line:
                break# skip over column headings
        #### First build unique dict tree by county & tax rate, listing cities:
        catax = {}
        count = 0
        while True:
            each_line = data.readline()
            words = each_line.split(",")
            if len(words)!=3:
                break
            city = words[0]
            rate = words[1]
            county = words[2]
            count += 1
            city = re.sub('(\(.*\))', '', city)
            city = city.strip().replace ('*', '')
            county = county.strip().replace ('*', '')
            rate = '%4g' % float (rate.strip('% '))

            if county in catax:
                if rate in catax [county]:
                    catax [county][rate] += ', ' + city
                else:
                    catax [county][rate] = city
            else:
                catax [county] = { rate:city }

        if trace:
            pprint(catax)
        catax2 = []
        for county, ratesdict in catax.items():
            rates = [[rate, len (cities.split(',')), cities] for (rate, cities) in ratesdict.items()]
            rates.sort(key=lambda r: r[1], reverse=1)
            #print rates
            catax2 += [(county, '', rates [0][0], rates [0][1])]
            rates [0].append (county)  # append name
            for r in rates [1:]:
                catax2 += [(county, r [2], r [0], r [1])]
                r.append ('%s (%s)' % (county, r [2]))

            if dbteeth:
                for rate, count, cities, name in rates:
                        instance, created = create_or_update (Catax, dict(name=name), dict (tax=rate, county=county, cities=cities, count=count))
                        if created:
                            print 'Created:', name, rate
                        else:
                            print 'Updated:', name, rate

        catax2.sort()
        if trace:
            pprint (catax2)
            #print "Org len (total CA cities):", len (catax.items()), "Normalized:", len (catax2)
            print "Org len (total CA cities):", count, "Counties:", len (catax), "Unique rates:", len (catax2)
            #### Now create legacy catax.calist for Zope, as needed
        if teeth:
            calist = [ ('', 'Select CA County') ]
            for (county, city, tax, count) in catax2:
                tax = float (tax) #+ 1.25
                if city:
                    val = county + ' (%s)' % city
                else:
                    val = county
                val = val + ' %.4g%%' % (tax)
                if city:
                    county = '%s.%s' % (county, city.replace(' ','').replace(',','_'))
                nam = "(%s,'%s')" % (tax, county)
                calist.append ( (nam, val) )
            f = open ("./test.py",'w')
            f.write ('calist=' + pformat (calist))
            f.close()
Пример #12
0
def scrape_redirect (fm, to):
    redir, created = create_or_update (Redirect, keys = dict(old_path = fm, site_id=1), fields = dict(new_path = to))
    print redir