quantity = int(re.sub('[a-z,]+', '', temp[0][0]))
        else:
            quantity = 1
    except:
        units = ""
        amount = 0.0
        quantity = 0

    category = 1 + categories.index(t[2])

    # Custom stuff
    if 'benzocaine' in t[0]:
        category = 1 + categories.index('custom.benzocaine')
    if 'coca leave' in t[0]:
        category = 1 + categories.index('custom.cocaleaves')

    write_cur.execute(
        "INSERT INTO listings VALUES('{0}', '{1}', {2}, {3}, {4}, '{5}', {6}, {7}, '{8}')"
        .format(t[0], t[1], category, 1 + ships_from.index(t[3]),
                1 + ships_to.index(t[4]), units, amount, quantity, t[5]))

    buf = buf + 1
    if (buf > buffer_limit):
        buf = 0
        write.commit()
    count = count + 1
    update_progress(count, tot_count)

# Clean out to just get titles
titles = [t[0] for t in titles]
count = 0
for d in dirs:
	for root, dirnames, filenames in os.walk(d):
		for f in filenames:
			if not f in existing_files:

				try:
					# No idea why I have to do this!
					dests = copy.deepcopy(destinations)

					# Identify the Final Destination IV of this file
					dest_dir = 'remaining'
					for dest in dests:
						dp = dest[0]
						mp = dest[1]
						if mp in f:
							dest_dir = dp

					# Read it, remove CSS, and write it out
					with open (os.path.join(root, f)) as open_file:
						value = open_file.read()
						value = re.sub('(<style ?type ?= ?"text/css">)(.*?)(<\/style>)', '', value)
					with open(os.path.join('raw_by_site', market, dest_dir, f), "w") as open_file:
						open_file.write(value)
				except:
					print_progress("Warning, read/write error.")
	count = count + 1
	update_progress(count, tot_count)

print_progress("Finished sorting " + market)
示例#3
0
except lite.Error as e:
    print_progress("Failed to clean " + market +
                   " listings, error %s:" % e.args[0])

size = len([name for name in os.listdir(path)])

count = 1
tot_scraped = 0
try:
    con = lite.connect(output_path + output_file)

    buf = 0
    for f in listdir(path):

        # Update the progress
        update_progress(count, size)
        count = count + 1

        # Load the file into a string
        with open(path + f, "r") as file:
            file_string = file.read()

        # Parse the HTML
        tree = html.fromstring(file_string)

        name = tree.xpath('//div/strong/text()')

        if len(name) != 1:
            continue
        else:
            name = name[0]
    con.cursor().execute("CREATE TABLE vendors( dat INT, name TEXT, rating TEXT, ratings TEXT )")
except lite.Error as e:
    print_progress("Failed to clean " + market + " listings, error %s:" % e.args[0])

size = len([name for name in os.listdir(path)])

count = 1
tot_scraped = 0
try:
    con = lite.connect(output_path + output_file)

    buf = 0;
    for f in listdir(path):

        # Update the progress
        update_progress(count, size)
        count = count + 1

        # Load the file into a string
        with open(path + f, "r") as file:
            file_string = file.read()

        # Parse the HTML
        tree = html.fromstring(file_string)

        name = tree.xpath('//div/strong/text()')

        if len(name) != 1:
            continue
        else:
            name = name[0]