quantity = int(re.sub('[a-z,]+', '', temp[0][0])) else: quantity = 1 except: units = "" amount = 0.0 quantity = 0 category = 1 + categories.index(t[2]) # Custom stuff if 'benzocaine' in t[0]: category = 1 + categories.index('custom.benzocaine') if 'coca leave' in t[0]: category = 1 + categories.index('custom.cocaleaves') write_cur.execute( "INSERT INTO listings VALUES('{0}', '{1}', {2}, {3}, {4}, '{5}', {6}, {7}, '{8}')" .format(t[0], t[1], category, 1 + ships_from.index(t[3]), 1 + ships_to.index(t[4]), units, amount, quantity, t[5])) buf = buf + 1 if (buf > buffer_limit): buf = 0 write.commit() count = count + 1 update_progress(count, tot_count) # Clean out to just get titles titles = [t[0] for t in titles]
count = 0 for d in dirs: for root, dirnames, filenames in os.walk(d): for f in filenames: if not f in existing_files: try: # No idea why I have to do this! dests = copy.deepcopy(destinations) # Identify the Final Destination IV of this file dest_dir = 'remaining' for dest in dests: dp = dest[0] mp = dest[1] if mp in f: dest_dir = dp # Read it, remove CSS, and write it out with open (os.path.join(root, f)) as open_file: value = open_file.read() value = re.sub('(<style ?type ?= ?"text/css">)(.*?)(<\/style>)', '', value) with open(os.path.join('raw_by_site', market, dest_dir, f), "w") as open_file: open_file.write(value) except: print_progress("Warning, read/write error.") count = count + 1 update_progress(count, tot_count) print_progress("Finished sorting " + market)
except lite.Error as e: print_progress("Failed to clean " + market + " listings, error %s:" % e.args[0]) size = len([name for name in os.listdir(path)]) count = 1 tot_scraped = 0 try: con = lite.connect(output_path + output_file) buf = 0 for f in listdir(path): # Update the progress update_progress(count, size) count = count + 1 # Load the file into a string with open(path + f, "r") as file: file_string = file.read() # Parse the HTML tree = html.fromstring(file_string) name = tree.xpath('//div/strong/text()') if len(name) != 1: continue else: name = name[0]
con.cursor().execute("CREATE TABLE vendors( dat INT, name TEXT, rating TEXT, ratings TEXT )") except lite.Error as e: print_progress("Failed to clean " + market + " listings, error %s:" % e.args[0]) size = len([name for name in os.listdir(path)]) count = 1 tot_scraped = 0 try: con = lite.connect(output_path + output_file) buf = 0; for f in listdir(path): # Update the progress update_progress(count, size) count = count + 1 # Load the file into a string with open(path + f, "r") as file: file_string = file.read() # Parse the HTML tree = html.fromstring(file_string) name = tree.xpath('//div/strong/text()') if len(name) != 1: continue else: name = name[0]