) write_cur.execute( 'CREATE TABLE reviews(dat INT, listing INT, review TEXT, val INT, price REAL, scraped_at INT, user_rating REAL, user_deals INT)' ) write.commit() categories = [c[0] for c in categories] ships_from = [c[0] for c in ships_from] ships_to = [c[0] for c in ships_to] # Custom categories categories.append('custom.benzocaine') categories.append('custom.cocaleaves') # Add all the categories print_progress("Writing categories...") for c in categories: write_cur.execute("INSERT INTO categories VALUES('{0}')".format(c)) write.commit() print_progress("Writing shipping locations...") for c in ships_from: write_cur.execute("INSERT INTO ships_from VALUES('{0}')".format(c)) write.commit() print_progress("Writing ship to locations...") for c in ships_to: write_cur.execute("INSERT INTO ships_to VALUES('{0}')".format(c)) write.commit() # Sort all the titles
del temp_doc[d][s] doc_json = temp_doc with open(doc_file, "w") as file: file.write( json.dumps(doc_json, indent=4, sort_keys=True, separators=(',', ': '))) # Complain! bad = re.findall( '\[MISSING', json.dumps(doc_json, indent=4, sort_keys=True, separators=(',', ': '))) if len(bad) > 0: print_progress('Parsed ' + doc_file) print_progress('') print_progress('') print_progress( '------------------------- WARNING! -------------------------') print_progress( str(len(bad)) + ' instances of undocumented columns detected.') print_progress('') print_progress( ' DOCUMENT THEM IMMEDIATELY OR YOU WILL BE PUNISHED') print_progress('') print_progress('') else: print_progress('Parsed ' + doc_file) else:
# Find all directories with abraxas in the title and pipe all files it contains to the appropriate directory import os import re import copy from update_progress import update_progress from update_progress import print_progress print_progress("Sorting and cleaning " + market + "...") dests = copy.deepcopy(destinations) # Create the destination folders for d, m in dests: if not os.path.exists(os.path.join('raw_by_site', market, d)): os.makedirs(os.path.join('raw_by_site', market, d)) if not os.path.exists(os.path.join('raw_by_site', market, 'remaining')): os.makedirs(os.path.join('raw_by_site', market, 'remaining')) print_progress("Finding all existing files... ") # Find the set of existing files and convert to hashed list existing_files = [] count = 0 for root, dirnames, filenames in os.walk('raw_by_site/' + market): for f in filenames: existing_files.append(f) count = count + 1 if (count > 1000): print_progress(" ..." + f) count = 0
try: os.remove(output_path + output_file) except OSError: pass if not os.path.exists(output_path): os.makedirs(output_path) try: con = lite.connect(output_path + output_file) con.cursor().execute( "CREATE TABLE vendors( dat INT, name TEXT, rating TEXT, ratings TEXT )" ) except lite.Error as e: print_progress("Failed to clean " + market + " listings, error %s:" % e.args[0]) size = len([name for name in os.listdir(path)]) count = 1 tot_scraped = 0 try: con = lite.connect(output_path + output_file) buf = 0 for f in listdir(path): # Update the progress update_progress(count, size) count = count + 1
import sqlite3 as lite import sys from clean_text import clean from dateutil.parser import parse import calendar import time import datetime # Paths path = 'raw_by_site/' + market + '/listings/' output_path = 'clean_listings/' output_file = 'temp.db' final_output = market + '.db' buffer_limit = 10000 try: os.remove(output_path + output_file) except OSError: pass if not os.path.exists(output_path): os.makedirs(output_path) size = len([name for name in os.listdir(path)]) print_progress( "Cleaning html files and putting information in sql format for " + market + " market.") print_progress("Connecting to " + output_file)
read_cur = read.cursor() read_cur.execute('SELECT DISTINCT name FROM vendors') names = read_cur.fetchall() write = lite.connect(os.path.join('extract_data_vendors', 'temp.db')) write_cur = write.cursor() write_cur.execute('CREATE TABLE vendors(name TEXT)') write_cur.execute('CREATE TABLE reviews(vendor INT, val INT, content TEXT, product TEXT, dat INT, scraped_at INT, user_rating REAL, min_user_sales INT, max_user_sales INT)') write_cur.execute('CREATE TABLE ratings(vendor INT, val REAL, dat INT)') write_cur.execute('CREATE TABLE sales(vendor INT, val INT, dat INT)') write.commit() vendors = [c[0] for c in names] # Add all the categories print_progress("Writing vendors...") for c in vendors: write_cur.execute("INSERT INTO vendors VALUES('{0}')".format(c)) write.commit() # Get # of rows to sort read_cur.execute("SELECT Count(*) FROM vendors") row_count = read_cur.fetchall()[0][0] # Sort all the titles print_progress("Sorting reviews and ratings by vendor...") tot_count, count, tot_aggregated = row_count, 0, 0 buf = 0 for i in range(1, row_count):
if d not in names and d != 'data_doc': del temp_doc[d] elif d != 'data_doc': ind = names.index(d) for s in doc_json[d]: if s not in schemas[ind] and s != 'table_doc': del temp_doc[d][s] doc_json = temp_doc with open(doc_file, "w") as file: file.write(json.dumps(doc_json, indent = 4, sort_keys=True, separators=(',', ': '))) # Complain! bad = re.findall('\[MISSING', json.dumps(doc_json, indent = 4, sort_keys=True, separators=(',', ': '))) if len(bad) > 0: print_progress('Parsed ' + doc_file) print_progress('') print_progress('') print_progress('------------------------- WARNING! -------------------------') print_progress(str(len(bad)) + ' instances of undocumented columns detected.') print_progress('') print_progress(' DOCUMENT THEM IMMEDIATELY OR YOU WILL BE PUNISHED') print_progress('') print_progress('') else: print_progress('Parsed ' + doc_file) else: print("Unrecognized file extension: " + extension) quit(1)
buffer_limit = 10000 try: os.remove(output_path + output_file) except OSError: pass if not os.path.exists(output_path): os.makedirs(output_path) try: con = lite.connect(output_path + output_file) con.cursor().execute("CREATE TABLE vendors( dat INT, name TEXT, rating TEXT, ratings TEXT )") except lite.Error as e: print_progress("Failed to clean " + market + " listings, error %s:" % e.args[0]) size = len([name for name in os.listdir(path)]) count = 1 tot_scraped = 0 try: con = lite.connect(output_path + output_file) buf = 0; for f in listdir(path): # Update the progress update_progress(count, size) count = count + 1