)
write_cur.execute(
    'CREATE TABLE reviews(dat INT, listing INT, review TEXT, val INT, price REAL, scraped_at INT, user_rating REAL, user_deals INT)'
)
write.commit()

categories = [c[0] for c in categories]
ships_from = [c[0] for c in ships_from]
ships_to = [c[0] for c in ships_to]

# Custom categories
categories.append('custom.benzocaine')
categories.append('custom.cocaleaves')

# Add all the categories
print_progress("Writing categories...")
for c in categories:
    write_cur.execute("INSERT INTO categories VALUES('{0}')".format(c))
write.commit()

print_progress("Writing shipping locations...")
for c in ships_from:
    write_cur.execute("INSERT INTO ships_from VALUES('{0}')".format(c))
write.commit()

print_progress("Writing ship to locations...")
for c in ships_to:
    write_cur.execute("INSERT INTO ships_to VALUES('{0}')".format(c))
write.commit()

# Sort all the titles
Пример #2
0
                        del temp_doc[d][s]
        doc_json = temp_doc

    with open(doc_file, "w") as file:
        file.write(
            json.dumps(doc_json,
                       indent=4,
                       sort_keys=True,
                       separators=(',', ': ')))

    # Complain!
    bad = re.findall(
        '\[MISSING',
        json.dumps(doc_json, indent=4, sort_keys=True, separators=(',', ': ')))
    if len(bad) > 0:
        print_progress('Parsed ' + doc_file)
        print_progress('')
        print_progress('')
        print_progress(
            '------------------------- WARNING! -------------------------')
        print_progress(
            str(len(bad)) + ' instances of undocumented columns detected.')
        print_progress('')
        print_progress(
            '     DOCUMENT THEM IMMEDIATELY OR YOU WILL BE PUNISHED')
        print_progress('')
        print_progress('')
    else:
        print_progress('Parsed ' + doc_file)

else:
# Find all directories with abraxas in the title and pipe all files it contains to the appropriate directory

import os
import re
import copy
from update_progress import update_progress
from update_progress import print_progress

print_progress("Sorting and cleaning " + market + "...")

dests = copy.deepcopy(destinations)

# Create the destination folders
for d, m in dests:
	if not os.path.exists(os.path.join('raw_by_site', market, d)):
	    os.makedirs(os.path.join('raw_by_site', market, d))
if not os.path.exists(os.path.join('raw_by_site', market, 'remaining')):
	os.makedirs(os.path.join('raw_by_site', market, 'remaining'))

print_progress("Finding all existing files... ")

# Find the set of existing files and convert to hashed list
existing_files = []
count = 0
for root, dirnames, filenames in os.walk('raw_by_site/' + market):
	for f in filenames:
		existing_files.append(f)
		count = count + 1
		if (count > 1000):
			print_progress(" ..." + f)
			count = 0
Пример #4
0
try:
    os.remove(output_path + output_file)
except OSError:
    pass

if not os.path.exists(output_path):
    os.makedirs(output_path)

try:
    con = lite.connect(output_path + output_file)
    con.cursor().execute(
        "CREATE TABLE vendors( dat INT, name TEXT, rating TEXT, ratings TEXT )"
    )
except lite.Error as e:
    print_progress("Failed to clean " + market +
                   " listings, error %s:" % e.args[0])

size = len([name for name in os.listdir(path)])

count = 1
tot_scraped = 0
try:
    con = lite.connect(output_path + output_file)

    buf = 0
    for f in listdir(path):

        # Update the progress
        update_progress(count, size)
        count = count + 1
Пример #5
0
import sqlite3 as lite
import sys
from clean_text import clean
from dateutil.parser import parse
import calendar
import time
import datetime

# Paths
path = 'raw_by_site/' + market + '/listings/'
output_path = 'clean_listings/'
output_file = 'temp.db'
final_output = market + '.db'

buffer_limit = 10000

try:
    os.remove(output_path + output_file)
except OSError:
    pass

if not os.path.exists(output_path):
    os.makedirs(output_path)

size = len([name for name in os.listdir(path)])

print_progress(
    "Cleaning html files and putting information in sql format for " + market +
    " market.")
print_progress("Connecting to " + output_file)
    read_cur = read.cursor()
    read_cur.execute('SELECT DISTINCT name FROM vendors')
    names = read_cur.fetchall()

    write = lite.connect(os.path.join('extract_data_vendors', 'temp.db'))
    write_cur = write.cursor()
    write_cur.execute('CREATE TABLE vendors(name TEXT)')
    write_cur.execute('CREATE TABLE reviews(vendor INT, val INT, content TEXT, product TEXT, dat INT, scraped_at INT, user_rating REAL, min_user_sales INT, max_user_sales INT)')
    write_cur.execute('CREATE TABLE ratings(vendor INT, val REAL, dat INT)')
    write_cur.execute('CREATE TABLE sales(vendor INT, val INT, dat INT)')
    write.commit()

    vendors = [c[0] for c in names]

    # Add all the categories
    print_progress("Writing vendors...")
    for c in vendors:
    	write_cur.execute("INSERT INTO vendors VALUES('{0}')".format(c))
    write.commit()

    # Get # of rows to sort
    read_cur.execute("SELECT Count(*) FROM vendors")
    row_count = read_cur.fetchall()[0][0]

    # Sort all the titles
    print_progress("Sorting reviews and ratings by vendor...")
    tot_count, count, tot_aggregated = row_count, 0, 0
    buf = 0

    for i in range(1, row_count):
Пример #7
0
# Find all directories with abraxas in the title and pipe all files it contains to the appropriate directory

import os
import re
import copy
from update_progress import update_progress
from update_progress import print_progress

print_progress("Sorting and cleaning " + market + "...")

dests = copy.deepcopy(destinations)

# Create the destination folders
for d, m in dests:
    if not os.path.exists(os.path.join('raw_by_site', market, d)):
        os.makedirs(os.path.join('raw_by_site', market, d))
if not os.path.exists(os.path.join('raw_by_site', market, 'remaining')):
    os.makedirs(os.path.join('raw_by_site', market, 'remaining'))

print_progress("Finding all existing files... ")

# Find the set of existing files and convert to hashed list
existing_files = []
count = 0
for root, dirnames, filenames in os.walk('raw_by_site/' + market):
    for f in filenames:
        existing_files.append(f)
        count = count + 1
        if (count > 1000):
            print_progress(" ..." + f)
            count = 0
Пример #8
0
            if d not in names and d != 'data_doc':
                del temp_doc[d]
            elif d != 'data_doc':
                ind = names.index(d)
                for s in doc_json[d]:
                    if s not in schemas[ind] and s != 'table_doc':
                        del temp_doc[d][s]
        doc_json = temp_doc

    with open(doc_file, "w") as file:
        file.write(json.dumps(doc_json, indent = 4, sort_keys=True, separators=(',', ': ')))

    # Complain!
    bad = re.findall('\[MISSING', json.dumps(doc_json, indent = 4, sort_keys=True, separators=(',', ': ')))
    if len(bad) > 0:
        print_progress('Parsed ' + doc_file)
        print_progress('')
        print_progress('')
        print_progress('------------------------- WARNING! -------------------------')
        print_progress(str(len(bad)) + ' instances of undocumented columns detected.')
        print_progress('')
        print_progress('     DOCUMENT THEM IMMEDIATELY OR YOU WILL BE PUNISHED')
        print_progress('')
        print_progress('')
    else:
        print_progress('Parsed ' + doc_file)

else:
    print("Unrecognized file extension: " + extension)
    quit(1)
buffer_limit = 10000

try:
    os.remove(output_path + output_file)
except OSError:
    pass

if not os.path.exists(output_path):
    os.makedirs(output_path)

try:
    con = lite.connect(output_path + output_file)
    con.cursor().execute("CREATE TABLE vendors( dat INT, name TEXT, rating TEXT, ratings TEXT )")
except lite.Error as e:
    print_progress("Failed to clean " + market + " listings, error %s:" % e.args[0])

size = len([name for name in os.listdir(path)])

count = 1
tot_scraped = 0
try:
    con = lite.connect(output_path + output_file)

    buf = 0;
    for f in listdir(path):

        # Update the progress
        update_progress(count, size)
        count = count + 1