Пример #1
0
    def query_craig(self):

        if self.verbose: print('# query_craig()' )
        if self.veryverbose: print( self.info['filters'] )

        cl_a = CraigslistForSale(
                    site=self.info['site'], 
                    area=self.info['area'], 
                    category=self.info['category'], 
                    filters=self.info['filters']
                    )
        limit = 0
        for result in cl_a.get_results(sort_by='newest'):
            record_time = car_util.time_object( result['datetime'],'%Y-%m-%d %H:%M' )
            clean_time = car_util.time_object( self.info['since-date'],'%Y-%m-%dT%H:%M:%SZ' )
            if record_time > clean_time:
                xCar = car_info.car( {'url': result['url']} ).update_info()
                # print('x' + str(xCar.info) )
                while xCar.error != '':
                    print('ErroR ' + str(xCar.error) + result['url'])
                    if xCar.error == 408: # Request Timeout
                        print( ' time problem, wait 3 and try again ' )
                        xCar = car_info.car( {'url': result['url']} ).update_info()
                        time.sleep(3)
                    if car.error == 404: # Request not found
                        print( ' not found!! ')
                        xCar.info['sold-date'] = datetime.datetime.utcnow()
                        xCar.save_me()
                        xCar.post_solr()
                        break
                        

            time.sleep(1)
            limit += 1
Пример #2
0
def scrape_area(area):
    """
    Scrapes craigslist for a certain geographic area, and finds the latest listings.
    :param area:
    :return: A list of results.
    """
    cl_fs = CraigslistForSale(site=settings.CRAIGSLIST_SITE, area=area, category=settings.CRAIGSLIST_FORSALE_SECTION,
                          filters = {'make': 'triumph'})

    results = []
    gen = cl_fs.get_results(sort_by='newest', limit=150)
    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            continue
        listing = session.query(Listing).filter_by(cl_id=result["id"]).first()

        # Don't store the listing if it already exists.
        if listing is None:
            
            # Try parsing the price.
            price = 0
            try:
                price = float(result["price"].replace("$", ""))
            except Exception:
                pass

            # Create the listing object.
            listing = Listing(
                link=result["url"],
                created=parse(result["datetime"]),
##                lat=lat,
##                lon=lon,
                name=result["name"],
                price=price,
##                location=result["where"],
                cl_id=result["id"],
##                area=result["area"],
##                bart_stop=result["bart"]
            )

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()

            if len(result["name"]) > 0:
                results.append(result)

    return results
Пример #3
0
from craigslist import CraigslistForSale

cl_h = CraigslistForSale(site='denver',
                         filters={'query': 'wurlitzer 200'})


for result in cl_h.get_results(sort_by='newest', geotagged=True):
    print(result)
Пример #4
0
class Shark:
    def __init__(self, query=None):
        prev = os.path.dirname(os.getcwd())
        db = os.path.join(prev, 'database', 'craigslist_results.db')
        self.conn = self.connect_db(db)
        if query is not None:
            self.craig = CraigslistForSale(site='sandiego', filters={'query' : query})

            # Fill db with queried items now

            self.sql_init(query)


    def close_db(self):
        '''
        Closes the connection to the DB
        :return:
        '''
        try:
            self.conn.close()
        except Error as e:
            print(e)


    def sql_init(self, query):
        '''
        Initializes all the sql functions to their initial state
        :return:
        '''
        result_set = self.get_query(limit=50)
        for result in result_set:
            self.insert_db(result, query)

        data = self.select_price_from_db()
        # print(data)
        outliers = self.filter_data(data)
        self.remove_filtered_from_db(outliers)


    def connect_db(self, db_file):
        '''
        Make a connection to our DB
        :param db_file:
        :return: conn object or None
        '''
        try:
            conn = sqlite3.connect(db_file)
            x = conn.execute('pragma journal_mode=wal')
            return conn
        except Error as e:
            print(e)

        return None

    def insert_db(self, item, query):
        '''
        Updates DB and inserts new items into it
        :param item: Item to be inserted
        :param query: User input, will be hashed into a query ID for distinction between queries
        :return:
        '''
        id = int(item['id'])
        name = item['name']
        url = item['url']
        time = item['datetime']
        price = int(item['price'][1:])
        q_id = str(hash(query))
        insert_stmt = 'INSERT or IGNORE INTO computers (id, name, url, time, price, query_id) ' \
                      'VALUES (?, ?, ?, ?, ?, ?)'
        entry = (id, name, url, time, price, q_id)

        try:
            c = self.conn.cursor()
            with self.conn:
                c.execute(insert_stmt, entry)
        except Error as e:
            print(e)

    def remove_filtered_from_db(self, outliers):
        '''
        Removes all outliers from the DB
        :param outliers:
        '''
        cur = self.conn.cursor()
        for item in outliers:
            cur.execute('DELETE FROM computers WHERE price = ?', item)

    def select_all_from_db(self):
        '''
        Fetches all items from db
        :param conn:
        :return: rows -> all rows from db
        '''
        cur = self.conn.cursor()
        cur.execute('SELECT * FROM computers')

        rows = cur.fetchall()
        return rows

    def select_price_from_db(self):
        cur = self.conn.cursor()
        cur.execute('SELECT price FROM computers')

        rows = cur.fetchall()
        return rows

    def price_with_query(self, query):
        h = str(hash(query))
        cur = self.conn.cursor()
        cur.execute('SELECT price FROM computers WHERE query_id = ?', (query,))

        rows = cur.fetchall()
        return rows

    def select_by_hash_from_db(self, item):
        h = hash(item)
        cur = self.conn.cursor()
        cur.execute('SELECT * FROM computers WHERE query_id = ?', (str(h),))

        rows = cur.fetchall()
        return rows


    def get_query(self, limit=0, year=None):
        '''
        Gets results back from web search
        :param limit:
        :param year:
        :return:
        '''
        results = []
        for result in self.craig.get_results(limit=limit):
            if year is not None:
                if year in result['name']:
                    results.append(result)
            else:
                results.append(result)
        return results

    def filter_data(self, data):
        '''
        Filters out values that are too low to be valid electronics
        :param data: list of data to filter
        :return outliers:
        '''
        # TODO Fix so it actually filters data with a better algorithm
        mean = np.mean(data)
        std = np.std(data)
        outliers = []

        for item in data:
            if item < (mean - std):
                outliers.append(item)

        return outliers
Пример #5
0
    'purple', 'schwinn', 'bike', 'bicycle', 'hybrid', 'Bike', 'Bicycle',
    'Schwinn', 'Purple', 'womens', 'ladies'
]
cl_e = CraigslistForSale(site='sacramento',
                         filters={
                             'search_titles': True,
                             'query': 'bike',
                             'has_image': True
                         })

print(
    '/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////\n'
)
print('Results:\n')

for result in cl_e.get_results(sort_by='newest', limit=3000):
    if result['where'] == ('Davis' or 'Davis, CA' or 'davis'):
        for search in query:
            if search in result['name']:
                print('Location: {}\n'.format(result['where']))
                print('Date: {}\n'.format(result['datetime']))
                print('Post Title: {}\n'.format(result['name']))
                print('URL: {}\n'.format(result['url']))
                print('Price: {}\n'.format(result['price']))
                print('Has an Image Available (True or False): {}\n'.format(
                    result['has_image']))
        if query[0] and query[1] in result['name']:
            print('Red Alert')
            badmessage = twilioCli.messages.create(
                body='Suspicious Sale posted',
                from_=myTwilioNumber,
Пример #6
0
#hour = int(time.strftime('%H', time.localtime(time.time())))
#day = int(time.strftime('%d', time.localtime(time.time())))
#when = ""

#os.system(stri)
#os.system("osascript -e \'tell application \"Safari\" to activate\'")

i = 1
added = 0
sofar = ""

with open('cars.html', 'r') as myfile:
    data = myfile.read().replace('\n', '')

#took out geotagged=True
for result in search.get_results(sort_by='price_asc'):  #,limit = 50
    if ((data.find(result['name'].encode('utf-8')) == -1)
            and (sofar.find(result['name'].lower()) == -1)):
        with open("cars.html", "a") as myfile:  #append the listing
            myfile.write("<a href = \"" + result['url'].encode('utf-8') +
                         "\">" + str(i) + ": " +
                         result['price'].encode('utf-8') + " " +
                         result['name'].encode('utf-8') + " " +
                         result['url'].encode('utf-8') + "\n" + "</a><br><br>")
            added = added + 1
            sofar += result['name'].lower()
            i += 1
        print "new listing \"" + result['name'] + "\" added."
    #if day - int(result['datetime'][8:10]) == 0:
#    when = str(hour - int(result['datetime'][11:13]))
#else:
Пример #7
0
strj = "\"})\n\t end tell \nend tell\'"

hour = int(time.strftime('%H', time.localtime(time.time())))
day = int(time.strftime('%d', time.localtime(time.time())))
when = ""

#os.system(stri)
#os.system("osascript -e \'tell application \"Safari\" to activate\'")

i = 1

with open('porsche.html', 'r') as myfile:
    data = myfile.read().replace('\n', '')

#took out geotagged=True
for result in search.get_results(sort_by='newest'):  #,limit = 50
    if (data.find(result['name'].encode('utf-8')) == -1):
        with open("porsche.html", "a") as myfile:
            myfile.write("<a href = \"" + result['url'].encode('utf-8') +
                         "\">" + str(i) + ": " +
                         result['price'].encode('utf-8') + " " +
                         result['name'].encode('utf-8') + " " +
                         result['url'].encode('utf-8') + "\n" + "</a><br><br>")
        print "new listing " + result['name'] + " added."
    if day - int(result['datetime'][8:10]) == 0:
        when = str(hour - int(result['datetime'][11:13]))
    else:
        when = str(hour + (24 * abs(day - int(result['datetime'][8:10]))) -
                   int(result['datetime'][11:13]))
    print str(i) + ": " + result['price'] + ' ' + result[
        'name'][:30] + ": " + result['url']
Пример #8
0
def scrape_for_sale():
    """
    Searching target object in craiglist for sale
    """

    cl_h = CraigslistForSale(site=settings.CRAIGSLIST_SITE, category=settings.CRAIGSLIST_CATEGORY,
                             filters={'query': settings.CRAIGSLIST_FORSALE_SECTION,
                                      'search_titles': True,
                                      'max_price': settings.MAX_PRICE})

    gen = cl_h.get_results(sort_by='newest', geotagged=True)
    results = []

    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            continue

        # Search the result in database, if it is already there, skip it.
        listing = session.query(Listing).filter_by(cl_id=result["id"]).first()
        if listing is None:
            geotag = result["geotag"]
            area_found = False
            area = ""
            lat = 0
            lon = 0
            if result["geotag"] is not None:
                lat = geotag[1]
                lon = geotag[0]
            coords = (lat, lon)
            for a, box in settings.BOXES.items():
                if in_box(coords, box):
                    area = a
                    area_found = True
            result["area"] = area

            # Try parsing the price.
            price = 0
            try:
                price = float(result["price"].replace("$", ""))
            except Exception:
                pass

            # Create the listing object.
            listing = Listing(
                link=result["url"],
                created=parse(result["datetime"]),
                lat=lat,
                lon=lon,
                name=result["name"],
                price=price,
                location=result["where"],
                cl_id=result["id"],
                area=result["area"],
            )

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()

            # if the location can be found in the box, append it into the results
            if area_found:
                results.append(result)

    return results
        #run the search on the specific site
        cl_fs_car = CraigslistForSale(
            site=j['site'],
            category=cat_use,
            filters={
                'query': query_use,
                'has_image': True,
                'search_titles': True  #, 'auto_transmission':'manual'
                ,
                'auto_fuel_type': ['gas', 'hybrid', 'electric', 'other'],
                'max_year': '2000',
                'auto_title_status': ['clean', 'salvage', 'rebuilt', 'lien']
            })

        for result in cl_fs_car.get_results():
            try:
                date_time_use = convert(
                    result['datetime']
                )  #convert the string time from the ad to datetime format
                logging.info('found result!')
                if (
                        date_time_use > datetime_limit
                ):  #check to see if posting time was within the last 24 hours from when the code was run
                    the_image = get_image_from_page(
                        result['url']
                    )  #returns the first image on the ad, shouldn't error out as we filter to ads with images
                    logging.info(the_image)

                    #set the caption
                    caption = result['name'] + '\nListing Price: ' + result[
Пример #10
0
def main(argv):

    if len(argv) < 4:
        print('please enter $python clvehicles.py [city] [make] [model]')
        sys.exit()

    location = argv[1]
    make = argv[2]
    model = argv[3]

    print('Searching the ' + location + ' craigslist site for ' + make + ' ' +
          model)
    cl_s = CraigslistForSale(site=location,
                             filters={
                                 'make': make,
                                 'model': model,
                                 'min_price': 2000
                             })
    i = 0
    urls = []
    nresults = sum(1 for x in cl_s.get_results())

    vehicles = []

    for result in cl_s.get_results():
        #print result
        urls.append(result['url'])
        veh = Vehicle(result['name'])
        veh.setPrice(result['price'])
        veh.setTimestamp(result['datetime'])
        veh.setURL(result['url'])
        veh.setID(result['id'])
        vehicles.append(veh)

        i = i + 1
    print i

    print('Parsing the ads')
    for k in range(len(vehicles)):
        rsp = requests.get(urls[k])

        html = bs4(rsp.text, 'html.parser')
        vehresults = html.body.find_all('p', attrs={'class': 'attrgroup'})
        #find a way of turning this into a dictionary
        try:
            vyear = vehresults[0].find_all('span')[0].get_text()[0:4]
            vehicles[k].setYear(vyear)
            vehicle_info = vehresults[1].find_all('span')
            for l in range(len(vehicle_info)):
                attribute = vehicle_info[l].get_text().split(':')
                if attribute[0] == 'condition':
                    vehicles[k].setCondition(attribute[1])
                elif attribute[0] == 'cylinders':
                    vehicles[k].setCylinders(attribute[1])
                elif attribute[0] == 'drive':
                    vehicles[k].setDrive(attribute[1])
                elif attribute[0] == 'fuel':
                    vehicles[k].setFuel(attribute[1])
                elif attribute[0] == 'odometer':
                    vehicles[k].setMilage(attribute[1])
                elif attribute[0] == 'paint color':
                    vehicles[k].setColor(attribute[1])
                elif attribute[0] == 'title status':
                    vehicles[k].setTitleStatus(attribute[1])
                elif attribute[0] == 'transmission':
                    vehicles[k].setTranstype(attribute[1])
                elif attribute[0] == 'type':
                    vehicles[k].setVehicletype(attribute[1])

        except IndexError:
            print('Post %d was likely deleted', k)

    headers = [
        'name', 'price', 'year', 'condition', 'milage', 'title status',
        'transmission', 'drive', 'cylinders', 'fuel', 'color', 'location',
        'timestamp', 'url'
    ]

    print('writing to .csv')
    fname = location + model + '.csv'
    with open(fname, 'wb') as f:
        w = csv.writer(f)
        i = 0
        w.writerow(headers)

        for i in range(len(vehicles)):
            try:
                name = vehicles[i].name

            except AttributeError:
                name = 'N/A'
            try:
                price = vehicles[i].price

            except AttributeError:
                price = 'N/A'

            try:
                condition = vehicles[i].condition
            except AttributeError:
                condition = 'N/A'

            try:
                milage = vehicles[i].milage
            except AttributeError:
                milage = 'N/A'

            try:
                titleStatus = vehicles[i].titleStatus
            except AttributeError:
                titleStatus = 'N/A'

            try:
                transtype = vehicles[i].transtype
            except AttributeError:
                transtype = 'N/A'

            try:
                drive = vehicles[i].drive
            except AttributeError:
                drive = 'N/A'

            try:
                cylinders = vehicles[i].cylinders
            except AttributeError:
                cylinders = 'N/A'

            try:
                fuel = vehicles[i].fuel
            except AttributeError:
                fuel = 'N/A'

            try:
                color = vehicles[i].color
            except AttributeError:
                color = 'N/A'

            try:
                timestamp = vehicles[i].timestamp
            except AttributeError:
                timestamp = 'N/A'

            try:
                url = vehicles[i].url
            except AttributeError:
                url = 'N/A'

            try:
                year = vehicles[i].year
            except AttributeError:
                year = 'N/A'

            row = [
                name, price, year, condition, milage, titleStatus, transtype,
                drive, cylinders, fuel, color, location, timestamp, url
            ]
            try:
                w.writerow(row)
            except UnicodeEncodeError:
                print("weird character")
Пример #11
0
def scrape_area(area):
    """
    Scrapes craigslist for a certain geographic area, and finds the latest listings.
    :param area:
    :return: A list of results.
    """
    cl_h = CraigslistForSale(site=settings.CRAIGSLIST_SITE,
                             area=area,
                             category=settings.CRAIGSLIST_HOUSING_SECTION,
                             filters={
                                 'max_price': settings.MAX_PRICE,
                                 "min_price": settings.MIN_PRICE
                             })

    results = []
    gen = cl_h.get_results(sort_by='newest', geotagged=True, limit=20)
    while True:
        try:
            result = next(gen)
        except StopIteration:
            break
        except Exception:
            continue
        listing = session.query(Listing).filter_by(cl_id=result["id"]).first()

        # Don't store the listing if it already exists.
        if listing is None:
            # if result["where"] is None:
            #     # If there is no string identifying which neighborhood the result is from, skip it.
            #     continue

            lat = 0
            lon = 0
            if result["where"] is not None and result["geotag"] is not None:
                # Assign the coordinates.
                lat = result["geotag"][0]
                lon = result["geotag"][1]

                # Annotate the result with information about the area it's in and points of interest near it.
                geo_data = find_points_of_interest(result["geotag"],
                                                   result["where"])
                result.update(geo_data)
            else:
                result["area"] = ""
                result["bart"] = ""

            # Try parsing the price.
            price = 0
            try:
                price = float(result["price"].replace("$", ""))
            except Exception:
                pass

            # Create the listing object.
            listing = Listing(link=result["url"],
                              created=parse(result["datetime"]),
                              lat=lat,
                              lon=lon,
                              name=result["name"],
                              price=price,
                              location=result["where"],
                              cl_id=result["id"],
                              area=result["area"],
                              bart_stop=result["bart"])

            # Save the listing so we don't grab it again.
            session.add(listing)
            session.commit()

            # Return the result if it's near a bart station, or if it is in an area we defined.
            #if len(result["bart"]) > 0 or len(result["area"]) > 0:
            results.append(result)

    return results
Пример #12
0
import redis
import time

r = redis.Redis()

#Auth credentials for twilio
account_sid = ''
auth_token = ''
client = Client(account_sid, auth_token)

while True:
    cl_auto = CraigslistForSale(site='sandiego',
                                category='cto',
                                filters={'query': 'frontier'})

    for result in cl_auto.get_results(sort_by='newest'):
        id = result['id']
        name = result['name']
        url = result['url']

        if r.exists(id):
            continue

        print(result)
        r.set(id, 'true')

        # send the text message
        message = client.messages.create(body=name + ' ' + url,
                                         from_='+16199999999',
                                         to='+16199999999')
    time.sleep(60 * 10)
Пример #13
0
import props

geolocator = Nominatim()
orig_coord = (0, 0)

# initializing the originating location
def init_orig():
    orig_loc = geolocator.geocode(props.ORIG_LOC)
    global orig_coord
    orig_coord = (orig_loc.latitude, orig_loc.longitude)

# calculates the distance in miles between two locations
def get_distance(orig_coord, dest):
    dest_loc = geolocator.geocode(dest)
    dest_coord = (dest_loc.latitude, dest_loc.longitude)
    return round(vincenty(orig_coord, dest_coord).miles, 1)

# display formatted results
def display_results(result):
    dest = result['where'] + ", " + props.ORIG_STATE
    print result['name'] + " " + result['price'] + " " + result['where']
    print "distance: " + str(get_distance(orig_coord, dest)) + " miles\n"

init_orig()
cl_fs = CraigslistForSale(site=props.CRAIG_SITE, category=props.CRAIG_CATEGORY,
                          filters={'max_price': props.CRAIG_PRICE, 'has_image': props.CRAIG_IMAGE})

for result in cl_fs.get_results(sort_by=props.CRAIG_SORTBY):
    if props.CRAIG_SITE in result['url']:
        display_results(result)
Пример #14
0
        saved_ids = set(historical.keys())

found_posts = {}
for query in queries['queries']:
    sites = query['sites']
    category = query['category']
    filters = query['filters']

    # Crawl the craigslist
    for site in sites:
        CL_query = CraigslistForSale(site=site,
                                     category=category,
                                     filters=filters)
        found_posts.update(
            {result['id']: result
             for result in CL_query.get_results()})

# fun set logic
post_ids = set(found_posts.keys())
old_ids = saved_ids & post_ids  # posts which no longer appear in searches shouldn't be notified
new_ids = post_ids - old_ids

postings_to_notify = {post_id: found_posts[post_id] for post_id in new_ids}
postings_to_remind = {post_id: found_posts[post_id] for post_id in old_ids}

# Send an email
if postings_to_notify:
    receiver_email = queries['email']
    sender_email = config['sender_email']
    sender_password = config[
        'sender_password']  # should really switch to google 2fa instead of plaintext passwords lol