def get_page(url): log.info("requesting dice api {}".format(url)) r = requests.get(url) log.info("response code is {}".format(r.status_code)) if r.status_code != 200: log.error(r.text) raise RuntimeError return r.json()
def save_page(page, page_num): log.info("saving page {}".format(page_num)) with BatchQuery() as b: for job_item in page['resultItemList']: try: date = time.mktime( datetime.datetime.strptime(job_item['date'], "%Y-%m-%d").timetuple()) job = Job.create(date=date, location_text=job_item['location'], title=job_item['jobTitle'], company=job_item['company'], url=job_item['detailUrl'], source='dice') job.batch(b).save() except ValidationError as e: log.warn("Problem loading {}: {}".format(job_item, e))
class Job(Model): log.info("Define Job model") location_text = columns.Text(partition_key=True, min_length=1) date = columns.DateTime(primary_key=True, clustering_order='DESC') company = columns.Text(primary_key=True, min_length=1) title = columns.Text(primary_key=True, min_length=1) source = columns.Text(primary_key=True, min_length=1) location = columns.Text(required=False, static=True) url = columns.Text() description = columns.Text(required=False) keywords = columns.List(value_type=columns.Text, required=False)
def setup(): log.info("Setup cassandra connection") connection.setup(settings.CASSANDRA_ENDPOINT, settings.MAIN_KEYSPACE) log.info("Create keyspace") { "simple": lambda: create_keyspace_simple(settings.MAIN_KEYSPACE, settings. REPLICATION_FACTOR), "network": lambda: create_keyspace_network_topology( settings.MAIN_KEYSPACE, {settings.DC: settings.REPLICATION_FACTOR}) }[settings.TOPOLOGY]() log.info("Sync table") sync_table(Job)
def update_all(): models.setup() query = "SELECT DISTINCT location_text, location FROM {}.job".format( settings.MAIN_KEYSPACE) statement = SimpleStatement(query, fetch_size=50) for row in connection.session.execute(statement): if row['location'] is not None: log.info("skipping {}".format(row['location_text'])) continue else: log.info("geocoding {}".format(row['location_text'])) try: geohash = geocode(row['location_text']) if geohash: update_one_partition(row['location_text'], geohash) except Exception as e: log.info("problem geocoding {}: {}".format( row['location_text'], e))