示例#1
0
文件: load.py 项目: DBarthe/joby
def get_page(url):
    log.info("requesting dice api {}".format(url))
    r = requests.get(url)
    log.info("response code is {}".format(r.status_code))
    if r.status_code != 200:
        log.error(r.text)
        raise RuntimeError
    return r.json()
示例#2
0
文件: load.py 项目: DBarthe/joby
def save_page(page, page_num):
    log.info("saving page {}".format(page_num))
    with BatchQuery() as b:
        for job_item in page['resultItemList']:
            try:
                date = time.mktime(
                    datetime.datetime.strptime(job_item['date'],
                                               "%Y-%m-%d").timetuple())
                job = Job.create(date=date,
                                 location_text=job_item['location'],
                                 title=job_item['jobTitle'],
                                 company=job_item['company'],
                                 url=job_item['detailUrl'],
                                 source='dice')
                job.batch(b).save()
            except ValidationError as e:
                log.warn("Problem loading {}: {}".format(job_item, e))
示例#3
0
文件: models.py 项目: DBarthe/joby
class Job(Model):
    log.info("Define Job model")
    location_text = columns.Text(partition_key=True, min_length=1)
    date = columns.DateTime(primary_key=True, clustering_order='DESC')
    company = columns.Text(primary_key=True, min_length=1)
    title = columns.Text(primary_key=True, min_length=1)
    source = columns.Text(primary_key=True, min_length=1)
    location = columns.Text(required=False, static=True)
    url = columns.Text()
    description = columns.Text(required=False)
    keywords = columns.List(value_type=columns.Text, required=False)
示例#4
0
文件: models.py 项目: DBarthe/joby
def setup():
    log.info("Setup cassandra connection")
    connection.setup(settings.CASSANDRA_ENDPOINT, settings.MAIN_KEYSPACE)

    log.info("Create keyspace")
    {
        "simple":
        lambda: create_keyspace_simple(settings.MAIN_KEYSPACE, settings.
                                       REPLICATION_FACTOR),
        "network":
        lambda: create_keyspace_network_topology(
            settings.MAIN_KEYSPACE, {settings.DC: settings.REPLICATION_FACTOR})
    }[settings.TOPOLOGY]()

    log.info("Sync table")
    sync_table(Job)
示例#5
0
def update_all():
    models.setup()
    query = "SELECT DISTINCT location_text, location FROM {}.job".format(
        settings.MAIN_KEYSPACE)
    statement = SimpleStatement(query, fetch_size=50)
    for row in connection.session.execute(statement):
        if row['location'] is not None:
            log.info("skipping {}".format(row['location_text']))
            continue
        else:
            log.info("geocoding {}".format(row['location_text']))
            try:
                geohash = geocode(row['location_text'])
                if geohash:
                    update_one_partition(row['location_text'], geohash)
            except Exception as e:
                log.info("problem geocoding {}: {}".format(
                    row['location_text'], e))