def crawl(url_list): q = deque(url_list) processed = set(url_list) domain = 'http://www.ourcampaigns.com/' count = 0 sw = StopWatch() while q: current_url = q.popleft() result = html_to_json(domain + current_url) if result is None: print ' skip', current_url continue category, uid = tokenize(current_url) if category == 'race': components = result['RACE DETAILS']['Parents'][0]['text'].split( '>') if len(components) <= 2: print ' Bad', components, current_url continue if components[1].strip() != 'United States': continue position = campactify(components[-2] + components[-1]) year = int(result['RACE DETAILS']['Term Start'][0]['text'].split( '-')[0].split(',')[-1].strip()) if year > 2017 or year < 1900: continue description = 'race_{}_{}'.format(position, year) elif category == 'candidate': name = campactify(result['CANDIDATE DETAILS']['Name'][0]['text']) description = 'candidate_{}'.format(name) elif category == 'container': name = campactify(result['INCUMBENT']['Name'][0]['text']) year = result['INCUMBENT']['Won'][0]['text'].split('/')[-1].strip() description = 'container_{}_{}'.format(name, year) count += 1 if count % 500 == 0: print '{}, crawling {}'.format(count, description) for table_title, table in result.iteritems(): camel_title = to_camel(table_title) if camel_title not in [ 'LastGeneralElection', 'PrimaryOtherSchedule' ]: with open( os.path.join( JSON_DIR, '{}_{}_{}.json'.format(description, uid, camel_title)), 'wb') as fp: json.dump(table, fp) if category == 'race' and 'Governor' not in description and 'Mayor' not in description: continue for row_title, row in table.iteritems(): for cell in row: link = cell['link'] if link not in processed and is_valid_url(link): q.append(link) processed.add(link) sw.tic('crawl {} urls'.format(count))
def save_capnp(data, file_name='data.capnp', test=True): import capnp capnp.remove_import_hook() schemas_path = str(DATA_ROOT / 'schemas.capnp') schemas = capnp.load(schemas_path) flight_book = schemas.FlightBook.new_message() flights = flight_book.init('flights', len(data)) for i, item in enumerate(data): flight = flights[i] for key, value in item.items(): key = to_camel(key) if value is None: try: setattr(flight, key, -1) except capnp.lib.capnp.KjException: setattr(flight, key, "") else: setattr(flight, key, value) if test: file_name = "{}.{}".format(file_name, os.getpid()) path = str(DATA_ROOT / file_name) try: with open(path, 'wb') as f: flight_book.write(f) finally: if test: os.remove(path)
def transform_klass(self, model_klass): """ Transforms the model type into an output string to save the file """ # Jinja2 setup env = Environment(loader=PackageLoader('contrib.adapters.objc_mantle'), lstrip_blocks=True, trim_blocks=True) # Template vars superclass = model_klass.__bases__[0] superclass_name = superclass.__name__ # Is klass a Model, or did it inherit from something else? superclass_is_model = superclass_name is 'Model' fields = [] for field_name, field in model_klass._fields.items(): fields.append('%s%s;' % (FIELD_PROPERTIES[field.__class__], to_camel(field_name))) # Run the template header_template = env.get_template('header.txt') out = header_template.render(filename=self.get_filename_for_klass(model_klass), app=APP, author=AUTHOR, today=datetime.date.today().strftime('%x'), this_year=datetime.date.today().year, company=COMPANY, superclass_is_model=superclass_is_model, superclass=superclass_name, class_name=model_klass.__name__, fields=fields) # Returns string to write to file return out
def get_value(udt, is_default=False, options=[], fake=Factory.create()): if is_default: return BaseFaker.default() if len(options) > 0: if udt == 'timestamp': return BaseFaker.timestamp(start=options[0], end=options[1]) else: return BaseFaker.random(options=options) if 'varchar' in udt: length = int(udt[8:-1]) seed = BaseFaker.bs() if length <= 32 else BaseFaker.sentence() return to_camel(seed)[:length] value = { 'serial': BaseFaker.serial(), 'name': BaseFaker.name(fake=fake), 'first_name': BaseFaker.first_name(fake=fake), 'last_name': BaseFaker.last_name(fake=fake), 'name': BaseFaker.name(fake=fake), 'email': BaseFaker.email(fake=fake), 'text': BaseFaker.text(fake=fake), 'bs': BaseFaker.bs(fake=fake), 'address': BaseFaker.address(fake=fake), 'city': BaseFaker.city(fake=fake), 'state': BaseFaker.state(fake=fake), 'uuid': BaseFaker.uuid(), 'default': BaseFaker.default(), 'timestamp': BaseFaker.timestamp() }.get(udt, BaseFaker.default()) return value
def crawl(url): q = deque([url]) with open('processed.txt', 'rb') as fp: processed = set(fp.read().split()) processed.add(url) domain = 'http://www.ourcampaigns.com/' while q: current_url = q.popleft() if current_url.startswith(domain): current_url = current_url[len(domain):] result = html_to_json(domain + current_url) if result is None: print ' skip', current_url continue category, uid = tokenize(current_url) if category == 'race': components = result['RACE DETAILS']['Parents'][0]['text'].split( '>') if len(components) <= 2: print ' Bad', components, current_url continue if components[1].strip() != 'United States': continue position = campactify(components[-2] + components[-1]) year = int(result['RACE DETAILS']['Term Start'][0]['text'].split( '-')[0].split(',')[-1].strip()) if year > 2016 or year < 1950: continue description = 'race_{}_{}'.format(position, year) elif category == 'candidate': name = campactify(result['CANDIDATE DETAILS']['Name'][0]['text']) description = 'candidate_{}'.format(name) elif category == 'container': name = campactify(result['INCUMBENT']['Name'][0]['text']) year = result['INCUMBENT']['Won'][0]['text'].split('/')[-1].strip() description = 'container_{}_{}'.format(name, year) # print ' ' + description, current_url for table_title, table in result.iteritems(): camel_title = to_camel(table_title) if camel_title not in [ 'LastGeneralElection', 'PrimaryOtherSchedule' ]: with open( 'data/{}_{}_{}.json'.format(description, uid, camel_title), 'wb') as fp: json.dump(table, fp) if category == 'race' and 'Governor' not in description: continue for row_title, row in table.iteritems(): for cell in row: link = cell['link'] if is_valid_url(link) and link not in processed: q.append(link) processed.add(link) with open('processed.txt', 'wb') as fp: fp.write('\n'.join(processed))