예제 #1
0
def crawl(url_list):
    q = deque(url_list)
    processed = set(url_list)
    domain = 'http://www.ourcampaigns.com/'
    count = 0
    sw = StopWatch()
    while q:
        current_url = q.popleft()
        result = html_to_json(domain + current_url)
        if result is None:
            print '  skip', current_url
            continue
        category, uid = tokenize(current_url)

        if category == 'race':
            components = result['RACE DETAILS']['Parents'][0]['text'].split(
                '>')
            if len(components) <= 2:
                print '  Bad', components, current_url
                continue
            if components[1].strip() != 'United States':
                continue
            position = campactify(components[-2] + components[-1])
            year = int(result['RACE DETAILS']['Term Start'][0]['text'].split(
                '-')[0].split(',')[-1].strip())
            if year > 2017 or year < 1900:
                continue
            description = 'race_{}_{}'.format(position, year)
        elif category == 'candidate':
            name = campactify(result['CANDIDATE DETAILS']['Name'][0]['text'])
            description = 'candidate_{}'.format(name)
        elif category == 'container':
            name = campactify(result['INCUMBENT']['Name'][0]['text'])
            year = result['INCUMBENT']['Won'][0]['text'].split('/')[-1].strip()
            description = 'container_{}_{}'.format(name, year)

        count += 1
        if count % 500 == 0:
            print '{}, crawling {}'.format(count, description)

        for table_title, table in result.iteritems():
            camel_title = to_camel(table_title)
            if camel_title not in [
                    'LastGeneralElection', 'PrimaryOtherSchedule'
            ]:
                with open(
                        os.path.join(
                            JSON_DIR,
                            '{}_{}_{}.json'.format(description, uid,
                                                   camel_title)), 'wb') as fp:
                    json.dump(table, fp)
            if category == 'race' and 'Governor' not in description and 'Mayor' not in description:
                continue
            for row_title, row in table.iteritems():
                for cell in row:
                    link = cell['link']
                    if link not in processed and is_valid_url(link):
                        q.append(link)
                        processed.add(link)
    sw.tic('crawl {} urls'.format(count))
예제 #2
0
def save_capnp(data, file_name='data.capnp', test=True):
    import capnp
    capnp.remove_import_hook()

    schemas_path = str(DATA_ROOT / 'schemas.capnp')
    schemas = capnp.load(schemas_path)

    flight_book = schemas.FlightBook.new_message()
    flights = flight_book.init('flights', len(data))

    for i, item in enumerate(data):
        flight = flights[i]
        for key, value in item.items():
            key = to_camel(key)

            if value is None:
                try:
                    setattr(flight, key, -1)
                except capnp.lib.capnp.KjException:
                    setattr(flight, key, "")
            else:
                setattr(flight, key, value)

    if test:
        file_name = "{}.{}".format(file_name, os.getpid())

    path = str(DATA_ROOT / file_name)

    try:
        with open(path, 'wb') as f:
            flight_book.write(f)
    finally:
        if test:
            os.remove(path)
예제 #3
0
    def transform_klass(self, model_klass):
        """
        Transforms the model type into an
        output string to save the file
        """
        # Jinja2 setup
        env = Environment(loader=PackageLoader('contrib.adapters.objc_mantle'),
                          lstrip_blocks=True, trim_blocks=True)

        # Template vars
        superclass = model_klass.__bases__[0]
        superclass_name = superclass.__name__
        # Is klass a Model, or did it inherit from something else?
        superclass_is_model = superclass_name is 'Model'

        fields = []
        for field_name, field in model_klass._fields.items():
            fields.append('%s%s;' % (FIELD_PROPERTIES[field.__class__], to_camel(field_name)))

        # Run the template
        header_template = env.get_template('header.txt')
        out = header_template.render(filename=self.get_filename_for_klass(model_klass),
                                     app=APP, author=AUTHOR,
                                     today=datetime.date.today().strftime('%x'),
                                     this_year=datetime.date.today().year,
                                     company=COMPANY,
                                     superclass_is_model=superclass_is_model,
                                     superclass=superclass_name,
                                     class_name=model_klass.__name__,
                                     fields=fields)

        # Returns string to write to file
        return out
예제 #4
0
 def get_value(udt, is_default=False, options=[], fake=Factory.create()):
   if is_default:
     return BaseFaker.default()
   if len(options) > 0:
     if udt == 'timestamp':
       return BaseFaker.timestamp(start=options[0], end=options[1])
     else:
       return BaseFaker.random(options=options)
   if 'varchar' in udt:
     length = int(udt[8:-1])
     seed = BaseFaker.bs() if length <= 32 else BaseFaker.sentence()
     return to_camel(seed)[:length]
   value = {
     'serial': BaseFaker.serial(),
     'name': BaseFaker.name(fake=fake),
     'first_name': BaseFaker.first_name(fake=fake),
     'last_name': BaseFaker.last_name(fake=fake),
     'name': BaseFaker.name(fake=fake),
     'email': BaseFaker.email(fake=fake),
     'text': BaseFaker.text(fake=fake),
     'bs': BaseFaker.bs(fake=fake),
     'address': BaseFaker.address(fake=fake),
     'city': BaseFaker.city(fake=fake),
     'state': BaseFaker.state(fake=fake),
     'uuid': BaseFaker.uuid(),
     'default': BaseFaker.default(),
     'timestamp': BaseFaker.timestamp()
   }.get(udt, BaseFaker.default())
   return value
예제 #5
0
def crawl(url):
    q = deque([url])
    with open('processed.txt', 'rb') as fp:
        processed = set(fp.read().split())
    processed.add(url)
    domain = 'http://www.ourcampaigns.com/'
    while q:
        current_url = q.popleft()
        if current_url.startswith(domain):
            current_url = current_url[len(domain):]
        result = html_to_json(domain + current_url)
        if result is None:
            print '  skip', current_url
            continue
        category, uid = tokenize(current_url)

        if category == 'race':
            components = result['RACE DETAILS']['Parents'][0]['text'].split(
                '>')
            if len(components) <= 2:
                print '  Bad', components, current_url
                continue
            if components[1].strip() != 'United States':
                continue
            position = campactify(components[-2] + components[-1])
            year = int(result['RACE DETAILS']['Term Start'][0]['text'].split(
                '-')[0].split(',')[-1].strip())
            if year > 2016 or year < 1950:
                continue
            description = 'race_{}_{}'.format(position, year)
        elif category == 'candidate':
            name = campactify(result['CANDIDATE DETAILS']['Name'][0]['text'])
            description = 'candidate_{}'.format(name)
        elif category == 'container':
            name = campactify(result['INCUMBENT']['Name'][0]['text'])
            year = result['INCUMBENT']['Won'][0]['text'].split('/')[-1].strip()
            description = 'container_{}_{}'.format(name, year)
        # print '    ' + description, current_url
        for table_title, table in result.iteritems():
            camel_title = to_camel(table_title)
            if camel_title not in [
                    'LastGeneralElection', 'PrimaryOtherSchedule'
            ]:
                with open(
                        'data/{}_{}_{}.json'.format(description, uid,
                                                    camel_title), 'wb') as fp:
                    json.dump(table, fp)
            if category == 'race' and 'Governor' not in description:
                continue
            for row_title, row in table.iteritems():
                for cell in row:
                    link = cell['link']
                    if is_valid_url(link) and link not in processed:
                        q.append(link)
                        processed.add(link)
    with open('processed.txt', 'wb') as fp:
        fp.write('\n'.join(processed))