def encaminhar(url): s = Site(url_curta=url) s.recuperar_url_longa() # se não houver url, volta pra raiz if s.url_longa == "": return redirect(url_for('index')) return redirect(f'{s.url_longa}')
def index(): if request.method == 'POST': s = Site(url_longa=request.form['url-longa']) s.cadastrar() return f"""Sua url curta é: <a href = {request.url + s.url_curta}>{request.url + s.url_curta} </a>""" return render_template("base.html")
def _get_site(self): site = ndb.Key(Site, '_site').get() if not site: logging.error('Saving site') site = Site(id='_site') site.github_token = '' site.put() return site
def setUp(self): """Set up dependencies for web pages' loading """ self.client = server.app.test_client() server.app.config['TESTING'] = True # server._old_sites_list = server.sites_list server.sites_list = [ Site(name="Berkeley Arts Magnet"), Site(name="Young Adult Project") ]
def post(self): url = self.request.POST.get('url') try: result = urllib2.urlopen(url) content = result.read() site = Site() site.url = url site.count = self.__count_imgs(content) site.put() self.__grab_urls(content) except: pass
def get(self): sites_data = Site.query().fetch(keys_only=True) sites_data = ndb.get_multi(sites_data) context = {'data': sites_data} content = self.jinja2.render_template('sites.html', **context) self.response.headers.add_header('Access-Control-Allow-Origin', '*') self.response.write(content)
def process_job(): """processes url web scraping request If process has been done before, get result from database Otherwise, process and add data to database""" r = redis.StrictRedis() while True: curr_job = r.blpop('job_queue', 0)[1] r.hset('status', curr_job, 'processing') print('current job ID:', curr_job) # convert byte to string url = r.hget('urls', curr_job).decode("utf-8") print('Current URL:', url) # if this url has not been requested before/is not in the db if Site.query.filter_by(url=url).first(): r.hset('status', curr_job, 'complete') print('Job', curr_job, 'Completed') else: # fetches url page source try: html = str(get_html(url)) print('Successfully retrieved HTML') # add results to database db.session.add(Site(url=url, html=html)) db.session.commit() print('Added to database') r.hset('status', curr_job, 'complete') print('Job', curr_job, 'Completed') except ValueError: r.hset('status', curr_job, 'abort') print('Job', curr_job, 'Aborted') except TimeoutError: r.hset('status', curr_job, 'timeout') print('Job', curr_job, 'Timed Out') return
def get_site(self, required=False, create_if_missing=False): """Returns the Site object given by the URL.""" if self.response_dict().site: return self.response_dict().site domain = self.url_arg(0) if domain: key_name = Site.key_name_from_domain(domain) site = Site.get_by_key_name(key_name) if not site and create_if_missing: # create a site (but don't save it) site = Site(key_name=key_name, domain=domain) if site: self.response_dict(site = site) # for the template return site if required: raise NotFoundException("site not found")
def get(self): templatevars = {"title":"PREOMR - sites"} sites = [ (s,get_count("Work-%s"%s.name)) for s in Site.all().fetch(100)] templatevars['sites'] = sites total = sum( [ s[1] for s in sites ] ) templatevars["totalhere"] = total templatevars["overalltotal"] = get_count("Work") self.generate("sites.html",templatevars)
def parse_site_cell(line: str, coverage_radius: float): s = line[2:].split("|") point = s[0] corners = s[1][1:len(s[1]) - 2] p = point.split(":") site_x = p[0][1:] site_y = p[1][:len(p[1]) - 1] str_points = corners.split(", ") points = [] for point in str_points: coordinates = point[1:len(point) - 1].split(":") points.append((float(coordinates[0]), float(coordinates[1]))) return Site(float(site_x), float(site_y), coverage_radius, points)
def generate_random_solution(nb_sensor: int, sensor_coverage_distance: float, x_plane_size, y_plane_size): sites = [] for i in range(nb_sensor): realistic = False while not realistic: site = Site(random.uniform(0, x_plane_size), random.uniform(0, y_plane_size), sensor_coverage_distance) if len(sites) > 0: temp = [copy.copy(site) for site in sites] temp.append(site) realistic = solution_is_realistic(temp) else: realistic = True sites.append(site) return sites
def generate_random_solution_b(nb_sensor: int, sensor_coverage_distance: float, x_plane_size, y_plane_size): min_distance_between_sites = sensor_coverage_distance * 0.75 sites = [] for i in range(nb_sensor): min_distance_is_satisfied = False while not min_distance_is_satisfied: site = Site(random.uniform(0, x_plane_size), random.uniform(0, y_plane_size), sensor_coverage_distance) if len(sites) > 0: previous_point = sites[len(sites) - 1] point_distance = compute_point_distance(previous_point.x, previous_point.y, site.x, site.y) if min_distance_between_sites <= point_distance <= sensor_coverage_distance: min_distance_is_satisfied = True else: min_distance_is_satisfied = True sites.append(site) return sites
def sites_list(): """Return list of sites based on the location the user searched for.""" user_location = request.args.get('search') places = search_by_text(user_location) final_places = [] for place in places: site_in_db = Site.query.filter_by(site_id=place['place_id']).first() if site_in_db: final_places.append((place, site_in_db.thumbnail)) elif 'photos' in place: # get photo via the photo reference thumb_url = get_thumnail_url(place['photos'][0]['photo_reference']) #store new site to db geo_location = place['geometry']['location'] new_site = Site(site_id=place['place_id'], name=place['name'], address=place['formatted_address'], thumbnail=thumb_url, lat=geo_location['lat'], lng=geo_location['lng']) db.session.add(new_site) final_places.append((place, thumb_url)) #commit the new places to the db db.session.commit() if not final_places: flash("Sorry, no results were found for your search! Please try again") return render_template("sites.html", places=final_places, api_key=GOOGLE_API_KEY)
def update_site_pos_with_cell_centroid(site: Site): centroid = polygon_centre_area(site.cell_corners) updated_pos = find_weighted_updated_site_location((site.x, site.y), centroid, WEIGHT) site.x = updated_pos[0] site.y = updated_pos[1]
vector_distance = math.sqrt((target_point[0] - initial_point[0])**2 + (target_point[1] - initial_point[1])**2) u = vector[0] / vector_distance, vector[1] / vector_distance weighted_point = initial_point[ 0] + u[0] * vector_distance * weight, initial_point[ 1] + u[1] * vector_distance * weight return weighted_point if __name__ == "__main__": WEIGHT = 1 coverage_radius = 100 start = time.time() # iterations = voronoi_relaxation(generate_random_solution(34, 100, 500, 500), 500, 500, 15) sites = [ Site(356.19560084901246, 127.42318033479866, coverage_radius), Site(418.7012015568561, 81.20783302622547, coverage_radius), Site(282.2647422336893, 139.04607145486347, coverage_radius), Site(462.71824390613375, 138.61894677935322, coverage_radius), Site(495.8338855452489, 50.272726680936685, coverage_radius), Site(374.599719019584, 108.28343578447591, coverage_radius), Site(372.24584006522247, 129.03846814142344, coverage_radius), Site(406.17483780539385, 140.24310280444857, coverage_radius), Site(466.5597007333434, 61.28632635196157, coverage_radius), Site(233.796901623617, 120.66404604823155, coverage_radius) ] iterations = voronoi_relaxation(sites, 500, 500, 20) print( str(time.time() - start) + " seconds to run for " + str(iterations) + " iterations.")
def get(self): self.enforce_admin() name = self.request.get("name") if name is None or "" == name: self.jsonout(status="error", msg="No name specified", ) return name = name.strip() sq = Site.gql("WHERE name = :1",name) s = sq.get() if s is None: s = Site(name=name) s.put() increment("Site") msg = "site %s added with id %d" format = (s.name,s.key().id()) self.jsonout(status="ok",msg=msg,format=format, key=str(s.key()), id=s.key().id() ) else: self.jsonout(status="dup", msg="%s already existed as site with id %d", format=(name,s.key().id()), id=s.key().id(), key=str(s.key()) )
def post(self): ''' Create new sites to included in username searches. **Example Request** .. sourcecode:: json { "sites": [ { "name": "about.me", "url": "http://about.me/%s", "status_code": 200, "match_type": "text", "match_expr": "Foo Bar Baz", "test_username_pos": "john", "test_username_neg": "dPGMFrf72SaS", "headers": {"referer": "http://www.google.com"}, "censor_images": false, "wait_time": 5, "use_proxy": false, }, ... ] } **Example Response** .. sourcecode:: json { "message": "1 site created." } :<header Content-Type: application/json :<header X-Auth: the client's auth token :<json list sites: a list of sites to create :<json string sites[n].name: name of site :<json string sites[n].url: username search url for the site :<json int sites[n].status_code: the status code to check for determining a match (nullable) :<json string sites[n].match_type: type of match (see get_match_types() for valid match types) (nullable) :<json string sites[n].match_expr: expression to use for determining a page match (nullable) :<json string sites[n].test_username_pos: username that exists on site (used for testing) :<json string sites[n].test_username_neg: username that does not exist on site (used for testing) :<json array sites[n].headers: custom headers :<json bool sites[n].censor_images: whether to censor images from this profile :<json int sites[n].wait_time: time (in seconds) to wait for updates after page is loaded :<json bool sites[n].use_proxy: whether to proxy requests for this profile URL :>header Content-Type: application/json :>json string message: API response message :status 200: created :status 400: invalid request body :status 401: authentication required ''' request_json = request.get_json() sites = [] # Ensure all data is valid before db operations for site_json in request_json['sites']: validate_request_json(site_json, _site_attrs) if (site_json['match_type'] is None or site_json['match_expr'] is None) and \ site_json['status_code'] is None: raise BadRequest('At least one of the ' 'following is required: ' 'status code or page match.') if '%s' not in site_json['url']: raise BadRequest('URL must contain replacement character %s') # Save sites for site_json in request_json['sites']: test_username_pos = site_json['test_username_pos'].lower().strip() site = Site(name=site_json['name'].strip(), url=site_json['url'].lower().strip(), test_username_pos=test_username_pos) site.status_code = site_json['status_code'] site.match_expr = site_json['match_expr'] site.match_type = site_json['match_type'] if 'test_username_neg' in site_json: site.test_username_neg = site_json['test_username_neg'] \ .lower().strip(), if 'headers' in site_json: site.headers = site_json['headers'] g.db.add(site) try: g.db.flush() sites.append(site) except IntegrityError: g.db.rollback() raise BadRequest('Site URL {} already exists.'.format( site.url)) g.db.commit() # Send redis notifications for site in sites: notify_mask_client(channel='site', message={ 'id': site.id, 'name': site.name, 'status': 'created', 'resource': None }) message = '{} new sites created'.format(len(request_json['sites'])) response = jsonify(message=message) response.status_code = 202 return response
def _create_fixture_sites(self, config): ''' Create site fixtures. ''' session = app.database.get_session(self._db) about_me = Site(name='About.me', url='http://about.me/%s', category='social', status_code=200, test_username_pos='bob') session.add(about_me) anobii = Site(name='Anobii', url='http://www.anobii.com/%s/books', category='books', match_type='css', match_expr='h1.person_heading', test_username_pos='bob') session.add(anobii) ask_fm = Site(name='Ask FM', url='http://ask.fm/%s', category='social', status_code=200, test_username_pos='tipsofschool') session.add(ask_fm) audioboom = Site(name='Audioboom', url='http://audioboom.com/%s', category='music', status_code=200, test_username_pos='bob') session.add(audioboom) authorstream = Site(name='Authorstream', url='http://www.authorstream.com/%s/', category='social', status_code=200, test_username_pos='tiikmconferences') session.add(authorstream) badoo = Site(name='Badoo', url='http://badoo.com/%s/', category='dating', status_code=200, test_username_pos='dave') session.add(badoo) behance = Site(name='Behance', url='https://www.behance.net/%s', category='social', status_code=200, test_username_pos='juste') session.add(behance) bitbucket = Site(name='Bitbucket', url='https://bitbucket.org/%s', category='coding', status_code=200, test_username_pos='jespern') session.add(bitbucket) blip_fm = Site(name='Blip FM', url='http://blip.fm/%s', category='music', status_code=200, test_username_pos='mark_till') session.add(blip_fm) blogmarks = Site(name='Blogmarks', url='http://blogmarks.net/user/%s', category='social', match_type='css', match_expr='div#infos-user', test_username_pos='Krome') session.add(blogmarks) blogspot = Site(name='Blogspot', url='http://%s.blogspot.co.uk', category='social', status_code=200, test_username_pos='takingroot-jr') session.add(blogspot) bodybuilding = Site(name='Bodybuilding', url='http://bodyspace.bodybuilding.com/%s/', category='health', match_type='css', match_expr='div.BodyBanner', test_username_pos='Scarfdaddy') session.add(bodybuilding) break_com = Site(name='Break', url='http://www.break.com/user/%s', category='video', match_type='css', match_expr='section.profile-head', test_username_pos='jenny') session.add(break_com) cafemom = Site(name='Cafemom', url='http://www.cafemom.com/home/%s', category='social', match_type='css', match_expr='div#member-info', test_username_pos='jane') session.add(cafemom) car_domain = Site(name='Car Domain', url='http://www.cardomain.com/member/%s/', category='automotive', status_code=200, test_username_pos='dan') session.add(car_domain) codeplex = Site(name='Codeplex', url='http://www.codeplex.com/site/users/view/%s', category='coding', match_type='css', match_expr='h1.user_name', test_username_pos='dan') session.add(codeplex) colour_lovers = Site(name='Colour Lovers', url='http://www.colourlovers.com/lover/%s', category='art and design', match_type='css', match_expr='div.column-container', test_username_pos='bob') session.add(colour_lovers) conferize = Site(name='Conferize', url='https://www.conferize.com/u/%s/', category='business', match_type='css', match_expr='div.hero--user', test_username_pos='dan') session.add(conferize) copy_taste = Site(name='Copytaste', url='http://copytaste.com/profile/%s', category='social', status_code=200, test_username_pos='metegulec') session.add(copy_taste) cruisemates = Site( name='Cruisemates', url='http://www.cruisemates.com/forum/members/%s.html', category='travel', match_type='css', match_expr='div#main_userinfo', test_username_pos='trip') session.add(cruisemates) daily_motion = Site(name='Dailymotion', url='http://www.dailymotion.com/%s', category='Video', status_code=200, test_username_pos='fanreviews') session.add(daily_motion) delicious = Site(name='Delicious', url='https://del.icio.us/%s', category='Social', status_code=200, test_username_pos='john') session.add(delicious) deviant_art = Site(name='DeviantArt', url='http://%s.deviantart.com/', category='image', status_code=200, test_username_pos='marydoodles') session.add(deviant_art) diigo = Site(name='Diigo', url='https://www.diigo.com/profile/%s', category='bookmarking', match_type='css', match_expr='div#avatarSection', test_username_pos='hunter53') session.add(diigo) disqus = Site(name='Disqus', url='https://disqus.com/by/%s/', category='social', status_code=200, test_username_pos='willwillibe') session.add(disqus) diy = Site(name='DIY', url='https://diy.org/%s', category='home improvement', status_code=200, test_username_pos='bob') session.add(diy) dribble = Site(name='Dribble', url='https://www.dribbble.com/%s', category='art and design', status_code=200, test_username_pos='kirp') session.add(dribble) ebay = Site(name='Ebay', url='http://www.ebay.com/usr/%s', category='shopping', match_type='css', match_expr='div#user_image', test_username_pos='max') session.add(ebay) etsy = Site(name='Etsy', url='https://www.etsy.com/people/%s', category='shopping', status_code=200, test_username_pos='betsy') session.add(etsy) families = Site(name='Families', url='http://www.families.com/author/%s', category='lifestyle', match_type='css', match_expr='div#author-description', test_username_pos='JenThorpe') session.add(families) fanpop = Site(name='Fanpop', url='http://www.fanpop.com/fans/%s', category='entertainment', match_type='css', match_expr='div.user-header', test_username_pos='dan') session.add(fanpop) ffffound = Site(name='FFFFound', url='http://ffffound.com/home/%s/found/', category='image', status_code=200, test_username_pos='tobbz') session.add(ffffound) flavours = Site(name='Flavours', url='http://%s.flavors.me', category='social', status_code=200, test_username_pos='john') session.add(flavours) flickr = Site(name='Flickr', url='https://www.flickr.com/photos/%s/', category='image', status_code=200, test_username_pos='adam') session.add(flickr) foodspotting = Site( name='Foodspotting', url='http://www.foodspotting.com/%s', category='lifestyle', status_code=200, test_username_pos='dylan', # This site handles names with leading numerics strangely, so we # need to force an alphabetic negative case. test_username_neg='asdfqwerasdfqwer') session.add(foodspotting) fotolog = Site(name='Fotolog', url='http://www.fotolog.com/%s/', category='image', status_code=200, test_username_pos='anna') session.add(fotolog) foursquare = Site(name='Foursquare', url='https://foursquare.com/%s', category='social', status_code=200, test_username_pos='john') session.add(foursquare) freesound = Site(name='Freesound', url='http://www.freesound.org/people/%s/', category='music', status_code=200, test_username_pos='john') session.add(freesound) friend_finder_x = Site(name='FriendFinder-X', url='http://www.friendfinder-x.com/profile/%s', category='dating', match_type='css', match_expr='div#tmpl_member_profile_header', test_username_pos='daniel') session.add(friend_finder_x) funny_or_die = Site(name='Funny or Die', url='http://www.funnyordie.com/%s', category='video', status_code=200, test_username_pos='bob') session.add(funny_or_die) get_it_on = Site(name='GETitOn', url='http://getiton.com/profile/%s', category='dating', match_type='css', match_expr='div#profile_page_wrapper', test_username_pos='chris') session.add(get_it_on) github = Site(name='Github', url='https://github.com/%s', category='coding', status_code=200, test_username_pos='google') session.add(github) godtube = Site(name='GodTube', url='http://www.godtube.com/%s/', category='video', status_code=200, test_username_pos='bball1989') session.add(godtube) gogobot = Site(name='Gogobot', url='http://www.gogobot.com/user/%s', category='travel', status_code=200, test_username_pos='dan') session.add(gogobot) goodreads = Site(name='Goodreads', url='http://www.goodreads.com/%s', category='entertainment', status_code=200, test_username_pos='seal') session.add(goodreads) gravatar = Site(name='Gravatar', url='http://en.gravatar.com/profiles/%s', category='social', status_code=200, test_username_pos='simon') session.add(gravatar) hubpages = Site(name='Hubpages', url='http://%s.hubpages.com/', category='blog', status_code=200, test_username_pos='bob') session.add(hubpages) i_am_pregnant = Site(name='i-am-pregnant', url='http://www.i-am-pregnant.com/members/%s/', category='health', status_code=200, test_username_pos='shiv77') session.add(i_am_pregnant) if_this_then_that = Site(name='IFTTT', url='https://ifttt.com/p/%s/shared', category='technology', status_code=200, test_username_pos='bsaren') session.add(if_this_then_that) image_shack = Site(name='ImageShack', url='https://imageshack.com/user/%s', category='image', match_type='css', match_expr='header.user-profile', test_username_pos='Nicholas230') session.add(image_shack) imgur = Site(name='imgur', url='http://imgur.com/user/%s', category='image', status_code=200, test_username_pos='ThatPervert') session.add(imgur) instagram = Site(name='Instagram', url='https://www.instagram.com/%s/', category='social', status_code=200, test_username_pos='kensingtonroyal') session.add(instagram) instructables = Site(name='instructables', url='http://www.instructables.com/member/%s/', category='learning', status_code=200, test_username_pos='shags_j') session.add(instructables) interpals = Site(name='InterPals', url='https://www.interpals.net/%s', category='social', match_type='css', match_expr='div.profile', test_username_pos='Seven89') session.add(interpals) keybase = Site(name='Keybase', url='https://keybase.io/%s', category='crypto', status_code=200, test_username_pos='mehaase') session.add(keybase) kongregate = Site(name='Kongregrate', url='http://www.kongregate.com/accounts/%s', category='gaming', status_code=200, test_username_pos='Truestrike') session.add(kongregate) lanyrd = Site(name='Lanyrd', url='http://lanyrd.com/profile/%s/', category='social', status_code=200, test_username_pos='shanselman') session.add(lanyrd) last_fm = Site(name='Last.fm', url='http://www.last.fm/user/%s', category='music', status_code=200, test_username_pos='FrancaesG') session.add(last_fm) law_of_attraction = Site( name='Law of Attraction', url='http://www.lawofattractionsingles.com/%s', category='dating', match_type='css', match_expr='div.prof_top_block', test_username_pos='Jenniferlynnmaui') session.add(law_of_attraction) library_thing = Site(name='LibraryThing', url='https://www.librarything.com/profile/%s', category='learning', match_type='css', match_expr='div.profile', test_username_pos='Medievalgirl') session.add(library_thing) lifeboat = Site(name='lifeboat', url='https://oc.tc/%s', category='gaming', status_code=200, test_username_pos='Matilaina') session.add(lifeboat) linked_in = Site(name='LinkedIn', url='https://www.linkedin.com/in/%s', category='social', status_code=200, test_username_pos='markhaase') session.add(linked_in) marketing_land = Site(name='Marketing Land', url='http://marketingland.com/author/%s', category='business', status_code=200, test_username_pos='barb-palser') session.add(marketing_land) mate1 = Site(name='Mate1.com', url='http://www.mate1.com/profiles/%s', category='dating', status_code=200, test_username_pos='janedoe') session.add(mate1) medium = Site(name='Medium', url='https://medium.com/@%s', category='social', status_code=200, test_username_pos='erinshawstreet') session.add(medium) meetzur = Site(name='Meetzur', url='http://www.meetzur.com/%s', category='social', match_type='css', match_expr='div.profile-left', test_username_pos='sachin99') session.add(meetzur) mixcloud = Site(name='Mixcloud', url='https://www.mixcloud.com/%s/', category='music', status_code=200, test_username_pos='dublab') session.add(mixcloud) # This site was out of service at the time I tried testing it, so I # could not test this criteria. mixcrate = Site(name='mixcrate', url='http://www.mixcrate.com/%s', category='music', status_code=200, test_username_pos='kennyrock') session.add(mixcrate) mixlr = Site(name='Mixlr', url='http://mixlr.com/%s/', category='music', status_code=200, test_username_pos='therwandan') session.add(mixlr) mod_db = Site(name='Mod DB', url='http://www.moddb.com/members/%s', category='gaming', status_code=200, test_username_pos='hugebot') session.add(mod_db) muck_rack = Site(name='Muck Rack', url='https://muckrack.com/%s', category='gaming', status_code=200, test_username_pos='scottkleinberg') session.add(muck_rack) mybuilder_com = Site(name='MyBuilder.com', url='https://www.mybuilder.com/profile/view/%s', category='business', status_code=200, test_username_pos='kdbuildingservices') session.add(mybuilder_com) mylot = Site(name='myLot', url='http://www.mylot.com/%s', category='social', status_code=200, test_username_pos='LovingMyBabies') session.add(mylot) myspace = Site(name='Myspace', url='https://myspace.com/%s', category='social', status_code=200, test_username_pos='kesha') session.add(myspace) netvibes = Site(name='Netvibes', url='http://www.netvibes.com/%s', category='business', status_code=200, test_username_pos='grade3kis') session.add(netvibes) pandora = Site(name='Pandora', url='https://www.pandora.com/profile/%s', category='music', match_type='css', match_expr='div#user_info_container', test_username_pos='mehaase') session.add(pandora) photoblog = Site(name='PhotoBlog', url='https://www.photoblog.com/%s', category='social', status_code=200, test_username_pos='canon6d') session.add(photoblog) photobucket = Site(name='Photobucker', url='http://photobucket.com/user/%s/library/', category='image', status_code=200, test_username_pos='darkgladir') session.add(photobucket) picture_trail = Site(name='PictureTrail', url='http://www.picturetrail.com/%s', category='image', match_type='css', match_expr='td.IntroTitle-text-wt', test_username_pos='victoria15') session.add(picture_trail) pink_bike = Site(name='Pinkbike', url='http://www.pinkbike.com/u/%s/', category='entertainment', status_code=200, test_username_pos='mattwragg') session.add(pink_bike) pinterest = Site(name='Pinterest', url='https://www.pinterest.com/%s/', category='social', status_code=200, test_username_pos='mehaase') session.add(pinterest) playlists_net = Site(name='Playlists.Net', url='http://playlists.net/members/%s', category='music', status_code=200, test_username_pos='WhatisSoul') session.add(playlists_net) plurk = Site(name='Plurk', url='http://www.plurk.com/%s', category='social', match_type='css', match_expr='span.nick_name', test_username_pos='xxSaltandPepperxx') session.add(plurk) rapid7_community = Site(name='Rapid7 Community', url='https://community.rapid7.com/people/%s', category='technology', status_code=200, test_username_pos='dabdine') session.add(rapid7_community) # This site has banned our Splash IP so I cannot test it. rate_your_music = Site(name='Rate Your Music', url='http://rateyourmusic.com/~%s', category='music', status_code=200, test_username_pos='silvioporto') session.add(rate_your_music) readability = Site(name='Readability', url='https://readability.com/%s/', category='entertainment', status_code=200, test_username_pos='adam') session.add(readability) reddit = Site(name='Reddit', url='https://www.reddit.com/user/%s', category='social', status_code=200, test_username_pos='mehaase') session.add(reddit) scratch = Site(name='Scratch', url='https://scratch.mit.edu/users/%s/', category='social', status_code=200, test_username_pos='MeTwo') session.add(scratch) setlist_fm = Site(name='setlist.fm', url='http://www.setlist.fm/user/%s', category='music', status_code=200, test_username_pos='tw21') session.add(setlist_fm) shopcade = Site(name='Shopcade', url='https://www.shopcade.com/%s', category='social', status_code=200, test_username_pos='salonidahake') session.add(shopcade) # This site occasionally throws errors when testing. Maybe it doesn't # like having two requests so fast? single_muslim = Site( name='SingleMuslim', url='https://www.singlemuslim.com/searchuser/%s/abc', category='dating', match_type='css', match_expr='div.userProfileView', test_username_pos='YoghurtTub') session.add(single_muslim) slashdot = Site(name='Slashdot', url='https://slashdot.org/~%s', category='technology', match_type='css', match_expr='article#user_bio', test_username_pos='Locke2005') session.add(slashdot) slideshare = Site(name='SlideShare', url='http://www.slideshare.net/%s', category='technology', status_code=200, test_username_pos='dmc500hats') session.add(slideshare) smite_guru = Site(name='SmiteGuru', url='http://smite.guru/stats/xb/%s/summary', category='gaming', match_type='css', match_expr='div.header.panel', test_username_pos='WatsonV3') session.add(smite_guru) smug_mug = Site(name='SmugMug', url='https://%s.smugmug.com/', category='image', status_code=200, test_username_pos='therescueddog') session.add(smug_mug) smule = Site(name='Smule', url='http://www.smule.com/%s', category='music', status_code=200, test_username_pos='AbsurdJoker') session.add(smule) snooth = Site(name='Snooth', url='http://www.snooth.com/profiles/%s/', category='music', match_type='css', match_expr='div.profile-header', test_username_pos='dvogler') session.add(snooth) soldier_x = Site(name='SoldierX', url='https://www.soldierx.com/hdb/%s', category='technology', match_type='css', match_expr='div.field-field-hdb-photo', test_username_pos='achillean') session.add(soldier_x) sound_cloud = Site(name='SoundCloud', url='https://soundcloud.com/%s', category='music', status_code=200, test_username_pos='youngma') session.add(sound_cloud) soup = Site(name='Soup', url='http://%s.soup.io/', category='social', match_type='css', match_expr='div#userinfo', test_username_pos='nattaly') session.add(soup) source_forge = Site(name='SourceForge', url='https://sourceforge.net/u/%s/profile/', category='technology', status_code=200, test_username_pos='ronys') session.add(source_forge) speaker_deck = Site(name='Speaker Deck', url='https://speakerdeck.com/%s', category='technology', status_code=200, test_username_pos='rocio') session.add(speaker_deck) sporcle = Site(name='Sporcle', url='http://www.sporcle.com/user/%s', category='entertainment', match_type='css', match_expr='div#UserBox', test_username_pos='lolshortee') session.add(sporcle) steam = Site(name='Steam', url='http://steamcommunity.com/id/%s', category='gaming', match_type='css', match_expr='div.profile_page', test_username_pos='tryh4rdz') session.add(steam) stupid_cancer = Site( name='Stupidcancer', url='http://stupidcancer.org/community/profile/%s', category='social', status_code=200, test_username_pos='CatchMeYes') session.add(stupid_cancer) # Tribe.net was down when I was testing. I could not verify that these # settings work. tribe = Site(name='Tribe', url='http://people.tribe.net/%s', category='social', status_code=200, test_username_pos='violetta') session.add(tribe) trip_advisor = Site(name='TripAdvisor', url='https://www.tripadvisor.com/members/%s', category='social', status_code=200, test_username_pos='scrltd16') session.add(trip_advisor) tumblr = Site(name='Tumblr', url='http://%s.tumblr.com/', category='social', status_code=200, test_username_pos='seanjacobcullen') session.add(tumblr) twitter = Site(name='Twitter', url='https://twitter.com/%s', category='social', status_code=200, test_username_pos='mehaase') session.add(twitter) untappd = Site(name='Untappd', url='https://untappd.com/user/%s', category='entertainment', status_code=200, test_username_pos='samelawrence') session.add(untappd) vimeo = Site(name='Vimeo', url='https://vimeo.com/%s', category='image', status_code=200, test_username_pos='mikeolbinski') session.add(vimeo) visualize_us = Site(name='VisualizeUs', url='http://vi.sualize.us/%s/', category='social', status_code=200, test_username_pos='emilybusiness') session.add(visualize_us) voices_com = Site(name='Voices.com', url='https://www.voices.com/people/%s', category='business', match_type='css', match_expr='div.voices-profile-title', test_username_pos='johncavanagh') session.add(voices_com) wanelo = Site(name='Wanelo', url='https://wanelo.com/%s', category='social', status_code=200, test_username_pos='tsingeli') session.add(wanelo) wattpad = Site(name='Wattpad', url='https://www.wattpad.com/user/%s', category='social', status_code=200, test_username_pos='Weirdly_Sarcastic') session.add(wattpad) wishlistr = Site(name='Wishlistr', url='http://www.wishlistr.com/profile/%s/', category='social', match_type='css', match_expr='div#people', test_username_pos='seventy7') session.add(wishlistr) wordpress = Site(name='WordPress', url='https://profiles.wordpress.org/%s/', category='business', match_type='css', match_expr='ul#user-meta', test_username_pos='sivel') session.add(wordpress) xbox_gamertag = Site(name='Xbox Gamertag', url='https://www.xboxgamertag.com/search/%s/', category='gaming', status_code=200, test_username_pos='masterrshake') session.add(xbox_gamertag) youtube = Site(name='YouTube', url='https://www.youtube.com/user/%s', category='image', status_code=200, test_username_pos='vlogdozack') session.add(youtube) session.commit()
tempo = tempo + (random.random() * 120) time.sleep(tempo) print("dorme " + str(tempo / 60) + " min") print("realiza web crawling") req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) soup = BeautifulSoup(urlopen(req).read(), "html.parser") #regex reg = re.compile(".*&sa=") #Parsing web urls lista = [] for item in soup.find_all('h3', attrs={'class': 'r'}): line = (reg.match(item.a['href'][7:]).group()) temp = Site(line, item.a.text, word) lista.append(temp) print("web crawl registros: " + str(len(lista))) if len(lista) == 0: print("fim") break json_string = "" if os.path.exists("data.json"): with open("data.json", "r", encoding="utf-8") as f: json_string = f.read() if len(json_string) > 0: data = json.loads(json_string) for item in data:
def post(self): ''' Create new sites to included in username searches. **Example Request** .. sourcecode:: json { "sites": [ { "name": "about.me", "url": "http://about.me/%s", "category": "social", "status_code": 200, "match_type": "text", "match_expr": "Foo Bar Baz", "test_username_pos": "john", "test_username_neg": "dPGMFrf72SaS" }, ... ] } **Example Response** .. sourcecode:: json { "message": "1 site created." } :<header Content-Type: application/json :<header X-Auth: the client's auth token :>json list sites: a list of sites to create :>json string sites[n].name: name of site :>json string sites[n].url: username search url for the site :>json string sites[n].category: category of the site :>json int sites[n].status_code: the status code to check for determining a match (nullable) :>json string sites[n].match_type: type of match (see get_match_types() for valid match types) (nullable) :>json string sites[n].match_expr: expression to use for determining a page match (nullable) :>json string sites[n].test_username_pos: username that exists on site (used for testing) :>json string sites[n].test_username_neg: username that does not exist on site (used for testing) :status 200: created :status 400: invalid request body :status 401: authentication required ''' request_json = request.get_json() sites = [] # Ensure all data is valid before db operations for site_json in request_json['sites']: validate_request_json(site_json, SITE_ATTRS) if (site_json['match_type'] is None or \ site_json['match_expr'] is None) and \ site_json['status_code'] is None: raise BadRequest('At least one of the following is required: ' 'status code or page match.') # Save sites for site_json in request_json['sites']: test_username_pos = site_json['test_username_pos'].lower().strip() site = Site(name=site_json['name'].lower().strip(), url=site_json['url'].lower().strip(), category=site_json['category'].lower().strip(), test_username_pos=test_username_pos) site.status_code = site_json['status_code'] site.match_expr = site_json['match_expr'] site.match_type = site_json['match_type'] if 'test_username_neg' in site_json: site.test_username_neg = site_json['test_username_neg'] \ .lower().strip(), g.db.add(site) try: g.db.flush() sites.append(site) except IntegrityError: g.db.rollback() raise BadRequest( 'Site URL {} already exists.'.format(site.url) ) g.db.commit() # Send redis notifications for site in sites: notify_mask_client( channel='site', message={ 'id': site.id, 'name': site.name, 'status': 'created', 'resource': None } ) message = '{} new sites created'.format(len(request_json['sites'])) response = jsonify(message=message) response.status_code = 202 return response