예제 #1
0
def encaminhar(url):
    s = Site(url_curta=url)
    s.recuperar_url_longa()
    # se não houver url, volta pra raiz
    if s.url_longa == "":
        return redirect(url_for('index'))
    return redirect(f'{s.url_longa}')
예제 #2
0
def index():
    if request.method == 'POST':
        s = Site(url_longa=request.form['url-longa'])
        s.cadastrar()
        return f"""Sua url curta é:
        <a href = {request.url + s.url_curta}>{request.url + s.url_curta} </a>"""

    return render_template("base.html")
예제 #3
0
    def _get_site(self):
        site = ndb.Key(Site, '_site').get()

        if not site:
            logging.error('Saving site')
            site = Site(id='_site')
            site.github_token = ''
            site.put()

        return site
예제 #4
0
    def _get_site(self):
        site = ndb.Key(Site, '_site').get()

        if not site:
            logging.error('Saving site')
            site = Site(id='_site')
            site.github_token = ''
            site.put()

        return site
예제 #5
0
    def setUp(self):
        """Set up dependencies for web pages' loading """

        self.client = server.app.test_client()
        server.app.config['TESTING'] = True

        # server._old_sites_list = server.sites_list
        server.sites_list = [
            Site(name="Berkeley Arts Magnet"),
            Site(name="Young Adult Project")
        ]
예제 #6
0
파일: handlers.py 프로젝트: PyStok/PyStok-3
    def post(self):
        url = self.request.POST.get('url')

        try:
            result = urllib2.urlopen(url)
            content = result.read()

            site = Site()
            site.url = url
            site.count = self.__count_imgs(content)
            site.put()
            self.__grab_urls(content)
        except:
            pass
예제 #7
0
파일: handlers.py 프로젝트: PyStok/PyStok-3
 def get(self):
     sites_data = Site.query().fetch(keys_only=True)
     sites_data = ndb.get_multi(sites_data)
     context = {'data': sites_data}
     content = self.jinja2.render_template('sites.html', **context)
     self.response.headers.add_header('Access-Control-Allow-Origin', '*')
     self.response.write(content)
예제 #8
0
def process_job():
    """processes url web scraping request
    If process has been done before, get result from database
    Otherwise, process and add data to database"""
    r = redis.StrictRedis()
    while True:
        curr_job = r.blpop('job_queue', 0)[1]
        r.hset('status', curr_job, 'processing')
        print('current job ID:', curr_job)
        # convert byte to string
        url = r.hget('urls', curr_job).decode("utf-8")
        print('Current URL:', url)

        # if this url has not been requested before/is not in the db
        if Site.query.filter_by(url=url).first():
            r.hset('status', curr_job, 'complete')
            print('Job', curr_job, 'Completed')
        else:
            # fetches url page source
            try:
                html = str(get_html(url))
                print('Successfully retrieved HTML')
            # add results to database
                db.session.add(Site(url=url, html=html))
                db.session.commit()
                print('Added to database')
                r.hset('status', curr_job, 'complete')
                print('Job', curr_job, 'Completed')
            except ValueError:
                r.hset('status', curr_job, 'abort')
                print('Job', curr_job, 'Aborted')
            except TimeoutError:
                r.hset('status', curr_job, 'timeout')
                print('Job', curr_job, 'Timed Out')
    return
예제 #9
0
 def get_site(self, required=False, create_if_missing=False):
   """Returns the Site object given by the URL."""
   if self.response_dict().site:
     return self.response_dict().site
   domain = self.url_arg(0)
   if domain:
     key_name = Site.key_name_from_domain(domain)
     site = Site.get_by_key_name(key_name)
     if not site and create_if_missing:
       # create a site (but don't save it)
       site = Site(key_name=key_name, domain=domain)
     if site:
       self.response_dict(site = site) # for the template
       return site
   if required:
     raise NotFoundException("site not found")
예제 #10
0
 def get(self):
     templatevars = {"title":"PREOMR - sites"}
     sites = [ (s,get_count("Work-%s"%s.name)) for s in
                                        Site.all().fetch(100)]
     templatevars['sites'] = sites
     total = sum( [ s[1] for s in sites ] )
     templatevars["totalhere"] = total
     templatevars["overalltotal"] = get_count("Work")
     self.generate("sites.html",templatevars)
예제 #11
0
def parse_site_cell(line: str, coverage_radius: float):
    s = line[2:].split("|")
    point = s[0]
    corners = s[1][1:len(s[1]) - 2]

    p = point.split(":")
    site_x = p[0][1:]
    site_y = p[1][:len(p[1]) - 1]

    str_points = corners.split(", ")
    points = []
    for point in str_points:
        coordinates = point[1:len(point) - 1].split(":")
        points.append((float(coordinates[0]), float(coordinates[1])))

    return Site(float(site_x), float(site_y), coverage_radius, points)
def generate_random_solution(nb_sensor: int, sensor_coverage_distance: float, x_plane_size, y_plane_size):
    sites = []
    for i in range(nb_sensor):
        realistic = False
        while not realistic:
            site = Site(random.uniform(0, x_plane_size), random.uniform(0, y_plane_size), sensor_coverage_distance)
            if len(sites) > 0:
                temp = [copy.copy(site) for site in sites]
                temp.append(site)
                realistic = solution_is_realistic(temp)
            else:
                realistic = True

        sites.append(site)

    return sites
def generate_random_solution_b(nb_sensor: int, sensor_coverage_distance: float, x_plane_size, y_plane_size):
    min_distance_between_sites = sensor_coverage_distance * 0.75
    sites = []
    for i in range(nb_sensor):
        min_distance_is_satisfied = False
        while not min_distance_is_satisfied:
            site = Site(random.uniform(0, x_plane_size), random.uniform(0, y_plane_size), sensor_coverage_distance)
            if len(sites) > 0:
                previous_point = sites[len(sites) - 1]
                point_distance = compute_point_distance(previous_point.x, previous_point.y, site.x, site.y)
                if min_distance_between_sites <= point_distance <= sensor_coverage_distance:
                    min_distance_is_satisfied = True
            else:
                min_distance_is_satisfied = True

        sites.append(site)

    return sites
예제 #14
0
def sites_list():
    """Return list of sites based on the location the user searched for."""

    user_location = request.args.get('search')
    places = search_by_text(user_location)

    final_places = []

    for place in places:
        site_in_db = Site.query.filter_by(site_id=place['place_id']).first()

        if site_in_db:
            final_places.append((place, site_in_db.thumbnail))

        elif 'photos' in place:
            # get photo via the photo reference
            thumb_url = get_thumnail_url(place['photos'][0]['photo_reference'])

            #store new site to db
            geo_location = place['geometry']['location']
            new_site = Site(site_id=place['place_id'],
                            name=place['name'],
                            address=place['formatted_address'],
                            thumbnail=thumb_url,
                            lat=geo_location['lat'],
                            lng=geo_location['lng'])
            db.session.add(new_site)
            final_places.append((place, thumb_url))

    #commit the new places to the db
    db.session.commit()

    if not final_places:
        flash("Sorry, no results were found for your search! Please try again")

    return render_template("sites.html",
                           places=final_places,
                           api_key=GOOGLE_API_KEY)
예제 #15
0
def update_site_pos_with_cell_centroid(site: Site):
    centroid = polygon_centre_area(site.cell_corners)
    updated_pos = find_weighted_updated_site_location((site.x, site.y),
                                                      centroid, WEIGHT)
    site.x = updated_pos[0]
    site.y = updated_pos[1]
예제 #16
0
    vector_distance = math.sqrt((target_point[0] - initial_point[0])**2 +
                                (target_point[1] - initial_point[1])**2)
    u = vector[0] / vector_distance, vector[1] / vector_distance
    weighted_point = initial_point[
        0] + u[0] * vector_distance * weight, initial_point[
            1] + u[1] * vector_distance * weight
    return weighted_point


if __name__ == "__main__":
    WEIGHT = 1
    coverage_radius = 100
    start = time.time()
    # iterations = voronoi_relaxation(generate_random_solution(34, 100, 500, 500), 500, 500, 15)
    sites = [
        Site(356.19560084901246, 127.42318033479866, coverage_radius),
        Site(418.7012015568561, 81.20783302622547, coverage_radius),
        Site(282.2647422336893, 139.04607145486347, coverage_radius),
        Site(462.71824390613375, 138.61894677935322, coverage_radius),
        Site(495.8338855452489, 50.272726680936685, coverage_radius),
        Site(374.599719019584, 108.28343578447591, coverage_radius),
        Site(372.24584006522247, 129.03846814142344, coverage_radius),
        Site(406.17483780539385, 140.24310280444857, coverage_radius),
        Site(466.5597007333434, 61.28632635196157, coverage_radius),
        Site(233.796901623617, 120.66404604823155, coverage_radius)
    ]
    iterations = voronoi_relaxation(sites, 500, 500, 20)
    print(
        str(time.time() - start) + " seconds to run for " + str(iterations) +
        " iterations.")
예제 #17
0
 def get(self):
     self.enforce_admin()
     name = self.request.get("name")
     if name is None or "" == name: 
         self.jsonout(status="error",
                 msg="No name specified",
                )
         return
     name = name.strip()
     sq = Site.gql("WHERE name = :1",name)
     s = sq.get()
     if s is None:
         s = Site(name=name)
         s.put()
         increment("Site")
         msg = "site %s added with id %d"
         format = (s.name,s.key().id())
         self.jsonout(status="ok",msg=msg,format=format,
                 key=str(s.key()),
                 id=s.key().id()
                )
     else:
         self.jsonout(status="dup",
                      msg="%s already existed as site with id %d",
                      format=(name,s.key().id()),
                      id=s.key().id(),
                      key=str(s.key())
                     )
예제 #18
0
    def post(self):
        '''
        Create new sites to included in username searches.

        **Example Request**

        .. sourcecode:: json

            {
                "sites": [
                    {
                        "name": "about.me",
                        "url": "http://about.me/%s",
                        "status_code": 200,
                        "match_type": "text",
                        "match_expr": "Foo Bar Baz",
                        "test_username_pos": "john",
                        "test_username_neg": "dPGMFrf72SaS",
                        "headers": {"referer": "http://www.google.com"},
                        "censor_images": false,
                        "wait_time": 5,
                        "use_proxy": false,
                    },
                    ...
                ]
            }

        **Example Response**

        .. sourcecode:: json

            {
                "message": "1 site created."
            }

        :<header Content-Type: application/json
        :<header X-Auth: the client's auth token
        :<json list sites: a list of sites to create
        :<json string sites[n].name: name of site
        :<json string sites[n].url: username search url for the site
        :<json int sites[n].status_code: the status code to check for
           determining a match (nullable)
        :<json string sites[n].match_type: type of match (see get_match_types()
           for valid match types) (nullable)
        :<json string sites[n].match_expr: expression to use for determining
           a page match (nullable)
        :<json string sites[n].test_username_pos: username that exists on site
           (used for testing)
        :<json string sites[n].test_username_neg: username that does not exist
           on site (used for testing)
        :<json array sites[n].headers: custom headers
        :<json bool sites[n].censor_images: whether to censor images
            from this profile
        :<json int sites[n].wait_time: time (in seconds) to wait for updates
            after page is loaded
        :<json bool sites[n].use_proxy: whether to proxy requests
            for this profile URL

        :>header Content-Type: application/json
        :>json string message: API response message

        :status 200: created
        :status 400: invalid request body
        :status 401: authentication required
        '''
        request_json = request.get_json()
        sites = []

        # Ensure all data is valid before db operations
        for site_json in request_json['sites']:
            validate_request_json(site_json, _site_attrs)

            if (site_json['match_type'] is None or
                site_json['match_expr'] is None) and \
                    site_json['status_code'] is None:
                raise BadRequest('At least one of the '
                                 'following is required: '
                                 'status code or page match.')

            if '%s' not in site_json['url']:
                raise BadRequest('URL must contain replacement character %s')

        # Save sites
        for site_json in request_json['sites']:
            test_username_pos = site_json['test_username_pos'].lower().strip()
            site = Site(name=site_json['name'].strip(),
                        url=site_json['url'].lower().strip(),
                        test_username_pos=test_username_pos)

            site.status_code = site_json['status_code']
            site.match_expr = site_json['match_expr']
            site.match_type = site_json['match_type']

            if 'test_username_neg' in site_json:
                site.test_username_neg = site_json['test_username_neg'] \
                    .lower().strip(),

            if 'headers' in site_json:
                site.headers = site_json['headers']

            g.db.add(site)

            try:
                g.db.flush()
                sites.append(site)
            except IntegrityError:
                g.db.rollback()
                raise BadRequest('Site URL {} already exists.'.format(
                    site.url))

        g.db.commit()

        # Send redis notifications
        for site in sites:
            notify_mask_client(channel='site',
                               message={
                                   'id': site.id,
                                   'name': site.name,
                                   'status': 'created',
                                   'resource': None
                               })

        message = '{} new sites created'.format(len(request_json['sites']))
        response = jsonify(message=message)
        response.status_code = 202

        return response
예제 #19
0
    def _create_fixture_sites(self, config):
        ''' Create site fixtures. '''

        session = app.database.get_session(self._db)

        about_me = Site(name='About.me',
                        url='http://about.me/%s',
                        category='social',
                        status_code=200,
                        test_username_pos='bob')
        session.add(about_me)

        anobii = Site(name='Anobii',
                      url='http://www.anobii.com/%s/books',
                      category='books',
                      match_type='css',
                      match_expr='h1.person_heading',
                      test_username_pos='bob')
        session.add(anobii)

        ask_fm = Site(name='Ask FM',
                      url='http://ask.fm/%s',
                      category='social',
                      status_code=200,
                      test_username_pos='tipsofschool')
        session.add(ask_fm)

        audioboom = Site(name='Audioboom',
                         url='http://audioboom.com/%s',
                         category='music',
                         status_code=200,
                         test_username_pos='bob')
        session.add(audioboom)

        authorstream = Site(name='Authorstream',
                            url='http://www.authorstream.com/%s/',
                            category='social',
                            status_code=200,
                            test_username_pos='tiikmconferences')
        session.add(authorstream)

        badoo = Site(name='Badoo',
                     url='http://badoo.com/%s/',
                     category='dating',
                     status_code=200,
                     test_username_pos='dave')
        session.add(badoo)

        behance = Site(name='Behance',
                       url='https://www.behance.net/%s',
                       category='social',
                       status_code=200,
                       test_username_pos='juste')
        session.add(behance)

        bitbucket = Site(name='Bitbucket',
                         url='https://bitbucket.org/%s',
                         category='coding',
                         status_code=200,
                         test_username_pos='jespern')
        session.add(bitbucket)

        blip_fm = Site(name='Blip FM',
                       url='http://blip.fm/%s',
                       category='music',
                       status_code=200,
                       test_username_pos='mark_till')
        session.add(blip_fm)

        blogmarks = Site(name='Blogmarks',
                         url='http://blogmarks.net/user/%s',
                         category='social',
                         match_type='css',
                         match_expr='div#infos-user',
                         test_username_pos='Krome')
        session.add(blogmarks)

        blogspot = Site(name='Blogspot',
                        url='http://%s.blogspot.co.uk',
                        category='social',
                        status_code=200,
                        test_username_pos='takingroot-jr')
        session.add(blogspot)

        bodybuilding = Site(name='Bodybuilding',
                            url='http://bodyspace.bodybuilding.com/%s/',
                            category='health',
                            match_type='css',
                            match_expr='div.BodyBanner',
                            test_username_pos='Scarfdaddy')
        session.add(bodybuilding)

        break_com = Site(name='Break',
                         url='http://www.break.com/user/%s',
                         category='video',
                         match_type='css',
                         match_expr='section.profile-head',
                         test_username_pos='jenny')
        session.add(break_com)

        cafemom = Site(name='Cafemom',
                       url='http://www.cafemom.com/home/%s',
                       category='social',
                       match_type='css',
                       match_expr='div#member-info',
                       test_username_pos='jane')
        session.add(cafemom)

        car_domain = Site(name='Car Domain',
                          url='http://www.cardomain.com/member/%s/',
                          category='automotive',
                          status_code=200,
                          test_username_pos='dan')
        session.add(car_domain)

        codeplex = Site(name='Codeplex',
                        url='http://www.codeplex.com/site/users/view/%s',
                        category='coding',
                        match_type='css',
                        match_expr='h1.user_name',
                        test_username_pos='dan')
        session.add(codeplex)

        colour_lovers = Site(name='Colour Lovers',
                             url='http://www.colourlovers.com/lover/%s',
                             category='art and design',
                             match_type='css',
                             match_expr='div.column-container',
                             test_username_pos='bob')
        session.add(colour_lovers)

        conferize = Site(name='Conferize',
                         url='https://www.conferize.com/u/%s/',
                         category='business',
                         match_type='css',
                         match_expr='div.hero--user',
                         test_username_pos='dan')
        session.add(conferize)

        copy_taste = Site(name='Copytaste',
                          url='http://copytaste.com/profile/%s',
                          category='social',
                          status_code=200,
                          test_username_pos='metegulec')
        session.add(copy_taste)

        cruisemates = Site(
            name='Cruisemates',
            url='http://www.cruisemates.com/forum/members/%s.html',
            category='travel',
            match_type='css',
            match_expr='div#main_userinfo',
            test_username_pos='trip')
        session.add(cruisemates)

        daily_motion = Site(name='Dailymotion',
                            url='http://www.dailymotion.com/%s',
                            category='Video',
                            status_code=200,
                            test_username_pos='fanreviews')
        session.add(daily_motion)

        delicious = Site(name='Delicious',
                         url='https://del.icio.us/%s',
                         category='Social',
                         status_code=200,
                         test_username_pos='john')
        session.add(delicious)

        deviant_art = Site(name='DeviantArt',
                           url='http://%s.deviantart.com/',
                           category='image',
                           status_code=200,
                           test_username_pos='marydoodles')
        session.add(deviant_art)

        diigo = Site(name='Diigo',
                     url='https://www.diigo.com/profile/%s',
                     category='bookmarking',
                     match_type='css',
                     match_expr='div#avatarSection',
                     test_username_pos='hunter53')
        session.add(diigo)

        disqus = Site(name='Disqus',
                      url='https://disqus.com/by/%s/',
                      category='social',
                      status_code=200,
                      test_username_pos='willwillibe')
        session.add(disqus)

        diy = Site(name='DIY',
                   url='https://diy.org/%s',
                   category='home improvement',
                   status_code=200,
                   test_username_pos='bob')
        session.add(diy)

        dribble = Site(name='Dribble',
                       url='https://www.dribbble.com/%s',
                       category='art and design',
                       status_code=200,
                       test_username_pos='kirp')
        session.add(dribble)

        ebay = Site(name='Ebay',
                    url='http://www.ebay.com/usr/%s',
                    category='shopping',
                    match_type='css',
                    match_expr='div#user_image',
                    test_username_pos='max')
        session.add(ebay)

        etsy = Site(name='Etsy',
                    url='https://www.etsy.com/people/%s',
                    category='shopping',
                    status_code=200,
                    test_username_pos='betsy')
        session.add(etsy)

        families = Site(name='Families',
                        url='http://www.families.com/author/%s',
                        category='lifestyle',
                        match_type='css',
                        match_expr='div#author-description',
                        test_username_pos='JenThorpe')
        session.add(families)

        fanpop = Site(name='Fanpop',
                      url='http://www.fanpop.com/fans/%s',
                      category='entertainment',
                      match_type='css',
                      match_expr='div.user-header',
                      test_username_pos='dan')
        session.add(fanpop)

        ffffound = Site(name='FFFFound',
                        url='http://ffffound.com/home/%s/found/',
                        category='image',
                        status_code=200,
                        test_username_pos='tobbz')
        session.add(ffffound)

        flavours = Site(name='Flavours',
                        url='http://%s.flavors.me',
                        category='social',
                        status_code=200,
                        test_username_pos='john')
        session.add(flavours)

        flickr = Site(name='Flickr',
                      url='https://www.flickr.com/photos/%s/',
                      category='image',
                      status_code=200,
                      test_username_pos='adam')
        session.add(flickr)

        foodspotting = Site(
            name='Foodspotting',
            url='http://www.foodspotting.com/%s',
            category='lifestyle',
            status_code=200,
            test_username_pos='dylan',
            # This site handles names with leading numerics strangely, so we
            # need to force an alphabetic negative case.
            test_username_neg='asdfqwerasdfqwer')
        session.add(foodspotting)

        fotolog = Site(name='Fotolog',
                       url='http://www.fotolog.com/%s/',
                       category='image',
                       status_code=200,
                       test_username_pos='anna')
        session.add(fotolog)

        foursquare = Site(name='Foursquare',
                          url='https://foursquare.com/%s',
                          category='social',
                          status_code=200,
                          test_username_pos='john')
        session.add(foursquare)

        freesound = Site(name='Freesound',
                         url='http://www.freesound.org/people/%s/',
                         category='music',
                         status_code=200,
                         test_username_pos='john')
        session.add(freesound)

        friend_finder_x = Site(name='FriendFinder-X',
                               url='http://www.friendfinder-x.com/profile/%s',
                               category='dating',
                               match_type='css',
                               match_expr='div#tmpl_member_profile_header',
                               test_username_pos='daniel')
        session.add(friend_finder_x)

        funny_or_die = Site(name='Funny or Die',
                            url='http://www.funnyordie.com/%s',
                            category='video',
                            status_code=200,
                            test_username_pos='bob')
        session.add(funny_or_die)

        get_it_on = Site(name='GETitOn',
                         url='http://getiton.com/profile/%s',
                         category='dating',
                         match_type='css',
                         match_expr='div#profile_page_wrapper',
                         test_username_pos='chris')
        session.add(get_it_on)

        github = Site(name='Github',
                      url='https://github.com/%s',
                      category='coding',
                      status_code=200,
                      test_username_pos='google')
        session.add(github)

        godtube = Site(name='GodTube',
                       url='http://www.godtube.com/%s/',
                       category='video',
                       status_code=200,
                       test_username_pos='bball1989')
        session.add(godtube)

        gogobot = Site(name='Gogobot',
                       url='http://www.gogobot.com/user/%s',
                       category='travel',
                       status_code=200,
                       test_username_pos='dan')
        session.add(gogobot)

        goodreads = Site(name='Goodreads',
                         url='http://www.goodreads.com/%s',
                         category='entertainment',
                         status_code=200,
                         test_username_pos='seal')
        session.add(goodreads)

        gravatar = Site(name='Gravatar',
                        url='http://en.gravatar.com/profiles/%s',
                        category='social',
                        status_code=200,
                        test_username_pos='simon')
        session.add(gravatar)

        hubpages = Site(name='Hubpages',
                        url='http://%s.hubpages.com/',
                        category='blog',
                        status_code=200,
                        test_username_pos='bob')
        session.add(hubpages)

        i_am_pregnant = Site(name='i-am-pregnant',
                             url='http://www.i-am-pregnant.com/members/%s/',
                             category='health',
                             status_code=200,
                             test_username_pos='shiv77')
        session.add(i_am_pregnant)

        if_this_then_that = Site(name='IFTTT',
                                 url='https://ifttt.com/p/%s/shared',
                                 category='technology',
                                 status_code=200,
                                 test_username_pos='bsaren')
        session.add(if_this_then_that)

        image_shack = Site(name='ImageShack',
                           url='https://imageshack.com/user/%s',
                           category='image',
                           match_type='css',
                           match_expr='header.user-profile',
                           test_username_pos='Nicholas230')
        session.add(image_shack)

        imgur = Site(name='imgur',
                     url='http://imgur.com/user/%s',
                     category='image',
                     status_code=200,
                     test_username_pos='ThatPervert')
        session.add(imgur)

        instagram = Site(name='Instagram',
                         url='https://www.instagram.com/%s/',
                         category='social',
                         status_code=200,
                         test_username_pos='kensingtonroyal')
        session.add(instagram)

        instructables = Site(name='instructables',
                             url='http://www.instructables.com/member/%s/',
                             category='learning',
                             status_code=200,
                             test_username_pos='shags_j')
        session.add(instructables)

        interpals = Site(name='InterPals',
                         url='https://www.interpals.net/%s',
                         category='social',
                         match_type='css',
                         match_expr='div.profile',
                         test_username_pos='Seven89')
        session.add(interpals)

        keybase = Site(name='Keybase',
                       url='https://keybase.io/%s',
                       category='crypto',
                       status_code=200,
                       test_username_pos='mehaase')
        session.add(keybase)

        kongregate = Site(name='Kongregrate',
                          url='http://www.kongregate.com/accounts/%s',
                          category='gaming',
                          status_code=200,
                          test_username_pos='Truestrike')
        session.add(kongregate)

        lanyrd = Site(name='Lanyrd',
                      url='http://lanyrd.com/profile/%s/',
                      category='social',
                      status_code=200,
                      test_username_pos='shanselman')
        session.add(lanyrd)

        last_fm = Site(name='Last.fm',
                       url='http://www.last.fm/user/%s',
                       category='music',
                       status_code=200,
                       test_username_pos='FrancaesG')
        session.add(last_fm)

        law_of_attraction = Site(
            name='Law of Attraction',
            url='http://www.lawofattractionsingles.com/%s',
            category='dating',
            match_type='css',
            match_expr='div.prof_top_block',
            test_username_pos='Jenniferlynnmaui')
        session.add(law_of_attraction)

        library_thing = Site(name='LibraryThing',
                             url='https://www.librarything.com/profile/%s',
                             category='learning',
                             match_type='css',
                             match_expr='div.profile',
                             test_username_pos='Medievalgirl')
        session.add(library_thing)

        lifeboat = Site(name='lifeboat',
                        url='https://oc.tc/%s',
                        category='gaming',
                        status_code=200,
                        test_username_pos='Matilaina')
        session.add(lifeboat)

        linked_in = Site(name='LinkedIn',
                         url='https://www.linkedin.com/in/%s',
                         category='social',
                         status_code=200,
                         test_username_pos='markhaase')
        session.add(linked_in)

        marketing_land = Site(name='Marketing Land',
                              url='http://marketingland.com/author/%s',
                              category='business',
                              status_code=200,
                              test_username_pos='barb-palser')
        session.add(marketing_land)

        mate1 = Site(name='Mate1.com',
                     url='http://www.mate1.com/profiles/%s',
                     category='dating',
                     status_code=200,
                     test_username_pos='janedoe')
        session.add(mate1)

        medium = Site(name='Medium',
                      url='https://medium.com/@%s',
                      category='social',
                      status_code=200,
                      test_username_pos='erinshawstreet')
        session.add(medium)

        meetzur = Site(name='Meetzur',
                       url='http://www.meetzur.com/%s',
                       category='social',
                       match_type='css',
                       match_expr='div.profile-left',
                       test_username_pos='sachin99')
        session.add(meetzur)

        mixcloud = Site(name='Mixcloud',
                        url='https://www.mixcloud.com/%s/',
                        category='music',
                        status_code=200,
                        test_username_pos='dublab')
        session.add(mixcloud)

        # This site was out of service at the time I tried testing it, so I
        # could not test this criteria.
        mixcrate = Site(name='mixcrate',
                        url='http://www.mixcrate.com/%s',
                        category='music',
                        status_code=200,
                        test_username_pos='kennyrock')
        session.add(mixcrate)

        mixlr = Site(name='Mixlr',
                     url='http://mixlr.com/%s/',
                     category='music',
                     status_code=200,
                     test_username_pos='therwandan')
        session.add(mixlr)

        mod_db = Site(name='Mod DB',
                      url='http://www.moddb.com/members/%s',
                      category='gaming',
                      status_code=200,
                      test_username_pos='hugebot')
        session.add(mod_db)

        muck_rack = Site(name='Muck Rack',
                         url='https://muckrack.com/%s',
                         category='gaming',
                         status_code=200,
                         test_username_pos='scottkleinberg')
        session.add(muck_rack)

        mybuilder_com = Site(name='MyBuilder.com',
                             url='https://www.mybuilder.com/profile/view/%s',
                             category='business',
                             status_code=200,
                             test_username_pos='kdbuildingservices')
        session.add(mybuilder_com)

        mylot = Site(name='myLot',
                     url='http://www.mylot.com/%s',
                     category='social',
                     status_code=200,
                     test_username_pos='LovingMyBabies')
        session.add(mylot)

        myspace = Site(name='Myspace',
                       url='https://myspace.com/%s',
                       category='social',
                       status_code=200,
                       test_username_pos='kesha')
        session.add(myspace)

        netvibes = Site(name='Netvibes',
                        url='http://www.netvibes.com/%s',
                        category='business',
                        status_code=200,
                        test_username_pos='grade3kis')
        session.add(netvibes)

        pandora = Site(name='Pandora',
                       url='https://www.pandora.com/profile/%s',
                       category='music',
                       match_type='css',
                       match_expr='div#user_info_container',
                       test_username_pos='mehaase')
        session.add(pandora)

        photoblog = Site(name='PhotoBlog',
                         url='https://www.photoblog.com/%s',
                         category='social',
                         status_code=200,
                         test_username_pos='canon6d')
        session.add(photoblog)

        photobucket = Site(name='Photobucker',
                           url='http://photobucket.com/user/%s/library/',
                           category='image',
                           status_code=200,
                           test_username_pos='darkgladir')
        session.add(photobucket)

        picture_trail = Site(name='PictureTrail',
                             url='http://www.picturetrail.com/%s',
                             category='image',
                             match_type='css',
                             match_expr='td.IntroTitle-text-wt',
                             test_username_pos='victoria15')
        session.add(picture_trail)

        pink_bike = Site(name='Pinkbike',
                         url='http://www.pinkbike.com/u/%s/',
                         category='entertainment',
                         status_code=200,
                         test_username_pos='mattwragg')
        session.add(pink_bike)

        pinterest = Site(name='Pinterest',
                         url='https://www.pinterest.com/%s/',
                         category='social',
                         status_code=200,
                         test_username_pos='mehaase')
        session.add(pinterest)

        playlists_net = Site(name='Playlists.Net',
                             url='http://playlists.net/members/%s',
                             category='music',
                             status_code=200,
                             test_username_pos='WhatisSoul')
        session.add(playlists_net)

        plurk = Site(name='Plurk',
                     url='http://www.plurk.com/%s',
                     category='social',
                     match_type='css',
                     match_expr='span.nick_name',
                     test_username_pos='xxSaltandPepperxx')
        session.add(plurk)

        rapid7_community = Site(name='Rapid7 Community',
                                url='https://community.rapid7.com/people/%s',
                                category='technology',
                                status_code=200,
                                test_username_pos='dabdine')
        session.add(rapid7_community)

        # This site has banned our Splash IP so I cannot test it.
        rate_your_music = Site(name='Rate Your Music',
                               url='http://rateyourmusic.com/~%s',
                               category='music',
                               status_code=200,
                               test_username_pos='silvioporto')
        session.add(rate_your_music)

        readability = Site(name='Readability',
                           url='https://readability.com/%s/',
                           category='entertainment',
                           status_code=200,
                           test_username_pos='adam')
        session.add(readability)

        reddit = Site(name='Reddit',
                      url='https://www.reddit.com/user/%s',
                      category='social',
                      status_code=200,
                      test_username_pos='mehaase')
        session.add(reddit)

        scratch = Site(name='Scratch',
                       url='https://scratch.mit.edu/users/%s/',
                       category='social',
                       status_code=200,
                       test_username_pos='MeTwo')
        session.add(scratch)

        setlist_fm = Site(name='setlist.fm',
                          url='http://www.setlist.fm/user/%s',
                          category='music',
                          status_code=200,
                          test_username_pos='tw21')
        session.add(setlist_fm)

        shopcade = Site(name='Shopcade',
                        url='https://www.shopcade.com/%s',
                        category='social',
                        status_code=200,
                        test_username_pos='salonidahake')
        session.add(shopcade)

        # This site occasionally throws errors when testing. Maybe it doesn't
        # like having two requests so fast?
        single_muslim = Site(
            name='SingleMuslim',
            url='https://www.singlemuslim.com/searchuser/%s/abc',
            category='dating',
            match_type='css',
            match_expr='div.userProfileView',
            test_username_pos='YoghurtTub')
        session.add(single_muslim)

        slashdot = Site(name='Slashdot',
                        url='https://slashdot.org/~%s',
                        category='technology',
                        match_type='css',
                        match_expr='article#user_bio',
                        test_username_pos='Locke2005')
        session.add(slashdot)

        slideshare = Site(name='SlideShare',
                          url='http://www.slideshare.net/%s',
                          category='technology',
                          status_code=200,
                          test_username_pos='dmc500hats')
        session.add(slideshare)

        smite_guru = Site(name='SmiteGuru',
                          url='http://smite.guru/stats/xb/%s/summary',
                          category='gaming',
                          match_type='css',
                          match_expr='div.header.panel',
                          test_username_pos='WatsonV3')
        session.add(smite_guru)

        smug_mug = Site(name='SmugMug',
                        url='https://%s.smugmug.com/',
                        category='image',
                        status_code=200,
                        test_username_pos='therescueddog')
        session.add(smug_mug)

        smule = Site(name='Smule',
                     url='http://www.smule.com/%s',
                     category='music',
                     status_code=200,
                     test_username_pos='AbsurdJoker')
        session.add(smule)

        snooth = Site(name='Snooth',
                      url='http://www.snooth.com/profiles/%s/',
                      category='music',
                      match_type='css',
                      match_expr='div.profile-header',
                      test_username_pos='dvogler')
        session.add(snooth)

        soldier_x = Site(name='SoldierX',
                         url='https://www.soldierx.com/hdb/%s',
                         category='technology',
                         match_type='css',
                         match_expr='div.field-field-hdb-photo',
                         test_username_pos='achillean')
        session.add(soldier_x)

        sound_cloud = Site(name='SoundCloud',
                           url='https://soundcloud.com/%s',
                           category='music',
                           status_code=200,
                           test_username_pos='youngma')
        session.add(sound_cloud)

        soup = Site(name='Soup',
                    url='http://%s.soup.io/',
                    category='social',
                    match_type='css',
                    match_expr='div#userinfo',
                    test_username_pos='nattaly')
        session.add(soup)

        source_forge = Site(name='SourceForge',
                            url='https://sourceforge.net/u/%s/profile/',
                            category='technology',
                            status_code=200,
                            test_username_pos='ronys')
        session.add(source_forge)

        speaker_deck = Site(name='Speaker Deck',
                            url='https://speakerdeck.com/%s',
                            category='technology',
                            status_code=200,
                            test_username_pos='rocio')
        session.add(speaker_deck)

        sporcle = Site(name='Sporcle',
                       url='http://www.sporcle.com/user/%s',
                       category='entertainment',
                       match_type='css',
                       match_expr='div#UserBox',
                       test_username_pos='lolshortee')
        session.add(sporcle)

        steam = Site(name='Steam',
                     url='http://steamcommunity.com/id/%s',
                     category='gaming',
                     match_type='css',
                     match_expr='div.profile_page',
                     test_username_pos='tryh4rdz')
        session.add(steam)

        stupid_cancer = Site(
            name='Stupidcancer',
            url='http://stupidcancer.org/community/profile/%s',
            category='social',
            status_code=200,
            test_username_pos='CatchMeYes')
        session.add(stupid_cancer)

        # Tribe.net was down when I was testing. I could not verify that these
        # settings work.
        tribe = Site(name='Tribe',
                     url='http://people.tribe.net/%s',
                     category='social',
                     status_code=200,
                     test_username_pos='violetta')
        session.add(tribe)

        trip_advisor = Site(name='TripAdvisor',
                            url='https://www.tripadvisor.com/members/%s',
                            category='social',
                            status_code=200,
                            test_username_pos='scrltd16')
        session.add(trip_advisor)

        tumblr = Site(name='Tumblr',
                      url='http://%s.tumblr.com/',
                      category='social',
                      status_code=200,
                      test_username_pos='seanjacobcullen')
        session.add(tumblr)

        twitter = Site(name='Twitter',
                       url='https://twitter.com/%s',
                       category='social',
                       status_code=200,
                       test_username_pos='mehaase')
        session.add(twitter)

        untappd = Site(name='Untappd',
                       url='https://untappd.com/user/%s',
                       category='entertainment',
                       status_code=200,
                       test_username_pos='samelawrence')
        session.add(untappd)

        vimeo = Site(name='Vimeo',
                     url='https://vimeo.com/%s',
                     category='image',
                     status_code=200,
                     test_username_pos='mikeolbinski')
        session.add(vimeo)

        visualize_us = Site(name='VisualizeUs',
                            url='http://vi.sualize.us/%s/',
                            category='social',
                            status_code=200,
                            test_username_pos='emilybusiness')
        session.add(visualize_us)

        voices_com = Site(name='Voices.com',
                          url='https://www.voices.com/people/%s',
                          category='business',
                          match_type='css',
                          match_expr='div.voices-profile-title',
                          test_username_pos='johncavanagh')
        session.add(voices_com)

        wanelo = Site(name='Wanelo',
                      url='https://wanelo.com/%s',
                      category='social',
                      status_code=200,
                      test_username_pos='tsingeli')
        session.add(wanelo)

        wattpad = Site(name='Wattpad',
                       url='https://www.wattpad.com/user/%s',
                       category='social',
                       status_code=200,
                       test_username_pos='Weirdly_Sarcastic')
        session.add(wattpad)

        wishlistr = Site(name='Wishlistr',
                         url='http://www.wishlistr.com/profile/%s/',
                         category='social',
                         match_type='css',
                         match_expr='div#people',
                         test_username_pos='seventy7')
        session.add(wishlistr)

        wordpress = Site(name='WordPress',
                         url='https://profiles.wordpress.org/%s/',
                         category='business',
                         match_type='css',
                         match_expr='ul#user-meta',
                         test_username_pos='sivel')
        session.add(wordpress)

        xbox_gamertag = Site(name='Xbox Gamertag',
                             url='https://www.xboxgamertag.com/search/%s/',
                             category='gaming',
                             status_code=200,
                             test_username_pos='masterrshake')
        session.add(xbox_gamertag)

        youtube = Site(name='YouTube',
                       url='https://www.youtube.com/user/%s',
                       category='image',
                       status_code=200,
                       test_username_pos='vlogdozack')
        session.add(youtube)

        session.commit()
예제 #20
0
                tempo = tempo + (random.random() * 120)
            time.sleep(tempo)
            print("dorme " + str(tempo / 60) + " min")

            print("realiza web crawling")
            req = urllib.request.Request(url,
                                         headers={'User-Agent': 'Mozilla/5.0'})
            soup = BeautifulSoup(urlopen(req).read(), "html.parser")

            #regex
            reg = re.compile(".*&sa=")
            #Parsing web urls
            lista = []
            for item in soup.find_all('h3', attrs={'class': 'r'}):
                line = (reg.match(item.a['href'][7:]).group())
                temp = Site(line, item.a.text, word)
                lista.append(temp)
            print("web crawl registros: " + str(len(lista)))

            if len(lista) == 0:
                print("fim")
                break

            json_string = ""
            if os.path.exists("data.json"):
                with open("data.json", "r", encoding="utf-8") as f:
                    json_string = f.read()

            if len(json_string) > 0:
                data = json.loads(json_string)
                for item in data:
예제 #21
0
    def post(self):
        '''
        Create new sites to included in username searches.

        **Example Request**

        .. sourcecode:: json

            {
                "sites": [
                    {
                        "name": "about.me",
                        "url": "http://about.me/%s",
                        "category": "social",
                        "status_code": 200,
                        "match_type": "text",
                        "match_expr": "Foo Bar Baz",
                        "test_username_pos": "john",
                        "test_username_neg": "dPGMFrf72SaS"
                    },
                    ...
                ]
            }

        **Example Response**

        .. sourcecode:: json

            {
                "message": "1 site created."
            }

        :<header Content-Type: application/json
        :<header X-Auth: the client's auth token
        :>json list sites: a list of sites to create
        :>json string sites[n].name: name of site
        :>json string sites[n].url: username search url for the site
        :>json string sites[n].category: category of the site
        :>json int sites[n].status_code: the status code to check for
            determining a match (nullable)
        :>json string sites[n].match_type: type of match (see get_match_types()
            for valid match types) (nullable)
        :>json string sites[n].match_expr: expression to use for determining
            a page match (nullable)
        :>json string sites[n].test_username_pos: username that exists on site
            (used for testing)
        :>json string sites[n].test_username_neg: username that does not exist
            on site (used for testing)

        :status 200: created
        :status 400: invalid request body
        :status 401: authentication required
        '''
        request_json = request.get_json()
        sites = []

        # Ensure all data is valid before db operations
        for site_json in request_json['sites']:
            validate_request_json(site_json, SITE_ATTRS)

            if (site_json['match_type'] is None or \
                site_json['match_expr'] is None) and \
                site_json['status_code'] is None:
                raise BadRequest('At least one of the following is required: '
                    'status code or page match.')

        # Save sites
        for site_json in request_json['sites']:
            test_username_pos = site_json['test_username_pos'].lower().strip()
            site = Site(name=site_json['name'].lower().strip(),
                        url=site_json['url'].lower().strip(),
                        category=site_json['category'].lower().strip(),
                        test_username_pos=test_username_pos)

            site.status_code = site_json['status_code']
            site.match_expr = site_json['match_expr']
            site.match_type = site_json['match_type']

            if 'test_username_neg' in site_json:
                site.test_username_neg = site_json['test_username_neg'] \
                    .lower().strip(),

            g.db.add(site)

            try:
                g.db.flush()
                sites.append(site)
            except IntegrityError:
                g.db.rollback()
                raise BadRequest(
                    'Site URL {} already exists.'.format(site.url)
                )

        g.db.commit()

        # Send redis notifications
        for site in sites:
            notify_mask_client(
                channel='site',
                message={
                    'id': site.id,
                    'name': site.name,
                    'status': 'created',
                    'resource': None
                }
            )

        message = '{} new sites created'.format(len(request_json['sites']))
        response = jsonify(message=message)
        response.status_code = 202

        return response