예제 #1
0
def random_proxy(db_session=None):
    """
    Return a random proxy as URL string.
    """
    if db_session is None:
        db_session = worker.get_session()

    try:
        proxy = db_session.query(Proxy).filter(Proxy.active == True) \
            .order_by(func.random()).limit(1).one() # noqa
    except NoResultFound:
        return None

    proxy_url = '{}://'.format(proxy.protocol)

    if proxy.username:
        proxy_url += '{}:'.format(proxy.username)

        if proxy.password:
            proxy_url += proxy.password

        proxy_url += '@'

    proxy_url += '{}:{}'.format(proxy.host, proxy.port)

    return proxy_url
예제 #2
0
    def _get_label_id(self, name):
        """
        Get or create a database label object, return the ID.
        """
        print('Label name to add:{}'.format(name), flush=True)
        db_session = worker.get_session()
        redis = worker.get_redis()
        label = db_session.query(Label).filter_by(name=name.lower().strip()).first()

        if label:
            print('Found Label :{}'.format(label.id), flush=True)
            return label.id
        else:
            label = Label(name=name.lower().strip())
            db_session.add(label)

            try:
                db_session.commit()
            except IntegrityError:
                db_session.rollback()
                raise
            except AssertionError:
                db_session.rollback()
                raise ValueError(
                    'Label "{}" contains non-alphanumeric character'
                    .format(name)
                )

            redis.publish('label', json.dumps(label.as_dict()))
            print('Created label :{}'.format(label.id), flush=True)
            return label.id
예제 #3
0
def delete_expired_results():
    """
    Delete results more than _days_to_keep_result.

    Sites including expired results are retested.
    """
    worker.start_job()
    db_session = worker.get_session()
    tested_sites = set()
    expiry = datetime.utcnow() - timedelta(days=_days_to_keep_result)
    expired_results = db_session.query(Result).filter(
        Result.created_at < expiry).all()

    for result in expired_results:
        # Don't delete permanent image files
        if result.image_file.name in _permanent_images:
            result.image_file = None
            result.image_file_id = None
            db_session.flush()

        db_session.delete(result)

        if result.site_id not in tested_sites:
            tracker_id = 'tracker.{}'.format(random_string(10))
            test_site.enqueue(result.site_id, tracker_id)
            tested_sites.add(result.site_id)

    db_session.commit()
    worker.finish_job()
예제 #4
0
def splash_request(target_url, headers={}, request_timeout=10):
    ''' Ask splash to render a page. '''
    db_session = worker.get_session()
    splash_url = get_config(db_session, 'splash_url', required=True).value
    splash_user = get_config(db_session, 'splash_user', required=True).value
    splash_pass = get_config(db_session, 'splash_password',
                             required=True).value
    auth = (splash_user, splash_pass)
    splash_headers = {'content-type': 'application/json'}

    if 'user-agent' not in [header.lower() for header in headers.keys()]:
        headers['user-agent'] = USER_AGENT

    payload = {
        'url': target_url,
        'html': 1,
        'jpeg': 1,
        'har': 1,
        'history': 1,
        'timeout': request_timeout,
        'resource_timeout': 5,
        'headers': headers
    }

    splash_response = requests.post(urljoin(splash_url, 'render.json'),
                                    headers=splash_headers,
                                    json=payload,
                                    auth=auth)

    return splash_response
예제 #5
0
def test_site(site_id, tracker_id, request_timeout=10):
    """
    Perform postive and negative test of site.

    Postive test: check_username() return True for existing username.
    Negative test: check_username() returns False for non-existent username.

    Site is valid if:

        positive result  = 'f' (found)
        negative result = 'n' (not found)
    """
    worker.start_job()
    redis = worker.get_redis()
    db_session = worker.get_session()
    site = db_session.query(Site).get(site_id)

    # Do positive test.
    result_pos_id = check_username(username=site.test_username_pos,
                                   site_id=site_id,
                                   category_id=None,
                                   total=2,
                                   tracker_id=tracker_id + '-1',
                                   test=True)

    result_pos = db_session.query(Result).get(result_pos_id)

    # Do negative test.
    result_neg_id = check_username(username=site.test_username_neg,
                                   site_id=site_id,
                                   category_id=None,
                                   total=2,
                                   tracker_id=tracker_id + '-2',
                                   test=True)

    result_neg = db_session.query(Result).get(result_neg_id)

    # Update site with test results
    site.test_result_pos = result_pos
    site.test_result_neg = result_neg

    # Set site validity based on results
    # of both tests.
    if result_pos.status == 'f' and \
            result_neg.status == 'n':
        site.valid = True
    else:
        site.valid = False

    site.tested_at = datetime.utcnow()
    db_session.commit()

    # Send redis notification
    msg = {
        'tracker_id': tracker_id,
        'status': 'tested',
        'site': site.as_dict(),
        'resource': None,
    }
    redis.publish('site', json.dumps(msg))
예제 #6
0
def splash_request(target_url,
                   headers={},
                   request_timeout=None,
                   wait=1,
                   use_proxy=False):
    ''' Ask splash to render a page. '''
    db_session = worker.get_session()
    splash_url = get_config(db_session, 'splash_url', required=True).value
    splash_user = get_config(db_session, 'splash_user', required=True).value
    splash_pass = get_config(db_session, 'splash_password',
                             required=True).value
    splash_user_agent = get_config(db_session,
                                   'splash_user_agent',
                                   required=True).value
    proxy = None

    if request_timeout is None:
        try:
            request_timeout = int(
                get_config(db_session, 'splash_request_timeout',
                           required=True).value)
        except:
            raise ScrapeException('Request timeout must be an integer: {}',
                                  request_timeout)

    auth = (splash_user, splash_pass)
    splash_headers = {'content-type': 'application/json'}

    if 'user-agent' not in [header.lower() for header in headers.keys()]:
        headers['user-agent'] = splash_user_agent

    payload = {
        'url': target_url,
        'html': 1,
        'jpeg': 1,
        'har': 1,
        'history': 1,
        'wait': wait,
        'render_all': 1,
        'width': 1024,
        'height': 768,
        'timeout': request_timeout,
        'resource_timeout': 5,
        'headers': headers
    }

    # Use proxy if enabled
    if use_proxy:
        proxy = random_proxy(db_session)

    if proxy:
        payload['proxy'] = proxy

    splash_response = requests.post(urljoin(splash_url, 'render.json'),
                                    headers=splash_headers,
                                    json=payload,
                                    auth=auth)

    return splash_response
예제 #7
0
파일: index.py 프로젝트: zanachka/quickpin
def delete_profile_posts(profile_id):
    ''' Delete profile posts. '''

    worker.start_job()
    session = worker.get_session()
    solr = worker.get_solr()
    query = solr.Q(solr.Q(type_s='Post') & solr.Q(profile_id_i=profile_id))
    solr.delete_by_query(query=query)
    solr.commit()
    worker.finish_job()
예제 #8
0
def delete_expired_archives():
    """
    Delete archives older than _days_to_keep_archive.
    """
    worker.start_job()
    db_session = worker.get_session()
    expiry = datetime.utcnow() - timedelta(days=_days_to_keep_archive)
    db_session.query(Archive).filter(Archive.created_at < expiry).delete()
    db_session.commit()
    worker.finish_job()
예제 #9
0
def index_profile(profile_id):
    ''' Index a profile. '''

    worker.start_job()
    session = worker.get_session()
    solr = worker.get_solr()

    profile = session.query(Profile).filter(Profile.id == profile_id).one()
    solr.add(app.index.make_profile_doc(profile))
    solr.commit()
    worker.finish_job()
예제 #10
0
파일: index.py 프로젝트: zanachka/quickpin
def index_profile(profile_id):
    ''' Index a profile. '''

    worker.start_job()
    session = worker.get_session()
    solr = worker.get_solr()

    profile = session.query(Profile).filter(Profile.id == profile_id).one()
    solr.add(app.index.make_profile_doc(profile))
    solr.commit()
    worker.finish_job()
예제 #11
0
파일: scrape.py 프로젝트: zanachka/quickpin
def scrape_avatar(id_, site, url):
    """
    Get an twitter avatar from ``url`` and save it to the Profile identified by
    ``id_``.
    """

    worker.start_job()
    redis = worker.get_redis()
    db_session = worker.get_session()
    avatar = None
    profile = db_session.query(Profile).filter(Profile.id == id_).first()

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    if site == 'twitter':
        # Twitter points you to a scaled image by default, but we can
        # get the original resolution by removing "_normal" from the URL.
        #
        # See: https://dev.twitter.com/overview/general/user-profile-images-and-banners
        url = url.replace('_normal', '')

    # Update Avatar if it's already stored in the db
    for profile_avatar in profile.avatars:
        if profile_avatar.upstream_url == url:
            profile_avatar.end_date = datetime.today()
            avatar = profile_avatar
            break

    # Otherwise, scrape the new Avatar and append to the profile
    if avatar is None:

        response = requests.get(url)
        response.raise_for_status()

        if 'content-type' in response.headers:
            mime = response.headers['content-type']
        else:
            mime = 'application/octet-stream'

        image = response.content
        avatar = Avatar(url, mime, image)
        profile.avatars.append(avatar)
        profile.current_avatar = avatar

    db_session.commit()
    worker.finish_job()

    redis.publish('avatar', json.dumps({
        'id': id_,
        'thumb_url': '/api/file/' + str(avatar.thumb_file.id),
        'url': '/api/file/' + str(avatar.file.id),
    }))
예제 #12
0
파일: scrape.py 프로젝트: zanachka/quickpin
def scrape_instagram_account_by_id(upstream_id, stub=False):
    """ Scrape instagram bio data for upstream ID and update a profile. """

    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    # Instagram API request.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(upstream_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']

    # Update the profile.
    data = response.json()[0]
    profile = Profile('instagram', upstream_id, data['screen_name'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site == 'instagram') \
                            .filter(Profile.upstream_id == upstream_id) \
                            .one()

    # Update profile
    profile.last_update = datetime.now()
    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = stub
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_index_profile(profile) # index all profiles, inc stubs
    if not stub:
        app.queue.schedule_avatar(profile, data['profile_picture'])
        app.queue.schedule_posts(profile, recent=True)
        app.queue.schedule_relations(profile)

    return profile.as_dict()
예제 #13
0
def create_archive(username, category_id, tracker_id, user_id):
    """
    Archive summary of results in the database and store
    a zip archive in the data directory.
    """

    redis = worker.get_redis()
    worker.start_job()
    db_session = worker.get_session()
    found_count = 0
    not_found_count = 0
    error_count = 0

    results = (db_session.query(Result).options(subqueryload(
        Result.image_file)).filter(Result.tracker_id == tracker_id).all())
    site_count = len(results)

    # Generate zip file
    filename = re.sub('[\W_]+', '', username)  # Strip non-alphanumeric char
    zip_file_id = create_zip(filename, results, user_id)

    for result in results:
        if result.status == 'e':
            error_count += 1
        elif result.status == 'f':
            found_count += 1
        elif result.status == 'n':
            not_found_count += 1

    archive = Archive(tracker_id=tracker_id,
                      username=username,
                      category_id=category_id,
                      site_count=site_count,
                      found_count=found_count,
                      not_found_count=not_found_count,
                      error_count=error_count,
                      zip_file_id=zip_file_id,
                      user_id=user_id)

    # Write to db
    db_session.add(archive)
    db_session.commit()

    # Publish
    message = {
        'id': archive.id,
        'name': archive.username,
        'status': 'created',
        'archive': archive.as_dict(),
    }
    redis.publish('archive', json.dumps(message))
    worker.finish_job()
예제 #14
0
def scrape_avatar(id_, site, url):
    '''
    Get an twitter avatar from ``url`` and save it to the Profile identified by
    ``id_``.
    '''

    worker.start_job()
    redis = worker.get_redis()
    db_session = worker.get_session()
    avatar = None
    profile = db_session.query(Profile).filter(Profile.id==id_).first()

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    if site == 'twitter':
        # Twitter points you to a scaled image by default, but we can
        # get the original resolution by removing "_normal" from the URL.
        #
        # See: https://dev.twitter.com/overview/general/user-profile-images-and-banners
        url = url.replace('_normal', '')

    # Update Avatar if it's already stored in the db
    for profile_avatar in profile.avatars:
        if profile_avatar.upstream_url == url:
            profile_avatar.end_date = datetime.today()
            avatar = profile_avatar

    # Otherwise, scrape the new Avatar and append to the profile
    if avatar is None:

        response = requests.get(url)
        response.raise_for_status()

        if 'content-type' in response.headers:
            mime = response.headers['content-type']
        else:
            mime = 'application/octet-stream'

        image = response.content
        avatar = Avatar(url, mime, image)
        profile.avatars.append(avatar)

    db_session.commit()
    worker.finish_job()

    redis.publish('avatar', json.dumps({
        'id': id_,
        'thumb_url': '/api/file/' + str(avatar.thumb_file.id),
        'url': '/api/file/' + str(avatar.file.id),
    }))
예제 #15
0
def check_username(username,
                   site_id,
                   category_id,
                   total,
                   tracker_id,
                   request_timeout=10,
                   test=False):
    """
    Check if `username` exists on the specified site.
    """

    worker.start_job()
    redis = worker.get_redis()
    db_session = worker.get_session()

    # Make a splash request.
    site = db_session.query(Site).get(site_id)

    # Check site.
    splash_result = _splash_username_request(username, site, request_timeout)
    image_file = _save_image(db_session, splash_result)

    # Save result to DB.
    result = Result(tracker_id=tracker_id,
                    site_name=splash_result['site']['name'],
                    site_url=splash_result['url'],
                    status=splash_result['status'],
                    image_file_id=image_file.id,
                    error=splash_result['error'])
    db_session.add(result)
    db_session.commit()

    if not test:
        # Notify clients of the result.
        current = redis.incr(tracker_id)
        result_dict = result.as_dict()
        result_dict['current'] = current
        # result_dict['image_file_url'] = image_file.url()
        # result_dict['image_name'] = image_file.name
        result_dict['total'] = total
        redis.publish('result', json.dumps(result_dict))

        # If this username search is complete, then queue an archive job.
        if current == total:
            app.queue.schedule_archive(username, category_id, tracker_id)

    worker.finish_job()

    return result.id
예제 #16
0
def scrape_twitter_account(username):
    '''
    Scrape twitter bio data and create (or update) a profile.

    TODO The API call used here supports up to 100 usernames at a time. We
    could easily modify this function to populate many profiles at once.
    '''

    # Request from Twitter API.
    db_session = worker.get_session()

    api_url = 'https://api.twitter.com/1.1/users/lookup.json'
    params = {'screen_name': username}
    response = requests.get(
        api_url,
        params=params,
        proxies=_get_proxies(db_session),
        verify=False
    )
    response.raise_for_status()

    # Get Twitter ID and upsert the profile.
    data = response.json()[0] # TODO Only supports getting 1 profile right now...
    user_id = data['id_str']
    profile = Profile('twitter', user_id, data['screen_name'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site=='twitter') \
                            .filter(Profile.upstream_id==user_id) \
                            .one()

    _twitter_populate_profile(data, profile)
    profile.is_stub = False
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_avatar(profile, data['profile_image_url_https'])
    app.queue.schedule_index_profile(profile)
    app.queue.schedule_posts(profile, recent=True)
    app.queue.schedule_relations(profile)

    return profile.as_dict()
예제 #17
0
파일: index.py 프로젝트: zanachka/quickpin
def index_posts(post_ids):
    ''' Index a collection of posts. '''

    worker.start_job()
    session = worker.get_session()
    solr = worker.get_solr()

    post_query = session.query(Post, Profile) \
                        .join(Post.author) \
                        .filter(Post.id.in_(post_ids))

    for post, author in post_query:
        solr.add(app.index.make_post_doc(post, author))

    solr.commit()
    worker.finish_job()
예제 #18
0
def index_posts(post_ids):
    ''' Index a collection of posts. '''

    worker.start_job()
    session = worker.get_session()
    solr = worker.get_solr()

    post_query = session.query(Post, Profile) \
                        .join(Post.author) \
                        .filter(Post.id.in_(post_ids))

    for post, author in post_query:
        solr.add(app.index.make_post_doc(post, author))

    solr.commit()
    worker.finish_job()
예제 #19
0
def create_zip(filename, results, user_id):
    '''
    Generate zip archive of results and return the file id.

    Adds screenshots and HTML for found results.
    Adds csv result summary.
    '''

    db_session = worker.get_session()
    files = []
    str_files = []

    # Get images and HTML
    for result in results:
        if result.status == 'f':
            # Add the image file
            files.append((result.image_file.name, result.image_file.relpath()))
            # Add the HTML as a string file
            html_filename = '{}.html'.format(result.site_name.replace(' ', ''))
            html_file = (html_filename, result.html)
            str_files.append(html_file)

    # Generate in-memory results csv
    csv_string = results_csv_string(results)
    str_file = ('{}.csv'.format(filename), csv_string)
    str_files.append(str_file)

    zip_file = File(name='{}.zip'.format(filename),
                    mime='application/zip',
                    zip_archive=True,
                    zip_files=files,
                    zip_str_files=str_files,
                    user_id=user_id)

    db_session.add(zip_file)

    try:
        db_session.commit()
    except Exception as e:
        raise ArchiveException(e)

    return zip_file.id
예제 #20
0
def scrape_instagram_account_by_id(upstream_id):
    ''' Scrape instagram bio data for upstream ID and update a profile. '''

    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    profile = db_session.query(Profile) \
                        .filter(Profile.site=='instagram') \
                        .filter(Profile.upstream_id==upstream_id) \
                        .one()

    # Instagram API request.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(upstream_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']

    # Update profile
    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = False
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_avatar(profile, data['profile_picture'])
    app.queue.schedule_index_profile(profile)
    app.queue.schedule_posts(profile, recent=True)
    app.queue.schedule_relations(profile)

    return profile.as_dict()
예제 #21
0
def scrape_twitter_account_by_id(upstream_id):
    '''
    Scrape twitter bio data for upstream ID and update a profile.
    Accepts twitter ID rather than username.
    '''

    db_session = worker.get_session()

    profile = db_session.query(Profile) \
                        .filter(Profile.site=='twitter') \
                        .filter(Profile.upstream_id==upstream_id) \
                        .one()

    # Request from Twitter API.
    api_url = 'https://api.twitter.com/1.1/users/lookup.json'
    params = {'user_id': upstream_id}
    response = requests.post(
        api_url,
        params=params,
        proxies=_get_proxies(db_session),
        verify=False
    )
    response.raise_for_status()

    # Update the profile.
    data = response.json()[0]
    _twitter_populate_profile(data, profile)
    profile.is_stub = False
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_avatar(profile, data['profile_image_url_https'])
    app.queue.schedule_index_profile(profile)
    app.queue.schedule_posts(profile, recent=True)
    app.queue.schedule_relations(profile)

    return profile.as_dict()
예제 #22
0
def create_zip(filename, results):
    '''
    Generate zip archive of results and return the file id.

    Adds all images for results that have screenshots.
    Adds csv result summary created on the fly (as IOString).
    '''

    db_session = worker.get_session()
    files = []
    str_files = []

    # Create list of images
    for result in results:
        # Add the name to results for the csv output
        files.append((result.image_file.name, result.image_file.relpath()))

    # Generate in-memory results csv
    csv_string = results_csv_string(results)
    str_file = ('{}.csv'.format(filename), csv_string)
    str_files.append(str_file)

    zip_file = File(name='{}.zip'.format(filename),
                    mime='application/zip',
                    zip_archive=True,
                    zip_files=files,
                    zip_str_files=str_files)

    db_session.add(zip_file)

    try:
        db_session.commit()
    except Exception as e:
        raise ArchiveException(e)

    return zip_file.id
예제 #23
0
def scrape_instagram_relations(id_):
    '''
    Fetch friends and followers for the Instagram user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    '''
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    friends_results = 0
    followers_results = 0
    #max_results = _get_max_relations(db)['instagram']
    max_results = get_config(db, 'max_relations_instagram', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_instagram must be an integer'
        )

    friends_params = {}
    followers_params = {}
    total_results = max_results*2

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]

    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    worker.start_job(total=total_results)

    # Get friend IDs.
    friends_url = 'https://api.instagram.com/v1/users/{}/follows' \
                  .format(profile.upstream_id)

    while friends_results < max_results:
        # Get friends from Instagram API
        friends_response = requests.get(
            friends_url,
            params=friends_params,
            proxies=proxies,
            verify=False
        )
        friends_response.raise_for_status()
        pagination = friends_response.json()['pagination']

        for friend in friends_response.json()['data']:
            # Only store friends that are not already in db.
            if friend['id'] not in current_friends_ids:
                related_profile = Profile(
                    'instagram',
                    friend['id'],
                    friend['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==friend['id']) \
                            .one()

                related_profile.name = friend['full_name']
                profile.friends.append(related_profile)
                friends_results += 1
                worker.update_job(current=friends_results)

                if friends_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            friends_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    # Get follower IDs.
    followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \
                    .format(profile.upstream_id)

    # Get followers from Instagram API
    while followers_results < max_results:
        # Get friends from Instagram API
        followers_response = requests.get(
            followers_url,
            params=followers_params,
            proxies=proxies,
            verify=False
        )
        followers_response.raise_for_status()
        pagination = followers_response.json()['pagination']

        for follower in followers_response.json()['data']:
            # Only store followers that are not already in db.
            if follower['id'] not in current_followers_ids:
                related_profile = Profile(
                    'instagram',
                    follower['id'],
                    follower['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==follower['id']) \
                            .one()

                related_profile.name = follower['full_name']
                profile.followers.append(related_profile)
                followers_results += 1
                worker.update_job(current=friends_results + followers_results)

                if followers_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            followers_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
예제 #24
0
파일: scrape.py 프로젝트: zanachka/quickpin
def scrape_twitter_account(usernames, stub=False, labels=None):
    """
    Scrape twitter bio data and create (or update) a list of profile
    usernames.

    Keyword arguments:
    stub -- add the profile in stub mode (default False)
    labels -- dictionary of username labels (default None)
    """

    if len(usernames) > 100:
        raise ScrapeException('Twitter API max is 100 user IDs per request.')

    profiles = []
    # Request from Twitter API.
    db_session = worker.get_session()

    api_url = 'https://api.twitter.com/1.1/users/lookup.json'
    payload = {'screen_name': ','.join(usernames)}
    headers = {'ACCEPT-ENCODING': None}
    response = requests.post(
        api_url,
        data=payload,
        proxies=_get_proxies(db_session),
        verify=False,
        headers=TWITTER_HEADERS
    )
    response.raise_for_status()

    # Get Twitter ID and upsert the profile.
    for profile_json in response.json():
        user_id = profile_json['id_str']
        profile = Profile('twitter', user_id, profile_json['screen_name'])
        profile.is_stub = stub
        profile.private = profile_json['protected']
        db_session.add(profile)

        try:
            db_session.commit()
        except IntegrityError:
            # Already exists: use the existing profile.
            db_session.rollback()
            profile = db_session.query(Profile) \
                                .filter(Profile.site=='twitter') \
                                .filter(Profile.upstream_id==user_id) \
                                .one()
            # Profiles already in the system are either not stubs or
            # being updated to full profiles
            profile.is_stub = False

        _twitter_populate_profile(profile_json, profile)

        if profile.username.lower() in labels:
            print('Labels: {}'.format(labels), flush=True)
            _label_profile(db_session, profile, labels[profile.username.lower()])

        profile.last_update = datetime.now()
        db_session.commit()
        profiles.append(profile.as_dict())

        # Schedule followup jobs.
        app.queue.schedule_index_profile(profile)

        if not stub:
            app.queue.schedule_avatar(profile, profile_json['profile_image_url_https'])

            # Only get tweets and relations for unprotected profiles
            if not profile.private:
                app.queue.schedule_posts(profile, recent=True)
                app.queue.schedule_relations(profile)

    return profiles
예제 #25
0
def scrape_twitter_relations(id_):
    '''
    Fetch friends and followers for the Twitter user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    '''
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    #max_results = _get_max_relations(db)['twitter']
    max_results = get_config(db, 'max_relations_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_twitter must be an integer'
        )

    friends_results = 0
    friends_ids = []
    followers_results = 0
    followers_ids = []
    friends_cursor = -1
    followers_cursor = -1

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    params = {
        'count': 5000,
        'user_id': profile.upstream_id,
        'stringify_ids': True,
    }

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]


    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    ## Get friend IDs.
    friends_url = 'https://api.twitter.com/1.1/friends/ids.json'
    params['cursor'] = friends_cursor

    while friends_results < max_results:
        friends_response = requests.get(
            friends_url,
            params=params,
            proxies=proxies,
            verify=False
        )
        friends_response.raise_for_status()

        # Ignore friends already in the db
        for friend_id in friends_response.json()['ids']:
            if friend_id not in current_friends_ids:
                friends_ids.append(friend_id)
                friends_results += 1
                if friends_results == max_results:
                    break

        friends_cursor = friends_response.json()['next_cursor']

        if friends_cursor == 0:
            break # No more results
        else:
            params['cursor'] = friends_cursor

    # Get follower IDs.
    followers_url = 'https://api.twitter.com/1.1/followers/ids.json'
    params['cursor'] = followers_cursor

    while followers_results < max_results:
        followers_response = requests.get(
            followers_url,
            params=params,
            proxies=proxies,
            verify=False
        )
        followers_response.raise_for_status()

        # Ignore followers already in the db
        for follower_id in followers_response.json()['ids']:
            if follower_id not in current_followers_ids:
                followers_ids.append(follower_id)
                followers_results += 1
                if followers_results == max_results:
                    break

        followers_cursor = followers_response.json()['next_cursor']

        if followers_cursor == 0:
            break # No more results
        else:
            params['cursor'] = followers_cursor

    # Get username for each of the friend/follower IDs and create
    # a relationship in QuickPin.
    user_ids = [(uid, 'friend') for uid in friends_ids] + \
               [(uid, 'follower') for uid in followers_ids]
    worker.start_job(total=len(user_ids))
    chunk_size = 100
    for chunk_start in range(0, len(user_ids), chunk_size):
        chunk_end = chunk_start + chunk_size
        chunk = user_ids[chunk_start:chunk_end]
        chunk_lookup = {id_:relation for id_,relation in chunk}

        lookup_url = 'https://api.twitter.com/1.1/users/lookup.json'
        lookup_response = requests.post(
            lookup_url,
            proxies=_get_proxies(db),
            verify=False,
            data={'user_id': ','.join(chunk_lookup.keys())}
        )
        lookup_response.raise_for_status()
        relations = lookup_response.json()

        for related_dict in relations:
            uid = related_dict['id_str']
            username = related_dict['screen_name']
            related_profile = Profile('twitter', uid, username, is_stub=True)
            db.add(related_profile)

            try:
                db.commit()
            except IntegrityError:
                # Already exists: use the existing profile.
                db.rollback()
                related_profile = db \
                    .query(Profile) \
                    .filter(Profile.site=='twitter') \
                    .filter(Profile.upstream_id==uid) \
                    .one()

            _twitter_populate_profile(related_dict, related_profile)
            relation = chunk_lookup[uid]

            if relation == 'friend':
                profile.friends.append(related_profile)
            else: # relation == 'follower':
                profile.followers.append(related_profile)

            db.commit()

        worker.update_job(current=chunk_end)

    db.commit()
    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
예제 #26
0
파일: scrape.py 프로젝트: zanachka/quickpin
def scrape_instagram_account(username, stub=False):
    """ Scrape instagram bio data and create (or update) a profile. """
    # Getting a user ID is more difficult than it ought to be: you need to
    # search for the username and iterate through the search results results to
    # find an exact match.
    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    api_url = 'https://api.instagram.com/v1/users/search'
    params = {'q': username}

    response = requests.get(
        api_url,
        params=params,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    search_results = response.json()
    username_lower = username.lower()
    user_id = None

    for user_result in search_results['data']:
        if user_result['username'].lower() == username_lower:
            user_id = user_result['id']
            break

    if user_id is None:
        raise ScrapeException('Can\'t find Instagram user named {}.'
                              .format(username))

    # Now make another request to get this user's profile data.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']
    profile = Profile('instagram', user_id, data['username'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site == 'instagram') \
                            .filter(Profile.upstream_id == user_id) \
                            .one()

    profile.last_update = datetime.now()
    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = stub
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_index_profile(profile) # index all profiles, inc stubs
    if not stub:
        app.queue.schedule_avatar(profile, data['profile_picture'])
        app.queue.schedule_posts(profile, recent=True)
        app.queue.schedule_relations(profile)

    return profile.as_dict()
예제 #27
0
파일: scrape.py 프로젝트: zanachka/quickpin
def scrape_instagram_posts(id_, recent):
    """
    Fetch instagram posts for the user identified by id_.
    Checks posts already stored in db, and will only fetch older or newer
    posts depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent posts not already stored in the db.
    The number of posts to fetch is configured in the Admin.
    """
    redis = worker.get_redis()
    db = worker.get_session()
    author = db.query(Profile).filter(Profile.id == id_).first()
    proxies = _get_proxies(db)
    max_results = get_config(db, 'max_posts_instagram', required=True).value
    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_instagram must be an integer')

    min_id = None
    results = 0
    params = {}

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    url = 'https://api.instagram.com/v1/users/{}/media/recent' \
          .format(author.upstream_id)

    # Get last post currently stored in db for this profile.
    post_query = db.query(Post) \
        .filter(Post.author_id == id_) \
        .order_by(Post.upstream_created.desc()) \

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            min_id = post_query[0].upstream_id
            params['min_id'] = str(min_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() - 1].upstream_id
            params['max_id'] = str(max_id)

    worker.start_job(total=max_results)
    while results < max_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False
        )

        response.raise_for_status()
        post_ids = list()
        response_json = response.json()['data']
        pagination = response.json()['pagination']

        # Instagram API result includes post with min_id so remove it
        response_json[:] = [d for d in response_json if d.get('id') != min_id]

        for gram in response_json:
            if gram['caption'] is not None:
                text = gram['caption']['text']
            else:
                text = None

            post = Post(
                author,
                gram['id'],
                datetime.fromtimestamp(int(gram['created_time'])),
                text
            )

            if gram['location'] is not None:
                if 'latitude' in gram['location']:
                    post.latitude = gram['location']['latitude']
                    post.longitude = gram['location']['longitude']

                if 'name' in gram['location']:
                    post.location = gram['location']['name']

                    if 'street_address' in gram['location']:
                        post.location += ' ' + gram['location']['street_address']

            if 'images' in gram:
                image_url = gram['images']['standard_resolution']['url']
                name = os.path.basename(urlparse(image_url).path)
                img_response = requests.get(image_url, verify=False)
                mime = img_response.headers['Content-type']
                image = img_response.content
                post.attachments.append(File(name, mime, image))

            db.add(post)
            db.flush()
            post_ids.append(post.id)
            worker.update_job(current=results)
            results += 1
            if results == max_results:
                break

        # If there are more results, set the max_id param, otherwise finish
        if 'next_max_id' in pagination:
            params['max_id'] = pagination['next_max_id']
        else:
            break

    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)
예제 #28
0
def check_username(username,
                   site_id,
                   category_id,
                   total,
                   tracker_id,
                   user_id,
                   test=False):
    """
    Check if `username` exists on the specified site.
    """

    worker.start_job()
    redis = worker.get_redis()
    db_session = worker.get_session()

    # Get site
    site = db_session.query(Site).get(site_id)

    # Check site for username
    splash_result = _splash_username_request(username, site)
    # Save image file
    image_file = _save_image(db_session=db_session,
                             scrape_result=splash_result,
                             user_id=user_id,
                             censor=site.censor_images)

    # Save result to DB.
    result = Result(tracker_id=tracker_id,
                    site_id=splash_result['site']['id'],
                    site_name=splash_result['site']['name'],
                    site_url=splash_result['url'],
                    status=splash_result['status'],
                    image_file_id=image_file.id,
                    username=username,
                    error=splash_result['error'],
                    user_id=user_id)

    if result.status == 'f':
        result.html = splash_result['html']

    db_session.add(result)
    db_session.commit()

    if not test:
        # Notify clients of the result.
        current = redis.incr(tracker_id)
        result_dict = result.as_dict()
        result_dict['current'] = current
        # result_dict['image_file_url'] = image_file.url()
        # result_dict['image_name'] = image_file.name
        result_dict['total'] = total
        redis.publish('result', json.dumps(result_dict))

        # If this username search is complete, then queue an archive job.
        if current == total:
            description = 'Archiving results ' \
                          'for username "{}"'.format(username)
            worker.archive.create_archive.enqueue(
                username=username,
                category_id=category_id,
                tracker_id=tracker_id,
                jobdesc=description,
                timeout=_redis_worker['archive_timeout'],
                user_id=user_id)

    worker.finish_job()
    return result.id
예제 #29
0
파일: scrape.py 프로젝트: zanachka/quickpin
def scrape_twitter_account_by_id(upstream_ids, stub=False, labels={}):
    """
    Scrape twitter bio data for upstream IDs and/or updates a profile.
    Accepts twitter ID rather than username.
    """
    if len(upstream_ids) > 100:
        raise ScrapeException('Twitter API max is 100 user IDs per request.')

    db_session = worker.get_session()
    profiles = []

    # Request from Twitter API.
    api_url = 'https://api.twitter.com/1.1/users/lookup.json'
    payload = {'user_id': ','.join(upstream_ids)}
    headers = {'ACCEPT-ENCODING': None}
    response = requests.post(
        api_url,
        data=payload,
        proxies=_get_proxies(db_session),
        verify=False,
        headers=TWITTER_HEADERS
    )
    response.raise_for_status()

    # Update the profile.
    for profile_json in response.json():
        profile = Profile(
            'twitter',
            profile_json['id_str'],
            profile_json['screen_name']
        )
        profile.is_stub = stub
        profile.private = profile_json['protected']
        db_session.add(profile)

        try:
            db_session.commit()
        except IntegrityError:
            # Already exists: use the existing profile.
            db_session.rollback()
            profile = db_session.query(Profile) \
                                .filter(Profile.site=='twitter') \
                                .filter(
                                    Profile.upstream_id==profile_json['id_str']
                                )\
                                .one()
            # Profiles already in the system are either not stubs or
            # being updated to full profiles
            profile.is_stub = False


        _twitter_populate_profile(profile_json, profile)

        if profile.upstream_id in labels:
            _label_profile(db_session, profile, labels[profile.upstream_id])

        profile.last_update = datetime.now()
        db_session.commit()
        profiles.append(profile.as_dict())

        # Schedule followup jobs.
        app.queue.schedule_index_profile(profile)
        if not stub:
            app.queue.schedule_avatar(
                profile, profile_json['profile_image_url_https']
            )
            # Only get tweets and relations for unprotected profiles
            if not profile.private:
                app.queue.schedule_posts(profile, recent=True)
                app.queue.schedule_relations(profile)

    return profiles
예제 #30
0
def scrape_instagram_posts(id_, recent):
    '''
    Fetch instagram posts for the user identified by id_.
    Checks posts already stored in db, and will only fetch older or newer
    posts depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent posts not already stored in the db.
    The number of posts to fetch is configured in the Admin.
    '''
    redis = worker.get_redis()
    db = worker.get_session()
    author = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    max_results = get_config(db, 'max_posts_instagram', required=True).value
    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_instagram must be an integer')

    min_id = None
    more_results = True
    results = 0
    params = {}

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    url = 'https://api.instagram.com/v1/users/{}/media/recent' \
          .format(author.upstream_id)

    # Get last post currently stored in db for this profile.
    post_query = db.query(Post) \
                        .filter(Post.author_id == id_) \
                        .order_by(Post.upstream_created.desc()) \

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            min_id = post_query[0].upstream_id
            params['min_id'] = str(min_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() -1].upstream_id
            params['max_id'] = str(max_id)

    worker.start_job(total=max_results)
    logging.warning('WORKER max results: {}'.format(max_results))
    while results < max_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False
        )

        response.raise_for_status()
        post_ids = list()
        response_json = response.json()['data']
        pagination = response.json()['pagination']

        # Instagram API result includes post with min_id so remove it
        response_json[:] = [d for d in response_json if d.get('id') != min_id]

        for gram in response_json:
            if gram['caption'] is not None:
                text = gram['caption']['text']
            else:
                text = None

            post = Post(
                author,
                gram['id'],
                datetime.fromtimestamp(int(gram['created_time'])),
                text
            )

            if gram['location'] is not None:
                if 'latitude' in gram['location']:
                    post.latitude = gram['location']['latitude']
                    post.longitude = gram['location']['longitude']

                if 'name' in gram['location']:
                    post.location = gram['location']['name']

                    if 'street_address' in gram['location']:
                        post.location += ' ' + gram['location']['street_address']

            if 'images' in gram:
                image_url = gram['images']['standard_resolution']['url']
                name = os.path.basename(urlparse(image_url).path)
                img_response = requests.get(image_url, verify=False)
                mime = img_response.headers['Content-type']
                image = img_response.content
                post.attachments.append(File(name, mime, image))

            db.add(post)
            db.flush()
            post_ids.append(post.id)
            worker.update_job(current=results)
            results += 1
            if results == max_results:
                break

        # If there are more results, set the max_id param, otherwise finish
        if 'next_max_id' in pagination:
            params['max_id'] = pagination['next_max_id']
        else:
            break

    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)
예제 #31
0
def scrape_twitter_posts(id_, recent):
    '''
    Fetch tweets for the user identified by id_.
    Checks tweets already stored in db, and will only fetch older or newer
    tweets depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent tweets not already stored in the db.
    The number of tweets to fetch is configured in the Admin.
    '''
    db = worker.get_session()
    #max_results = _get_max_posts(db)['twitter']
    max_results = get_config(db, 'max_posts_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_twitter must be an integer')

    worker.start_job(total=max_results)
    redis = worker.get_redis()
    author = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    results = 0
    max_id = None
    more_results = True
    count = 200

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get posts currently stored in db for this profile.
    post_query = db.query(Post) \
                        .filter(Post.author_id == id_) \
                        .order_by(Post.upstream_created.desc())

    url = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
    params = {'count': count, 'user_id': author.upstream_id}

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            since_id = post_query[0].upstream_id
            params['since_id'] = str(since_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() -1].upstream_id
            params['max_id'] = str(max_id)

    while more_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False
        )
        response.raise_for_status()

        post_ids = list()

        tweets = response.json()
        if len(tweets) == 0:
            more_results = False

        if len(tweets) < count:
            more_results = False

        for tweet in tweets:
            # Twitter API result set includes the tweet with the max_id/since_id
            # so ignore it.
            if tweet['id_str'] != max_id:
                post = Post(
                    author,
                    tweet['id_str'],
                    dateutil.parser.parse(tweet['created_at']),
                    tweet['text']
                )

                if tweet['lang'] is not None:
                    post.language = tweet['lang']

                if tweet['coordinates'] is not None:
                    post.latitude, post.longitude = tweet['coordinates']

                place = tweet['place']

                if place is not None:
                    # Set longitude/latitude to the center the of bounding polygon.
                    total_lon = 0
                    total_lat = 0
                    num_coords = 0

                    for lon, lat in place['bounding_box']['coordinates'][0]:
                        total_lon += lon
                        total_lat += lat
                        num_coords += 1

                    post.longitude = total_lon / num_coords
                    post.latitude = total_lat / num_coords

                    # Set location to string identifying the place.
                    post.location = '{}, {}'.format(
                        place['full_name'],
                        place['country']
                    )

                db.add(post)
                db.flush()
                post_ids.append(post.id)
                # Set the max_id to the last tweet to get the next set of
                # results
                max_id = tweet['id_str']
                params['max_id'] = max_id
                results += 1
                worker.update_job(current=results)

                if results == max_results:
                    more_results = False
                    break


    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)
예제 #32
0
파일: scrape.py 프로젝트: zanachka/quickpin
def scrape_twitter_posts(id_, recent):
    """
    Fetch tweets for the user identified by id_.
    Checks tweets already stored in db, and will only fetch older or newer
    tweets depending on value of the boolean argument 'recent',
    e.g. recent=True will return recent tweets not already stored in the db.
    The number of tweets to fetch is configured in the Admin.
    """
    db = worker.get_session()
    max_results = get_config(db, 'max_posts_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException('Value of max_posts_twitter must be an integer')

    worker.start_job(total=max_results)
    redis = worker.get_redis()
    author = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    results = 0
    max_id = None
    more_results = True
    count = 200

    if author is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get posts currently stored in db for this profile.
    post_query = db.query(Post) \
                        .filter(Post.author_id == id_) \
                        .order_by(Post.upstream_created.desc())

    url = 'https://api.twitter.com/1.1/statuses/user_timeline.json'
    params = {'count': count, 'user_id': author.upstream_id}

    if post_query.count() > 0:
        # Only fetch posts newer than those already stored in db
        if recent:
            since_id = post_query[0].upstream_id
            params['since_id'] = str(since_id)
        # Only fetch posts older than those already stored in db
        else:
            max_id = post_query[post_query.count() -1].upstream_id
            params['max_id'] = str(max_id)

    while more_results:
        response = requests.get(
            url,
            params=params,
            proxies=proxies,
            verify=False,
            headers=TWITTER_HEADERS,
        )
        response.raise_for_status()

        post_ids = list()

        tweets = response.json()
        if len(tweets) == 0:
            more_results = False

        if len(tweets) < count:
            more_results = False

        for tweet in tweets:
            # Twitter API result set includes the tweet with the max_id/since_id
            # so ignore it.
            if tweet['id_str'] != max_id:
                post = Post(
                    author,
                    tweet['id_str'],
                    dateutil.parser.parse(tweet['created_at']),
                    tweet['text']
                )

                if tweet['lang'] is not None:
                    post.language = tweet['lang']

                if tweet['coordinates'] is not None:
                    post.latitude, post.longitude = tweet['coordinates']

                place = tweet['place']

                if place is not None:
                    # Set longitude/latitude to the center the of bounding polygon.
                    total_lon = 0
                    total_lat = 0
                    num_coords = 0

                    for lon, lat in place['bounding_box']['coordinates'][0]:
                        total_lon += lon
                        total_lat += lat
                        num_coords += 1

                    post.longitude = total_lon / num_coords
                    post.latitude = total_lat / num_coords

                    # Set location to string identifying the place.
                    post.location = '{}, {}'.format(
                        place['full_name'],
                        place['country']
                    )

                db.add(post)
                db.flush()
                post_ids.append(post.id)
                # Set the max_id to the last tweet to get the next set of
                # results
                max_id = tweet['id_str']
                params['max_id'] = max_id
                results += 1
                worker.update_job(current=results)

                if results == max_results:
                    more_results = False
                    break


    db.commit()
    worker.finish_job()
    redis.publish('profile_posts', json.dumps({'id': id_}))
    app.queue.schedule_index_posts(post_ids)
예제 #33
0
파일: scrape.py 프로젝트: zanachka/quickpin
def scrape_instagram_relations(id_):
    """
    Fetch friends and followers for the Instagram user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    """
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    friends_results = 0
    followers_results = 0
    max_results = get_config(db, 'max_relations_instagram', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_instagram must be an integer'
        )

    friends_params = {}
    followers_params = {}
    total_results = max_results*2

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]

    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    worker.start_job(total=total_results)

    # Get friend IDs.
    friends_url = 'https://api.instagram.com/v1/users/{}/follows' \
                  .format(profile.upstream_id)

    while friends_results < max_results:
        # Get friends from Instagram API
        friends_response = requests.get(
            friends_url,
            params=friends_params,
            proxies=proxies,
            verify=False
        )
        friends_response.raise_for_status()
        pagination = friends_response.json()['pagination']

        for friend in friends_response.json()['data']:
            # Only store friends that are not already in db.
            if friend['id'] not in current_friends_ids:
                related_profile = Profile(
                    'instagram',
                    friend['id'],
                    friend['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==friend['id']) \
                            .one()

                related_profile.name = friend['full_name']
                profile.friends.append(related_profile)
                friends_results += 1
                worker.update_job(current=friends_results)

                if friends_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            friends_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    # Get follower IDs.
    followers_url = 'https://api.instagram.com/v1/users/{}/followed-by' \
                    .format(profile.upstream_id)

    # Get followers from Instagram API
    while followers_results < max_results:
        # Get friends from Instagram API
        followers_response = requests.get(
            followers_url,
            params=followers_params,
            proxies=proxies,
            verify=False
        )
        followers_response.raise_for_status()
        pagination = followers_response.json()['pagination']

        for follower in followers_response.json()['data']:
            # Only store followers that are not already in db.
            if follower['id'] not in current_followers_ids:
                related_profile = Profile(
                    'instagram',
                    follower['id'],
                    follower['username'],
                    is_stub=True
                )

                db.add(related_profile)

                try:
                    db.commit()
                except IntegrityError:
                    db.rollback()
                    related_profile = db \
                            .query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==follower['id']) \
                            .one()

                related_profile.name = follower['full_name']
                profile.followers.append(related_profile)
                followers_results += 1
                worker.update_job(current=friends_results + followers_results)

                if followers_results == max_results:
                    break

        # If there are more results, set the cursor paramater, otherwise finish
        if 'next_cursor' in pagination:
            followers_params['cursor'] = pagination['next_cursor']
        else:
            break # No more results

    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
예제 #34
0
파일: scrape.py 프로젝트: zanachka/quickpin
def scrape_twitter_relations(id_):
    """
    Fetch friends and followers for the Twitter user identified by `id_`.
    The number of friends and followers to fetch is configured in Admin.
    """
    redis = worker.get_redis()
    db = worker.get_session()
    profile = db.query(Profile).filter(Profile.id==id_).first()
    proxies = _get_proxies(db)
    max_results = get_config(db, 'max_relations_twitter', required=True).value

    try:
        max_results = int(max_results)
    except:
        raise ScrapeException(
            'Value of max_relations_twitter must be an integer'
        )

    friends_results = 0
    friends_ids = []
    followers_results = 0
    followers_ids = []
    friends_cursor = -1
    followers_cursor = -1

    if profile is None:
        raise ValueError('No profile exists with id={}'.format(id_))

    params = {
        'count': 5000,
        'user_id': profile.upstream_id,
        'stringify_ids': True,
    }

    # Get friends currently stored in db for this profile.
    friends_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.friend_id == Profile.id)
            ) \
            .filter(profile_join_self.c.follower_id == id_)
    current_friends_ids = [friend.upstream_id for friend in friends_query]


    # Get followers currently stored in db for this profile.
    followers_query = \
        db.query(Profile.upstream_id) \
            .join(\
                profile_join_self, \
                (profile_join_self.c.follower_id == Profile.id)
            ) \
            .filter(profile_join_self.c.friend_id == id_)
    current_followers_ids = [follower.upstream_id for follower in followers_query]

    ## Get friend IDs.
    friends_url = 'https://api.twitter.com/1.1/friends/ids.json'
    params['cursor'] = friends_cursor

    while friends_results < max_results:
        friends_response = requests.get(
            friends_url,
            params=params,
            proxies=proxies,
            verify=False,
            headers=TWITTER_HEADERS
        )
        friends_response.raise_for_status()

        # Ignore friends already in the db
        for friend_id in friends_response.json()['ids']:
            if friend_id not in current_friends_ids:
                friends_ids.append(friend_id)
                friends_results += 1
                if friends_results == max_results:
                    break

        friends_cursor = friends_response.json()['next_cursor']

        if friends_cursor == 0:
            break # No more results
        else:
            params['cursor'] = friends_cursor

    # Get follower IDs.
    followers_url = 'https://api.twitter.com/1.1/followers/ids.json'
    params['cursor'] = followers_cursor

    while followers_results < max_results:
        followers_response = requests.get(
            followers_url,
            params=params,
            proxies=proxies,
            verify=False,
            headers=TWITTER_HEADERS,
        )
        followers_response.raise_for_status()

        # Ignore followers already in the db
        for follower_id in followers_response.json()['ids']:
            if follower_id not in current_followers_ids:
                followers_ids.append(follower_id)
                followers_results += 1
                if followers_results == max_results:
                    break

        followers_cursor = followers_response.json()['next_cursor']

        if followers_cursor == 0:
            break # No more results
        else:
            params['cursor'] = followers_cursor

    # Get username for each of the friend/follower IDs and create
    # a relationship in QuickPin.
    user_ids = [(uid, 'friend') for uid in friends_ids] + \
               [(uid, 'follower') for uid in followers_ids]
    worker.start_job(total=len(user_ids))
    chunk_size = 100
    for chunk_start in range(0, len(user_ids), chunk_size):
        chunk_end = chunk_start + chunk_size
        chunk = user_ids[chunk_start:chunk_end]
        chunk_lookup = {id_:relation for id_,relation in chunk}

        lookup_url = 'https://api.twitter.com/1.1/users/lookup.json'
        lookup_response = requests.post(
            lookup_url,
            proxies=_get_proxies(db),
            verify=False,
            headers=TWITTER_HEADERS,
            data={'user_id': ','.join(chunk_lookup.keys())}
        )
        lookup_response.raise_for_status()
        relations = lookup_response.json()

        for related_dict in relations:
            uid = related_dict['id_str']
            username = related_dict['screen_name']
            related_profile = Profile('twitter', uid, username, is_stub=True)
            db.add(related_profile)

            try:
                db.commit()
            except IntegrityError:
                # Already exists: use the existing profile.
                db.rollback()
                related_profile = db \
                    .query(Profile) \
                    .filter(Profile.site=='twitter') \
                    .filter(Profile.upstream_id==uid) \
                    .one()

            _twitter_populate_profile(related_dict, related_profile)
            relation = chunk_lookup[uid]

            if relation == 'friend':
                profile.friends.append(related_profile)
            else: # relation == 'follower':
                profile.followers.append(related_profile)

            db.commit()

        worker.update_job(current=chunk_end)

    db.commit()
    worker.finish_job()
    redis.publish('profile_relations', json.dumps({'id': id_}))
예제 #35
0
def scrape_instagram_account(username):
    ''' Scrape instagram bio data and create (or update) a profile. '''

    # Getting a user ID is more difficult than it ought to be: you need to
    # search for the username and iterate through the search results results to
    # find an exact match.
    db_session = worker.get_session()
    proxies = _get_proxies(db_session)

    api_url = 'https://api.instagram.com/v1/users/search'
    params = {'q': username}

    response = requests.get(
        api_url,
        params=params,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    search_results = response.json()
    username_lower = username.lower()
    user_id = None

    for user_result in search_results['data']:
        if user_result['username'].lower() == username_lower:
            user_id = user_result['id']
            break

    if user_id is None:
        raise ScrapeException('Can\'t find Instagram user named {}.'
                              .format(username))

    # Now make another request to get this user's profile data.
    api_url = 'https://api.instagram.com/v1/users/{}'.format(user_id)

    response = requests.get(
        api_url,
        proxies=proxies,
        verify=False
    )

    response.raise_for_status()
    data = response.json()['data']
    profile = Profile('instagram', user_id, data['username'])
    db_session.add(profile)

    try:
        db_session.commit()
    except IntegrityError:
        # Already exists: use the existing profile.
        db_session.rollback()
        profile = db_session.query(Profile) \
                            .filter(Profile.site=='instagram') \
                            .filter(Profile.upstream_id==user_id) \
                            .one()

    profile.description = data['bio']
    profile.follower_count = int(data['counts']['followed_by'])
    profile.friend_count = int(data['counts']['follows'])
    profile.homepage = data['website']
    profile.name = data['full_name']
    profile.post_count = int(data['counts']['media'])
    profile.is_stub = False
    db_session.commit()

    # Schedule followup jobs.
    app.queue.schedule_avatar(profile, data['profile_picture'])
    app.queue.schedule_index_profile(profile)
    app.queue.schedule_posts(profile, recent=True)
    app.queue.schedule_relations(profile)

    return profile.as_dict()