示例#1
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "instagram-scraper scrapes and downloads an instagram user's photos and videos.",
        epilog=textwrap.dedent("""
        You can hide your credentials from the history, by reading your
        username from a local file:

        $ instagram-scraper @insta_args.txt user_to_scrape

        with insta_args.txt looking like this:
        -u=my_username
        -p=my_password

        You can add all arguments you want to that file, just remember to have
        one argument per line.

        Customize filename:
        by adding option --template or -T
        Default is: {urlname}
        And there are some option:
        {username}: Instagram user(s) to scrape.
        {shortcode}: post shortcode, but profile_pic and story are none.
        {urlname}: filename form url.
        {mediatype}: type of media.
        {datetime}: date and time that photo/video post on,
                     format is: 20180101 01h01m01s
        {date}: date that photo/video post on,
                 format is: 20180101
        {year}: format is: 2018
        {month}: format is: 01-12
        {day}: format is: 01-31
        {h}: hour, format is: 00-23h
        {m}: minute, format is 00-59m
        {s}: second, format is 00-59s

        """),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        fromfile_prefix_chars='@')

    parser.add_argument('username',
                        help='Instagram user(s) to scrape',
                        nargs='*')
    parser.add_argument('-limit',
                        '-l',
                        help='Number of files to generate (default: %s)' %
                        DEFAULT_DOWNLOAD_LIMIT,
                        type=int)
    parser.add_argument('--destination',
                        '-d',
                        default='./',
                        help='Download destination')
    parser.add_argument('--login-user',
                        '--login_user',
                        '-u',
                        default=None,
                        help='Instagram login user')
    parser.add_argument('--login-pass',
                        '--login_pass',
                        '-p',
                        default=None,
                        help='Instagram login password')
    parser.add_argument(
        '--followings-input',
        '--followings_input',
        action='store_true',
        default=False,
        help='Compile list of profiles followed by login-user to use as input')
    parser.add_argument('--followings-output',
                        '--followings_output',
                        help='Output followings-input to file in destination')
    parser.add_argument(
        '--filename',
        '-f',
        help='Path to a file containing a list of users to scrape')
    parser.add_argument('--quiet',
                        '-q',
                        default=False,
                        action='store_true',
                        help='Be quiet while scraping')
    parser.add_argument('--maximum',
                        '-m',
                        type=int,
                        default=0,
                        help='Maximum number of items to scrape')
    parser.add_argument(
        '--retain-username',
        '--retain_username',
        '-n',
        action='store_true',
        default=False,
        help='Creates username subdirectory when destination flag is set')
    parser.add_argument('--media-metadata',
                        '--media_metadata',
                        action='store_true',
                        default=False,
                        help='Save media metadata to json file')
    parser.add_argument('--profile-metadata',
                        '--profile_metadata',
                        action='store_true',
                        default=False,
                        help='Save profile metadata to json file')
    parser.add_argument('--proxies',
                        default={},
                        help='Maximum number of items to scrape')
    parser.add_argument(
        '--include-location',
        '--include_location',
        action='store_true',
        default=False,
        help='Include location data when saving media metadata')
    parser.add_argument('--media-types',
                        '--media_types',
                        '-t',
                        nargs='+',
                        default=['image', 'video', 'story'],
                        help='Specify media types to scrape')
    parser.add_argument('--latest',
                        action='store_true',
                        default=False,
                        help='Scrape new media since the last scrape')
    parser.add_argument(
        '--latest-stamps',
        '--latest_stamps',
        default=None,
        help='Scrape new media since timestamps by user in specified file')
    parser.add_argument(
        '--cookiejar',
        '--cookierjar',
        default=None,
        help=
        'File in which to store cookies so that they can be reused between runs.'
    )
    parser.add_argument('--tag',
                        action='store_true',
                        default=False,
                        help='Scrape media using a hashtag')
    parser.add_argument('--filter',
                        default=None,
                        help='Filter by tags in user posts',
                        nargs='*')
    parser.add_argument('--location',
                        action='store_true',
                        default=False,
                        help='Scrape media using a location-id')
    parser.add_argument('--search-location',
                        action='store_true',
                        default=False,
                        help='Search for locations by name')
    parser.add_argument('--comments',
                        action='store_true',
                        default=False,
                        help='Save post comments to json file')
    parser.add_argument('--no-check-certificate',
                        action='store_true',
                        default=False,
                        help='Do not use ssl on transaction')
    parser.add_argument('--interactive',
                        '-i',
                        action='store_true',
                        default=False,
                        help='Enable interactive login challenge solving')
    parser.add_argument(
        '--retry-forever',
        action='store_true',
        default=False,
        help='Retry download attempts endlessly when errors are received')
    parser.add_argument('--verbose',
                        '-v',
                        type=int,
                        default=0,
                        help='Logging verbosity level')
    parser.add_argument('--template',
                        '-T',
                        type=str,
                        default='{urlname}',
                        help='Customize filename template')

    args = parser.parse_args()

    if (args.login_user
            and args.login_pass is None) or (args.login_user is None
                                             and args.login_pass):
        parser.print_help()
        raise ValueError('Must provide login user AND password')

    if not args.username and args.filename is None and not args.followings_input:
        parser.print_help()
        raise ValueError(
            'Must provide username(s) OR a file containing a list of username(s) OR pass --followings-input'
        )
    elif (args.username
          and args.filename) or (args.username and args.followings_input) or (
              args.filename and args.followings_input):
        parser.print_help()
        raise ValueError(
            'Must provide only one of the following: username(s) OR a filename containing username(s) OR --followings-input'
        )

    if args.tag and args.location:
        parser.print_help()
        raise ValueError(
            'Must provide only one of the following: hashtag OR location')

    if args.tag and args.filter:
        parser.print_help()
        raise ValueError('Filters apply to user posts')

    if args.filename:
        args.usernames = InstagramScraper.parse_file_usernames(args.filename)
    else:
        args.usernames = InstagramScraper.parse_delimited_str(','.join(
            args.username))

    if args.media_types and len(
            args.media_types) == 1 and re.compile(r'[,;\s]+').findall(
                args.media_types[0]):
        args.media_types = InstagramScraper.parse_delimited_str(
            args.media_types[0])

    if args.retry_forever:
        global MAX_RETRIES
        MAX_RETRIES = sys.maxsize

    scraper = InstagramScraper(**vars(args))

    if args.login_user and args.login_pass:
        scraper.authenticate_with_login()
    else:
        scraper.authenticate_as_guest()

    if args.followings_input:
        scraper.usernames = list(
            scraper.query_followings_gen(scraper.login_user))
        if args.followings_output:
            with open(scraper.destination + scraper.followings_output,
                      'w') as file:
                for username in scraper.usernames:
                    file.write(username + "\n")
            # If not requesting anything else, exit
            if args.media_types == ['none'] and args.media_metadata is False:
                scraper.logout()
                return

    if args.tag:
        scraper.scrape_hashtag()
    elif args.location:
        scraper.scrape_location()
    elif args.search_location:
        scraper.search_locations()
    else:
        scraper.scrape()

    scraper.save_cookies()
    with open("imgurls.txt", "a", encoding="utf8") as f:
        f.write("{")
    number = args.limit
    if number == None or number == 0:
        number = 1000
    for username in args.usernames:
        org_path = username
        all_img = glob.glob(org_path + "/*.jpg")
        for org_img in all_img:
            outfile = org_img.replace(username + "\\", "")
            path = r'emoji'  # use your path
            all_emoji = glob.glob(path + "/*.png")
            all_count = number
            while all_count > 0:

                img = cv2.imread(org_img)
                img_cnt = random.randint(1, 6)
                height, width, channels = img.shape

                while img_cnt > 0:
                    id = randrange(33)
                    choose_emoji = all_emoji[id]
                    overlay_t = cv2.imread(choose_emoji, -1)
                    img = overlay_transparent(img, overlay_t,
                                              random.randint(0, width - 75),
                                              random.randint(0, height - 75),
                                              (75, 75))
                    img_cnt -= 1

                output = "result/" + str(outfile) + "-" + str(
                    all_count) + ".png"
                cv2.imwrite(output, img)
                all_count -= 1

            org_path = 'result'
            all_images = glob.glob(org_path + "/*.png")
            arr = np.array_split(all_images, 5)
            with open("imgurls.txt", "a", encoding="utf8") as f:
                f.write("{")
            with Pool(processes=5) as pool:
                pool.map(UploadingImage, arr)
            files = glob.glob('result/*')
            for f in files:
                os.remove(f)
            with open("imgurls.txt", 'rb+') as filehandle:
                filehandle.seek(-1, os.SEEK_END)
                filehandle.truncate()
            with open("imgurls.txt", "a", encoding="utf8") as f:
                f.write("}|")
            print("image done:" + org_img)

        with open("imgurls.txt", 'rb+') as filehandle:
            filehandle.seek(-1, os.SEEK_END)
            filehandle.truncate()
        with open("imgurls.txt", "a", encoding="utf8") as f:
            f.write("}")
示例#2
0
def scrape_photos(sourceUserFolder):
    parser = argparse.ArgumentParser(
        description=
        "instagram-scraper scrapes and downloads an instagram user's photos and videos.",
        epilog=textwrap.dedent("""
        You can hide your credentials from the history, by reading your
        username from a local file:
        $ instagram-scraper @insta_args.txt user_to_scrape
        with insta_args.txt looking like this:
        -u=my_username
        -p=my_password
        You can add all arguments you want to that file, just remember to have
        one argument per line.
        Customize filename:
        by adding option --template or -T
        Default is: {urlname}
        And there are some option:
        {username}: Instagram user(s) to scrape.
        {shortcode}: post shortcode, but profile_pic and story are none.
        {urlname}: filename form url.
        {mediatype}: type of media.
        {datetime}: date and time that photo/video post on,
                     format is: 20180101 01h01m01s
        {date}: date that photo/video post on,
                 format is: 20180101
        {year}: format is: 2018
        {month}: format is: 01-12
        {day}: format is: 01-31
        {h}: hour, format is: 00-23h
        {m}: minute, format is 00-59m
        {s}: second, format is 00-59s
        """),
        formatter_class=argparse.RawDescriptionHelpFormatter,
        fromfile_prefix_chars='@')

    parser.add_argument('username',
                        help='Instagram user(s) to scrape',
                        nargs='*')
    parser.add_argument('--destination',
                        '-d',
                        default=sourceUserFolder,
                        help='Download destination')
    parser.add_argument('--login-user',
                        '--login_user',
                        '-u',
                        default=None,
                        help='Instagram login user')
    parser.add_argument('--login-pass',
                        '--login_pass',
                        '-p',
                        default=None,
                        help='Instagram login password')
    parser.add_argument(
        '--followings-input',
        '--followings_input',
        action='store_true',
        default=False,
        help='Compile list of profiles followed by login-user to use as input')
    parser.add_argument('--followings-output',
                        '--followings_output',
                        help='Output followings-input to file in destination')
    parser.add_argument(
        '--filename',
        '-f',
        help='Path to a file containing a list of users to scrape')
    parser.add_argument('--quiet',
                        '-q',
                        default=False,
                        action='store_true',
                        help='Be quiet while scraping')
    parser.add_argument('--maximum',
                        '-m',
                        type=int,
                        default=0,
                        help='Maximum number of items to scrape')
    parser.add_argument(
        '--retain-username',
        '--retain_username',
        '-n',
        action='store_true',
        default=False,
        help='Creates username subdirectory when destination flag is set')
    parser.add_argument('--media-metadata',
                        '--media_metadata',
                        action='store_true',
                        default=False,
                        help='Save media metadata to json file')
    parser.add_argument('--profile-metadata',
                        '--profile_metadata',
                        action='store_true',
                        default=False,
                        help='Save profile metadata to json file')
    parser.add_argument(
        '--proxies',
        default={},
        help=
        'Enable use of proxies, add a valid JSON with http or/and https urls.')
    parser.add_argument(
        '--include-location',
        '--include_location',
        action='store_true',
        default=False,
        help='Include location data when saving media metadata')
    parser.add_argument('--media-types',
                        '--media_types',
                        '-t',
                        nargs='+',
                        default=['image', 'video', 'story'],
                        help='Specify media types to scrape')
    parser.add_argument('--latest',
                        action='store_true',
                        default=False,
                        help='Scrape new media since the last scrape')
    parser.add_argument(
        '--latest-stamps',
        '--latest_stamps',
        default=None,
        help='Scrape new media since timestamps by user in specified file')
    parser.add_argument(
        '--cookiejar',
        '--cookierjar',
        default=None,
        help=
        'File in which to store cookies so that they can be reused between runs.'
    )
    parser.add_argument('--tag',
                        action='store_true',
                        default=False,
                        help='Scrape media using a hashtag')
    parser.add_argument('--filter',
                        default=None,
                        help='Filter by tags in user posts',
                        nargs='*')
    parser.add_argument(
        '--filter_location',
        default=None,
        nargs="*",
        help=
        "filter query by only accepting media with location filter as the location id"
    )
    parser.add_argument(
        '--filter_location_file',
        default=None,
        type=str,
        help="file containing list of locations to filter query by")
    parser.add_argument('--location',
                        action='store_true',
                        default=False,
                        help='Scrape media using a location-id')
    parser.add_argument('--search-location',
                        action='store_true',
                        default=False,
                        help='Search for locations by name')
    parser.add_argument('--comments',
                        action='store_true',
                        default=False,
                        help='Save post comments to json file')
    parser.add_argument('--no-check-certificate',
                        action='store_true',
                        default=False,
                        help='Do not use ssl on transaction')
    parser.add_argument('--interactive',
                        '-i',
                        action='store_true',
                        default=False,
                        help='Enable interactive login challenge solving')
    parser.add_argument(
        '--retry-forever',
        action='store_true',
        default=False,
        help='Retry download attempts endlessly when errors are received')
    parser.add_argument('--verbose',
                        '-v',
                        type=int,
                        default=0,
                        help='Logging verbosity level')
    parser.add_argument('--template',
                        '-T',
                        type=str,
                        default='{urlname}',
                        help='Customize filename template')
    parser.add_argument(
        '--log_destination',
        '-l',
        type=str,
        default='',
        help='destination folder for the instagram-scraper.log file')

    args = parser.parse_args()

    # Need to set destination folder

    if (args.login_user
            and args.login_pass is None) or (args.login_user is None
                                             and args.login_pass):
        parser.print_help()
        raise ValueError('Must provide login user AND password')

    if not args.username and args.filename is None and not args.followings_input:
        parser.print_help()
        raise ValueError(
            'Must provide username(s) OR a file containing a list of username(s) OR pass --followings-input'
        )
    elif (args.username
          and args.filename) or (args.username and args.followings_input) or (
              args.filename and args.followings_input):
        parser.print_help()
        raise ValueError(
            'Must provide only one of the following: username(s) OR a filename containing username(s) OR --followings-input'
        )

    if args.tag and args.location:
        parser.print_help()
        raise ValueError(
            'Must provide only one of the following: hashtag OR location')

    if args.tag and args.filter:
        parser.print_help()
        raise ValueError('Filters apply to user posts')

    if (args.filter_location
            or args.filter_location_file) and not args.include_location:
        parser.print_help()
        raise ValueError(
            'Location filter needs locations in metadata to filter properly')

    if args.filename:
        args.usernames = InstagramScraper.get_values_from_file(args.filename)
    else:
        args.usernames = InstagramScraper.parse_delimited_str(','.join(
            args.username))

    if args.filter_location_file:
        args.filter_locations = InstagramScraper.get_values_from_file(
            args.filter_location_file)
    elif args.filter_location:
        args.filter_locations = InstagramScraper.parse_delimited_str(','.join(
            args.filter_location))

    if args.media_types and len(
            args.media_types) == 1 and re.compile(r'[,;\s]+').findall(
                args.media_types[0]):
        args.media_types = InstagramScraper.parse_delimited_str(
            args.media_types[0])

    if args.retry_forever:
        global MAX_RETRIES
        MAX_RETRIES = sys.maxsize

    scraper = InstagramScraper(**vars(args))

    if args.login_user and args.login_pass:
        scraper.authenticate_with_login()
    else:
        scraper.authenticate_as_guest()

    if args.followings_input:
        scraper.usernames = list(
            scraper.query_followings_gen(scraper.login_user))
        if args.followings_output:
            with open(scraper.destination + scraper.followings_output,
                      'w') as file:
                for username in scraper.usernames:
                    file.write(username + "\n")
            # If not requesting anything else, exit
            if args.media_types == ['none'] and args.media_metadata is False:
                scraper.logout()
                return

    if args.tag:
        scraper.scrape_hashtag()
    elif args.location:
        scraper.scrape_location()
    elif args.search_location:
        scraper.search_locations()
    else:
        scraper.scrape()

    scraper.save_cookies()