def scrapeVideos(username = "",
                 password = "",
                 output_folder = "",
                 days = 1):
        
    print("Starting Scraping")

    L = instaloader.Instaloader()

    # Login or load session for loader
    L.login(username, password)  
    profile = instaloader.Profile.from_username(L.context, username)
    following = profile.get_followees()
    print(following)

    for profile in following:
        acc = profile.username
        looter = ProfileLooter(acc, videos_only=True, template="{id}-{username}-{width}-{height}")
        if not looter.logged_in():
            looter.login(username, password)
        print("Scraping From Account: " + acc)

        today = datetime.date.today()
        timeframe = (today, today - dateutil.relativedelta.relativedelta(days=days))
        numDowloaded = looter.download(output_folder, media_count=30, timeframe=timeframe)
        print("Downloaded " + str(numDowloaded) + " videos successfully")
        print("")
예제 #2
0
class TestLogin(unittest.TestCase):
    def setUp(self):
        self.looter = ProfileLooter(USERNAME, template="test")
        self.destfs = fs.memoryfs.MemoryFS()

    def tearDown(self):
        self.destfs.close()

    def test_login(self):

        self.assertFalse(self.looter.logged_in())
        self.assertRaises(RuntimeError, self.looter.medias)
        self.assertFalse(self.looter._cachefs.exists(self.looter._COOKIE_FILE))

        try:
            self.looter.login(USERNAME, PASSWORD)
            self.assertTrue(self.looter.logged_in())
            self.assertTrue(
                self.looter._cachefs.exists(self.looter._COOKIE_FILE))
            self.assertTrue(next(self.looter.medias()))
        finally:
            self.looter.logout()
            self.assertFalse(
                self.looter._cachefs.exists(self.looter._COOKIE_FILE))

    def test_download(self):

        try:
            self.looter.login(USERNAME, PASSWORD)
            self.looter.download(self.destfs)
            self.assertTrue(self.destfs.exists('test.jpg'))
            self.assertEqual(self.destfs.getbytes('test.jpg')[6:10], b'JFIF')
        finally:
            self.looter.logout()
예제 #3
0
class TestLogin(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        cls.session = requests.Session()
        InstaLooter._user_agent = cls.session.headers["User-Agent"]

    @classmethod
    def tearDownClass(cls):
        cls.session.close()
        del InstaLooter._user_agent

    def setUp(self):
        self.looter = ProfileLooter(USERNAME, template="test")
        self.destfs = fs.memoryfs.MemoryFS()

    def tearDown(self):
        self.destfs.close()

    def test_login(self):

        self.assertFalse(self.looter.logged_in())
        self.assertRaises(RuntimeError, self.looter.medias)
        self.assertFalse(self.looter._cachefs.exists(self.looter._COOKIE_FILE))

        try:
            self.looter.login(USERNAME, PASSWORD)
            self.assertTrue(self.looter.logged_in())
            self.assertTrue(
                self.looter._cachefs.exists(self.looter._COOKIE_FILE))
            self.assertTrue(next(self.looter.medias()))
        finally:
            self.looter.logout()
            self.assertFalse(
                self.looter._cachefs.exists(self.looter._COOKIE_FILE))

    def test_download(self):
        try:
            self.looter.login(USERNAME, PASSWORD)
            self.looter.download(self.destfs)
            self.assertTrue(self.destfs.exists('test.jpg'))
            self.assertEqual(self.destfs.getbytes('test.jpg')[6:10], b'JFIF')
        finally:
            self.looter.logout()
예제 #4
0
from instalooter.looters import ProfileLooter
import datetime
import dateutil.relativedelta

# instalooter_test downloads videos posted by daquan in the last month

# Instanciate
looter = ProfileLooter("daquan",
                       videos_only=True,
                       template="{id}-{username}-{width}-{height}")
looter.login("", "")

today = datetime.date.today()
thismonth = (today, today - dateutil.relativedelta.relativedelta(days=28))

looter.download('./Memes_December_4', media_count=50, timeframe=thismonth)
예제 #5
0
class InstagramFeedMediaChannelMixin(object):

    LISTING_CLASS = InstagramMediaListing

    POST_TYPE_MAP = {
        "GraphImage": "image",
        "GraphVideo": "video",
        "GraphSidecar": "carousel"
    }

    looter_ : typing.Any = None

    @property
    @db_session
    def end_cursor(self):
        return self.attrs.get("end_cursor", None)

    @db_session
    def save_end_cursor(self, timestamp, end_cursor):
        self.attrs["end_cursor"] = [timestamp, end_cursor]
        commit()

    @property
    def looter(self):
        if not hasattr(self, "looter_") or not self.looter_ or self.looter_._username != self.locator[1:]:
            self.looter_ = ProfileLooter(self.locator[1:])
            if self.provider.config.credentials and not self.looter_.logged_in:
                self.looter_.login(**self.provider.session_params)
        return self.looter_


    def get_post_info(self, shortcode):

        return self.looter.get_post_info(shortcode)

    @property
    def posts(self):

        url = f"https://www.instagram.com/{self.locator[1:]}/?__a=1"
        data = self.looter.session.get(url).json()
        return data["graphql"]["user"]["edge_owner_to_timeline_media"]["count"]

    def extract_content(self, post):

        media_type = self.POST_TYPE_MAP[post["__typename"]]

        if media_type == "image":
            content = [
                dict(
                    url = post.display_url,
                    media_type = media_type,
                    shortcode = post.shortcode
                )
            ]
        elif media_type == "video":
            if post.get("video_url"):
                content = [
                    dict(
                        url = post.video_url,
                        url_thumbnail = post.display_url,
                        media_type = media_type,
                        shortcode = post.shortcode
                    )
                ]
            else:
                content = [
                    dict(
                        url = None,
                        url_thumbnail = post.display_url,
                        media_type = media_type,
                        shortcode = post.shortcode
                    )
                ]

        elif media_type == "carousel":
            if post.get('edge_sidecar_to_children'):
                content = [
                    dict(
                        url = s.video_url if s.is_video else s.display_url,
                        url_thumbnail = s.display_url,
                        media_type = "video" if s.is_video else "image",
                        shortcode = post.shortcode
                    )
                    for s in [AttrDict(e['node']) for e in post['edge_sidecar_to_children']['edges']]
                ]
            else:
                content = [
                    dict(
                        url = None,
                        url_thumbnail = post.display_url,
                        media_type = media_type
                    )
                ]

        else:
            raise Exception(f"invalid media type: {media_type}")

        return content


    async def fetch(self, limit=None, resume=False, replace=False):

        logger.info(f"fetching {self.locator} {resume}, {replace}")

        # update cached post count
        with db_session:
            self.attrs["posts"] = self.posts

        try:
            (_, end_cursor) = self.end_cursor if resume else None
        except TypeError:
            end_cursor = None

        logger.info(f"cursor: {end_cursor}")
        try:
            self.pages = self.looter.pages(cursor=end_cursor)
        except ValueError:
            self.looter_.logout()
            self.looter_.login(
                username=self.provider.session_params["username"],
                password=self.provider.session_params["password"],
            )
            self.pages = self.looter.pages(cursor=end_cursor)

        # def get_posts(pages):
        #     posts = list()
        #     for page in pages:
        #         cursor = page["edge_owner_to_timeline_media"]["page_info"]["end_cursor"]
        #         for media in self.looter._medias(iter([page])):
        #             posts.append((cursor, AttrDict(media)))
        #     return posts
        #
        def get_posts(pages):
            try:
                for page in pages:
                    cursor = page["edge_owner_to_timeline_media"]["page_info"]["end_cursor"]
                    for media in self.looter._medias(iter([page])):
                        yield (cursor, AttrDict(media))
            except json.decoder.JSONDecodeError:
                logger.error("".join(traceback.format_exc()))
                raise StopIteration

        count = 0
        new_count = 0

        posts = state.event_loop.run_in_executor(
            None, get_posts, self.pages
        )

        for end_cursor, post in await posts:

            count += 1

            logger.info(f"cursor: {end_cursor}")

            logger.debug(f"{count} {new_count} {limit}")

            if new_count >= limit or new_count == 0 and count >= limit:
                break

            created_timestamp = post.get(
                "date", post.get("taken_at_timestamp")
            )

            if end_cursor and (self.end_cursor is None or created_timestamp < self.end_cursor[0]):
                logger.info(f"saving end_cursor: {created_timestamp}, {self.end_cursor[0] if self.end_cursor else None}")
                self.save_end_cursor(created_timestamp, end_cursor)

            created = datetime.utcfromtimestamp(created_timestamp)

            i = self.items.select(lambda i: i.guid == post.shortcode).first()

            if i and not replace:
                logger.debug(f"old: {created}")
                return
            else:
                logger.debug(f"new: {created}")
                caption = (
                    post["edge_media_to_caption"]["edges"][0]["node"]["text"]
                    if "edge_media_to_caption" in post and post["edge_media_to_caption"]["edges"]
                    else  post["caption"]
                    if "caption" in post
                    else None
                )

                try:
                    media_type = self.POST_TYPE_MAP[post["__typename"]]
                except:
                    logger.warn(f"unknown post type: {post.__typename}")
                    continue

                content = self.extract_content(post)

                i = dict(
                    channel = self,
                    guid = post.shortcode,
                    title = (caption or "(no caption)").replace("\n", " "),
                    created = created,
                    media_type = media_type,
                    sources =  content,
                    attrs = dict(
                        short_code = post.shortcode
                    ),
                    is_inflated = media_type == "image"
                )
                new_count += 1
                yield i

    @db_session
    def reset(self):
        super().reset()
        if "post_iter" in self.attrs:
            del self.attrs["post_iter"]
            commit()
예제 #6
0
    os.makedirs(ThumbsFilePath)
UserFilePath='./users/'
if not os.path.exists(UserFilePath):
	os.makedirs(UserFilePath)
img_src=[]
#Grab all the thumbnails
for i in range(0,len(img)):
	img_src.append(img[i].get_attribute('src'))
	# print(img_src)
	os.system('wget -q -O '+ThumbsFilePath+followinglist[i]+'.jpg '+img_src[i]+' &')

#Close selenium
driver.quit()
#Login into instagram
looter=ProfileLooter("instagram")
looter.login(username_,password_)
#Loop through all the people who are being followed and grab their photo urls
for i in followinglist:
	try:
		print(i)
		i = i.strip()
		looter=ProfileLooter(i)
		with open(UserFilePath+i+".txt", "a") as output:
			for media in looter.medias():
				for link in instalinks(media,looter):
					if not (os.path.isfile(UserFilePath+i+"/"+link.split('/')[-1])):
						print(link)
						output.write("{}\n".format(link))
					else:
						print("Image already exists")
		#Wget from the file