Exemplo n.º 1
0
def extract_user_posts(browser, num_of_posts_to_do):
    links2, preview_imgs = get_num_posts(browser, num_of_posts_to_do)

    post_infos = []

    counter = 1
    # into user_commented_total_list I will add all username links who commented on any post of this user
    user_commented_total_list = []

    for postlink in links2:

        InstaLogger.logger().info(f"\n {counter} / {len(links2)}")
        counter = counter + 1

        try:
            instagram_post = InstagramPost(browser, postlink)
            instagram_post.extract_post_info()

            location = {
                'location_url': instagram_post.location_url,
                'location_name': instagram_post.location_name,
                'location_id': instagram_post.location_id,
                'latitude': instagram_post.lat,
                'longitude': instagram_post.lng,
            }

            post_infos.append({
                'caption': instagram_post.get('caption', 'None'),
                'location': location,
                'imgs': instagram_post.imgs,
                'imgdesc': instagram_post.imgdesc,
                'preview_img': preview_imgs.get(instagram_post.postlink, None),
                'date': instagram_post.date,
                'tags': instagram_post.tags,
                'likes': {
                    'count': instagram_post.likes,
                    'list': instagram_post.user_liked_list
                },
                'views': instagram_post.views,
                'url': instagram_post.postlink,
                'comments': {
                    'count': instagram_post.commentscount,
                    'list': instagram_post.user_comments
                },
                'mentions': instagram_post.mentions
            })
            user_commented_total_list = user_commented_total_list + instagram_post.user_commented_list
        except NoSuchElementException as err:
            InstaLogger.logger().error("Could not get information from post: " + instagram_post.postlink)
            InstaLogger.logger().error(err)
    return post_infos, user_commented_total_list
def extract_post_caption(user_comments, username):
    tags = []
    caption = ''
    try:
        if len(user_comments) > 0:
            user_commented = user_comments[0]
            if username == user_commented['user']:
                caption = user_commented['comment']
                InstaLogger.logger().info("caption: " + caption)
                tags = findall(r'#[A-Za-z0-9]*', caption)
    except Exception as err:
        InstaLogger.logger().error("Error - getting caption")
        InstaLogger.logger().error(err)
    return caption, tags
Exemplo n.º 3
0
    def extract_image_data(self):
        img_tags = []
        imgs = []
        imgdesc = []

        img_tags = self.post.find_elements_by_class_name('FFVAD')
        InstaLogger.logger().info("number of images: " + str(len(img_tags)))

        for i in img_tags:
            imgs.append(i.get_attribute('src'))
            imgdesc.append(i.get_attribute('alt'))
            InstaLogger.logger().info(f"post image: {imgs[-1]}")
            InstaLogger.logger().info(f"alt text: {imgdesc[-1]}")

        return img_tags, imgs, imgdesc
Exemplo n.º 4
0
    def __init__(self, browser, postlink):
        self.browser = browser
        self.postlink = postlink

        try:
            InstaLogger.logger().info("Scraping Post Link: " + self.postlink)
            web_adress_navigator(self.browser, self.postlink)
        except PageNotFound404 as e:
            raise NoInstaPostPageFound(e)
        except NoSuchElementException as err:
            InstaLogger.logger().error("Could not get information from post: " + self.postlink)
            InstaLogger.logger().error(err)
            pass

        self.post = self.browser.find_element_by_class_name('ltEKP')
Exemplo n.º 5
0
def extract_post_mentions(browser, post):
    mentions = []
    if (Settings.mentions is False):
        return mentions

    try:
        if post.find_elements_by_class_name('xUdfV'):  # perhaps JYWcJ
            mention_list = post.find_elements_by_class_name('xUdfV')  # perhaps JYWcJ
            for mention in mention_list:
                user_mention = mention.get_attribute("href").split('/')
                mentions.append(user_mention[3])
            InstaLogger.logger().info("mentions: " + str(len(mentions)))
    except Exception as err:
        InstaLogger.logger().error("Error - getting mentions")
        InstaLogger.logger().error(err)
    return mentions
Exemplo n.º 6
0
def _extract_post_info(proxy_browser, post_link):
    try:

        caption, location_url, location_name, location_id, lat, lng, imgs, img_desc, tags, likes, comments_count, \
        date, user_commented_list, user_comments, mentions, user_liked_post, views = extract_post_info(proxy_browser, post_link)

        location = {
            'location_url': location_url,
            'location_name': location_name,
            'location_id': location_id,
            'latitude': lat,
            'longitude': lng,
        }

        return {
            'caption': caption,
            'location': location,
            'imgs': imgs,
            'imgdesc': img_desc,
            'date': date,
            'tags': tags,
            'likes': {
                'count': likes,
                'list': user_liked_post
            },
            'views': views,
            'url': post_link,
            'comments': {
                'count': comments_count,
                'list': user_comments
            },
            'mentions': mentions
        }
    except NoSuchElementException as err:
        InstaLogger.logger().error("Could not get information from post: " +
                                   post_link)
        InstaLogger.logger().error(err)
    except Exception as ex:
        InstaLogger.logger().error("Could not get information from post: " +
                                   post_link)
    return None
def extract_post_info(browser, postlink):
    """Get the information from the current post"""

    try:
        InstaLogger.logger().info("Scrapping Post Link: " + postlink)
        web_adress_navigator(browser, postlink)
    except PageNotFound404 as e:
        raise NoInstaPostPageFound(e)
    except NoSuchElementException as err:
        InstaLogger.logger().error("Could not get information from post: " +
                                   postlink)
        InstaLogger.logger().error(err)
        pass

    post = browser.find_element_by_class_name('ltEKP')
    date = ''
    # Get caption
    caption = ''
    username = ''
    try:
        username = post.find_element_by_class_name('e1e1d').text
    except:
        InstaLogger.logger().error("ERROR - getting Post infos (username) ")

    # Get location details
    location_url = ''
    location_name = ''
    location_id = 0
    lat = ''
    lng = ''
    imgs = []
    img = ''

    try:
        # Location url and name
        location_div = post.find_element_by_class_name(
            'M30cS').find_elements_by_tag_name('a')
        if location_div:
            location_url = location_div[0].get_attribute('href')
            location_name = location_div[0].text
            # Longitude and latitude
            location_id = location_url.strip(
                'https://www.instagram.com/explore/locations/').split('/')[0]
            url = 'https://www.instagram.com/explore/locations/' + location_id + '/?__a=1'
            response = requests.get(url)
            data = response.json()
            lat = data['graphql']['location']['lat']
            lng = data['graphql']['location']['lng']
        InstaLogger.logger().info("location_id: " + location_id)
        InstaLogger.logger().info("location_url: " + location_url)
        InstaLogger.logger().info("location_name: " + location_name)
        InstaLogger.logger().info("lat: " + lat)
        InstaLogger.logger().info("lng: " + lng)
    except:
        InstaLogger.logger().error("getting Location Infos  (perhaps not set)")

    try:
        date = post.find_element_by_xpath('//a/time').get_attribute("datetime")
        InstaLogger.logger().info("Post date: " + str(date))
    except:
        InstaLogger.logger().error("ERROR - getting Post Date ")

    try:
        imgs = post.find_elements_by_tag_name('img')
        if len(imgs) >= 2:
            img = imgs[1].get_attribute('src')
        else:
            img = imgs[0].get_attribute('src')
        InstaLogger.logger().info("post image: " + img)
    except:
        InstaLogger.logger().error("ERROR - Post Image ")

    likes = 0

    try:
        #if len(post.find_elements_by_xpath('//article/div/section')) > 2:
        likes_element = post.find_elements_by_xpath(
            '//article/div[2]/section[2]/div/div/a/span')
        if len(likes_element) > 1:
            likes = str(likes_element[1].text)
        else:
            likes = str(likes_element[0].text)

        likes = likes.replace(',', '').replace('.', '')
        likes = likes.replace('k', '00')
        InstaLogger.logger().info("post likes: " + likes)
    except Exception as err:
        InstaLogger.logger().error("ERROR - Getting Post Likes")
        InstaLogger.logger().error(err)
    # if likes is not known, it would cause errors to convert empty string to int

    try:
        likes = int(likes)
    except Exception as err:
        InstaLogger.logger().error(
            "ERROR - Extracting number of likes failed. Saving likes as -1")
        InstaLogger.logger().error(err)
        likes = -1

    user_comments = []
    user_commented_list = []
    user_liked_list = []
    mentions = []
    tags = []
    caption = ''
    commentscount = 0

    try:
        user_comments, user_commented_list, commentscount = extract_post_comments(
            browser, post)
    except:
        InstaLogger.logger().error(
            "ERROR - getting Post comments function trying")

    try:
        caption, tags = extract_post_caption(user_comments, username)
        # delete first comment because its the caption of the user posted
        if len(caption) > 0:
            user_comments.pop(0)
    except:
        InstaLogger.logger().error(
            "ERROR - getting Post caption/tags function")

    try:
        mentions = extract_post_mentions(browser, post)
    except:
        InstaLogger.logger().error("ERROR - getting Post Mentions function")

    try:
        user_liked_list = extract_post_likers(browser, post, postlink, likes)
    except:
        InstaLogger.logger().error("ERROR - getting Post Likers function")

    return caption, location_url, location_name, location_id, lat, lng, img, tags, int(
        likes
    ), commentscount, date, user_commented_list, user_comments, mentions, user_liked_list
Exemplo n.º 8
0
def extract_post_likers(browser, post, postlink, likes):
    user_liked_list = []

    xpath_identifier_user = "******"

    if (Settings.scrape_posts_likers is False):
        return user_liked_list
    else:
        InstaLogger.logger().info("GETTING LIKERS FROM POST")

    postlink = postlink + "liked_by/"
    tried_catch_likers = 0
    likers_list_before = 0
    try:

        # post.find_element_by_xpath("//a[contains(@class, 'zV_Nj')]").click()
        elementToClick = post.find_element_by_xpath(
            "//a[contains(@class, 'zV_Nj')]")
        browser.execute_script("arguments[0].click();", elementToClick)
        sleep(3)
        # likers_list = post.find_elements_by_xpath("//li[@class='wo9IH']//a[contains(@class, 'FPmhX')]")
        likers_list = post.find_elements_by_xpath(xpath_identifier_user)
        print("LÄNGE " + str(len(likers_list)) + "")
        while len(likers_list) < likes:

            InstaLogger.logger().info("new likers in actual view: " +
                                      str(len(likers_list)) + " - list: " +
                                      str(len(user_liked_list)) +
                                      " should be " + str(likes) +
                                      " -- scroll for more")
            try:
                div_likebox_elem = browser.find_element_by_xpath(
                    "//div[contains(@class, 'i0EQd')]/div/div/div[last()]"
                )  # old:wwxN2
                # browser.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", div_likebox_elem)
                browser.execute_script("arguments[0].scrollIntoView(true);",
                                       div_likebox_elem)
            except BaseException as e:
                tried_catch_likers = tried_catch_likers + 1
                div_likebox_elem = browser.find_element_by_xpath(
                    "//div[contains(@class, 'i0EQd')]/div/div/div[1]")
                browser.execute_script("arguments[0].scrollIntoView(true);",
                                       div_likebox_elem)
                print("error on scrolling - next try (tried: " +
                      str(tried_catch_likers) + ") Message:" + e)

            sleep(Settings.sleep_time_between_post_scroll)
            # likers_list = post.find_elements_by_xpath(" //li[@class='wo9IH']//a[contains(@class, 'FPmhX')]")
            likers_list = post.find_elements_by_xpath(xpath_identifier_user)
            for liker in likers_list:
                user_like = liker.get_attribute("href").split('/')
                username_liked_post = user_like[3]
                if username_liked_post not in user_liked_list:
                    user_liked_list.append(username_liked_post)

            if (likers_list_before == len(user_liked_list)):
                tried_catch_likers = tried_catch_likers + 1
                print("error on scrolling - next try (tried: " +
                      str(tried_catch_likers) + ")")
                sleep(Settings.sleep_time_between_post_scroll * 1.5)
                div_likebox_elem = browser.find_element_by_xpath(
                    "//div[contains(@class, 'i0EQd')]/div/div/div[1]")
                browser.execute_script("arguments[0].scrollIntoView(true);",
                                       div_likebox_elem)

            if tried_catch_likers > 10:
                InstaLogger.logger().error("exit scrolling likers " +
                                           str(tried_catch_likers) +
                                           "x tries - liker list: " +
                                           str(len(user_liked_list)) +
                                           " should be " + str(likes) + "")
                break
            likers_list_before = len(user_liked_list)

        InstaLogger.logger().info('likers: ' + str(len(user_liked_list)))

    except BaseException as e:
        InstaLogger.logger().error("Error - getting post likers")
        InstaLogger.logger().error(e)
    return user_liked_list
Exemplo n.º 9
0
def quick_post_extract(browser, num_of_posts_to_do):
    body_elem = browser.find_element_by_tag_name('body')

    previouslen = 0
    breaking = 0

    num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12)

    post_infos = []
    posts_set = set()
    posts_set_len = 0

    while (posts_set_len < num_of_posts_to_do):
        print(posts_set_len)

        JSGetPostsFromReact = """
            var feed = document.getElementsByTagName('article')[0];
            var __reactInternalInstanceKey = Object.keys(feed).filter(k=>k.startsWith('__reactInternalInstance'))[0]
            var posts = feed[__reactInternalInstanceKey].return.stateNode.state.combinedPosts
            return posts;
        """
        posts_json = browser.execute_script(JSGetPostsFromReact)

        for post_json in posts_json:
            # TODO: Convert to InstagramPost
            # instagram_post = InstagramPost.from_react_json(post_json)
            post_code = post_json['code']
            if post_code in posts_set:
                continue

            posts_set.add(post_code)

            location = {}
            if post_json.get('location'):
                loc_id = post_json['location']['id']
                loc_slug = post_json['location']['slug']

                location = {
                    'location_url':
                    f"https://www.instagram.com/explore/locations/{loc_id}/{loc_slug}/",
                    'location_name': post_json['location']['name'],
                    'location_id': loc_id,
                    'latitude': post_json['location']['lat'],
                    'longitude': post_json['location']['lng'],
                }

            num_comments = post_json['numComments']
            num_likes = post_json.get('numLikes') or post_json.get(
                'numPreviewLikes', -1)

            post_infos.append({
                'caption': post_json.get('caption'),
                'location': location,
                'imgs': [],
                'imgdesc': [],
                'preview_img': post_json['thumbnailResources'],
                'date': post_json['postedAt'],
                'tags': [],
                'likes': {
                    'count': num_likes,
                    'list': []
                },
                'views': post_json.get('videoViews', -1),
                'url': f"https://www.instagram.com/p/{post_code}/",
                'comments': {
                    'count': num_comments,
                    'list': []
                },
                'mentions': []
            })

        body_elem.send_keys(Keys.END)
        sleep(Settings.sleep_time_between_post_scroll)

        posts_set_len = len(posts_set)
        ##remove below part to never break the scrolling script before reaching the num_of_posts
        if (posts_set_len == previouslen):
            breaking += 1
            InstaLogger.logger().info(
                f"breaking in {4 - breaking}...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py"
            )
        else:
            breaking = 0

        if breaking > 3:
            InstaLogger.logger().info(
                "Not getting any more posts, ending scrolling")
            sleep(2)
            break

        previouslen = len(post_infos)

    return post_infos, []
Exemplo n.º 10
0
def extract_followers(browser, username):
    InstaLogger.logger().info('Extracting follower from ' + username)
    try:
        user_link = "https://www.instagram.com/{}".format(username)
        web_adress_navigator(browser, user_link)
    except PageNotFound404 as e:
        raise NoInstaProfilePageFound(e)
    sleep(5)

    followers = []

    # find number of followers
    elem = browser.find_element_by_xpath(
        "//div[@id='react-root']//header[@class='vtbgv ']//ul[@class='k9GMp ']/child::li[2]/a/span"
    )
    elem.click()
    sleep(15)

    # remove suggestion list and load 24 list elements after this
    browser.execute_script(
        "document.getElementsByClassName('isgrP')[0].scrollTo(0,500)")
    sleep(10)

    elems = browser.find_elements_by_xpath(
        "//body//div[@class='PZuss']//a[@class='FPmhX notranslate  _0imsa ']")
    for i in range(12):
        val = elems[i].get_attribute('innerHTML')
        followers.append(val)

    for i in range(12):
        browser.execute_script(
            "document.getElementsByClassName('PZuss')[0].children[0].remove()")

    isDone = False

    while 1:
        try:

            start = time()
            browser.execute_script(
                "document.getElementsByClassName('isgrP')[0].scrollTo(0,document.getElementsByClassName('isgrP')[0].scrollHeight)"
            )

            while 1:
                try:
                    if int(
                            browser.execute_script(
                                "return document.getElementsByClassName('PZuss')[0].children.length"
                            )) == 24:
                        break
                except (KeyboardInterrupt, SystemExit):
                    # f.close()
                    raise
                except:
                    continue
                if time() - start > 10:
                    isDone = True
                    break

            if isDone:
                break

            elems = browser.find_elements_by_xpath(
                "//body//div[@class='PZuss']//a[@class='FPmhX notranslate  _0imsa ']"
            )
            list_segment = ""
            for i in range(12):
                val = elems[i].get_attribute('innerHTML')
                list_segment += (val + '\n')
                followers.append(val)

            for i in range(12):
                browser.execute_script(
                    "document.getElementsByClassName('PZuss')[0].children[0].remove()"
                )

            InstaLogger.logger().info(time() - start)

        except (KeyboardInterrupt, SystemExit):
            # f.close()
            raise
        except:
            continue

    list_segment = ""
    elems = browser.find_elements_by_xpath(
        "//body//div[@class='PZuss']//a[@class='FPmhX notranslate  _0imsa ']")
    for i in range(len(elems)):
        val = elems[i].get_attribute('innerHTML')
        list_segment += (val + '\n')
        followers.append(val)

    return followers
Exemplo n.º 11
0
def get_user_info(browser, username):
    """Get the basic user info from the profile screen"""
    num_of_posts = 0
    followers = {'count': 0}
    following = {'count': 0}
    prof_img = ""
    bio = ""
    bio_url = ""
    alias = ""
    container = browser.find_element_by_class_name('v9tJq')
    isprivate = False
    try:
        if container.find_element_by_class_name('Nd_Rl'):
            isprivate = True
    except:
        isprivate = False

    try:
        alias = container.find_element_by_class_name(
            '-vDIg').find_element_by_tag_name('h1').text
    except:
        InstaLogger.logger().info("alias is empty")

    try:
        bio = container.find_element_by_class_name('-vDIg') \
            .find_element_by_tag_name('span').text
    except:
        InstaLogger.logger().info("Bio is empty")

    try:
        bio_url = container.find_element_by_class_name('yLUwa').text
    except:
        InstaLogger.logger().info("Bio Url is empty")

    try:
        img_container = browser.find_element_by_class_name('RR-M-')
        prof_img = img_container.find_element_by_tag_name('img').get_attribute(
            'src')
    except:
        InstaLogger.logger().info("image is empty")

    try:
        infos = container.find_elements_by_class_name('Y8-fY')

        try:
            num_of_posts = extract_exact_info(infos[0])
        except:
            InstaLogger.logger().error("Number of Posts empty")

        try:
            following = {'count': extract_exact_info(infos[2])}
        except:
            InstaLogger.logger().error("Following is empty")

        try:
            followers = {'count': extract_exact_info(infos[1])}

            try:
                if Settings.scrape_follower == True:
                    if isprivate == True:
                        InstaLogger.logger().info(
                            "Cannot get Follower List - private account")
                    else:
                        followers['list'] = extract_followers(
                            browser, username)
            except Exception as exception:
                # Output unexpected Exceptions.
                print("Unexpected error:", sys.exc_info()[0])
                print(exception)

                InstaLogger.logger().error("Cannot get Follower List")
        except:
            InstaLogger.logger().error("Follower is empty")
    except:
        InstaLogger.logger().error("Infos (Following, Abo, Posts) is empty")

    information = {
        'alias': alias,
        'username': username,
        'bio': bio,
        'prof_img': prof_img,
        'num_of_posts': num_of_posts,
        'followers': followers,
        'following': following,
        'bio_url': bio_url,
        'isprivate': isprivate,
    }

    InstaLogger.logger().info("alias name: " + information['alias'])
    InstaLogger.logger().info("bio: " + information['bio'])
    InstaLogger.logger().info("url: " + information['bio_url'])
    InstaLogger.logger().info("Posts: " + str(information['num_of_posts']))
    InstaLogger.logger().info("Follower: " +
                              str(information['followers']['count']))
    InstaLogger.logger().info("Following: " + str(information['following']))
    InstaLogger.logger().info("isPrivate: " + str(information['isprivate']))
    return information
Exemplo n.º 12
0
def get_user_info(browser):
    """Get the basic user info from the profile screen"""
    num_of_posts = 0
    followers = 0
    following = 0
    prof_img = ""
    bio = ""
    bio_url = ""
    alias_name = ""
    container = browser.find_element_by_class_name('v9tJq')
    isprivate = False

    try:
        infos = container.find_elements_by_class_name('Y8-fY')
        num_of_posts = extract_exact_info(infos[0])
        followers = extract_exact_info(infos[1])
        following = extract_exact_info(infos[2])
    except:
        InstaLogger.logger().error("Infos (Follower, Abo, Posts) is empty")
        infos = ""

    try:
        alias_name = container.find_element_by_class_name('-vDIg').find_element_by_tag_name('h1').text
    except:
        InstaLogger.logger().info("alias is empty")

    try:
        bio = container.find_element_by_class_name('-vDIg') \
            .find_element_by_tag_name('span').text
    except:
        InstaLogger.logger().info("Bio is empty")

    try:
        bio_url = container.find_element_by_class_name('yLUwa').text
    except:
        InstaLogger.logger().info("Bio Url is empty")

    try:
        img_container = browser.find_element_by_class_name('RR-M-')
        prof_img = img_container.find_element_by_tag_name('img').get_attribute('src')
    except:
        InstaLogger.logger().info("image is empty")

    try:
        if container.find_element_by_class_name('Nd_Rl'):
            isprivate = True
    except:
        isprivate = False

    InstaLogger.logger().info("alias name: " + alias_name)
    InstaLogger.logger().info("bio: " + bio)
    InstaLogger.logger().info("url: " + bio_url)
    InstaLogger.logger().info("Posts: " + str(num_of_posts))
    InstaLogger.logger().info("Follower: " + str(followers))
    InstaLogger.logger().info("Following: " + str(following))
    InstaLogger.logger().info("isPrivate: " + str(isprivate))
    return alias_name, bio, prof_img, num_of_posts, followers, following, bio_url, isprivate
Exemplo n.º 13
0
    def extract_location(self):
        # Get location details
        location_url = ''
        location_name = ''
        location_id = 0
        lat = ''
        lng = ''

        try:
            # Location url and name
            location_div = self.post.find_element_by_class_name(
                'M30cS').find_elements_by_tag_name('a')
            if location_div:
                location_url = location_div[0].get_attribute('href')
                location_name = location_div[0].text
                # Longitude and latitude
                location_id = location_url.strip(
                    'https://www.instagram.com/explore/locations/').split(
                        '/')[0]
                url = 'https://www.instagram.com/explore/locations/' + location_id + '/?__a=1'
                response = requests.get(url)
                data = response.json()
                lat = data['graphql']['location']['lat']
                lng = data['graphql']['location']['lng']
            InstaLogger.logger().info("location_id: " + str(location_id))
            InstaLogger.logger().info("location_url: " + str(location_url))
            InstaLogger.logger().info("location_name: " + str(location_name))
            InstaLogger.logger().info("lat: " + str(lat))
            InstaLogger.logger().info("lng: " + str(lng))
        except Exception as err:
            InstaLogger.logger().warning(
                "getting Location Infos (perhaps not set)")

        return location_url, location_name, location_id, lat, lng
Exemplo n.º 14
0
def extract_post_info(browser, postlink):
    """Get the information from the current post"""

    try:
        InstaLogger.logger().info("Scrapping Post Link: " + postlink)
        web_adress_navigator(browser, postlink)
    except PageNotFound404 as e:
        raise NoInstaPostPageFound(e)
    except NoSuchElementException as err:
        InstaLogger.logger().error("Could not get information from post: " +
                                   postlink)
        InstaLogger.logger().error(err)
        pass

    post = browser.find_element_by_class_name('ltEKP')
    date = ''
    # Get caption
    caption = ''
    username = ''
    try:
        username = post.find_element_by_class_name('e1e1d').text
    except:
        InstaLogger.logger().error("ERROR - getting Post infos (username) ")

    # Get location details
    location_url = ''
    location_name = ''
    location_id = 0
    lat = ''
    lng = ''

    try:
        # Location url and name
        location_div = post.find_element_by_class_name(
            'M30cS').find_elements_by_tag_name('a')
        if location_div:
            location_url = location_div[0].get_attribute('href')
            location_name = location_div[0].text
            # Longitude and latitude
            location_id = location_url.strip(
                'https://www.instagram.com/explore/locations/').split('/')[0]
            url = 'https://www.instagram.com/explore/locations/' + location_id + '/?__a=1'
            response = requests.get(url)
            data = response.json()
            lat = data['graphql']['location']['lat']
            lng = data['graphql']['location']['lng']
        InstaLogger.logger().info("location_id:" + location_id)
        InstaLogger.logger().info("location_url:" + location_url)
        InstaLogger.logger().info("location_name:" + location_name)
        InstaLogger.logger().info("lat:" + lat)
        InstaLogger.logger().info("lng:" + lng)
    except:
        InstaLogger.logger().error("getting Location Infos  (perhaps not set)")

    try:
        date = post.find_element_by_xpath('//a/time').get_attribute("datetime")
        InstaLogger.logger().info("date:" + str(date))
    except:
        InstaLogger.logger().error("ERROR - getting Post Date ")

    imgs = post.find_elements_by_tag_name('img')
    img = ''

    if len(imgs) >= 2:
        img = imgs[1].get_attribute('src')
    else:
        img = imgs[0].get_attribute('src')

    likes = 0

    if len(post.find_elements_by_tag_name('section')) > 2:
        likes = post.find_elements_by_tag_name('section')[1] \
            .find_element_by_tag_name('div').text

        likes = likes.split(' ')

        # count the names if there is no number displayed
        if len(likes) > 2:
            likes = len([
                word for word in likes if word not in ['and', 'like', 'this']
            ])
        else:
            likes = likes[0]
            likes = likes.replace(',', '').replace('.', '')
            likes = likes.replace('k', '00')
        InstaLogger.logger().info("post-likes:" + likes)

    user_comments = []
    user_commented_list = []
    mentions = []
    tags = []
    caption = ''
    commentscount = 0

    try:
        user_comments, user_commented_list, commentscount = extract_post_comments(
            browser, post)
    except:
        InstaLogger.logger().error("trying to get comments (function)")

    try:
        caption, tags = extract_post_caption(user_comments, username)
        # delete first comment because its the caption of the user posted
        if len(caption) > 0:
            user_comments.pop(0)
    except:
        InstaLogger.logger().error("trying to get caption/tags (function)")

    try:
        mentions = extract_post_mentions(browser, post)
    except:
        InstaLogger.logger().error("trying to get mentions (function)")

    return caption, location_url, location_name, location_id, lat, lng, img, tags, int(
        likes
    ), commentscount, date, user_commented_list, user_comments, mentions
def extract_user_posts(browser, ig_user, num_of_posts_to_do):
    links2, preview_imgs = get_num_posts(browser, ig_user, num_of_posts_to_do)

    post_infos = []

    counter = 1
    # into user_commented_total_list I will add all username links who commented on any post of this user
    user_commented_total_list = []

    with open(ig_user.username + "/" + ig_user.username + "_posts.txt", "r") as f:
        link_temp = [item.strip() for item in f.readlines()]

    for postlink in links2:
        
        print(postlink)
        if([item.strip() for item in link_temp if postlink in item][0].startswith("#")):
            print("Post: ", postlink.split("/")[-2], " already fetched.")
            continue


        InstaLogger.logger().info(f"\n {counter} / {len(links2)}")
        counter = counter + 1

        try:
            instagram_post = InstagramPost(browser, postlink)
            instagram_post.extract_post_info()

            location = {
                'location_url': instagram_post.location_url,
                'location_name': instagram_post.location_name,
                'location_id': instagram_post.location_id,
                'latitude': instagram_post.lat,
                'longitude': instagram_post.lng,
            }

            post_infos.append({
                'caption': instagram_post.caption, #instagram_post.get('caption', 'None'),
                'location': location,
                'imgs': instagram_post.imgs,
                'imgdesc': instagram_post.imgdesc,
                'preview_img': preview_imgs.get(instagram_post.postlink, None),
                'date': instagram_post.date,
                'tags': instagram_post.tags,
                'likes': {
                    'count': instagram_post.likes,
                    'list': instagram_post.user_liked_list
                },
                'views': instagram_post.views,
                'url': instagram_post.postlink,
                'comments': {
                    'count': instagram_post.commentscount,
                    'list': instagram_post.user_comments
                },
                'mentions': instagram_post.mentions
            })
            user_commented_total_list = user_commented_total_list + instagram_post.user_commented_list
        except NoSuchElementException as err:
            InstaLogger.logger().error("Could not get information from post: " + instagram_post.postlink)
            InstaLogger.logger().error(err)

        #RISHABH
        print("Post: ", postlink.split("/")[-2], " fetching complete.")
        
        # save # in the file
        link_temp[link_temp.index(postlink)] = "# " + postlink
        with open(ig_user.username + "/" + ig_user.username + "_posts.txt", "w") as f:
            f.write("\n".join(link_temp))

    return post_infos, user_commented_total_list
def get_num_posts(browser, ig_user, num_of_posts_to_do):
    """Get all posts from user"""
    links = []
    links2 = []
    preview_imgs = {}

    #Rishabh
    removed_hurdles = False
    #check for existing posts list
    import os.path
    from pathlib import Path
    Path(ig_user.username).mkdir(parents=True, exist_ok=True)
    if (os.path.isfile(ig_user.username + "/" + ig_user.username + "_posts.txt")):
        print("Posts list exist, not fetching again.")
        with open(ig_user.username + "/" + ig_user.username + "_posts.txt", "r") as f:
            links2 = f.readlines()
        links2 = [l.strip() for l in links2]
        return links2, preview_imgs

    print("Fetching posts list...")
    # list links contains 30 links from the current view, as that is the maximum Instagram is showing at one time
    # list links2 contains all the links collected so far
    # preview_imgs dictionary maps link in links2 to link's post's preview image src
    try:
        body_elem = browser.find_element_by_tag_name('body')

        # load_button = body_elem.find_element_by_xpath\
        #  ('//a[contains(@class, "_1cr2e _epyes")]')
        # body_elem.send_keys(Keys.END)
        # sleep(3)
        try:
            more_posts_btn = body_elem.find_element_by_xpath("//*[contains(text(), 'Show More Posts')]")
            browser.execute_script("arguments[0].click();", more_posts_btn)
            print("clicked to get more posts")
        except Exception as e:
            print("no more posts")

        previouslen = 0
        breaking = 0

        InstaLogger.logger().info(f"number of posts to do: {num_of_posts_to_do}")
        num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12)
        InstaLogger.logger().info(
            f"Getting first {num_of_posts_to_scroll} posts but checking {num_of_posts_to_do} posts only, if you want to change this limit, change limit_amount value in crawl_profile.py\n")
        while (len(links2) < num_of_posts_to_do):

            prev_divs = browser.find_elements_by_tag_name('main')
            links_elems = [div.find_elements_by_tag_name('a') for div in prev_divs]
            links = sum([[link_elem.get_attribute('href')
                          for link_elem in elems] for elems in links_elems], [])

            for elems in links_elems:
                for link_elem in elems:

                    href = link_elem.get_attribute('href')
                    if "/p/" in href:
                        img = link_elem.find_element_by_tag_name('img')
                        src = img.get_attribute('src')
                        preview_imgs[href] = src

            for link in links:
                if "/p/" in link:
                    if (len(links2) < num_of_posts_to_do):
                        links2.append(link)

            links2 = list(set(links2))
            InstaLogger.logger().info(f"Scrolling profile {len(links2)} / {num_of_posts_to_scroll}")
            body_elem.send_keys(Keys.END)
            sleep(Settings.sleep_time_between_post_scroll)

            ##remove bellow part to never break the scrolling script before reaching the num_of_posts
            if (len(links2) == previouslen):
                breaking += 1
                InstaLogger.logger().info(
                    f"breaking in {4 - breaking}...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py")
                # remove_log_in() - RISHABH
                if(not removed_hurdles):
                    sleep(Settings.sleep_time_between_post_scroll)
                    print("removing hurdles...")
                    pop_up = body_elem.find_element_by_class_name('RnEpo')
                    # print(pop_up)
                    browser.execute_script("""
                    var element = arguments[0];
                    element.parentNode.removeChild(element);
                    """, pop_up)
                    # print(pop_up)
                    # set body style overflow to none
                    browser.execute_script("arguments[0].style = 'overflow:none'", body_elem)
                    removed_hurdles = True
                    sleep(Settings.sleep_time_between_post_scroll)

            else:
                breaking = 0
            if breaking > 3:
                InstaLogger.logger().info("Not getting any more posts, ending scrolling")
                sleep(2)
                break
            previouslen = len(links2)
            ##

    except NoSuchElementException as err:
        InstaLogger.logger().error('Something went terribly wrong')

    #save to list
    print("Saving posts list to file " + ig_user.username + "/" + ig_user.username + "_posts.txt")
    with open(ig_user.username + "/" + ig_user.username + "_posts.txt", "w+") as f:
        f.write("\n".join(links2))
    return links2, preview_imgs
def extract_post_comments(browser, post):
    # if more than 22 comment elements, use the second to see
    # how much comments, else count the li's

    # first element is the text, second either the first comment
    # or the button to display all the comments

    comments = []
    user_commented_list = []
    user_comments = []
    try:
        if post.find_elements_by_tag_name('ul'):
            comment_list = post.find_element_by_tag_name('ul')
            comments = comment_list.find_elements_by_tag_name('li')

            if len(comments) > 1:
                # load hidden comments
                tried_catch_comments = 0
                while (comments[1].text.lower() == 'load more comments'
                       or comments[1].text.lower().startswith('view all')):
                    try:
                        if comments[1].find_element_by_tag_name('button'):
                            print("click button for loading more")
                            comments[1].find_element_by_tag_name(
                                'button').click()
                        elif comments[1].find_element_by_tag_name('a'):
                            print("click a for loading more")
                            comments[1].find_element_by_tag_name('a').click()
                        sleep(Settings.sleep_time_between_comment_loading)
                    except:
                        print("error on clicking - next try (tried: " +
                              str(tried_catch_comments) + ") comments:" +
                              str(len(comments)) + ")")
                        tried_catch_comments = tried_catch_comments + 1
                        if tried_catch_comments > 10:
                            print("exit getting comments")
                            break
                        sleep(Settings.sleep_time_between_comment_loading)

                    comment_list = post.find_element_by_tag_name('ul')
                    comments = comment_list.find_elements_by_tag_name('li')
                # adding who commented into user_commented_list
                InstaLogger.logger().info("found comments: " +
                                          str(len(comments)))
            else:
                print("found comment: 1")

            for comm in comments:
                try:
                    user_commented = comm.find_element_by_tag_name(
                        'a').get_attribute("href").split('/')
                    user_commented_list.append(user_commented[3])
                except:
                    InstaLogger.logger().error(
                        "ERROR something went wrong getting user_commented")
                # first comment has to be loaded everytime to get the caption and tag from post
                if (Settings.output_comments is True
                        or len(user_comments) < 1):
                    user_comment = {}
                    try:
                        user_comment = {
                            'user': user_commented[3],
                            'comment':
                            comm.find_element_by_tag_name('span').text
                        }
                        InstaLogger.logger().info(
                            user_commented[3] + " -- " +
                            comm.find_element_by_tag_name('span').text)
                        user_comments.append(user_comment)
                    except:
                        InstaLogger.logger().error(
                            "ERROR something went wrong getting comment")

        InstaLogger.logger().info(str(len(user_commented_list)) + " comments.")
    except BaseException as e:
        InstaLogger.logger().error(e)
    except:
        InstaLogger.logger().error("Error - getting comments")

    return user_comments, user_commented_list, int(len(comments) - 1)
Exemplo n.º 18
0
    def load_more_comments(self, comments):
        tried_catch_comments = 0

        while (comments[1].text.lower() == 'load more comments'
               or comments[1].text.lower().startswith('view all')):
            try:
                if comments[1].find_element_by_tag_name('button'):
                    InstaLogger.logger().info(
                        "clicking button for loading more comments")
                    self.browser.execute_script(
                        "arguments[0].click();",
                        comments[1].find_element_by_tag_name('button'))
                elif comments[1].find_element_by_tag_name('a'):
                    InstaLogger.logger().info("clicking a for loading more")
                    self.browser.execute_script(
                        "arguments[0].click();",
                        comments[1].find_element_by_tag_name('a'))

                sleep(Settings.sleep_time_between_comment_loading)

                comment_list = self.post.find_element_by_tag_name('ul')
                comments = comment_list.find_elements_by_tag_name('li')
                InstaLogger.logger().info(
                    f"comments (loaded: {len(comments)} /lastrun: {comments_found_last_run})"
                )

                if (comments_found_last_run == len(comments)):
                    comments_run_same_length = comments_run_same_length + 1
                    if comments_run_same_length > 10:
                        InstaLogger.logger().error(
                            f"exit getting comments: {comments_run_same_length} x same length of comments, perhaps endless loop"
                        )
                        break
                else:
                    comments_same_length = 0

                comments_found_last_run = len(comments)
            except:
                InstaLogger.logger().error(
                    f"error clicking - next try (tried: {tried_catch_comments}) comments: {len(comments)}"
                )
                tried_catch_comments = tried_catch_comments + 1
                if tried_catch_comments > 10:
                    InstaLogger.logger().error(
                        f"exit getting comments, {tried_catch_comments}x tried to get comments"
                    )
                    break
                sleep(Settings.sleep_time_between_comment_loading)

        return comments
Exemplo n.º 19
0
def extract_information(browser, username, limit_amount):
    InstaLogger.logger().info('Extracting information from ' + username)
    """Get all the information for the given username"""
    isprivate = False
    try:
        user_link = "https://www.instagram.com/{}/".format(username)
        web_adress_navigator(browser, user_link)
    except PageNotFound404 as e:
        raise NoInstaProfilePageFound(e)

    num_of_posts_to_do = 999999
    alias_name = ''
    bio = ''
    prof_img = ''
    num_of_posts = 0
    followers = 0
    following = 0
    bio_url = ''
    try:
        alias_name, bio, prof_img, num_of_posts, followers, following, bio_url, isprivate = get_user_info(browser)
        if limit_amount < 1:
            limit_amount = 999999
        num_of_posts_to_do = min(limit_amount, num_of_posts)
    except Exception as err:
        InstaLogger.logger().error("Couldn't get user profile. - Terminating")
        quit()
    prev_divs = browser.find_elements_by_class_name('_70iju')

    post_infos = []
    user_commented_total_list = []
    if Settings.scrap_posts_infos is True and isprivate is False:
        try:
            post_infos, user_commented_total_list = extract_user_posts(browser, num_of_posts_to_do)
        except:
            InstaLogger.logger().error("Couldn't get user posts.")

    information = {
        'alias': alias_name,
        'username': username,
        'bio': bio,
        'prof_img': prof_img,
        'num_of_posts': num_of_posts,
        'followers': followers,
        'following': following,
        'bio_url': bio_url,
        'isprivate': isprivate,
        'scrapped': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        'posts': post_infos,
    }

    InstaLogger.logger().info("User " + username + " has " + str(len(user_commented_total_list)) + " comments.")

    # sorts the list by frequencies, so users who comment the most are at the top
    import collections
    from operator import itemgetter, attrgetter
    counter = collections.Counter(user_commented_total_list)
    com = sorted(counter.most_common(), key=itemgetter(1, 0), reverse=True)
    com = map(lambda x: [x[0]] * x[1], com)
    user_commented_total_list = [item for sublist in com for item in sublist]

    # remove duplicates preserving order (that's why not using set())
    user_commented_list = []
    last = ''
    for i in range(len(user_commented_total_list)):
        if username.lower() != user_commented_total_list[i]:
            if last != user_commented_total_list[i]:
                user_commented_list.append(user_commented_total_list[i])
            last = user_commented_total_list[i]

    return information, user_commented_list
Exemplo n.º 20
0
    def extract_likes_views(self, img_tags):
        likes = 0
        views = 0

        try:
            # if len(self.post.find_elements_by_xpath('//article/div/section')) > 2:
            # image or video post?
            if len(img_tags) >= 1:
                likes = self.post.find_element_by_xpath(
                    '//article/div[2]/section[2]/div/div/button/span').text
            else:
                try:
                    views = int(
                        self.post.find_element_by_xpath(
                            '//article/div[2]/section[2]/div/span/span').text.
                        replace(",", ""))
                    InstaLogger.logger().info("video views: " + str(views))
                except:
                    InstaLogger.logger().error("ERROR - Getting Video Views")
                # click the view count to get the likes popup
                viewcount_click = self.post.find_element_by_xpath(
                    '//article/div[2]/section[2]/div/span')
                self.browser.execute_script("arguments[0].click();",
                                            viewcount_click)
                likes = self.post.find_element_by_xpath(
                    '//article/div[2]/section[2]/div/div/div[4]/span').text

            likes = likes.replace(',', '').replace('.', '')
            likes = likes.replace('k', '00')
            InstaLogger.logger().info("post likes: " + likes)
        except Exception as err:
            InstaLogger.logger().error("ERROR - Getting Post Likes")
            InstaLogger.logger().error(err)
        # if likes is not known, it would cause errors to convert empty string to int

        try:
            likes = int(likes)
        except Exception as err:
            InstaLogger.logger().error(
                "ERROR - Extracting number of likes failed. Saving likes as -1"
            )
            InstaLogger.logger().error(err)
            likes = -1

        return likes, views
Exemplo n.º 21
0
def extract_user_posts(browser, num_of_posts_to_do):
    """Get all posts from user"""
    links = []
    links2 = []
    preview_imgs = {}

    # list links contains 30 links from the current view, as that is the maximum Instagram is showing at one time
    # list links2 contains all the links collected so far
    # preview_imgs dictionary maps link in links2 to link's post's preview image src
    try:
        body_elem = browser.find_element_by_tag_name('body')

        # load_button = body_elem.find_element_by_xpath\
        #  ('//a[contains(@class, "_1cr2e _epyes")]')
        # body_elem.send_keys(Keys.END)
        # sleep(3)

        previouslen = 0
        breaking = 0

        print("number of posts to do: ", num_of_posts_to_do)
        num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12)
        print(
            "Getting first", num_of_posts_to_scroll, "posts but checking ",
            num_of_posts_to_do,
            " posts only, if you want to change this limit, change limit_amount value in crawl_profile.py\n"
        )
        while (len(links2) < num_of_posts_to_do):

            prev_divs = browser.find_elements_by_tag_name('main')
            links_elems = [
                div.find_elements_by_tag_name('a') for div in prev_divs
            ]
            links = sum(
                [[link_elem.get_attribute('href') for link_elem in elems]
                 for elems in links_elems], [])

            for elems in links_elems:
                for link_elem in elems:

                    href = link_elem.get_attribute('href')
                    try:
                        if "/p/" in href:
                            try:
                                img = link_elem.find_element_by_tag_name('img')
                                src = img.get_attribute('src')
                                preview_imgs[href] = src
                            except NoSuchElementException:
                                print("img exception 132")
                                continue
                    except Exception as err:
                        print(err)

            for link in links:
                if "/p/" in link:
                    if (len(links2) < num_of_posts_to_do):
                        links2.append(link)
            links2 = list(set(links2))
            print("Scrolling profile ", len(links2), "/",
                  num_of_posts_to_scroll)
            body_elem.send_keys(Keys.END)
            sleep(Settings.sleep_time_between_post_scroll)

            ##remove bellow part to never break the scrolling script before reaching the num_of_posts
            if (len(links2) == previouslen):
                breaking += 1
                print(
                    "breaking in ", 4 - breaking,
                    "...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py"
                )
            else:
                breaking = 0
            if breaking > 3:
                print("Not getting any more posts, ending scrolling")
                sleep(2)
                break
            previouslen = len(links2)
            ##

    except NoSuchElementException as err:
        InstaLogger.logger().error('Something went terribly wrong')

    post_infos = []

    counter = 1
    # into user_commented_total_list I will add all username links who commented on any post of this user
    user_commented_total_list = []

    for postlink in links2:

        print("\n", counter, "/", len(links2))
        counter = counter + 1

        try:
            caption, location_url, location_name, location_id, lat, lng, imgs, imgdesc, tags, likes, commentscount, date, user_commented_list, user_comments, mentions, user_liked_post, views = extract_post_info(
                browser, postlink)

            location = {
                'location_url': location_url,
                'location_name': location_name,
                'location_id': location_id,
                'latitude': lat,
                'longitude': lng,
            }

            post_infos.append({
                'caption': caption,
                'location': location,
                'imgs': imgs,
                'imgdesc': imgdesc,
                'preview_img': preview_imgs.get(postlink, None),
                'date': date,
                'tags': tags,
                'likes': {
                    'count': likes,
                    'list': user_liked_post
                },
                'views': views,
                'url': postlink,
                'comments': {
                    'count': commentscount,
                    'list': user_comments
                },
                'mentions': mentions
            })
            user_commented_total_list = user_commented_total_list + user_commented_list
        except NoSuchElementException as err:
            InstaLogger.logger().error(
                "Could not get information from post: " + postlink)
            InstaLogger.logger().error(err)
        except:
            InstaLogger.logger().error(
                "Could not get information from post: " + postlink)
    return post_infos, user_commented_total_list
Exemplo n.º 22
0
def extract_post_info(browser, postlink):
    """Get the information from the current post"""

    try:
        InstaLogger.logger().info("Scraping Post Link: " + postlink)
        web_adress_navigator(browser, postlink)
    except PageNotFound404 as e:
        raise NoInstaPostPageFound(e)
    except NoSuchElementException as err:
        InstaLogger.logger().error("Could not get information from post: " +
                                   postlink)
        InstaLogger.logger().error(err)
        pass

    post = browser.find_element_by_class_name('ltEKP')
    date = ''
    # Get caption
    caption = ''
    username = ''
    try:
        username = post.find_element_by_class_name(
            'e1e1d').find_element_by_tag_name('a').text
    except:
        InstaLogger.logger().error("ERROR - getting Post infos (username) ")

    # Get location details
    location_url = ''
    location_name = ''
    location_id = 0
    lat = ''
    lng = ''

    img_tags = []
    imgs = []
    imgdesc = []
    views = 0

    try:
        # Location url and name
        location_div = post.find_element_by_class_name(
            'M30cS').find_elements_by_tag_name('a')
        if location_div:
            location_url = location_div[0].get_attribute('href')
            location_name = location_div[0].text
            print(location_name)
            # Longitude and latitude
            location_id = location_url.strip(
                'https://www.instagram.com/explore/locations/').split('/')[0]
            print("location id", location_id)
            url = 'https://www.instagram.com/explore/locations/' + str(
                location_id) + '/?__a=1'
            print(len(location_id))
            response = requests.get(url)
            data = response.json()
            if response:
                print("got data")
            lat = data['graphql']['location']['lat']
            print("latitude", lat)
            lng = data['graphql']['location']['lng']
            print("longitude", lng)
        InstaLogger.logger().info("location_id: " + str(location_id))
        InstaLogger.logger().info("location_url: " + str(location_url))
        InstaLogger.logger().info("location_name: " + str(location_name))
        InstaLogger.logger().info("lat: " + str(lat))
        InstaLogger.logger().info("lng: " + str(lng))
    except Exception as err:
        InstaLogger.logger().warning(
            "getting Location Infos (perhaps not set)")
    try:
        date = post.find_element_by_xpath('//a/time').get_attribute("datetime")
        InstaLogger.logger().info("Post date: " + str(date))
    except:
        InstaLogger.logger().error("ERROR - getting Post Date ")

    try:
        img_tags = post.find_elements_by_class_name('FFVAD')
        InstaLogger.logger().info("number of images: " + str(len(img_tags)))
        for i in img_tags:
            imgs.append(i.get_attribute('src'))
            imgdesc.append(i.get_attribute('alt'))
            InstaLogger.logger().info("post image: " + imgs[-1])
            InstaLogger.logger().info("alt text: " + imgdesc[-1])
    except Exception as err:
        InstaLogger.logger().error("ERROR - Post Image")
        InstaLogger.logger().error(str(err))

    likes = 0

    try:
        # if len(post.find_elements_by_xpath('//article/div/section')) > 2:
        # image or video post?
        if len(img_tags) >= 1:
            likes = post.find_element_by_xpath(
                '//article/div[2]/section[2]/div/div[2]/button/span').text
        else:
            try:
                views = int(
                    post.find_element_by_xpath(
                        '//article/div[2]/section[2]/div/span/span').text.
                    replace(",", ""))
                InstaLogger.logger().info("video views: " + str(views))
            except:
                InstaLogger.logger().error("ERROR - Getting Video Views")
            # click the view count to get the likes popup
            viewcount_click = post.find_element_by_xpath(
                '//article/div[2]/section[2]/div/span')
            browser.execute_script("arguments[0].click();", viewcount_click)
            likes = post.find_element_by_xpath(
                '//article/div[2]/section[2]/div/div/div[4]/span').text

        likes = likes.replace(',', '').replace('.', '')
        likes = likes.replace('k', '00')
        InstaLogger.logger().info("post likes: " + likes)
    except Exception as err:
        print(err)
        InstaLogger.logger().error("ERROR - Getting Post Likes")
        InstaLogger.logger().error(err)
    # if likes is not known, it would cause errors to convert empty string to int

    try:
        likes = int(likes)
    except Exception as err:
        InstaLogger.logger().error(
            "ERROR - Extracting number of likes failed. Saving likes as -1")
        InstaLogger.logger().error(err)
        likes = -1

    user_comments = []
    user_commented_list = []
    user_liked_list = []
    mentions = []
    tags = []
    caption = ''
    commentscount = 0

    try:
        user_comments, user_commented_list, commentscount = extract_post_comments(
            browser, post)
    except:
        InstaLogger.logger().error(
            "ERROR - getting Post comments function trying")

    try:
        caption, tags = extract_post_caption(user_comments, username)
        # delete first comment because its the caption of the user posted
        if len(caption) > 0:
            user_comments.pop(0)
    except:
        InstaLogger.logger().error(
            "ERROR - getting Post caption/tags function")

    try:
        mentions = extract_post_mentions(browser, post)
    except:
        InstaLogger.logger().error("ERROR - getting Post Mentions function")

    try:
        user_liked_list = extract_post_likers(browser, post, postlink, likes)
    except:
        InstaLogger.logger().error("ERROR - getting Post Likers function")

    return caption, location_url, location_name, location_id, lat, lng, imgs, imgdesc, tags, int(
        likes
    ), commentscount, date, user_commented_list, user_comments, mentions, user_liked_list, views
Exemplo n.º 23
0
def get_num_posts(browser, num_of_posts_to_do):
    """Get all posts from user"""
    links = []
    links2 = []
    preview_imgs = {}

    # list links contains 30 links from the current view, as that is the maximum Instagram is showing at one time
    # list links2 contains all the links collected so far
    # preview_imgs dictionary maps link in links2 to link's post's preview image src
    try:
        body_elem = browser.find_element_by_tag_name('body')

        # load_button = body_elem.find_element_by_xpath\
        #  ('//a[contains(@class, "_1cr2e _epyes")]')
        # body_elem.send_keys(Keys.END)
        # sleep(3)

        previouslen = 0
        breaking = 0

        InstaLogger.logger().info(
            f"number of posts to do: {num_of_posts_to_do}")
        num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12)
        InstaLogger.logger().info(
            f"Getting first {num_of_posts_to_scroll} posts but checking {num_of_posts_to_do} posts only, if you want to change this limit, change limit_amount value in crawl_profile.py\n"
        )
        while (len(links2) < num_of_posts_to_do):

            prev_divs = browser.find_elements_by_tag_name('main')
            links_elems = [
                div.find_elements_by_tag_name('a') for div in prev_divs
            ]
            links = sum(
                [[link_elem.get_attribute('href') for link_elem in elems]
                 for elems in links_elems], [])

            for elems in links_elems:
                for link_elem in elems:

                    href = link_elem.get_attribute('href')
                    if "/p/" in href:
                        img = link_elem.find_element_by_tag_name('img')
                        src = img.get_attribute('src')
                        preview_imgs[href] = src

            for link in links:
                if "/p/" in link:
                    if (len(links2) < num_of_posts_to_do):
                        links2.append(link)

            links2 = list(set(links2))
            InstaLogger.logger().info(
                f"Scrolling profile {len(links2)} / {num_of_posts_to_scroll}")
            body_elem.send_keys(Keys.END)
            sleep(Settings.sleep_time_between_post_scroll)

            ##remove bellow part to never break the scrolling script before reaching the num_of_posts
            if (len(links2) == previouslen):
                breaking += 1
                InstaLogger.logger().info(
                    f"breaking in {4 - breaking}...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py"
                )
            else:
                breaking = 0
            if breaking > 3:
                InstaLogger.logger().info(
                    "Not getting any more posts, ending scrolling")
                sleep(2)
                break
            previouslen = len(links2)
            ##

    except NoSuchElementException as err:
        InstaLogger.logger().error('Something went terribly wrong')

    return links2, preview_imgs
Exemplo n.º 24
0
def extract_post_comments(browser, post):
    # if more than 22 comment elements, use the second to see
    # how much comments, else count the li's

    # first element is the text, second either the first comment
    # or the button to display all the comments

    # sometimes getting comments ends in a endless loop
    # therefore reduce the run
    comments_found_last_run = 0
    comments_run_same_length = 0
    comments = []
    user_commented_list = []
    user_comments = []
    try:
        if post.find_elements_by_tag_name('ul'):
            comment_list = post.find_element_by_tag_name('ul')
            comments = comment_list.find_elements_by_tag_name('li')

            if len(comments) > 1:
                # load hidden comments
                tried_catch_comments = 0
                while (comments[1].text.lower() == 'load more comments'
                       or comments[1].text.lower().startswith('view all')):
                    try:
                        if comments[1].find_element_by_tag_name('button'):
                            print("clicking button for loading more comments")
                            browser.execute_script(
                                "arguments[0].click();",
                                comments[1].find_element_by_tag_name('button'))
                        elif comments[1].find_element_by_tag_name('a'):
                            print("clicking a for loading more")
                            browser.execute_script(
                                "arguments[0].click();",
                                comments[1].find_element_by_tag_name('a'))
                        sleep(Settings.sleep_time_between_comment_loading)

                        comment_list = post.find_element_by_tag_name('ul')
                        comments = comment_list.find_elements_by_tag_name('li')
                        print("comments (loaded: " + str(len(comments)) +
                              "/lastrun: " + str(comments_found_last_run) +
                              ")")

                        if (comments_found_last_run == len(comments)):
                            comments_run_same_length = comments_run_same_length + 1
                            if comments_run_same_length > 10:
                                InstaLogger.logger().error(
                                    "exit getting comments: " +
                                    str(comments_run_same_length) +
                                    "x same length of comments, perhaps endless loop"
                                )
                                break
                        else:
                            comments_same_length = 0

                        comments_found_last_run = len(comments)
                    except:
                        InstaLogger.logger().error(
                            "error clicking - next try (tried: " +
                            str(tried_catch_comments) + ") comments:" +
                            str(len(comments)) + ")")
                        tried_catch_comments = tried_catch_comments + 1
                        if tried_catch_comments > 10:
                            InstaLogger.logger().error(
                                "exit getting comments, " +
                                str(tried_catch_comments) +
                                "x tried to get comments")
                            break
                        sleep(Settings.sleep_time_between_comment_loading)

                InstaLogger.logger().info("found comments: " +
                                          str(len(comments)))
            else:
                print("found comment: 1")

            # adding who commented into user_commented_list
            for comm in comments:
                try:
                    user_commented = comm.find_element_by_tag_name(
                        'a').get_attribute("href").split('/')
                    user_commented_list.append(user_commented[3])
                except:
                    InstaLogger.logger().error(
                        "ERROR something went wrong getting user_commented")
                # first comment has to be loaded every time to get the caption and tag from post
                if (Settings.output_comments is True
                        or len(user_comments) < 1):
                    user_comment = {}
                    try:
                        user_comment = {
                            'user':
                            user_commented[3],
                            'comment':
                            comm.find_element_by_css_selector(
                                'h2 + span, h3 + span').text
                        }
                        print(user_comment)
                        InstaLogger.logger().info(
                            user_commented[3] + " -- " +
                            comm.find_element_by_css_selector(
                                'h2 + span, h3 + span').text)
                        user_comments.append(user_comment)
                    except:
                        InstaLogger.logger().error(
                            "ERROR something went wrong getting comment")

        InstaLogger.logger().info(str(len(user_commented_list)) + " comments.")
    except BaseException as e:
        InstaLogger.logger().error(e)
    except:
        InstaLogger.logger().error("Error - getting comments")

    return user_comments, user_commented_list, int(len(comments) - 1)
Exemplo n.º 25
0
    def get_user_info(self):
        """Get the basic user info from the profile screen"""

        self.isprivate = self._is_user_private()
        self.alias = self._user_alias()
        self.bio = self._user_bio()
        self.bio_url = self._user_bio_url()
        self.profile_image = self._user_profile_image()

        infos = self.container.find_elements_by_class_name('Y8-fY')
        if infos:
            self.num_of_posts = {'count': extract_exact_info(infos[0])}
            self.following = {'count': extract_exact_info(infos[2])}
            self.followers = {'count': extract_exact_info(infos[1])}

            if Settings.scrape_follower == True:
                if not self.isprivate:
                    self.followers['list'] = extract_followers(
                        self.browser, self.username)

        InstaLogger.logger().info("Alias name: " + self.alias)
        InstaLogger.logger().info("Bio: " + self.bio)
        InstaLogger.logger().info("Url: " + self.bio_url)
        InstaLogger.logger().info("Posts: " + str(self.num_of_posts))
        InstaLogger.logger().info("Follower: " + str(self.followers['count']))
        InstaLogger.logger().info("Following: " + str(self.following))
        InstaLogger.logger().info("Is private: " + str(self.isprivate))
Exemplo n.º 26
0
def extract_user_posts_links(browser, username, limit_amount):
    InstaLogger.logger().info('Extracting information from ' + username)
    """Get all the information for the given username"""

    try:
        user_link = "https://www.instagram.com/{}/".format(username)
        web_adress_navigator(browser, user_link)
    except PageNotFound404 as e:
        raise NoInstaProfilePageFound(e)

    num_of_posts_to_do = 999999

    user_info = {}

    try:
        user_info = get_user_info(browser, username)
        if limit_amount < 1:
            limit_amount = 999999
        num_of_posts_to_do = min(limit_amount, user_info['num_of_posts'])
    except Exception as err:
        InstaLogger.logger().error("Couldn't get user profile. - Terminating")
        quit()
    """Get all posts from user"""
    indexed_links = dict()
    preview_images = {}

    try:
        body_elem = browser.find_element_by_tag_name('body')

        previouslen = 0
        breaking = 0

        print("number of posts to do: ", num_of_posts_to_do)
        num_of_posts_to_scroll = 12 * math.ceil(num_of_posts_to_do / 12)
        print(
            "Getting first", num_of_posts_to_scroll, "posts but checking ",
            num_of_posts_to_do,
            " posts only, if you want to change this limit, change limit_amount value in crawl_profile.py\n"
        )
        while (len(indexed_links) < num_of_posts_to_do):

            prev_divs = browser.find_elements_by_tag_name('main')
            links_elems = [
                div.find_elements_by_tag_name('a') for div in prev_divs
            ]
            links = sum(
                [[link_elem.get_attribute('href') for link_elem in elems]
                 for elems in links_elems], [])

            for elems in links_elems:
                for link_elem in elems:

                    href = link_elem.get_attribute('href')
                    try:
                        if "/p/" in href:
                            try:
                                img = link_elem.find_element_by_tag_name('img')
                                src = img.get_attribute('src')
                                preview_images[href] = src
                            except NoSuchElementException:
                                print("img exception 132")
                                continue
                    except Exception as err:
                        print(err)

            for link in links:
                if "/p/" in link:
                    if (len(indexed_links) < num_of_posts_to_do):
                        if link not in indexed_links:
                            indexed_links[link] = len(indexed_links)
            print("Scrolling profile ", len(indexed_links), "/",
                  num_of_posts_to_scroll)
            body_elem.send_keys(Keys.END)
            sleep_time = Settings.sleep_time_between_post_scroll
            sleep(random.uniform(sleep_time - 1, sleep_time + 1))

            ##remove bellow part to never break the scrolling script before reaching the num_of_posts
            if (len(indexed_links) == previouslen):
                breaking += 1
                print(
                    "breaking in ", 4 - breaking,
                    "...\nIf you believe this is only caused by slow internet, increase sleep time 'sleep_time_between_post_scroll' in settings.py"
                )
            else:
                breaking = 0
            if breaking > 3:
                print("Not getting any more posts, ending scrolling")
                sleep(2)
                break
            previouslen = len(indexed_links)
            ##

    except NoSuchElementException as err:
        InstaLogger.logger().error('Something went terribly wrong')

    return user_info, indexed_links, preview_images