Exemplo n.º 1
0
    def __init__(self, browser, postlink):
        self.browser = browser
        self.postlink = postlink

        try:
            InstaLogger.logger().info("Scraping Post Link: " + self.postlink)
            web_adress_navigator(self.browser, self.postlink)
        except PageNotFound404 as e:
            raise NoInstaPostPageFound(e)
        except NoSuchElementException as err:
            InstaLogger.logger().error("Could not get information from post: " + self.postlink)
            InstaLogger.logger().error(err)
            pass

        self.post = self.browser.find_element_by_class_name('ltEKP')
def extract_post_info(browser, postlink):
    """Get the information from the current post"""

    try:
        InstaLogger.logger().info("Scrapping Post Link: " + postlink)
        web_adress_navigator(browser, postlink)
    except PageNotFound404 as e:
        raise NoInstaPostPageFound(e)
    except NoSuchElementException as err:
        InstaLogger.logger().error("Could not get information from post: " +
                                   postlink)
        InstaLogger.logger().error(err)
        pass

    post = browser.find_element_by_class_name('ltEKP')
    date = ''
    # Get caption
    caption = ''
    username = ''
    try:
        username = post.find_element_by_class_name('e1e1d').text
    except:
        InstaLogger.logger().error("ERROR - getting Post infos (username) ")

    # Get location details
    location_url = ''
    location_name = ''
    location_id = 0
    lat = ''
    lng = ''
    imgs = []
    img = ''

    try:
        # Location url and name
        location_div = post.find_element_by_class_name(
            'M30cS').find_elements_by_tag_name('a')
        if location_div:
            location_url = location_div[0].get_attribute('href')
            location_name = location_div[0].text
            # Longitude and latitude
            location_id = location_url.strip(
                'https://www.instagram.com/explore/locations/').split('/')[0]
            url = 'https://www.instagram.com/explore/locations/' + location_id + '/?__a=1'
            response = requests.get(url)
            data = response.json()
            lat = data['graphql']['location']['lat']
            lng = data['graphql']['location']['lng']
        InstaLogger.logger().info("location_id: " + location_id)
        InstaLogger.logger().info("location_url: " + location_url)
        InstaLogger.logger().info("location_name: " + location_name)
        InstaLogger.logger().info("lat: " + lat)
        InstaLogger.logger().info("lng: " + lng)
    except:
        InstaLogger.logger().error("getting Location Infos  (perhaps not set)")

    try:
        date = post.find_element_by_xpath('//a/time').get_attribute("datetime")
        InstaLogger.logger().info("Post date: " + str(date))
    except:
        InstaLogger.logger().error("ERROR - getting Post Date ")

    try:
        imgs = post.find_elements_by_tag_name('img')
        if len(imgs) >= 2:
            img = imgs[1].get_attribute('src')
        else:
            img = imgs[0].get_attribute('src')
        InstaLogger.logger().info("post image: " + img)
    except:
        InstaLogger.logger().error("ERROR - Post Image ")

    likes = 0

    try:
        #if len(post.find_elements_by_xpath('//article/div/section')) > 2:
        likes_element = post.find_elements_by_xpath(
            '//article/div[2]/section[2]/div/div/a/span')
        if len(likes_element) > 1:
            likes = str(likes_element[1].text)
        else:
            likes = str(likes_element[0].text)

        likes = likes.replace(',', '').replace('.', '')
        likes = likes.replace('k', '00')
        InstaLogger.logger().info("post likes: " + likes)
    except Exception as err:
        InstaLogger.logger().error("ERROR - Getting Post Likes")
        InstaLogger.logger().error(err)
    # if likes is not known, it would cause errors to convert empty string to int

    try:
        likes = int(likes)
    except Exception as err:
        InstaLogger.logger().error(
            "ERROR - Extracting number of likes failed. Saving likes as -1")
        InstaLogger.logger().error(err)
        likes = -1

    user_comments = []
    user_commented_list = []
    user_liked_list = []
    mentions = []
    tags = []
    caption = ''
    commentscount = 0

    try:
        user_comments, user_commented_list, commentscount = extract_post_comments(
            browser, post)
    except:
        InstaLogger.logger().error(
            "ERROR - getting Post comments function trying")

    try:
        caption, tags = extract_post_caption(user_comments, username)
        # delete first comment because its the caption of the user posted
        if len(caption) > 0:
            user_comments.pop(0)
    except:
        InstaLogger.logger().error(
            "ERROR - getting Post caption/tags function")

    try:
        mentions = extract_post_mentions(browser, post)
    except:
        InstaLogger.logger().error("ERROR - getting Post Mentions function")

    try:
        user_liked_list = extract_post_likers(browser, post, postlink, likes)
    except:
        InstaLogger.logger().error("ERROR - getting Post Likers function")

    return caption, location_url, location_name, location_id, lat, lng, img, tags, int(
        likes
    ), commentscount, date, user_commented_list, user_comments, mentions, user_liked_list
Exemplo n.º 3
0
def extract_post_info(browser, postlink):
    """Get the information from the current post"""

    try:
        InstaLogger.logger().info("Scraping Post Link: " + postlink)
        web_adress_navigator(browser, postlink)
    except PageNotFound404 as e:
        raise NoInstaPostPageFound(e)
    except NoSuchElementException as err:
        InstaLogger.logger().error("Could not get information from post: " +
                                   postlink)
        InstaLogger.logger().error(err)
        pass

    post = browser.find_element_by_class_name('ltEKP')
    date = ''
    # Get caption
    caption = ''
    username = ''
    try:
        username = post.find_element_by_class_name(
            'e1e1d').find_element_by_tag_name('a').text
    except:
        InstaLogger.logger().error("ERROR - getting Post infos (username) ")

    # Get location details
    location_url = ''
    location_name = ''
    location_id = 0
    lat = ''
    lng = ''

    img_tags = []
    imgs = []
    imgdesc = []
    views = 0

    try:
        # Location url and name
        location_div = post.find_element_by_class_name(
            'M30cS').find_elements_by_tag_name('a')
        if location_div:
            location_url = location_div[0].get_attribute('href')
            location_name = location_div[0].text
            print(location_name)
            # Longitude and latitude
            location_id = location_url.strip(
                'https://www.instagram.com/explore/locations/').split('/')[0]
            print("location id", location_id)
            url = 'https://www.instagram.com/explore/locations/' + str(
                location_id) + '/?__a=1'
            print(len(location_id))
            response = requests.get(url)
            data = response.json()
            if response:
                print("got data")
            lat = data['graphql']['location']['lat']
            print("latitude", lat)
            lng = data['graphql']['location']['lng']
            print("longitude", lng)
        InstaLogger.logger().info("location_id: " + str(location_id))
        InstaLogger.logger().info("location_url: " + str(location_url))
        InstaLogger.logger().info("location_name: " + str(location_name))
        InstaLogger.logger().info("lat: " + str(lat))
        InstaLogger.logger().info("lng: " + str(lng))
    except Exception as err:
        InstaLogger.logger().warning(
            "getting Location Infos (perhaps not set)")
    try:
        date = post.find_element_by_xpath('//a/time').get_attribute("datetime")
        InstaLogger.logger().info("Post date: " + str(date))
    except:
        InstaLogger.logger().error("ERROR - getting Post Date ")

    try:
        img_tags = post.find_elements_by_class_name('FFVAD')
        InstaLogger.logger().info("number of images: " + str(len(img_tags)))
        for i in img_tags:
            imgs.append(i.get_attribute('src'))
            imgdesc.append(i.get_attribute('alt'))
            InstaLogger.logger().info("post image: " + imgs[-1])
            InstaLogger.logger().info("alt text: " + imgdesc[-1])
    except Exception as err:
        InstaLogger.logger().error("ERROR - Post Image")
        InstaLogger.logger().error(str(err))

    likes = 0

    try:
        # if len(post.find_elements_by_xpath('//article/div/section')) > 2:
        # image or video post?
        if len(img_tags) >= 1:
            likes = post.find_element_by_xpath(
                '//article/div[2]/section[2]/div/div[2]/button/span').text
        else:
            try:
                views = int(
                    post.find_element_by_xpath(
                        '//article/div[2]/section[2]/div/span/span').text.
                    replace(",", ""))
                InstaLogger.logger().info("video views: " + str(views))
            except:
                InstaLogger.logger().error("ERROR - Getting Video Views")
            # click the view count to get the likes popup
            viewcount_click = post.find_element_by_xpath(
                '//article/div[2]/section[2]/div/span')
            browser.execute_script("arguments[0].click();", viewcount_click)
            likes = post.find_element_by_xpath(
                '//article/div[2]/section[2]/div/div/div[4]/span').text

        likes = likes.replace(',', '').replace('.', '')
        likes = likes.replace('k', '00')
        InstaLogger.logger().info("post likes: " + likes)
    except Exception as err:
        print(err)
        InstaLogger.logger().error("ERROR - Getting Post Likes")
        InstaLogger.logger().error(err)
    # if likes is not known, it would cause errors to convert empty string to int

    try:
        likes = int(likes)
    except Exception as err:
        InstaLogger.logger().error(
            "ERROR - Extracting number of likes failed. Saving likes as -1")
        InstaLogger.logger().error(err)
        likes = -1

    user_comments = []
    user_commented_list = []
    user_liked_list = []
    mentions = []
    tags = []
    caption = ''
    commentscount = 0

    try:
        user_comments, user_commented_list, commentscount = extract_post_comments(
            browser, post)
    except:
        InstaLogger.logger().error(
            "ERROR - getting Post comments function trying")

    try:
        caption, tags = extract_post_caption(user_comments, username)
        # delete first comment because its the caption of the user posted
        if len(caption) > 0:
            user_comments.pop(0)
    except:
        InstaLogger.logger().error(
            "ERROR - getting Post caption/tags function")

    try:
        mentions = extract_post_mentions(browser, post)
    except:
        InstaLogger.logger().error("ERROR - getting Post Mentions function")

    try:
        user_liked_list = extract_post_likers(browser, post, postlink, likes)
    except:
        InstaLogger.logger().error("ERROR - getting Post Likers function")

    return caption, location_url, location_name, location_id, lat, lng, imgs, imgdesc, tags, int(
        likes
    ), commentscount, date, user_commented_list, user_comments, mentions, user_liked_list, views
Exemplo n.º 4
0
def extract_post_info(browser, postlink):
    """Get the information from the current post"""

    try:
        InstaLogger.logger().info("Scrapping Post Link: " + postlink)
        web_adress_navigator(browser, postlink)
    except PageNotFound404 as e:
        raise NoInstaPostPageFound(e)
    except NoSuchElementException as err:
        InstaLogger.logger().error("Could not get information from post: " +
                                   postlink)
        InstaLogger.logger().error(err)
        pass

    post = browser.find_element_by_class_name('ltEKP')
    date = ''
    # Get caption
    caption = ''
    username = ''
    try:
        username = post.find_element_by_class_name('e1e1d').text
    except:
        InstaLogger.logger().error("ERROR - getting Post infos (username) ")

    # Get location details
    location_url = ''
    location_name = ''
    location_id = 0
    lat = ''
    lng = ''

    try:
        # Location url and name
        location_div = post.find_element_by_class_name(
            'M30cS').find_elements_by_tag_name('a')
        if location_div:
            location_url = location_div[0].get_attribute('href')
            location_name = location_div[0].text
            # Longitude and latitude
            location_id = location_url.strip(
                'https://www.instagram.com/explore/locations/').split('/')[0]
            url = 'https://www.instagram.com/explore/locations/' + location_id + '/?__a=1'
            response = requests.get(url)
            data = response.json()
            lat = data['graphql']['location']['lat']
            lng = data['graphql']['location']['lng']
        InstaLogger.logger().info("location_id:" + location_id)
        InstaLogger.logger().info("location_url:" + location_url)
        InstaLogger.logger().info("location_name:" + location_name)
        InstaLogger.logger().info("lat:" + lat)
        InstaLogger.logger().info("lng:" + lng)
    except:
        InstaLogger.logger().error("getting Location Infos  (perhaps not set)")

    try:
        date = post.find_element_by_xpath('//a/time').get_attribute("datetime")
        InstaLogger.logger().info("date:" + str(date))
    except:
        InstaLogger.logger().error("ERROR - getting Post Date ")

    imgs = post.find_elements_by_tag_name('img')
    img = ''

    if len(imgs) >= 2:
        img = imgs[1].get_attribute('src')
    else:
        img = imgs[0].get_attribute('src')

    likes = 0

    if len(post.find_elements_by_tag_name('section')) > 2:
        likes = post.find_elements_by_tag_name('section')[1] \
            .find_element_by_tag_name('div').text

        likes = likes.split(' ')

        # count the names if there is no number displayed
        if len(likes) > 2:
            likes = len([
                word for word in likes if word not in ['and', 'like', 'this']
            ])
        else:
            likes = likes[0]
            likes = likes.replace(',', '').replace('.', '')
            likes = likes.replace('k', '00')
        InstaLogger.logger().info("post-likes:" + likes)

    user_comments = []
    user_commented_list = []
    mentions = []
    tags = []
    caption = ''
    commentscount = 0

    try:
        user_comments, user_commented_list, commentscount = extract_post_comments(
            browser, post)
    except:
        InstaLogger.logger().error("trying to get comments (function)")

    try:
        caption, tags = extract_post_caption(user_comments, username)
        # delete first comment because its the caption of the user posted
        if len(caption) > 0:
            user_comments.pop(0)
    except:
        InstaLogger.logger().error("trying to get caption/tags (function)")

    try:
        mentions = extract_post_mentions(browser, post)
    except:
        InstaLogger.logger().error("trying to get mentions (function)")

    return caption, location_url, location_name, location_id, lat, lng, img, tags, int(
        likes
    ), commentscount, date, user_commented_list, user_comments, mentions