def extractNewPostsFromIG(todayHT):
    #Search  hashtag
    looter = HashtagLooter(
        todayHT)  #Assume extraction is once a day. Dw to do time v mafan

    # Create a df that contains new posts
    appendDF = pd.DataFrame(columns=[
        'username', 'date', 'time', 'text', 'photo', 'is_video', 'points',
        'hashtags'
    ])
    index = 0

    # Make each new post as a new row
    for onePost in looter.medias():
        onePostDict = looter.get_post_info(onePost.get('shortcode'))
        appendDF.at[index,
                    'username'] = (onePostDict.get('owner')).get('username')
        appendDF.at[index, 'date'] = dt.datetime.utcfromtimestamp(
            int(onePostDict.get('taken_at_timestamp'))).strftime("%Y/%m/%d")
        appendDF.at[index, 'time'] = dt.datetime.utcfromtimestamp(
            int(onePostDict.get('taken_at_timestamp'))).strftime("%H:%M:%S")
        appendDF.at[index,
                    'text'] = ((((onePostDict.get('edge_media_to_caption')
                                  ).get('edges'))[0]).get('node')).get('text')
        appendDF.at[index, 'photo'] = onePostDict.get('display_url')
        appendDF.at[index, 'is_video'] = onePostDict.get(
            'is_video')  # returns True or False
        appendDF.at[
            index,
            'hashtags'] = todayHT  #this one go crawl from the other database

        index += 1

    # We will do sorting later
    return appendDF
示例#2
0
def scrape(hashtag):

    looter = HashtagLooter(hashtag)
    with open("output/output.txt", "w") as f:
        for pages in looter.pages():
            for p in pages:
                print(pages)
示例#3
0
    def post(self, hashtag_):
        looter = HashtagLooter("hashtag_")

        with open("hashtag/" + hashtag_ + ".txt", "w") as f:
            for media in looter.medias():
                for link in links(media, looter):
                    f.write("{}\n".format(link))
        return "ok", 201
示例#4
0
    def test_timeframe_datetime(self):
        looter = HashtagLooter("protein")
        now = datetime.datetime.now()
        timeframe = now - datetime.timedelta(5), now - datetime.timedelta(7)
        media = next(looter.medias(timeframe=timeframe))

        taken_at = datetime.datetime.fromtimestamp(media["taken_at_timestamp"])
        self.assertLessEqual(taken_at, max(timeframe))
        self.assertGreaterEqual(taken_at, min(timeframe))
示例#5
0
def extractAllPostsFromIG(mainHT, commonHT, allHT):
    #Search hashtag
    looter = HashtagLooter(mainHT)

    # Create a df that contains new posts
    appendDF = pd.DataFrame(columns=[
        'username', 'date', 'time', 'text', 'photo', 'is_video', 'points',
        'hashtags'
    ])
    index = 0

    # Make each new post as a new row
    for onePost in looter.medias():
        onePostDict = looter.get_post_info(onePost.get('shortcode'))
        appendDF.at[index,
                    'username'] = (onePostDict.get('owner')).get('username')
        appendDF.at[index, 'date'] = dt.datetime.utcfromtimestamp(
            int(onePostDict.get('taken_at_timestamp'))).strftime("%Y/%m/%d")
        appendDF.at[index, 'time'] = dt.datetime.utcfromtimestamp(
            int(onePostDict.get('taken_at_timestamp'))).strftime("%H:%M:%S")
        appendDF.at[index,
                    'text'] = ((((onePostDict.get('edge_media_to_caption')
                                  ).get('edges'))[0]).get('node')).get('text')
        appendDF.at[index, 'photo'] = onePostDict.get('display_url')
        appendDF.at[index, 'is_video'] = onePostDict.get(
            'is_video')  # returns True or False
        appendDF.at[
            index,
            'points'] = 0  #this one go crawl from the other database with HT

        # =============================================================================
        # If the single post contains more than one todayHT
        # =============================================================================
        text = ((((onePostDict.get('edge_media_to_caption')).get('edges'))[0]
                 ).get('node')).get('text')

        totalPoints = 0
        if (text.count(commonHT) > 1):  #many hashtags
            manyHT = []  # create a list to store a list of hashtags
            for HT in allHT:  #loop through all the hashtags
                if HT in text:
                    manyHT.append(HT)
                    pointsHT = extractTodayPointsFromGS('DailyChallenges', HT)
                    totalPoints += pointsHT
            appendDF.at[index, 'points'] = totalPoints
            appendDF.at[index, 'hashtags'] = manyHT

        else:  #one hashtag
            for HT in allHT:  #loop through all the hashtags
                if HT in text:
                    appendDF.at[index, 'hashtags'] = HT
                    pointsHT = extractTodayPointsFromGS('DailyChallenges', HT)
                    appendDF.at[index, 'points'] = pointsHT
        index += 1

    # We will do sorting later
    return appendDF
示例#6
0
    def test_issue_076(self):
        """Thanks to @zeshuaro for reporting this bug.

        Check that when downloading hashtags, the downloader
        actually stops.
        """
        looter = HashtagLooter("oulianov", session=self.session)

        medias_it = looter.medias()
        postcount = length_hint(medias_it)

        for i, m in enumerate(medias_it):
            if i > postcount:
                self.fail("looter.medias() did not stop.")
示例#7
0
    def test_issue_009(self):
        """
        Thanks to @kurtmaia for reporting this bug.

        Checks that adding metadata to pictures downloaded from a hashtag
        works as well.
        """
        looter = HashtagLooter("fluoxetine", add_metadata=True, session=self.session)
        with contexter.Contexter() as ctx:
            ctx << mock.patch.object(looter, 'pages', MockPages('fluoxetine'))
            looter.download(self.destfs, media_count=10)
        for f in self.destfs.listdir("/"):
            exif = piexif.load(self.destfs.getbytes(f))
            self.assertTrue(exif['Exif'])  # Date & Caption
            self.assertTrue(exif['0th'])  # Image creator
示例#8
0
 def test_hashtag(self, hashtag, **kwargs):
     looter = HashtagLooter(hashtag, session=self.session, **kwargs)
     looter.download(self.destfs, media_count=self.MEDIA_COUNT)
     self.assertGreaterEqual(len(self.destfs.listdir("/")),
                             self.MEDIA_COUNT)
示例#9
0
def get_hashtag(hashtag):
    print("get hashtag : %s" % hashtag)
    looter = HashtagLooter(hashtag)
    medias = islice(looter.medias(), 100)
    res = list(map(lambda m: media_to_dict(m, hashtag), medias))
    return { "res": res }
示例#10
0
 def test_timeframe_date(self):
     looter = HashtagLooter("protein")
     timeframe = datetime.date(2019, 12, 27), datetime.date(2019, 12, 20)
     media = next(looter.medias(timeframe=timeframe))
示例#11
0
def instaLooter(hashtag, count):
    from instalooter.looters import HashtagLooter
    looter = HashtagLooter(hashtag)
    looter.download(hashtag, media_count=count)
示例#12
0
from instalooter.looters import HashtagLooter
import os

hashtags = ['ramones']
loot_count = 1000

for hashtag in hashtags:
    looter = HashtagLooter(hashtag)
    download_dir = '/data/danieltc/{}'.format(hashtag)
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)
    print("Looting hashtag {} into dir {}".format(hashtag, download_dir))
    looter.download_pictures(download_dir, media_count=loot_count)
print("Exit gracefully")