def extractNewPostsFromIG(todayHT): #Search hashtag looter = HashtagLooter( todayHT) #Assume extraction is once a day. Dw to do time v mafan # Create a df that contains new posts appendDF = pd.DataFrame(columns=[ 'username', 'date', 'time', 'text', 'photo', 'is_video', 'points', 'hashtags' ]) index = 0 # Make each new post as a new row for onePost in looter.medias(): onePostDict = looter.get_post_info(onePost.get('shortcode')) appendDF.at[index, 'username'] = (onePostDict.get('owner')).get('username') appendDF.at[index, 'date'] = dt.datetime.utcfromtimestamp( int(onePostDict.get('taken_at_timestamp'))).strftime("%Y/%m/%d") appendDF.at[index, 'time'] = dt.datetime.utcfromtimestamp( int(onePostDict.get('taken_at_timestamp'))).strftime("%H:%M:%S") appendDF.at[index, 'text'] = ((((onePostDict.get('edge_media_to_caption') ).get('edges'))[0]).get('node')).get('text') appendDF.at[index, 'photo'] = onePostDict.get('display_url') appendDF.at[index, 'is_video'] = onePostDict.get( 'is_video') # returns True or False appendDF.at[ index, 'hashtags'] = todayHT #this one go crawl from the other database index += 1 # We will do sorting later return appendDF
def post(self, hashtag_): looter = HashtagLooter("hashtag_") with open("hashtag/" + hashtag_ + ".txt", "w") as f: for media in looter.medias(): for link in links(media, looter): f.write("{}\n".format(link)) return "ok", 201
def test_timeframe_datetime(self): looter = HashtagLooter("protein") now = datetime.datetime.now() timeframe = now - datetime.timedelta(5), now - datetime.timedelta(7) media = next(looter.medias(timeframe=timeframe)) taken_at = datetime.datetime.fromtimestamp(media["taken_at_timestamp"]) self.assertLessEqual(taken_at, max(timeframe)) self.assertGreaterEqual(taken_at, min(timeframe))
def extractAllPostsFromIG(mainHT, commonHT, allHT): #Search hashtag looter = HashtagLooter(mainHT) # Create a df that contains new posts appendDF = pd.DataFrame(columns=[ 'username', 'date', 'time', 'text', 'photo', 'is_video', 'points', 'hashtags' ]) index = 0 # Make each new post as a new row for onePost in looter.medias(): onePostDict = looter.get_post_info(onePost.get('shortcode')) appendDF.at[index, 'username'] = (onePostDict.get('owner')).get('username') appendDF.at[index, 'date'] = dt.datetime.utcfromtimestamp( int(onePostDict.get('taken_at_timestamp'))).strftime("%Y/%m/%d") appendDF.at[index, 'time'] = dt.datetime.utcfromtimestamp( int(onePostDict.get('taken_at_timestamp'))).strftime("%H:%M:%S") appendDF.at[index, 'text'] = ((((onePostDict.get('edge_media_to_caption') ).get('edges'))[0]).get('node')).get('text') appendDF.at[index, 'photo'] = onePostDict.get('display_url') appendDF.at[index, 'is_video'] = onePostDict.get( 'is_video') # returns True or False appendDF.at[ index, 'points'] = 0 #this one go crawl from the other database with HT # ============================================================================= # If the single post contains more than one todayHT # ============================================================================= text = ((((onePostDict.get('edge_media_to_caption')).get('edges'))[0] ).get('node')).get('text') totalPoints = 0 if (text.count(commonHT) > 1): #many hashtags manyHT = [] # create a list to store a list of hashtags for HT in allHT: #loop through all the hashtags if HT in text: manyHT.append(HT) pointsHT = extractTodayPointsFromGS('DailyChallenges', HT) totalPoints += pointsHT appendDF.at[index, 'points'] = totalPoints appendDF.at[index, 'hashtags'] = manyHT else: #one hashtag for HT in allHT: #loop through all the hashtags if HT in text: appendDF.at[index, 'hashtags'] = HT pointsHT = extractTodayPointsFromGS('DailyChallenges', HT) appendDF.at[index, 'points'] = pointsHT index += 1 # We will do sorting later return appendDF
def test_issue_076(self): """Thanks to @zeshuaro for reporting this bug. Check that when downloading hashtags, the downloader actually stops. """ looter = HashtagLooter("oulianov", session=self.session) medias_it = looter.medias() postcount = length_hint(medias_it) for i, m in enumerate(medias_it): if i > postcount: self.fail("looter.medias() did not stop.")
def get_hashtag(hashtag): print("get hashtag : %s" % hashtag) looter = HashtagLooter(hashtag) medias = islice(looter.medias(), 100) res = list(map(lambda m: media_to_dict(m, hashtag), medias)) return { "res": res }
def test_timeframe_date(self): looter = HashtagLooter("protein") timeframe = datetime.date(2019, 12, 27), datetime.date(2019, 12, 20) media = next(looter.medias(timeframe=timeframe))