def get_metadata(): data = request.json # print(data) # print(data["doc_id"]) filename = data["doc_id"] # print(filename) coll = s3_mongo_helper.initialize_mongo() print(coll) result = " " for i in coll.find({"filename": filename}): result = i result.pop("_id", None) print(result) return jsonify(result)
def virality_scraper(USER_ID=None, PASSCODE=None, virality_job=None): start_time = time.time() # Initialize S3 and Mongo DB print("Initializing Mongo DB ...") initializationSuccess = False try: coll = s3_mongo_helper.initialize_mongo() initializationSuccess = True print("Initialized successfully") except Exception: print("Initialization failure") print(logging.traceback.format_exc()) if initializationSuccess: updates = 0 failed = 0 today = datetime.utcnow() if virality_job == 1: # Get metrics for t+1 & t+2 start = today - timedelta(days=2) end = today - timedelta(days=1) print( "# Updating virality metrics for posts 1 & 2 day old posts ..." ) elif virality_job == 2: # Get metrics for t+3 ... t+5 start = today - timedelta(days=5) end = today - timedelta(days=3) print("# Updating virality metrics for 3, 4 & 5 day old posts ...") cursor = coll.find({ "scraped_date": { "$gte": start, "$lte": end }, "scraper_type": "fresh" }) for doc in cursor: try: # Get timestamp for day t timestamp = pd.to_datetime(doc["timestamp"]) # Calculate days since t diff = str((today - timestamp).days) # If the post is actually less than 5 days old if int(diff) <= 5: # Get current virality metrics result = sharechat_helper.get_current_metrics( USER_ID, PASSCODE, doc["post_permalink"]) # Update doc coll.update( {"_id": doc["_id"]}, { "$set": { "comments_t+" + diff: result[0], "external_shares_t+" + diff: result[1], "likes_t+" + diff: result[2], "reposts_t+" + diff: result[3], "views_t+" + diff: result[4], } }, ) updates += 1 # For debugging # print(coll.find_one({"_id": doc["_id"]})) # print("") else: # If for some reason the post is older pass except Exception: failed += 1 pass print("Scraping complete") print("Updated virality metrics for {} posts".format(updates)) print("{} updates failed".format(failed)) print("Time taken: %s seconds" % (time.time() - start_time))
def trending_content_scraper(USER_ID=None, PASSCODE=None, tag_hashes=None, bucket_ids=None, pages=None, mode=None, targeting=None): if targeting == "bucket": tag_hashes = sharechat_helper.get_tag_hashes(USER_ID, PASSCODE, bucket_ids) delay = uniform(10, 15) elif targeting == "tag": delay = uniform(30, 35) if mode == "archive": print("Scraping in archive mode") start_time = time.time() # Initialize S3 and Mongo DB print("Initializing ...") initializationSuccess = False try: aws, bucket, s3 = s3_mongo_helper.initialize_s3() coll = s3_mongo_helper.initialize_mongo() initializationSuccess = True print("Initialized successfully") except Exception as e: print("Initialization failure") print(logging.traceback.format_exc()) # Scrape data from Sharechat tags if initializationSuccess: print("Scraping in progress ...") sharechat_df = sharechat_helper.get_trending_data( USER_ID, PASSCODE, tag_hashes, pages, delay) if len(sharechat_df) < 1: raise ValueError( "get_data() returned empty dataframe. No posts were scraped." ) else: # Save data locally sharechat_df.to_pickle("sharechat_df.pkl") # Save data to S3 & Mongo DB s3UploadSuccess = False try: print("S3 upload in progress ...") sharechat_df = sharechat_helper.sharechat_s3_upload( sharechat_df, aws, bucket, s3) # the returned df includes s3 urls s3UploadSuccess = True print("Data uploaded to S3") except Exception as e: print("S3 upload failed") print(logging.traceback.format_exc()) pass if s3UploadSuccess: try: print("HTML preview file creation in progress ...") sharechat_df, sharechat_df_html = sharechat_helper.get_thumbnails_from_s3( sharechat_df) with open("sharechat_trending_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML preview file created") except Exception as e: print("HTML preview file creation failed") print(logging.traceback.format_exc()) pass try: print("MongoDB upload in progress ...") sharechat_helper.sharechat_mongo_upload( sharechat_df, coll) print("Data uploaded to MongoDB") except Exception as e: print("MongoDB upload failed") print(logging.traceback.format_exc()) pass else: pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_trending_data.csv") print("CSV file created") print("{} posts scraped".format(len(sharechat_df))) except Exception as e: print("CSV file creation failed") print(logging.traceback.format_exc()) pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df elif mode == "local": print("Scraping in local mode") start_time = time.time() print("Scraping in progress ...") sharechat_df = sharechat_helper.get_trending_data( USER_ID, PASSCODE, tag_hashes, pages, delay) if len(sharechat_df) < 1: raise ValueError( "get_data() returned empty dataframe. No posts were scraped.") else: # Save data locally sharechat_df.to_pickle("sharechat_df.pkl") try: print("HTML preview file creation in progress ...") sharechat_df, sharechat_df_html = sharechat_helper.get_thumbnails_from_sharechat( sharechat_df) with open("sharechat_trending_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML preview file created") except Exception as e: print("HTML preview file creation failed") print(logging.traceback.format_exc()) pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_trending_data.csv") print("CSV file created") print("{} posts scraped".format(len(sharechat_df))) except Exception as e: print("CSV file creation failed") print(logging.traceback.format_exc()) pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df
def fresh_content_scraper( USER_ID=None, PASSCODE=None, tag_hashes=None, bucket_ids=None, pages=None, unix_timestamp=None, mode=None, targeting=None, ): if targeting == "bucket": tag_hashes = sharechat_helper.get_tag_hashes(USER_ID, PASSCODE, bucket_ids) delay = uniform(10, 15) elif targeting == "tag": delay = uniform(30, 35) if mode == "archive": print("Scraping in archive mode") start_time = time.time() # Initialize S3 and Mongo DB print("Initializing ...") initializationSuccess = False try: aws, bucket, s3 = s3_mongo_helper.initialize_s3() coll = s3_mongo_helper.initialize_mongo() initializationSuccess = True print("Initialized successfully") except Exception: print("Initialization failure") print(logging.traceback.format_exc()) # Scrape data from Sharechat tags if initializationSuccess: print("Scraping in progress ...") sharechat_df = sharechat_helper.get_fresh_data( USER_ID, PASSCODE, tag_hashes, pages, unix_timestamp, delay) if len(sharechat_df) < 1: raise ValueError( "Returned empty dataframe. No posts were scraped.") else: # Save data to S3 & Mongo DB s3UploadSuccess = False try: print("S3 upload in progress ...") sharechat_df, tagwise_duplicates = sharechat_helper.sharechat_s3_upload( sharechat_df, aws, bucket, s3, coll) # the returned df includes s3 urls s3UploadSuccess = True print("Data uploaded to S3") except Exception: print("S3 upload failed") print(logging.traceback.format_exc()) pass if s3UploadSuccess: aws, logbucket, s3 = sharechat_helper.initialize_s3_logbucket() today = datetime.utcnow().strftime("%Y%m%d") try: print("HTML file creation in progress ...") ( sharechat_df, sharechat_df_html, ) = sharechat_helper.get_thumbnails_from_s3(sharechat_df) with open("sharechat_fresh_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML file created") print("Uploading HTML file to S3 ...") sharechat_helper.upload_logs( s3=s3, filename="sharechat_fresh_data_preview.html", key="fresh_preview_" + today, bucket=logbucket, ) print("HTML file uploaded") except Exception: print("HTML file upload failed") print(logging.traceback.format_exc()) pass try: print("Duplicates log creation in progress ...") with open("tagwise_duplicates.json", "w") as fp: json.dump(tagwise_duplicates, fp) print("Duplicates log created") print("Uploading duplicates log to S3 ...") sharechat_helper.upload_logs( s3=s3, filename="tagwise_duplicates.json", key="fresh_duplicates_" + today, bucket=logbucket, ) print("Duplicates log uploaded") except Exception: print("Duplicates log upload failed") print(logging.traceback.format_exc()) pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_fresh_data.csv") print("CSV file created") print("Uploading CSV file to S3 ...") sharechat_helper.upload_logs( s3=s3, filename="sharechat_fresh_data.csv", key="fresh_posts_" + today, bucket=logbucket, ) print("CSV file uploaded") except Exception: print("CSV file upload failed") print(logging.traceback.format_exc()) pass try: print("MongoDB upload in progress ...") sharechat_helper.sharechat_mongo_upload(sharechat_df, coll) print("Data uploaded to MongoDB") print("{} posts saved".format(len(sharechat_df))) except Exception: print("MongoDB upload failed") print(logging.traceback.format_exc()) pass else: pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df elif mode == "local": print("Scraping in local mode") start_time = time.time() print("Scraping in progress ...") sharechat_df = sharechat_helper.get_fresh_data(USER_ID, PASSCODE, tag_hashes, pages, unix_timestamp, delay) if len(sharechat_df) < 1: raise ValueError( "Returned empty dataframe. No posts were scraped.") else: # Save data locally sharechat_df.to_pickle("sharechat_df.pkl") try: print("HTML preview file creation in progress ...") ( sharechat_df, sharechat_df_html, ) = sharechat_helper.get_thumbnails_from_sharechat(sharechat_df) with open("sharechat_fresh_data_preview.html", "w") as f: f.write(sharechat_df_html.data) print("HTML preview file created") except Exception: print("HTML preview file creation failed") print(logging.traceback.format_exc()) pass try: print("CSV file creation in progress ... ") sharechat_df.to_csv("sharechat_fresh_data.csv") print("CSV file created") print("{} posts saved".format(len(sharechat_df))) except Exception: print("CSV file creation failed") print(logging.traceback.format_exc()) pass print("Scraping complete") print("Time taken: %s seconds" % (time.time() - start_time)) return sharechat_df
"s3URL":["NULL"], "scrapedTime":["NULL"], }) #temp = pd.DataFrame([ts,number,fn,"NULL"]) print(temp) ## Append rows df = df.append(temp,ignore_index = True) print(df) fileTab = pd.DataFrame(fileList) fileTab.to_csv('temp.csv') ''' Join fileTab with df ''' test = pd.merge(df, fileTab, how='inner', left_on='filename', right_on='title') print(test) for index, row in test.iterrows(): file_block = drive.CreateFile({'id': row["id"]}) filename = row["filename"] print(filename) file_block.GetContentFile(filename) #s3_mongo_helper.upload_to_s3(s3,file=filename,filename=filename,bucket=bucket,content_type=row["mimeType"]) coll = s3_mongo_helper.initialize_mongo() for i in df.to_dict("records"): s3_mongo_helper.upload_to_mongo(data=i, coll=coll)