示例#1
0
def get_thumbnails_from_s3(df):
    def path_to_image_html(path):
        return '<img src="' + path + '"width="200" >'

    thumbnail = []
    aws, bucket, s3 = s3_mongo_helper.initialize_s3()
    temp_dir = tempfile.mkdtemp(dir=os.getcwd())
    for link in df["s3_url"]:
        if link is not None:
            if link.split(".")[-1] == "mp4":
                video_input_path = link
                img_output_path = (temp_dir.split("/")[-1] + "/" +
                                   link.split("/")[-1].split(".")[0] + ".jpg")
                filename = link.split("/")[-1].split(".")[0] + ".jpg"
                subprocess.call(
                    [
                        "ffmpeg",
                        "-i",
                        video_input_path,
                        "-ss",
                        "00:00:00.000",
                        "-vframes",
                        "1",
                        img_output_path,
                    ],
                    stderr=subprocess.DEVNULL,
                    stdout=subprocess.DEVNULL,
                )
                s3_mongo_helper.upload_to_s3(
                    s3=s3,
                    file=img_output_path,
                    filename=filename,
                    bucket=bucket,
                    content_type="image/jpeg",
                )
                thumbnail.append(aws + bucket + "/" + filename)
            elif link.split(".")[-1] == "txt":
                thumbnail.append(None)
            else:  # if jpg/jpeg/png
                thumbnail.append(link)
        else:  # if NaN
            thumbnail.append(None)
    df["thumbnail"] = np.array(thumbnail)
    pd.set_option("display.max_colwidth", -1)
    df_html = HTML(
        df.to_html(
            index=False,
            escape=False,
            formatters=dict(thumbnail=path_to_image_html),
            render_links=True,
        ))
    shutil.rmtree(temp_dir)
    return df, df_html
def save_to_server(all_msgs: List[Msg], merged_msgs: List[Msg],
                   media_files: list, drive_id: str) -> None:
    """
    Save msgs and media to the Tattle server.
    This requires setting environment variables
    """

    # 0. Initialize
    all_coll, merged_coll = initialize_mongo()
    aws, bucket, s3 = initialize_s3()

    # 1. Insert all "raw" msgs
    all_coll = initialize_mongo(var_prefix="whatsapp_all")
    msgs_by_file = group_by_file(all_msgs)
    to_insert = []
    insert_dt = datetime.datetime.utcnow().isoformat()
    for msgs in msgs_by_file.values():
        to_insert.append({
            'scrape_datetime': insert_dt,
            'source': GOOGLE_DRIVE,
            'source_loc': drive_id,
            'msgs': [m.as_dict() for m in msgs]
        })
    all_coll.insert_many(to_insert)

    # 2. Upsert merged msgs
    merged_coll = initialize_mongo(var_prefix="whatsapp_merged")
    msg_gids = [msg.group_id for msg in merged_msgs]
    existing_msgs = merged_coll.find({"group_id": {"$in": msg_gids}})
    if existing_msgs:
        logging.warning("Not overwriting %d msgs already in server.",
                        len(existing_msgs))
        merge_msgs_from_server(merged_msgs, existing_msgs)
    merged_coll.insert_many([m.as_dict() for m in merged_msgs])

    # 3. Upload media files to s3
    for fl in media_files:
        logging.info("Uploading %r", fl['hash'])
        upload_to_s3(s3, fl['content'], fl['hash'], bucket, fl['media_mime_type'])
    logging.info("Wrote %d files to S3. Done", len(media_files))
示例#3
0
def fresh_content_scraper(
    USER_ID=None,
    PASSCODE=None,
    tag_hashes=None,
    bucket_ids=None,
    pages=None,
    unix_timestamp=None,
    mode=None,
    targeting=None,
):
    if targeting == "bucket":
        tag_hashes = sharechat_helper.get_tag_hashes(USER_ID, PASSCODE,
                                                     bucket_ids)
        delay = uniform(10, 15)
    elif targeting == "tag":
        delay = uniform(30, 35)
    if mode == "archive":
        print("Scraping in archive mode")
        start_time = time.time()
        # Initialize S3 and Mongo DB
        print("Initializing ...")
        initializationSuccess = False
        try:
            aws, bucket, s3 = s3_mongo_helper.initialize_s3()
            coll = s3_mongo_helper.initialize_mongo()
            initializationSuccess = True
            print("Initialized successfully")
        except Exception:
            print("Initialization failure")
            print(logging.traceback.format_exc())
        # Scrape data from Sharechat tags
        if initializationSuccess:
            print("Scraping in progress ...")
            sharechat_df = sharechat_helper.get_fresh_data(
                USER_ID, PASSCODE, tag_hashes, pages, unix_timestamp, delay)
        if len(sharechat_df) < 1:
            raise ValueError(
                "Returned empty dataframe. No posts were scraped.")
        else:
            # Save data to S3 & Mongo DB
            s3UploadSuccess = False
            try:
                print("S3 upload in progress ...")
                sharechat_df, tagwise_duplicates = sharechat_helper.sharechat_s3_upload(
                    sharechat_df, aws, bucket, s3,
                    coll)  # the returned df includes s3 urls
                s3UploadSuccess = True
                print("Data uploaded to S3")
            except Exception:
                print("S3 upload failed")
                print(logging.traceback.format_exc())
                pass
            if s3UploadSuccess:
                aws, logbucket, s3 = sharechat_helper.initialize_s3_logbucket()
                today = datetime.utcnow().strftime("%Y%m%d")
                try:
                    print("HTML file creation in progress ...")
                    (
                        sharechat_df,
                        sharechat_df_html,
                    ) = sharechat_helper.get_thumbnails_from_s3(sharechat_df)
                    with open("sharechat_fresh_data_preview.html", "w") as f:
                        f.write(sharechat_df_html.data)
                        print("HTML file created")
                    print("Uploading HTML file to S3 ...")
                    sharechat_helper.upload_logs(
                        s3=s3,
                        filename="sharechat_fresh_data_preview.html",
                        key="fresh_preview_" + today,
                        bucket=logbucket,
                    )
                    print("HTML file uploaded")
                except Exception:
                    print("HTML file upload failed")
                    print(logging.traceback.format_exc())
                    pass
                try:
                    print("Duplicates log creation in progress ...")
                    with open("tagwise_duplicates.json", "w") as fp:
                        json.dump(tagwise_duplicates, fp)
                    print("Duplicates log created")
                    print("Uploading duplicates log to S3 ...")
                    sharechat_helper.upload_logs(
                        s3=s3,
                        filename="tagwise_duplicates.json",
                        key="fresh_duplicates_" + today,
                        bucket=logbucket,
                    )
                    print("Duplicates log uploaded")
                except Exception:
                    print("Duplicates log upload failed")
                    print(logging.traceback.format_exc())
                    pass
                try:
                    print("CSV file creation in progress ... ")
                    sharechat_df.to_csv("sharechat_fresh_data.csv")
                    print("CSV file created")
                    print("Uploading CSV file to S3 ...")
                    sharechat_helper.upload_logs(
                        s3=s3,
                        filename="sharechat_fresh_data.csv",
                        key="fresh_posts_" + today,
                        bucket=logbucket,
                    )
                    print("CSV file uploaded")
                except Exception:
                    print("CSV file upload failed")
                    print(logging.traceback.format_exc())
                    pass
                try:
                    print("MongoDB upload in progress ...")
                    sharechat_helper.sharechat_mongo_upload(sharechat_df, coll)
                    print("Data uploaded to MongoDB")
                    print("{} posts saved".format(len(sharechat_df)))
                except Exception:
                    print("MongoDB upload failed")
                    print(logging.traceback.format_exc())
                    pass
            else:
                pass
            print("Scraping complete")
            print("Time taken: %s seconds" % (time.time() - start_time))
            return sharechat_df
    elif mode == "local":
        print("Scraping in local mode")
        start_time = time.time()
        print("Scraping in progress ...")
        sharechat_df = sharechat_helper.get_fresh_data(USER_ID, PASSCODE,
                                                       tag_hashes, pages,
                                                       unix_timestamp, delay)
        if len(sharechat_df) < 1:
            raise ValueError(
                "Returned empty dataframe. No posts were scraped.")
        else:
            # Save data locally
            sharechat_df.to_pickle("sharechat_df.pkl")
        try:
            print("HTML preview file creation in progress ...")
            (
                sharechat_df,
                sharechat_df_html,
            ) = sharechat_helper.get_thumbnails_from_sharechat(sharechat_df)
            with open("sharechat_fresh_data_preview.html", "w") as f:
                f.write(sharechat_df_html.data)
                print("HTML preview file created")
        except Exception:
            print("HTML preview file creation failed")
            print(logging.traceback.format_exc())
            pass
        try:
            print("CSV file creation in progress ... ")
            sharechat_df.to_csv("sharechat_fresh_data.csv")
            print("CSV file created")
            print("{} posts saved".format(len(sharechat_df)))
        except Exception:
            print("CSV file creation failed")
            print(logging.traceback.format_exc())
            pass
        print("Scraping complete")
        print("Time taken: %s seconds" % (time.time() - start_time))
        return sharechat_df
示例#4
0
def trending_content_scraper(USER_ID=None,
                             PASSCODE=None,
                             tag_hashes=None,
                             bucket_ids=None,
                             pages=None,
                             mode=None,
                             targeting=None):
    if targeting == "bucket":
        tag_hashes = sharechat_helper.get_tag_hashes(USER_ID, PASSCODE,
                                                     bucket_ids)
        delay = uniform(10, 15)
    elif targeting == "tag":
        delay = uniform(30, 35)
    if mode == "archive":
        print("Scraping in archive mode")
        start_time = time.time()
        # Initialize S3 and Mongo DB
        print("Initializing ...")
        initializationSuccess = False
        try:
            aws, bucket, s3 = s3_mongo_helper.initialize_s3()
            coll = s3_mongo_helper.initialize_mongo()
            initializationSuccess = True
            print("Initialized successfully")
        except Exception as e:
            print("Initialization failure")
            print(logging.traceback.format_exc())
        # Scrape data from Sharechat tags
        if initializationSuccess:
            print("Scraping in progress ...")
            sharechat_df = sharechat_helper.get_trending_data(
                USER_ID, PASSCODE, tag_hashes, pages, delay)

            if len(sharechat_df) < 1:
                raise ValueError(
                    "get_data() returned empty dataframe. No posts were scraped."
                )
            else:
                # Save data locally
                sharechat_df.to_pickle("sharechat_df.pkl")
                # Save data to S3 & Mongo DB
                s3UploadSuccess = False
                try:
                    print("S3 upload in progress ...")
                    sharechat_df = sharechat_helper.sharechat_s3_upload(
                        sharechat_df, aws, bucket,
                        s3)  # the returned df includes s3 urls
                    s3UploadSuccess = True
                    print("Data uploaded to S3")
                except Exception as e:
                    print("S3 upload failed")
                    print(logging.traceback.format_exc())
                    pass
                if s3UploadSuccess:
                    try:
                        print("HTML preview file creation in progress ...")
                        sharechat_df, sharechat_df_html = sharechat_helper.get_thumbnails_from_s3(
                            sharechat_df)
                        with open("sharechat_trending_data_preview.html",
                                  "w") as f:
                            f.write(sharechat_df_html.data)
                            print("HTML preview file created")
                    except Exception as e:
                        print("HTML preview file creation failed")
                        print(logging.traceback.format_exc())
                        pass
                    try:
                        print("MongoDB upload in progress ...")
                        sharechat_helper.sharechat_mongo_upload(
                            sharechat_df, coll)
                        print("Data uploaded to MongoDB")
                    except Exception as e:
                        print("MongoDB upload failed")
                        print(logging.traceback.format_exc())
                        pass
                else:
                    pass
                try:
                    print("CSV file creation in progress ... ")
                    sharechat_df.to_csv("sharechat_trending_data.csv")
                    print("CSV file created")
                    print("{} posts scraped".format(len(sharechat_df)))
                except Exception as e:
                    print("CSV file creation failed")
                    print(logging.traceback.format_exc())
                    pass
                print("Scraping complete")
                print("Time taken: %s seconds" % (time.time() - start_time))
                return sharechat_df
    elif mode == "local":
        print("Scraping in local mode")
        start_time = time.time()
        print("Scraping in progress ...")
        sharechat_df = sharechat_helper.get_trending_data(
            USER_ID, PASSCODE, tag_hashes, pages, delay)
        if len(sharechat_df) < 1:
            raise ValueError(
                "get_data() returned empty dataframe. No posts were scraped.")
        else:
            # Save data locally
            sharechat_df.to_pickle("sharechat_df.pkl")
        try:
            print("HTML preview file creation in progress ...")
            sharechat_df, sharechat_df_html = sharechat_helper.get_thumbnails_from_sharechat(
                sharechat_df)
            with open("sharechat_trending_data_preview.html", "w") as f:
                f.write(sharechat_df_html.data)
                print("HTML preview file created")
        except Exception as e:
            print("HTML preview file creation failed")
            print(logging.traceback.format_exc())
            pass
        try:
            print("CSV file creation in progress ... ")
            sharechat_df.to_csv("sharechat_trending_data.csv")
            print("CSV file created")
            print("{} posts scraped".format(len(sharechat_df)))
        except Exception as e:
            print("CSV file creation failed")
            print(logging.traceback.format_exc())
            pass
        print("Scraping complete")
        print("Time taken: %s seconds" % (time.time() - start_time))
        return sharechat_df