Exemplo n.º 1
0
    def prep_redditor(data):
        """
        Prepare Redditor data.

        Calls previously defined public methods:

            CleanData.count_words()
            PrepMutts.prep_mutts()

        Parameters
        ----------
        data: dict
            Dictionary containing extracted scrape data

        Returns
        -------
        frequencies: dict
            Dictionary containing finalized word frequencies
        """

        status = Status("Finished Redditor analysis.",
                        "Analyzing Redditor scrape.", "white")

        plt_dict = dict()

        status.start()
        for interactions in data["interactions"].values():
            PrepMutts.prep_mutts(interactions, plt_dict)

        status.succeed()
        return plt_dict
Exemplo n.º 2
0
    def _prep_raw(data, plt_dict):
        """
        Prepare raw submission comments.

        Calls previously defined public method:

            CleanData.count_words()

        Parameters
        ----------
        data: list
            List containing extracted scrape data
        plt_dict: dict
            Dictionary containing frequency data

        Returns
        -------
        None
        """

        status = Status("Finished raw submission comments analysis.",
                        "Analyzing raw submission comments scrape.", "white")

        status.start()
        for comment in data:
            CleanData.count_words("body", comment, plt_dict)

        status.succeed()
Exemplo n.º 3
0
    def prep_subreddit(data):
        """
        Prepare Subreddit data.

        Calls previously defined public method:

            CleanData.count_words()

        Parameters
        ----------
        data: list
            List containing extracted scrape data

        Returns
        -------
        frequencies: dict
            Dictionary containing finalized word frequencies
        """

        status = Status("Finished Subreddit analysis.",
                        "Analyzing Subreddit scrape.", "white")

        plt_dict = dict()

        status.start()
        for submission in data:
            CleanData.count_words("selftext", submission, plt_dict)
            CleanData.count_words("title", submission, plt_dict)

        status.succeed()
        return plt_dict
Exemplo n.º 4
0
    def __init__(self, args, submission, url):
        """
        Initialize variables used in later methods:

            self._submission: PRAW submission object

        Calls replace_more() method on submission object to get nested comments.

        Parameters
        ----------
        args: Namespace
            Namespace object containing all arguments that were defined in the CLI
        submission: PRAW submission object
        url: str
            String denoting the submission's url

        Returns
        -------
        None
        """

        self._args = args
        self._url = url

        more_comments_status = Status(
            "Finished resolving instances of MoreComments.",
            Fore.CYAN + Style.BRIGHT + "Resolving instances of MoreComments. This may take a while. Please wait.",
            "cyan"
        )

        more_comments_status.start()
        self._submission = submission
        self._submission.comments.replace_more(limit = None)
        more_comments_status.succeed()
Exemplo n.º 5
0
    def format_json(args, skeleton, submissions, subreddit):
        """
        Format submission metadata for JSON export.

        Parameters
        ----------
        args: Namespace
            Namespace object containing all arguments that were defined in the CLI 
        reddit: PRAW Reddit object
        skeleton: dict
            Dictionary containing all Subreddit scrape data
        sub: str
            String denoting the Subreddit name
        submissions: list
            List containing submission objects
        """

        format_status = Status("Finished formatting data for JSON export.",
                               "Formatting data for JSON export.", "white")

        format_status.start()
        skeleton["data"] = submissions

        if args.rules:
            FormatJSON._add_subreddit_rules(skeleton, subreddit)

        format_status.succeed()
Exemplo n.º 6
0
    def format_csv(submissions):
        """
        Format submission metadata for CSV export.

        Parameters
        ----------
        submissions: list
            List containing submission objects

        Returns
        -------
        overview: dict
            Dictionary containing submission data
        """

        format_status = Status("Finished formatting data for CSV export.",
                               "Formatting data for CSV export.", "white")

        overview = dict()

        format_status.start()
        for submission in submissions:
            for field, metadata in submission.items():
                if field not in overview.keys():
                    overview[field] = []

                overview[field].append(metadata)

        format_status.succeed()
        return overview
Exemplo n.º 7
0
    def prep_subreddit(data, file):
        """
        Prepare Subreddit data.

        Calls previously defined public method:

            CleanData.count_words()

        Parameters
        ----------
        data: dict
            Dictionary containing extracted scrape data
        file: str
            String denoting the filepath

        Returns
        -------
        frequencies: dict
            Dictionary containing finalized word frequencies
        """

        status = Status("Finished Subreddit analysis.",
                        "Analyzing Subreddit scrape.", "white")

        plt_dict = dict()

        status.start()
        for submission in data:
            CleanData.count_words("selftext", submission, plt_dict)
            CleanData.count_words("title", submission, plt_dict)

        status.succeed()
        return dict(
            sorted(plt_dict.items(), key=lambda item: item[1], reverse=True))
Exemplo n.º 8
0
    def generate(args):
        """
        Generate frequencies.

        Calls previously defined public methods:

            ExportFrequencies.export()
            PrintConfirm().confirm()
            Sort().create_csv()
            Sort().create_json()
            Sort().get_data()
            Sort().name_and_create_dir()
        
        Calls public methods from external modules:

            AnalyticsTitles.f_title()

        Parameters
        ----------
        args: Namespace
            Namespace object containing all arguments used in the CLI

        Returns
        -------
        None
        """

        AnalyticsTitles.f_title()

        for file in args.frequencies:
            f_type, filename = Sort().name_and_create_dir(args, file)
            plt_dict = Sort().get_data(file)

            Halo().info("Generating frequencies.")
            print()
            data = Sort().create_csv(plt_dict) \
                if args.csv \
                else Sort().create_json(file, plt_dict)

            export_status = Status(
                Style.BRIGHT + Fore.GREEN +
                "Frequencies exported to %s." % "/".join(
                    filename.split("/")
                    [filename.split("/").index("scrapes"):]),
                "Exporting frequencies.", "white")

            export_status.start()
            ExportFrequencies.export(data, f_type, filename)
            export_status.succeed()
            print()
Exemplo n.º 9
0
    def save_wordcloud(self, analytics_dir, scrape_file, wc):
        """
        Save wordcloud to file.

        Calls a public method from an external module:

            GetPath.name_file()

        Parameters
        ----------
        analytics_dir: str
            String denoting the path to the directory in which the analytical
            data will be written
        scrape_file: list
            List containing scrape files and file formats to generate wordcloud with
        wc: WordCloud
            Wordcloud instance

        Returns
        -------
        new_filename: str
            String denoting the filename for the exported wordcloud
        """

        filename = GetPath.name_file(analytics_dir, scrape_file[0])

        split_path = list(Path(filename).parts)

        split_filename = split_path[-1].split(".")
        split_filename[-1] = scrape_file[-1]

        split_path[-1] = ".".join(split_filename)
        new_filename = "/".join(split_path)

        export_status = Status(
            Style.BRIGHT + Fore.GREEN + f"Wordcloud exported to {new_filename}.",
            "Exporting wordcloud.",
            "white"
        )

        export_status.start()
        wc.to_file(new_filename)
        export_status.succeed()
        print()
        
        return new_filename
Exemplo n.º 10
0
    def _create_directory_tree(date_dir, tree):
        """
        Create the directory Tree based on the date_dir Path using iterative 
        depth-first search.

        Parameters
        ----------
        date_dir: str
            String denoting the path to the date directory
        tree: Tree instance

        Returns
        -------
        None
        """

        build_tree_status = Status("Displaying directory tree.",
                                   f"Building directory tree for {date_dir}.",
                                   "cyan")

        stack = DateTree._create_stack(date_dir, tree)

        visited = set()
        visited.add(Path(date_dir))

        build_tree_status.start()
        while stack:
            current = stack.pop(0)
            current_path, current_tree = current[0], current[1]

            if current_path in visited:
                continue
            elif current_path.is_dir():
                sub_tree = current_tree.add(f"[bold blue]{current_path.name}")
                sub_paths = DateTree._create_stack(current_path, sub_tree)

                stack = sub_paths + stack
            elif current_path.is_file():
                file_size = current_path.stat().st_size
                current_tree.add(
                    f"[bold]{current_path.name} [{decimal(file_size)}]")

                visited.add(current_path)

        build_tree_status.succeed()
        print()
Exemplo n.º 11
0
    def sort_structured(submission, url):
        """
        Sort all comments in structured format. 
        
        Calls previously defined public methods:

            CommentNode()
            Forest()
            Forest().seed()
            CreateComment.create()

        Calls a public method from an external module:

            EncodeNode().encode()

        Parameters
        ----------
        submission: PRAW submission object
        url: str
            String denoting the submission's url

        Returns
        -------
        replies: list
            List containing `CommentNode`s
        """
        
        forest = Forest(submission, url)

        seed_status = Status(
            "Forest has fully matured.",
            Fore.CYAN + Style.BRIGHT + "Seeding Forest.",
            "cyan"
        )

        seed_status.start()
        for comment in submission.comments.list():
            comment_node = CommentNode(CreateComment.create(comment))
            EncodeNode().encode(comment_node)

            forest.seed(comment_node)

        seed_status.succeed()
        return forest.root.replies
Exemplo n.º 12
0
    def prep_redditor(data, file):
        """
        Prepare Redditor data.

        Calls previously defined public method:

            CleanData.count_words()

        Parameters
        ----------
        data: dict
            Dictionary containing extracted scrape data
        file: str
            String denoting the filepath

        Returns
        -------
        frequencies: dict
            Dictionary containing finalized word frequencies
        """

        status = Status("Finished Redditor analysis.",
                        "Analyzing Redditor scrape.", "white")

        plt_dict = dict()

        status.start()
        for interactions in data["interactions"].values():
            for obj in interactions:
                ### Indicates there is valid data in this field.
                if isinstance(obj, dict):
                    if obj["type"] == "submission":
                        CleanData.count_words("selftext", obj, plt_dict)
                        CleanData.count_words("title", obj, plt_dict)
                    elif obj["type"] == "comment":
                        CleanData.count_words("body", obj, plt_dict)
                ### Indicates this field is forbidden.
                elif isinstance(obj, str):
                    continue

        status.succeed()
        return dict(
            sorted(plt_dict.items(), key=lambda item: item[1], reverse=True))
Exemplo n.º 13
0
    def _prep_structured(data, plt_dict):
        """
        An iterative implementation of depth-first search to prepare structured
        comments.

        Parameters
        ----------
        data: list
            List containing extracted scrape data
        plt_dict: dict
            Dictionary containing frequency data

        Returns
        -------
        None
        """

        status = Status("Finished structured submission comments analysis.",
                        "Analyzing structured submission comments scrape.",
                        "white")

        status.start()
        for comment in data:
            CleanData.count_words("body", comment, plt_dict)

            stack = []
            stack.append(comment)

            visited = []
            visited.append(comment)

            while stack:
                current_comment = stack.pop(0)

                for reply in current_comment["replies"]:
                    CleanData.count_words("body", reply, plt_dict)

                    if reply not in visited:
                        stack.insert(0, reply)
                        visited.append(reply)

        status.succeed()
Exemplo n.º 14
0
    def prep_livestream(data):
        """
        Prepare livestream data.

        Parameters
        ----------
        data: list
            List containing extracted scrape data
        """

        status = Status("Finished livestream analysis.",
                        "Analyzing livestream scrape.", "white")

        plt_dict = {}

        status.start()
        PrepMutts.prep_mutts(data, plt_dict)
        status.succeed()

        return plt_dict
Exemplo n.º 15
0
    def initialize_wordcloud(file, scrape_type):
        """
        Initialize wordcloud by setting dimensions, max font size, and generating
        it from word frequencies.

        Calls a public method from an external module:

            PrepData.prep()

        Parameters
        ----------
        file: list
            List containing scrape files and file formats to generate wordcloud with
        scrape_type: str
            String denoting the scrape type

        Returns
        -------
        wc: WordCloud
            WordCloud instance
        """

        frequencies = PrepData.prep(file[0], scrape_type)

        initialize_status = Status(
            "Generated wordcloud.",
            "Generating wordcloud.",
            "white"
        )

        initialize_status.start()
        wordcloud = WordCloud(
            height = 1200,
            max_font_size = 400,
            width = 1600
        ).generate_from_frequencies(frequencies)
        initialize_status.succeed()

        return wordcloud
Exemplo n.º 16
0
    def save_wordcloud(self, file, wc):
        """
        Save wordcloud to file.

        Calls public methods from external modules:

            GetPath.name_file()
            InitializeDirectory.make_analytics_directory()

        Parameters
        ----------
        file: list
            List containing scrape files and file formats to generate wordcloud with
        wc: WordCloud
            Wordcloud instance

        Returns
        -------
        filename: str
            String denoting the filename for the exported wordcloud
        """

        date_dir, filename = GetPath.name_file(file[1], file[0], "wordclouds")
        
        export_status = Status(
            Style.BRIGHT + Fore.GREEN + "Wordcloud exported to %s." % "/".join(filename.split("/")[filename.split("/").index("scrapes"):]),
            "Exporting wordcloud.",
            "white"
        )

        export_status.start()
        InitializeDirectory.make_analytics_directory(date_dir, "wordclouds")
        wc.to_file(filename)
        export_status.succeed()
        print()
        
        return filename
Exemplo n.º 17
0
    def validate(object_list, reddit, scraper_type):
        """
        Check if Subreddit(s), Redditor(s), or submission(s) exist and catch PRAW 
        exceptions. Log invalid Reddit objects to `urs.log` if applicable.

        Calls previously defined public method:

            Validation.check_existence()

        Parameters
        ----------
        object_list: list
            List of Reddit objects to check
        reddit: Reddit object
            Reddit instance created by PRAW API credentials
        scraper_type: str
            String denoting the scraper type

        Returns
        -------
        invalid: list
            List of invalid Reddit objects
        valid: list
            List of valid Reddit objects
        """

        object_type = "submission" \
            if scraper_type == "comments" \
            else scraper_type.capitalize()

        check_status = Status(
            "Finished %s validation." % object_type,
            "Validating %s(s)" % object_type,
            "white"
        )

        check_status.start()

        logging.info("Validating %s(s)..." % object_type)
        logging.info("")

        invalid, valid = Validation.check_existence(object_list, reddit, scraper_type)
        
        check_status.succeed()
        print()

        if invalid:
            warning_message = "The following %ss were not found and will be skipped:" % object_type

            print(Fore.YELLOW + Style.BRIGHT + warning_message)
            print(Fore.YELLOW + Style.BRIGHT + "-" * len(warning_message))
            print(*invalid, sep = "\n")

            logging.warning("Failed to validate the following %ss:" % object_type)
            logging.warning("%s" % (invalid))
            logging.warning("Skipping.")
            logging.info("")

        if not valid:
            logging.critical("ALL %sS FAILED VALIDATION." % object_type.upper())
            Errors.n_title(object_type + "s")
            logging.critical("NO %sS LEFT TO SCRAPE." % object_type.upper())
            logging.critical("ABORTING URS.\n")
            
            quit()

        return invalid, valid
Exemplo n.º 18
0
    def _make_json_skeleton(args, limit, submission, url):
        """
        Create a skeleton for JSON export. Include scrape details at the top.

        Parameters
        ----------
        args: Namespace
            Namespace object containing all arguments that were defined in the CLI
        limit: str
            Integer of string type denoting n_results or RAW format
        submission: PRAW submission object
        url: str
            String denoting submission URL


        Returns
        -------
        skeleton: dict
            Dictionary containing scrape settings and all scrape data
        """

        metadata_status = Status(
            "Extracted submission metadata.",
            "Extracting submission metadata.",
            "white"
        )

        metadata_status.start()
        skeleton = {
            "scrape_settings": {
                "n_results": int(limit) \
                    if int(limit) > 0 \
                    else "all",
                "style": "structured" \
                    if not args.raw \
                    else "raw",
                "url": url
            },
            "data": {
                "submission_metadata": {
                    "author": "u/" + submission.author.name \
                        if hasattr(submission.author, "name") \
                        else "[deleted]",
                    "created_utc": convert_time(submission.created_utc),
                    "distinguished": submission.distinguished,
                    "edited": submission.edited \
                        if submission.edited == False \
                        else convert_time(submission.edited),
                    "is_original_content": submission.is_original_content,
                    "is_self": submission.is_self,
                    "link_flair_text": submission.link_flair_text,
                    "locked": submission.locked,
                    "num_comments": submission.num_comments,
                    "nsfw": submission.over_18,
                    "permalink": submission.permalink,
                    "score": submission.score,
                    "selftext": submission.selftext,
                    "spoiler": submission.spoiler,
                    "stickied": submission.stickied,
                    "subreddit": submission.subreddit.display_name,
                    "title": submission.title,
                    "upvote_ratio": submission.upvote_ratio
                },
                "comments": None
            }
        }
        metadata_status.succeed()

        return skeleton