Пример #1
0
def funnyshortjokes_test():
    joke_parser = RawFunnyShortJokesJokeReader()
    pipeline = Pipeline()
    pipeline.add(Clean())
    pipeline.add(AddNouns())
    pipeline.add(Lowercase())
    pipeline.add(AddGloveEmbeddings())

    for file in get_raw_funnyshortjokes_joke_files():
        filename = file.split("/")[-1]
        output = os.path.join(get_project_data_path(),
                              "funnyshortjokes_processed", filename)
        writer = ProcessedFunnyShortJokesJokeWriter()

        jokes = joke_parser.read(file)
        jokes = (pipeline.process(joke) for joke in jokes)
        writer.write(jokes, output)

        reader = ProcessedFunnyShortJokesJokeReader()
        jokes = reader.read(output)

        # jokes = list(filter(lambda joke: joke is not None, jokes))

        for joke in jokes:
            print(joke)
            print("Nouns:", joke.nouns_)
            print("Embeddings:", joke.embeddings_)

        # print(jokes)
        break
Пример #2
0
    def spider_closed(self, reason):
        filename_template = os.path.join(get_project_data_path(),
                                         "funnyshortjokes_raw", "%s.json")

        for category, jokes in self.jokes.items():
            filename = filename_template % category.replace(" ", "_")

            with open(filename, "w", encoding="utf-8") as outfile:
                json.dump(jokes, outfile, indent=4, sort_keys=True)
Пример #3
0
    def _write_jokes_to_file(self, jokes, filename):
        joke_id = 1
        jokes_dict = {}

        for id, premise, punchline, subreddit in jokes:
            jokes_dict[joke_id] = {"id": id,
                                   "premise": premise,
                                   "punchline": punchline,
                                   "subreddit": subreddit}
            joke_id += 1

        file = os.path.join(get_project_data_path(), "reddit_raw", filename)
        with open(file, "w", encoding="utf-8") as outfile:
            json.dump(jokes_dict, outfile, indent=4, sort_keys=True)
Пример #4
0
    def start_requests(self):
        data_directory = os.path.join(get_project_data_path(),
                                      "funnyshortjokes_raw")
        parsed_categories = {
            category.split(".")[0].replace("_", " ")
            for category in os.listdir(data_directory)
        }

        urls = [
            self.base_url + "/c/" + category.lower().replace(" ", "-")
            for category in self.categories
            if category not in parsed_categories
        ]

        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)
Пример #5
0
def run_joke_scraper():
    base_url = "https://www.reddit.com/r/"
    subreddits = ["Jokes", "DirtyJokes", "cleanjokes", "AntiJokes", "Antihumor",
                  "darkjokes", "MeanJokes", "AntiAntiJokes", "dadjokes", "ProgrammerHumor",
                  "MathJokes", "MommaJokes", "3amjokes", "ShortCleanFunny", "badjokes",
                  "deadbabyjokes", "DarkHumor", "Punny", "pun", "ScienceJokes", "chemistryjokes",
                  "intellectualdadjokes", "ProgrammerDadJokes", "nsfwdadjokes",
                  "dadjokesinhistory", "Hearthstonedadjokes", "dadsouls", "warcraftdadjokes",
                  "dota2dadjokes", "DestinyDadJokes", "FFXIVDadjokes", "Falloutdadjokes", "DMDadJokes",
                  "skyrimdadjokes", "OverwatchDadjokes", "DarkDadJokes", "CivDadJokes", "TrahearneJokes",
                  "StarWarsDadJokes", "eu4dadjokes", "shubreddit", "momjokes"]

    subreddits_scraped = {filename.split(".")[0] for filename in
                          os.listdir(os.path.join(get_project_data_path(), "reddit_raw"))}

    start_urls = [base_url + subreddit for subreddit in subreddits if subreddit not in subreddits_scraped]

    scraper = RedditJokeScraper(subreddits, 1000)
    jokes = Parallel(n_jobs=1)(delayed(scraper.scrape)(start_url) for start_url in start_urls)
Пример #6
0
def main():
    id_to_category = {
        1: "funny_chuck_norris_jokes",
        2: "funny_yo_momma_jokes",
        3: "funny_blonde_jokes",
        4: "funny_one_liner_jokes",
        5: "funny_short_jokes",
        6: "funny_long_jokes",
        7: "funny_redneck_jokes",
        9: "funny_dirty_jokes",
        10: "funny_racial_jokes",
        12: "funny_comebacks",
        14: "funny_pick_up_lines",
        15: "funny_celebrity_jokes",
        16: "funny_anti_humor_jokes",
        17: "funny_animal_jokes",
        18: "funny_puns"
    }

    scraped_categories = {
        category.split(".")[0]
        for category in os.listdir(
            os.path.join(get_project_data_path(), "kickasshumor_raw"))
    }

    category_ids = [
        id for id, category in id_to_category.items()
        if category not in scraped_categories
    ]

    base_url = "https://www.kickasshumor.com/c/%i"
    urls = [base_url % id for id in category_ids]

    for url in urls:
        scraper = KickassHumorJokeScraper()
        jokes = scraper.scrape(url)
        print(len(jokes))
Пример #7
0
def get_raw_reddit_joke_files():
    directory = os.path.join(get_project_data_path(), "reddit_raw")
    for file in _get_joke_files(directory):
        yield file
Пример #8
0
def get_raw_kickasshumor_joke_files():
    directory = os.path.join(get_project_data_path(), "kickasshumor_raw")
    for file in _get_joke_files(directory):
        yield file
Пример #9
0
def get_raw_funnyshortjokes_joke_files():
    directory = os.path.join(get_project_data_path(), "funnyshortjokes_raw")
    for file in _get_joke_files(directory):
        yield file
Пример #10
0
    def _write_jokes_to_file(self):
        file = os.path.join(get_project_data_path(), "kickasshumor_raw",
                            "%s.json" % self.category.replace("-", "_"))

        with open(file, "w", encoding="utf-8") as outfile:
            json.dump(self.jokes, outfile, indent=4, sort_keys=True)