예제 #1
0
 def __init__(self):
     self.api = psaw.PushshiftAPI()
     self._subreddit = ''
     self._limit = 10
     self._start = int(dt.datetime(2020, 1, 1).timestamp())
     self._end = 0
     self._query = ''
예제 #2
0
def get_urls(subreddit='gifs', max_urls=10000, end_time=None, score_threshold=5):

    api = psaw.PushshiftAPI()

    if end_time is None:
        end_time = int(datetime.datetime.now().timestamp())

    query = api.search_submissions(before=end_time,
                                   subreddit='gifs',
                                   filter=['url', 'score', 'title', 'permalink', 'subreddit'],
                                   limit=max_urls,
                                   score='>%d' % score_threshold,
                                   is_self=False,
                                   over_18=False)
    seen = {}

    for i, subm in enumerate(tqdm.tqdm(query, total=max_urls)):
        url = subm.url

        if url in seen:
            continue

        seen[url] = True

        # weird issue with psaw/pushshift that breaks score=">2"
        if subm.score < score_threshold:
            continue

        entry = { 'url': url, 'score': subm.score, 'title': subm.title, 'permalink': subm.permalink, 'subreddit': subm.subreddit }

        yield entry
def get_comment_mapping(author: str, num_comments: int):
    fic_id_to_submissions, submissions_to_fics = defaultdict(set), defaultdict(
        set)
    bar = progressbar.ProgressBar(max_value=num_comments)
    api = psaw.PushshiftAPI()
    errors = []
    for i, comment in enumerate(
            api.search_comments(
                author=author,
                filter=['score', 'id', 'link_id', 'body', 'permalink'],
                limit=num_comments)):
        bar.update(i)
        for fic_name, fic_id in re.findall(comment_regex, comment.body):
            try:
                # Validate that all attributes exist (some of these will not for removed/deleted submissions)
                _, _, _, _, _ = comment.score, comment.id, comment.link_id, comment.body, comment.permalink
                f = FicComment(name=fic_name,
                               id=fic_id,
                               score=comment.score,
                               permalink=comment.permalink)
                fic_id_to_submissions[fic_id].add(comment.link_id)
                submissions_to_fics[comment.link_id].add(f)
            except Exception as e:
                errors.append(str(e))
                continue
    bar.finish()

    if len(errors) > 0:
        print("Errors:\n" + "\n".join(errors))
        print(f"{len(errors)} errors.")

    return fic_id_to_submissions, submissions_to_fics
예제 #4
0
def reddit_scr(keyword):
    '''
    this function is dedicated to scrap reddit data using psaw
    :param keyword: str, keyword used for searching
    :return: str
    '''
    # use psaw's API
    api = psaw.PushshiftAPI()
    start_time = int(dt.datetime(2020, 1, 1).timestamp())
    output_raw = list(
        api.search_submissions(after=start_time, q=keyword, limit=100000000))
    # output = api.search_comments(after=start_time, q=keyword, limit=1)
    output = []
    curr = [
    ]  # this list is used for holding an entry before putting it into the final csv file
    for obj in output_raw:
        if obj.subreddit == 'Comcast_Xfinity':
            # convert the timestamp to a more convenient format
            t = time.localtime(int(obj.created_utc))
            t2 = time.strftime("%Y-%m-%d %H:%M:%S", t)
            tf = dt.datetime.strptime(t2, "%Y-%m-%d %H:%M:%S")
            # combine the attributes to form an entry
            curr.append(tf)
            curr.append(obj.subreddit)
            curr.append(obj.title)
            curr.append(obj.selftext)
            # append the entry into output
            output.append(curr)
            curr = []
    # form the csv file
    file = open('reddit_data4.csv', 'a+', newline='')
    with file:
        write = csv.writer(file)
        write.writerows(output)
    return 'Done'
예제 #5
0
 def __init__(self, secrets_manager: RedditSecretsManager):
     secrets = secrets_manager.get_secrets()
     reddit = praw.Reddit(
         user_agent="Comment Extraction (by /u/balindwalinstalin)",
         client_id=secrets["REDDIT_CLIENT_ID"],
         client_secret=secrets["REDDIT_CLIENT_SECRET"],
     )
     self.reddit = psaw.PushshiftAPI(reddit)
예제 #6
0
 def get_comments(self, submission):
     comment_search = psaw.PushshiftAPI(r=self.reddit).search_comments(
         submission_id=submission.id, return_batch=True)
     for comment in comment_search:
         author = comment.author.name
         if author == 'AutoModerator':
             continue
         yield {
             'comment_author': author,
             'comment_body': comment.body,
             'comment_score': comment.score,
             'comment_created_utc': comment.created_utc,
             'comment_id': comment.id,
             'comment_parent_id': comment.parent_id
         }
예제 #7
0
    def __init__(self, client_id, client_secret, username, password, bot_name):

        self.client_id = client_id
        self.client_secret = client_secret
        self.username = username
        self.password = password
        self.bot_name = bot_name
        self.reddit = praw.Reddit(
            client_id=self.client_id,
            client_secret=self.client_secret,
            user_agent=f"ChangeMeClient/0.1 by /u/{self.bot_name}",
            username=self.username,
            password=self.password)
        self.PS = psaw.PushshiftAPI()

        self.frame = wx.Frame(parent=None, title="Reddit", size=(325, 255))
        panel = wx.Panel(self.frame)

        subreddit_text = wx.StaticText(panel,
                                       label="Subreddit: ",
                                       pos=(20, 20))
        limit_text = wx.StaticText(panel, label="Limit: ", pos=(20, 50))
        directory_text = wx.StaticText(panel, label="Directory:", pos=(20, 80))
        self.download_text = wx.StaticText(panel,
                                           label="Waiting...",
                                           pos=(20, 140))
        self.progress_text = wx.StaticText(panel, label="0/0", pos=(20, 180))

        self.subreddit_textctrl = wx.TextCtrl(panel, pos=(100, 15))
        self.limit_textctrl = wx.TextCtrl(panel, pos=(100, 45))

        directory_button = wx.Button(panel, label="Select", pos=(100, 75))
        directory_button.Bind(wx.EVT_BUTTON, self._set_directory)
        scrape_button = wx.Button(panel, label="Scrape", pos=(100, 105))
        scrape_button.Bind(wx.EVT_BUTTON, self._start_scrape)

        self.progress_bar = wx.Gauge(panel,
                                     range=100,
                                     pos=(20, 160),
                                     size=(265, 15))
        self.directory = os.getcwd()

        self.scrape_thread = threading.Thread(target=self._scrape, daemon=True)

        self.frame.Show()
def main():
    reddit = praw.Reddit(client_id="jSTLDT5NQzi6LA",
                         client_secret="Q2VODbHrd_Zykjj0zcWi0z7M3MA",
                         password="******",
                         user_agent="markover",
                         username="******")
    api = psaw.PushshiftAPI(reddit)
    db = sqlite3.connect('r_france.sqlite')
    gen = api.search_comments(subreddit='france')
    for comment in progressbar.progressbar(gen):
        if comment.author is None:
            author = ''
        else:
            author = comment.author.name
        if not_in(db, comment.id):
            db.cursor().execute('INSERT INTO comments VALUES(?, ?, ?, ?)',
                                (comment.id, author, comment.body,
                                 comment.created_utc)).close()
            db.commit()
예제 #9
0
def init_watch_pushshift(subreddit: str, hours: int) -> str:
    """
    Initiate watch of subreddit using Pushshift, create CSV, return filename.
    """

    import psaw

    print(f"\nInitializing watch on {subreddit}")
    hours_ago = NOW.subtract(hours=hours)
    hours_ago_as_timestamp = hours_ago.int_timestamp
    print(f"fetching initial posts from {subreddit}")
    pushshift = psaw.PushshiftAPI()
    submissions = pushshift.search_submissions(
        after=hours_ago_as_timestamp,
        subreddit=subreddit,
        filter=["id", "subreddit", "author", "created_utc"],
    )

    submissions_d = collections.defaultdict(list)
    for submission in submissions:
        created_utc_human = pendulum.from_timestamp(
            submission.created_utc).format("YYYYMMDD HH:mm:ss")

        submissions_d["id"].append(submission.id)
        submissions_d["subreddit"].append(submission.subreddit)
        submissions_d["author_p"].append(submission.author)
        submissions_d["del_author_p"].append("FALSE")
        submissions_d["created_utc"].append(created_utc_human)
        submissions_d["found_utc"].append(NOW_STR)
        submissions_d["del_author_r"].append("FALSE")
        submissions_d["del_author_r_utc"].append("NA")
        submissions_d["del_text_r"].append("FALSE")
        submissions_d["del_text_r_utc"].append("NA")
        submissions_d["rem_text_r"].append("FALSE")
        submissions_d["rem_text_r_utc"].append("NA")
        submissions_d["removed_by_category_r"].append("FALSE")

    watch_fn = (f"{DATA_DIR}/watch-{subreddit}-{NOW.format('YYYYMMDD')}"
                f"_n{len(submissions_d['id'])}.csv")
    watch_df = pd.DataFrame.from_dict(submissions_d)
    watch_df.to_csv(watch_fn, index=True, encoding="utf-8-sig", na_rep="NA")
    return watch_fn
예제 #10
0
def download_subreddit_comments(subreddit, start, count, destination, min_word_count=None):
    api = psaw.PushshiftAPI()

    fields = [
        'author',
        'author_flair_text',
        'body',
        'created_utc',
        'gildings',
        'id',
        'parent_id',
        'permalink',
        'score',
        'subreddit',
        'subreddit_id',
    ]
    start_epoch = int(start.timestamp())

    gen = api.search_comments(subreddit=subreddit, filter=fields, after=start_epoch)

    output_path = destination / subreddit
    if not output_path.exists():
        print("creating output directory: {}".format(output_path))
        output_path.mkdir()

    comments_saved = 0
    while comments_saved < count:
        comment = next(gen)
        if min_word_count and len(comment.body.split()) < min_word_count:
            continue

        if comments_saved % 50 == 0:
            print("comment {}: {}".format(comments_saved, comment.body))

        output_file = output_path / "{}.json".format(comment.id)
        json.dump(comment.d_, output_file.open('w', encoding='utf8'))
        comments_saved += 1
예제 #11
0
def get_psaw_api():
    return psaw.PushshiftAPI()
예제 #12
0
def main():
    arg_parser = argparse.ArgumentParser(
        description='Reddit canned response bot')
    arg_parser.add_argument(
        dest='bot_config_file',
        type=str,
        help=
        'json bot config file (see examples/minimal_example_bot_config.json)')
    arg_parser.add_argument(
        '--dry-run',
        dest='dry_run',
        type=int,
        const=0,
        default=None,
        nargs='?',
        help='Doesn\'t actually reply, just prints what it would\'ve sent.'
        ' A number of hours prior to "now" may also be supplied to '
        'iterate over old comments first e.g. "--dry-run=168"')
    arg_parser.add_argument('--verbose',
                            dest='verbose',
                            action='store_true',
                            help='Display additional debug messages')
    arg_parser.add_argument('--skip-tests',
                            dest='skip_tests',
                            action='store_true',
                            help='Skips tests')
    arguments = arg_parser.parse_args()

    # Reddit credentials should be supplied via praw.ini file.
    reddit = praw.Reddit()
    pushshift = psaw.PushshiftAPI(reddit)

    # Setup logging to stdout and rotating files
    log_stream_handler = logging.StreamHandler(sys.stdout)
    log_stream_handler.setLevel(
        logging.DEBUG if arguments.verbose else logging.INFO)
    log_stream_handler.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    log.addHandler(log_stream_handler)
    # Rotate file log through 3 * 10MiB files
    log_file_handler = logging.handlers.RotatingFileHandler(
        pathlib.Path(reddit.config.username).with_suffix('.log'),
        maxBytes=10 * 1048576,
        backupCount=2)
    log_file_handler.setLevel(logging.DEBUG)
    log_file_handler.setFormatter(
        logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    log.addHandler(log_file_handler)

    with open(arguments.bot_config_file) as f:
        bot_config = json.load(f)  # type: Dict[str, typing.Any]

    canned_responses = [
        CannedResponse(**kwargs) for kwargs in bot_config['canned_responses']
    ]
    reply_generator = ReplyGenerator(
        canned_responses, bot_config.get('comment_mention_reply', None),
        bot_config['postfix'])

    tests = bot_config.get('tests', None)
    if tests and not arguments.skip_tests:
        # Run tests
        log.setLevel(logging.WARNING)  # Hide test log output
        suite = unittest.TestSuite()
        suite.addTest(BotTests(tests, reply_generator))
        unittest.TextTestRunner().run(suite)
        log.setLevel(logging.DEBUG)  # Restore log output

    max_comments_per_submission = bot_config.get(
        'max_comments_per_submission', DEFAULT_MAX_COMMENTS_PER_SUBMISSION)
    delete_unliked_comment_score = bot_config.get(
        'delete_unliked_comment_score', DEFAULT_DELETE_UNLIKED_COMMENT_SCORE)
    dry_run = arguments.dry_run is not None
    start_time_offset_hours = 0 if arguments.dry_run is None else -arguments.dry_run
    bot = Bot(pushshift,
              reply_generator,
              bot_config['subreddits'],
              max_comments_per_submission,
              delete_unliked_comment_score,
              dry_run=dry_run,
              start_time_offset_hours=start_time_offset_hours)
    bot.run()
예제 #13
0
client_id = args.client_id
client_secret = args.client_secret
if args.credentials.exists() and args.credentials.is_file():
    with open(args.credentials) as credentials_file:
        credentials = json.load(credentials_file)
        client_id = credentials.get('client_id', client_id)
        client_secret = credentials.get('client_secret', client_secret)

# Make sure that we have a client id AND a client secret.
assert client_id is not None and client_secret is not None

reddit = praw.Reddit(client_id=client_id,
                     client_secret=client_secret,
                     user_agent=args.user_agent)
api = psaw.PushshiftAPI(reddit)

SUBMISSION_SERIALIZE_ATTRIBUTES = [
    'created_utc', 'id', 'name', 'permalink', 'score', 'title', 'upvote_ratio',
    'url', 'selftext'
]

COMMENT_SERIALIZE_ATTRIBUTES = [
    'body', 'created_utc', 'id', 'is_submitter', 'link_id', 'parent_id',
    'permalink', 'score', 'subreddit_id'
]


def _serialize_reddit_object(obj, attributes, print_func=print):
    data = {attribute: getattr(obj, attribute) for attribute in attributes}
    if obj.author is not None:
예제 #14
0
import simplejson as json
import psaw
import requests

from flask import Flask
from flask import request, jsonify, abort

import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
from google.cloud import pubsub_v1

app = Flask(__name__)
firebase_admin.initialize_app(credentials.ApplicationDefault())
reddit = psaw.PushshiftAPI()
db = firestore.client()
session = requests.Session()
publisher = pubsub_v1.PublisherClient()


@app.route('/r/<subreddit>')
def run(subreddit):
  before = request.args.get('before', type=int)

  submissions = reddit.search_submissions(
    before=before,
    subreddit=subreddit,
    limit=500,
    filter=[
      'id',
예제 #15
0
 def __init__(self, credentials, size, fields=None):
     self.ps_api = psaw.PushshiftAPI(max_results_per_request=size)
     self.reddit = praw.Reddit(**credentials)
     if fields is None:
         self.fields = self.default_fields
예제 #16
0
    and save the urls to a txt
    
    Arguments:
        query {instace of api.search_submissions} -- where: api = psaw.PushshiftAPI()
        save_urls_path {str} -- path to save urls
    """
    reddit_to_db = RedditToDb(save_to_table)
    with tqdm.tqdm() as pbar:
        for subm in query:
            reddit_to_db.insert_submission_into_db(subm)
            pbar.update(1)


if __name__ == "__main__":

    api = psaw.PushshiftAPI()
    end_time = int(datetime.datetime(2019, 6, 1).timestamp())
    # start_time = int(datetime.datetime(2000, 6, 1).timestamp())
    # search = "tl & dr"
    # search = "selftext:(tl & dr)"
    # search = "selftext:tl & dr"
    # search = "selftext:tl" 0
    # self_text_search = "tl & dr" bunch of meanigless results
    # self_text_search = "' tl' & 'dr'" 3 results

    #-------------self text search
    # TLDRs in self text (=> usually summarizing the reddit data)
    # self_text_search = "'tl' & 'dr'"
    # query = api.search_submissions(
    #     # q=search,
    #     selftext=self_text_search,
def collect_comment_star_citizen(save_file, limit=None, used_saved=False, append=False,
                                do_roadmap=True, flair_list=["OFFICIAL"]):

    if used_saved and not append:
        try:
            comments_list = load_all_comments(db_name=save_file)
            return comments_list
        except:
            print("Could not retrieve saved comments, getting comment normally")
    else:
        comments_list = []

    with open("credentials.json") as f:
        credentials = json.loads(f.read())

    reddit = praw.Reddit(client_id=credentials["id"],
                     client_secret=credentials["secret"],
                     user_agent="Comment Extraction")
    api = psaw.PushshiftAPI(reddit)

    title_filter = re.compile(".*(Star Citizen Roadmap Update|Squadron 42 Roadmap Update).*")
    official_title_filter = re.compile(".*([Ee]vocati +[Pp]atch|([Pp]atch|Release|P[Tt][Uu])[ -]+[nN]otes).*")

    initial_epoch = int(dt.datetime(2012, 10, 20).timestamp())

    raw_comments_list = []
    num_submission = 0

    try:
        if do_roadmap:
            for submission in api.search_submissions(author="Odysseus-Ithaca", subreddit="starcitizen", after=initial_epoch):
                if submission is None:
                    break
                if title_filter.match(submission.title) is None:
                    continue

                print("submission {} - flair {}".format(submission.title, submission.link_flair_text))
        
                submission.comments.replace_more(limit=None, threshold=1)
                raw_comments_list += submission.comments.list()

                num_submission += 1
                if not limit is None and len(raw_comments_list) >= limit:
                    break

            print("{} submission for odysseus done".format(num_submission))

        if len(flair_list) > 0:

            raw_comments_list, num_submission = retrieve_flair(api, raw_comments_list, num_submission, flair_list,
                                                        lambda s:official_title_filter.match(s.title), after_cond=initial_epoch)

    except KeyboardInterrup:
        print("Received keyboard interrup - stopping scraping")
    finally:
        print("{} submission done in total".format(num_submission))


        #Retrieve all attributes
        tmp_list = []
        for c in raw_comments_list:
            attributes_raw = vars(c)
            attributes = {}
            #filter lazy attributes
            attributes["submission_title"] = c.submission.title
            attributes["submission_name"] = c.submission.name
            attributes["submission_flair"] = c.submission.link_flair_text
            for key, value in attributes_raw.items():
                if key.startswith("_"):
                    continue
                elif key == "subreddit" or key == "author":
                    continue

                attributes[key] = value

            tmp_list.append(attributes)

        comments_list += tmp_list

        #Cache results
        save_comments(comments_list, append, db_name=save_file)

        return comments_list