def test_class_ops_dateutil(self): def compare(x, y): assert ( int( np.round(Timestamp(x).value / 1e9) - np.round(Timestamp(y).value / 1e9) ) == 0 ) compare(Timestamp.now(), datetime.now()) compare(Timestamp.now("UTC"), datetime.now(tzutc())) compare(Timestamp.utcnow(), datetime.utcnow()) compare(Timestamp.today(), datetime.today()) current_time = calendar.timegm(datetime.now().utctimetuple()) compare( Timestamp.utcfromtimestamp(current_time), datetime.utcfromtimestamp(current_time), ) compare( Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time) ) date_component = datetime.utcnow() time_component = (date_component + timedelta(minutes=10)).time() compare( Timestamp.combine(date_component, time_component), datetime.combine(date_component, time_component), )
def test_class_ops_dateutil(self): def compare(x, y): assert (int( np.round(Timestamp(x).value / 1e9) - np.round(Timestamp(y).value / 1e9)) == 0) compare(Timestamp.now(), datetime.now()) compare(Timestamp.now("UTC"), datetime.now(tzutc())) compare(Timestamp.utcnow(), datetime.utcnow()) compare(Timestamp.today(), datetime.today()) current_time = calendar.timegm(datetime.now().utctimetuple()) msg = "timezone-aware Timestamp with UTC" with tm.assert_produces_warning(FutureWarning, match=msg): # GH#22451 ts_utc = Timestamp.utcfromtimestamp(current_time) compare( ts_utc, datetime.utcfromtimestamp(current_time), ) compare(Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time)) date_component = datetime.utcnow() time_component = (date_component + timedelta(minutes=10)).time() compare( Timestamp.combine(date_component, time_component), datetime.combine(date_component, time_component), )
def to_json(line): """Convert a line of json into a cleaned up dict.""" # Convert timestamps into Timestamp objects date = line['created_utc'] line['created_utc'] = Timestamp.utcfromtimestamp(int(date)) edited = line['edited'] line['edited'] = Timestamp.utcfromtimestamp(int(edited)) if edited else NaT # Convert deleted posts into `None`s (missing text data) if line['author'] == '[deleted]': line['author'] = None if line['body'] == '[deleted]': line['body'] = None # Remove 'id', and 'subreddit_id' as they're redundant # Remove 'retrieved_on' as it's irrelevant return dissoc(line, 'retrieved_on')
def to_json(line): """Convert a line of json into a cleaned up dict.""" # Convert timestamps into Timestamp objects date = line['created_utc'] line['created_utc'] = Timestamp.utcfromtimestamp(int(date)) edited = line['edited'] line['edited'] = Timestamp.utcfromtimestamp(int(edited)) if edited else NaT # Convert deleted posts into `None`s (missing text data) if line['author'] == '[deleted]': line['author'] = None if line['body'] == '[deleted]': line['body'] = None # Remove 'id', and 'subreddit_id' as they're redundant # Remove 'retrieved_on' as it's irrelevant return dissoc(line, 'retrieved_on')
def time_to_time_stamp(time): """Convert a RFC3339 or UNIX timestamp to pandas Timestamp""" try: # This will work with RFC3339 time_stamp = Timestamp(time) except ValueError: # This will work with UNIX nanoseconds = lambda: {'nanoseconds': int(time[-3:])} time_stamp = (Timestamp.utcfromtimestamp(float(time)) + Timedelta(**nanoseconds())).tz_localize('UTC') return time_stamp
def test_class_ops_pytz(self): def compare(x, y): assert (int(Timestamp(x).value / 1e9) == int( Timestamp(y).value / 1e9)) compare(Timestamp.now(), datetime.now()) compare(Timestamp.now('UTC'), datetime.now(timezone('UTC'))) compare(Timestamp.utcnow(), datetime.utcnow()) compare(Timestamp.today(), datetime.today()) current_time = calendar.timegm(datetime.now().utctimetuple()) compare(Timestamp.utcfromtimestamp(current_time), datetime.utcfromtimestamp(current_time)) compare(Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time)) date_component = datetime.utcnow() time_component = (date_component + timedelta(minutes=10)).time() compare(Timestamp.combine(date_component, time_component), datetime.combine(date_component, time_component))
def test_class_ops_dateutil(self): def compare(x, y): assert (int(np.round(Timestamp(x).value / 1e9)) == int(np.round(Timestamp(y).value / 1e9))) compare(Timestamp.now(), datetime.now()) compare(Timestamp.now('UTC'), datetime.now(tzutc())) compare(Timestamp.utcnow(), datetime.utcnow()) compare(Timestamp.today(), datetime.today()) current_time = calendar.timegm(datetime.now().utctimetuple()) compare(Timestamp.utcfromtimestamp(current_time), datetime.utcfromtimestamp(current_time)) compare(Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time)) date_component = datetime.utcnow() time_component = (date_component + timedelta(minutes=10)).time() compare(Timestamp.combine(date_component, time_component), datetime.combine(date_component, time_component))
def test_class_ops_pytz(self): def compare(x, y): assert int((Timestamp(x).value - Timestamp(y).value) / 1e9) == 0 compare(Timestamp.now(), datetime.now()) compare(Timestamp.now("UTC"), datetime.now(timezone("UTC"))) compare(Timestamp.utcnow(), datetime.utcnow()) compare(Timestamp.today(), datetime.today()) current_time = calendar.timegm(datetime.now().utctimetuple()) msg = "timezone-aware Timestamp with UTC" with tm.assert_produces_warning(FutureWarning, match=msg): # GH#22451 ts_utc = Timestamp.utcfromtimestamp(current_time) compare( ts_utc, datetime.utcfromtimestamp(current_time), ) compare( Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time) ) compare( # Support tz kwarg in Timestamp.fromtimestamp Timestamp.fromtimestamp(current_time, "UTC"), datetime.fromtimestamp(current_time, utc), ) compare( # Support tz kwarg in Timestamp.fromtimestamp Timestamp.fromtimestamp(current_time, tz="UTC"), datetime.fromtimestamp(current_time, utc), ) date_component = datetime.utcnow() time_component = (date_component + timedelta(minutes=10)).time() compare( Timestamp.combine(date_component, time_component), datetime.combine(date_component, time_component), )
def top_comments(drug=None,n=5): """retrieves top n comments for given drug, returns parseable format. Ranks top comments by score normalized to subreddit population. KWARGS: drug: string or None. Drug selector. Allows three cases: * None: scrape all comments in database, regardless of drug. * 'antidepressant': select comments speaking generically about drug, not referencing specific drug. * [drug name]: comments referencing specific drug. Default None. n: int. number of top comments to return. YIELDS: Iterable of top n comments. RAISES: ValueError: On invalid drug selection. """ conn = pms.connect( host='localhost', user='******', passwd='', db='empath', charset='utf8', init_command='SET NAMES UTF8') cur = conn.cursor() if drug is None: gen = None query = ("SELECT c.author,c.body,c.created_utc,c.score,c.subreddit," "s.subscribers FROM Comments c " "JOIN Subreddits s ON s.subreddit=c.subreddit") elif drug.upper() == 'ANTIDEPRESSANT': gen = None query = ("SELECT c.author,c.body,c.created_utc,c.score,c.subreddit," "s.subscribers FROM Comments c " "JOIN Subreddits s ON s.subreddit=c.subreddit") else: gen = _drug_dict.get(drug.upper(),None) if gen is None: raise ValueError('Invalid drug selection.') query = ("SELECT c.author,c.body,c.created_utc,c.score,c.subreddit," "s.subscribers FROM Comments c " "JOIN Subreddits s ON s.subreddit=c.subreddit " "JOIN Mentions m ON m.id=c.id " "WHERE m.%s=True" % gen.lower()) data = pandas.read_sql(query,conn) conn.close() data['normscore'] = data.score/data.subscribers**.33 data.sort_values(by='normscore',ascending=False,inplace=True) data = data[:n] for row in data.iterrows(): entry = row[1] # pandas.Series author = entry.author body = entry.body created_utc = entry.created_utc score = entry.score subreddit = entry.subreddit created_utc = str(Timestamp.utcfromtimestamp(int(created_utc))) #if drug is not None: # bodymod = re.compile(drug,re.IGNORECASE) # body = bodymod.sub("**%s*" % drug,body) #if gen is not None: # bodymod = re.compile(gen,re.IGNORECASE) # body = bodymod.sub("**%s**" % gen,body) yield (author,body,created_utc,score,subreddit)
def top_comments_simple(drug=None,n=5): """retrieves top n comments for given drug, returns parseable format. Ranks top comments by raw score (rendered by SQL query). KWARGS: drug: string or None. Drug selector. Allows three cases: * None: scrape all comments in database, regardless of drug. * 'antidepressant': select comments speaking generically about drug, not referencing specific drug. * [drug name]: comments referencing specific drug. Default None. n: int. number of top comments to return. YIELDS: Iterable of top n comments. RAISES: ValueError: On invalid drug selection. """ conn = pms.connect( host='localhost', user='******', passwd='', db='empath', charset='utf8', init_command='SET NAMES UTF8') cur = conn.cursor() if drug is None: gen = None query = ("SELECT author,body,created_utc,score,subreddit from Comments " "ORDER BY score DESC " "LIMIT %s" % n) elif drug.upper() == 'ANTIDEPRESSANT': gen = None query = ("SELECT c.author,c.body,c.created_utc,c.score,c.subreddit " "FROM Comments c " "JOIN Mentions m on m.id=c.id " "WHERE m.count=0 " "ORDER BY c.score DESC " "LIMIT %s" % n) else: gen = _drug_dict.get(drug.upper(),None) if gen is None: raise ValueError('Invalid drug selection.') query = ("SELECT c.author,c.body,c.created_utc,c.score,c.subreddit " "FROM Comments c " "JOIN Mentions m on m.id=c.id " "WHERE m.%s=True " "ORDER BY c.score DESC " "LIMIT %s" % (gen.lower(),n)) cur.execute(query) conn.close() for row in cur: author = row[0] body = row[1] created_utc = row[2] score = row[3] subreddit = row[4] created_utc = str(Timestamp.utcfromtimestamp(int(created_utc))) #if drug is not None: # bodymod = re.compile(drug,re.IGNORECASE) # body = bodymod.sub("%s" % drug,body) #if gen is not None: # bodymod = re.compile(gen,re.IGNORECASE) # body = bodymod.sub("%s" % gen,body) yield (author,body,created_utc,score,subreddit)
def top_comments(drug=None, n=5): """retrieves top n comments for given drug, returns parseable format. Ranks top comments by score normalized to subreddit population. KWARGS: drug: string or None. Drug selector. Allows three cases: * None: scrape all comments in database, regardless of drug. * 'antidepressant': select comments speaking generically about drug, not referencing specific drug. * [drug name]: comments referencing specific drug. Default None. n: int. number of top comments to return. YIELDS: Iterable of top n comments. RAISES: ValueError: On invalid drug selection. """ conn = pms.connect( host="localhost", user="******", passwd="", db="empath", charset="utf8", init_command="SET NAMES UTF8" ) cur = conn.cursor() if drug is None: gen = None query = ( "SELECT c.author,c.body,c.created_utc,c.score,c.subreddit," "s.subscribers FROM Comments c " "JOIN Subreddits s ON s.subreddit=c.subreddit" ) elif drug.upper() == "ANTIDEPRESSANT": gen = None query = ( "SELECT c.author,c.body,c.created_utc,c.score,c.subreddit," "s.subscribers FROM Comments c " "JOIN Subreddits s ON s.subreddit=c.subreddit" ) else: gen = _drug_dict.get(drug.upper(), None) if gen is None: raise ValueError("Invalid drug selection.") query = ( "SELECT c.author,c.body,c.created_utc,c.score,c.subreddit," "s.subscribers FROM Comments c " "JOIN Subreddits s ON s.subreddit=c.subreddit " "JOIN Mentions m ON m.id=c.id " "WHERE m.%s=True" % gen.lower() ) data = pandas.read_sql(query, conn) conn.close() data["normscore"] = data.score / data.subscribers ** 0.33 data.sort_values(by="normscore", ascending=False, inplace=True) data = data[:n] for row in data.iterrows(): entry = row[1] # pandas.Series author = entry.author body = entry.body created_utc = entry.created_utc score = entry.score subreddit = entry.subreddit created_utc = str(Timestamp.utcfromtimestamp(int(created_utc))) # if drug is not None: # bodymod = re.compile(drug,re.IGNORECASE) # body = bodymod.sub("**%s*" % drug,body) # if gen is not None: # bodymod = re.compile(gen,re.IGNORECASE) # body = bodymod.sub("**%s**" % gen,body) yield (author, body, created_utc, score, subreddit)
def top_comments_simple(drug=None, n=5): """retrieves top n comments for given drug, returns parseable format. Ranks top comments by raw score (rendered by SQL query). KWARGS: drug: string or None. Drug selector. Allows three cases: * None: scrape all comments in database, regardless of drug. * 'antidepressant': select comments speaking generically about drug, not referencing specific drug. * [drug name]: comments referencing specific drug. Default None. n: int. number of top comments to return. YIELDS: Iterable of top n comments. RAISES: ValueError: On invalid drug selection. """ conn = pms.connect( host="localhost", user="******", passwd="", db="empath", charset="utf8", init_command="SET NAMES UTF8" ) cur = conn.cursor() if drug is None: gen = None query = "SELECT author,body,created_utc,score,subreddit from Comments " "ORDER BY score DESC " "LIMIT %s" % n elif drug.upper() == "ANTIDEPRESSANT": gen = None query = ( "SELECT c.author,c.body,c.created_utc,c.score,c.subreddit " "FROM Comments c " "JOIN Mentions m on m.id=c.id " "WHERE m.count=0 " "ORDER BY c.score DESC " "LIMIT %s" % n ) else: gen = _drug_dict.get(drug.upper(), None) if gen is None: raise ValueError("Invalid drug selection.") query = ( "SELECT c.author,c.body,c.created_utc,c.score,c.subreddit " "FROM Comments c " "JOIN Mentions m on m.id=c.id " "WHERE m.%s=True " "ORDER BY c.score DESC " "LIMIT %s" % (gen.lower(), n) ) cur.execute(query) conn.close() for row in cur: author = row[0] body = row[1] created_utc = row[2] score = row[3] subreddit = row[4] created_utc = str(Timestamp.utcfromtimestamp(int(created_utc))) # if drug is not None: # bodymod = re.compile(drug,re.IGNORECASE) # body = bodymod.sub("%s" % drug,body) # if gen is not None: # bodymod = re.compile(gen,re.IGNORECASE) # body = bodymod.sub("%s" % gen,body) yield (author, body, created_utc, score, subreddit)