示例#1
0
    def test_class_ops_dateutil(self):
        def compare(x, y):
            assert (
                int(
                    np.round(Timestamp(x).value / 1e9)
                    - np.round(Timestamp(y).value / 1e9)
                )
                == 0
            )

        compare(Timestamp.now(), datetime.now())
        compare(Timestamp.now("UTC"), datetime.now(tzutc()))
        compare(Timestamp.utcnow(), datetime.utcnow())
        compare(Timestamp.today(), datetime.today())
        current_time = calendar.timegm(datetime.now().utctimetuple())
        compare(
            Timestamp.utcfromtimestamp(current_time),
            datetime.utcfromtimestamp(current_time),
        )
        compare(
            Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time)
        )

        date_component = datetime.utcnow()
        time_component = (date_component + timedelta(minutes=10)).time()
        compare(
            Timestamp.combine(date_component, time_component),
            datetime.combine(date_component, time_component),
        )
示例#2
0
    def test_class_ops_dateutil(self):
        def compare(x, y):
            assert (int(
                np.round(Timestamp(x).value / 1e9) -
                np.round(Timestamp(y).value / 1e9)) == 0)

        compare(Timestamp.now(), datetime.now())
        compare(Timestamp.now("UTC"), datetime.now(tzutc()))
        compare(Timestamp.utcnow(), datetime.utcnow())
        compare(Timestamp.today(), datetime.today())
        current_time = calendar.timegm(datetime.now().utctimetuple())

        msg = "timezone-aware Timestamp with UTC"
        with tm.assert_produces_warning(FutureWarning, match=msg):
            # GH#22451
            ts_utc = Timestamp.utcfromtimestamp(current_time)

        compare(
            ts_utc,
            datetime.utcfromtimestamp(current_time),
        )
        compare(Timestamp.fromtimestamp(current_time),
                datetime.fromtimestamp(current_time))

        date_component = datetime.utcnow()
        time_component = (date_component + timedelta(minutes=10)).time()
        compare(
            Timestamp.combine(date_component, time_component),
            datetime.combine(date_component, time_component),
        )
def to_json(line):
    """Convert a line of json into a cleaned up dict."""
    # Convert timestamps into Timestamp objects
    date = line['created_utc']
    line['created_utc'] = Timestamp.utcfromtimestamp(int(date))
    edited = line['edited']
    line['edited'] = Timestamp.utcfromtimestamp(int(edited)) if edited else NaT

    # Convert deleted posts into `None`s (missing text data)
    if line['author'] == '[deleted]':
        line['author'] = None
    if line['body'] == '[deleted]':
        line['body'] = None

    # Remove 'id', and 'subreddit_id' as they're redundant
    # Remove 'retrieved_on' as it's irrelevant
    return dissoc(line, 'retrieved_on')
示例#4
0
def to_json(line):
    """Convert a line of json into a cleaned up dict."""
    # Convert timestamps into Timestamp objects
    date = line['created_utc']
    line['created_utc'] = Timestamp.utcfromtimestamp(int(date))
    edited = line['edited']
    line['edited'] = Timestamp.utcfromtimestamp(int(edited)) if edited else NaT

    # Convert deleted posts into `None`s (missing text data)
    if line['author'] == '[deleted]':
        line['author'] = None
    if line['body'] == '[deleted]':
        line['body'] = None

    # Remove 'id', and 'subreddit_id' as they're redundant
    # Remove 'retrieved_on' as it's irrelevant
    return dissoc(line, 'retrieved_on')
示例#5
0
def time_to_time_stamp(time):
    """Convert a RFC3339 or UNIX timestamp to pandas Timestamp"""
    try:
        # This will work with RFC3339
        time_stamp = Timestamp(time)
    except ValueError:
        # This will work with UNIX
        nanoseconds = lambda: {'nanoseconds': int(time[-3:])}
        time_stamp = (Timestamp.utcfromtimestamp(float(time)) +
                      Timedelta(**nanoseconds())).tz_localize('UTC')

    return time_stamp
示例#6
0
    def test_class_ops_pytz(self):
        def compare(x, y):
            assert (int(Timestamp(x).value / 1e9) == int(
                Timestamp(y).value / 1e9))

        compare(Timestamp.now(), datetime.now())
        compare(Timestamp.now('UTC'), datetime.now(timezone('UTC')))
        compare(Timestamp.utcnow(), datetime.utcnow())
        compare(Timestamp.today(), datetime.today())
        current_time = calendar.timegm(datetime.now().utctimetuple())
        compare(Timestamp.utcfromtimestamp(current_time),
                datetime.utcfromtimestamp(current_time))
        compare(Timestamp.fromtimestamp(current_time),
                datetime.fromtimestamp(current_time))

        date_component = datetime.utcnow()
        time_component = (date_component + timedelta(minutes=10)).time()
        compare(Timestamp.combine(date_component, time_component),
                datetime.combine(date_component, time_component))
示例#7
0
    def test_class_ops_dateutil(self):
        def compare(x, y):
            assert (int(np.round(Timestamp(x).value / 1e9)) ==
                    int(np.round(Timestamp(y).value / 1e9)))

        compare(Timestamp.now(), datetime.now())
        compare(Timestamp.now('UTC'), datetime.now(tzutc()))
        compare(Timestamp.utcnow(), datetime.utcnow())
        compare(Timestamp.today(), datetime.today())
        current_time = calendar.timegm(datetime.now().utctimetuple())
        compare(Timestamp.utcfromtimestamp(current_time),
                datetime.utcfromtimestamp(current_time))
        compare(Timestamp.fromtimestamp(current_time),
                datetime.fromtimestamp(current_time))

        date_component = datetime.utcnow()
        time_component = (date_component + timedelta(minutes=10)).time()
        compare(Timestamp.combine(date_component, time_component),
                datetime.combine(date_component, time_component))
示例#8
0
    def test_class_ops_pytz(self):
        def compare(x, y):
            assert int((Timestamp(x).value - Timestamp(y).value) / 1e9) == 0

        compare(Timestamp.now(), datetime.now())
        compare(Timestamp.now("UTC"), datetime.now(timezone("UTC")))
        compare(Timestamp.utcnow(), datetime.utcnow())
        compare(Timestamp.today(), datetime.today())
        current_time = calendar.timegm(datetime.now().utctimetuple())
        msg = "timezone-aware Timestamp with UTC"
        with tm.assert_produces_warning(FutureWarning, match=msg):
            # GH#22451
            ts_utc = Timestamp.utcfromtimestamp(current_time)
        compare(
            ts_utc,
            datetime.utcfromtimestamp(current_time),
        )
        compare(
            Timestamp.fromtimestamp(current_time), datetime.fromtimestamp(current_time)
        )
        compare(
            # Support tz kwarg in Timestamp.fromtimestamp
            Timestamp.fromtimestamp(current_time, "UTC"),
            datetime.fromtimestamp(current_time, utc),
        )
        compare(
            # Support tz kwarg in Timestamp.fromtimestamp
            Timestamp.fromtimestamp(current_time, tz="UTC"),
            datetime.fromtimestamp(current_time, utc),
        )

        date_component = datetime.utcnow()
        time_component = (date_component + timedelta(minutes=10)).time()
        compare(
            Timestamp.combine(date_component, time_component),
            datetime.combine(date_component, time_component),
        )
示例#9
0
def top_comments(drug=None,n=5):
	"""retrieves top n comments for given drug, returns parseable format.  
	Ranks top comments by score normalized to subreddit population.

	KWARGS:
		drug: string or None.
			Drug selector.  Allows three cases:
			* None: scrape all comments in database, regardless of drug.
			* 'antidepressant': select comments speaking generically about
				drug, not referencing specific drug.
			* [drug name]: comments referencing specific drug.
			Default None.
		n: int.
			number of top comments to return.

	YIELDS:
		Iterable of top n comments.

	RAISES:
		ValueError:
			On invalid drug selection.
	"""
	conn = pms.connect(
		host='localhost',
		user='******',
		passwd='',
		db='empath',
		charset='utf8',
		init_command='SET NAMES UTF8')
	cur = conn.cursor()

	if drug is None:
		gen = None
		query = ("SELECT c.author,c.body,c.created_utc,c.score,c.subreddit,"
			"s.subscribers FROM Comments c "
			"JOIN Subreddits s ON s.subreddit=c.subreddit")
	elif drug.upper() == 'ANTIDEPRESSANT':
		gen = None
		query = ("SELECT c.author,c.body,c.created_utc,c.score,c.subreddit,"
			"s.subscribers FROM Comments c "
			"JOIN Subreddits s ON s.subreddit=c.subreddit")
	else:
		gen = _drug_dict.get(drug.upper(),None)
		if gen is None:
			raise ValueError('Invalid drug selection.')

		query = ("SELECT c.author,c.body,c.created_utc,c.score,c.subreddit,"
			"s.subscribers FROM Comments c "
			"JOIN Subreddits s ON s.subreddit=c.subreddit "
			"JOIN Mentions m ON m.id=c.id "
			"WHERE m.%s=True" % gen.lower())
	data = pandas.read_sql(query,conn)
	conn.close()

	data['normscore'] = data.score/data.subscribers**.33
	data.sort_values(by='normscore',ascending=False,inplace=True)
	data = data[:n]

	for row in data.iterrows():
		entry = row[1]	# pandas.Series
		author = entry.author
		body = entry.body
		created_utc = entry.created_utc
		score = entry.score
		subreddit = entry.subreddit

		created_utc = str(Timestamp.utcfromtimestamp(int(created_utc)))

		#if drug is not None:
		#	bodymod = re.compile(drug,re.IGNORECASE)
		#	body = bodymod.sub("**%s*" % drug,body)
		#if gen is not None:
		#	bodymod = re.compile(gen,re.IGNORECASE)
		#	body = bodymod.sub("**%s**" % gen,body)

		yield (author,body,created_utc,score,subreddit)
示例#10
0
def top_comments_simple(drug=None,n=5):
	"""retrieves top n comments for given drug, returns parseable format.  
	Ranks top comments by raw score (rendered by SQL query).

	KWARGS:
		drug: string or None.
			Drug selector.  Allows three cases:
			* None: scrape all comments in database, regardless of drug.
			* 'antidepressant': select comments speaking generically about
				drug, not referencing specific drug.
			* [drug name]: comments referencing specific drug.
			Default None.
		n: int.
			number of top comments to return.

	YIELDS:
		Iterable of top n comments.

	RAISES:
		ValueError:
			On invalid drug selection.
	"""
	conn = pms.connect(
		host='localhost',
		user='******',
		passwd='',
		db='empath',
		charset='utf8',
		init_command='SET NAMES UTF8')
	cur = conn.cursor()

	if drug is None:
		gen = None
		query = ("SELECT author,body,created_utc,score,subreddit from Comments "
			"ORDER BY score DESC "
			"LIMIT %s" % n)
	elif drug.upper() == 'ANTIDEPRESSANT':
		gen = None
		query = ("SELECT c.author,c.body,c.created_utc,c.score,c.subreddit "
			"FROM Comments c "
			"JOIN Mentions m on m.id=c.id "
			"WHERE m.count=0 "
			"ORDER BY c.score DESC "
			"LIMIT %s" % n)
	else:
		gen = _drug_dict.get(drug.upper(),None)
		if gen is None:
			raise ValueError('Invalid drug selection.')

		query = ("SELECT c.author,c.body,c.created_utc,c.score,c.subreddit "
			"FROM Comments c "
			"JOIN Mentions m on m.id=c.id "
			"WHERE m.%s=True "
			"ORDER BY c.score DESC "
			"LIMIT %s" % (gen.lower(),n))
	cur.execute(query)

	conn.close()

	for row in cur:
		author = row[0]
		body = row[1]
		created_utc = row[2]
		score = row[3]
		subreddit = row[4]

		created_utc = str(Timestamp.utcfromtimestamp(int(created_utc)))

		#if drug is not None:
		#	bodymod = re.compile(drug,re.IGNORECASE)
		#	body = bodymod.sub("%s" % drug,body)
		#if gen is not None:
		#	bodymod = re.compile(gen,re.IGNORECASE)
		#	body = bodymod.sub("%s" % gen,body)

		yield (author,body,created_utc,score,subreddit)
示例#11
0
def top_comments(drug=None, n=5):
    """retrieves top n comments for given drug, returns parseable format.  
	Ranks top comments by score normalized to subreddit population.

	KWARGS:
		drug: string or None.
			Drug selector.  Allows three cases:
			* None: scrape all comments in database, regardless of drug.
			* 'antidepressant': select comments speaking generically about
				drug, not referencing specific drug.
			* [drug name]: comments referencing specific drug.
			Default None.
		n: int.
			number of top comments to return.

	YIELDS:
		Iterable of top n comments.

	RAISES:
		ValueError:
			On invalid drug selection.
	"""
    conn = pms.connect(
        host="localhost", user="******", passwd="", db="empath", charset="utf8", init_command="SET NAMES UTF8"
    )
    cur = conn.cursor()

    if drug is None:
        gen = None
        query = (
            "SELECT c.author,c.body,c.created_utc,c.score,c.subreddit,"
            "s.subscribers FROM Comments c "
            "JOIN Subreddits s ON s.subreddit=c.subreddit"
        )
    elif drug.upper() == "ANTIDEPRESSANT":
        gen = None
        query = (
            "SELECT c.author,c.body,c.created_utc,c.score,c.subreddit,"
            "s.subscribers FROM Comments c "
            "JOIN Subreddits s ON s.subreddit=c.subreddit"
        )
    else:
        gen = _drug_dict.get(drug.upper(), None)
        if gen is None:
            raise ValueError("Invalid drug selection.")

        query = (
            "SELECT c.author,c.body,c.created_utc,c.score,c.subreddit,"
            "s.subscribers FROM Comments c "
            "JOIN Subreddits s ON s.subreddit=c.subreddit "
            "JOIN Mentions m ON m.id=c.id "
            "WHERE m.%s=True" % gen.lower()
        )
    data = pandas.read_sql(query, conn)
    conn.close()

    data["normscore"] = data.score / data.subscribers ** 0.33
    data.sort_values(by="normscore", ascending=False, inplace=True)
    data = data[:n]

    for row in data.iterrows():
        entry = row[1]  # pandas.Series
        author = entry.author
        body = entry.body
        created_utc = entry.created_utc
        score = entry.score
        subreddit = entry.subreddit

        created_utc = str(Timestamp.utcfromtimestamp(int(created_utc)))

        # if drug is not None:
        # 	bodymod = re.compile(drug,re.IGNORECASE)
        # 	body = bodymod.sub("**%s*" % drug,body)
        # if gen is not None:
        # 	bodymod = re.compile(gen,re.IGNORECASE)
        # 	body = bodymod.sub("**%s**" % gen,body)

        yield (author, body, created_utc, score, subreddit)
示例#12
0
def top_comments_simple(drug=None, n=5):
    """retrieves top n comments for given drug, returns parseable format.  
	Ranks top comments by raw score (rendered by SQL query).

	KWARGS:
		drug: string or None.
			Drug selector.  Allows three cases:
			* None: scrape all comments in database, regardless of drug.
			* 'antidepressant': select comments speaking generically about
				drug, not referencing specific drug.
			* [drug name]: comments referencing specific drug.
			Default None.
		n: int.
			number of top comments to return.

	YIELDS:
		Iterable of top n comments.

	RAISES:
		ValueError:
			On invalid drug selection.
	"""
    conn = pms.connect(
        host="localhost", user="******", passwd="", db="empath", charset="utf8", init_command="SET NAMES UTF8"
    )
    cur = conn.cursor()

    if drug is None:
        gen = None
        query = "SELECT author,body,created_utc,score,subreddit from Comments " "ORDER BY score DESC " "LIMIT %s" % n
    elif drug.upper() == "ANTIDEPRESSANT":
        gen = None
        query = (
            "SELECT c.author,c.body,c.created_utc,c.score,c.subreddit "
            "FROM Comments c "
            "JOIN Mentions m on m.id=c.id "
            "WHERE m.count=0 "
            "ORDER BY c.score DESC "
            "LIMIT %s" % n
        )
    else:
        gen = _drug_dict.get(drug.upper(), None)
        if gen is None:
            raise ValueError("Invalid drug selection.")

        query = (
            "SELECT c.author,c.body,c.created_utc,c.score,c.subreddit "
            "FROM Comments c "
            "JOIN Mentions m on m.id=c.id "
            "WHERE m.%s=True "
            "ORDER BY c.score DESC "
            "LIMIT %s" % (gen.lower(), n)
        )
    cur.execute(query)

    conn.close()

    for row in cur:
        author = row[0]
        body = row[1]
        created_utc = row[2]
        score = row[3]
        subreddit = row[4]

        created_utc = str(Timestamp.utcfromtimestamp(int(created_utc)))

        # if drug is not None:
        # 	bodymod = re.compile(drug,re.IGNORECASE)
        # 	body = bodymod.sub("%s" % drug,body)
        # if gen is not None:
        # 	bodymod = re.compile(gen,re.IGNORECASE)
        # 	body = bodymod.sub("%s" % gen,body)

        yield (author, body, created_utc, score, subreddit)