Exemplo n.º 1
0
 def __init__(self):
     smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
     self.analyzers = {"smartcn": smartcn}
     directory = lucene.SimpleFSDirectory(lucene.File(self.STORE_DIR))
     self.searcher = lucene.IndexSearcher(directory, True)
     self.pgconn = mypass.getConn()
     self.sw = sinaweibooauth.SinaWeiboOauth()
Exemplo n.º 2
0
    def __init__(self):
	smartcn = lucene.SmartChineseAnalyzer(lucene.Version.LUCENE_33)
	#analyzer = lucene.StandardAnalyzer(lucene.Version.LUCENE_33)
	analyzers = { "smartcn": smartcn }
	self.pgconn = mypass.getConn()
	self.sw = sinaweibooauth.SinaWeiboOauth()
	if not os.path.exists(self.storeDir):
	    os.mkdir(self.storeDir)
	store = lucene.SimpleFSDirectory(lucene.File(self.storeDir))
	writerconfig = lucene.IndexWriterConfig(lucene.Version.LUCENE_33, analyzers["smartcn"])
	writerconfig.setWriteLockTimeout(600000L)
	writerconfig.setMaxThreadStates(50)
	writerconfig.setRAMBufferSizeMB(128.0)
	self.writer = lucene.IndexWriter(store, writerconfig)
Exemplo n.º 3
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
import pg
import json
import mypass
import time

import sinaweibooauth

if __name__ == "__main__":
    sw = sinaweibooauth.SinaWeiboOauth()
    sw.setToken(sw.sinaweiboOauth["oauth_token"],
                sw.sinaweiboOauth["oauth_token_secret"])
    f = open(
        "/var/data/sinaweibo/mostretweeted/mostretweeted-hk_list1_yesterday.json",
        "r")
    js = json.loads(f.read())
    for x in js["rts"]:
        print x["id"]
        out = sw.dispatch(9, x["id"])
        print out
        #time.sleep(1)
Exemplo n.º 4
0
def sinatrace(tid,
              minimal=False,
              extra_fields=False,
              get_users=False,
              outformat="json"):
    # For RP: Should try to find the created_at if it's not known or given as argument...
    sw = sinaweibooauth.SinaWeiboOauth()
    sw.setToken(sw.sinaweiboOauth["oauth_token"],
                sw.sinaweiboOauth["oauth_token_secret"])
    try:
        tid = long(tid)
    except ValueError:
        print usage
        sys.exit()
    if extra_fields:
        extra_fields = ", u.name user_name, u.screen_name user_screen_name, u.domain user_domain, \
u.province user_province, u.city user_city, u.gender user_gender, \
u.followers_count user_followers_count, u.friends_count user_friends_count, u.retrieved user_retrieved "

    else:
        extra_fields = ""
    '''
    rps = sw.getRangePartitionByIds([tid])
    for rp in rps:
	x = rp.split(",")
	year = int(x[0])
	week = int(x[1])
	break
    isocal = datetime.datetime.now().isocalendar()
    year_now = isocal[0]
    week_now = isocal[1]
    sw_tables_arr = list()
    for x in range(year,year_now+1):
	if year == year_now:
	    myrange = range(week,week_now+1)
	elif x == year:
	    myrange = range(week,54)
	elif x == year_now:
	    myrange = range(1,week)
	for y in myrange:
	    sw_tables_arr.append("SELECT * FROM rp_sinaweibo_y%(year)dw%(week)d" % { "year": x, "week": y })
    sw_tables = " UNION ".join(sw_tables_arr)
    '''
    sql = "SELECT s.id, s.created_at, s.user_id, s.screen_name, s.text, u.id AS user_id_ref %(extra_fields)s \
FROM rp_sinaweibo s LEFT JOIN sinaweibo_users u ON s.user_id = u.id \
WHERE retweeted_status = %(tid)d ORDER BY s.id " % {
        "tid": tid,
        "extra_fields": extra_fields
    }  #, "sw_tables": sw_tables}
    #print sql
    rows = pgconn.query(sql).dictresult()
    out = dict()
    rts = list()
    count = 0
    out["generated_start"] = datetime.datetime.now().strftime("%c")
    ids_cache = dict()
    missing_users = list()
    missing_users_ids = list()
    for r in rows:
        if r["screen_name"] not in ids_cache:
            ids_cache[r["screen_name"]] = r["user_id"]
        m = re.findall(u"//@([^\::。\.\:/@\-,,【\[ ]*)", r["text"])
        #m = re.findall(u"//@([^:/@:, ]*)", r["text"])
        refs = list()
        for refname in m:
            refname = refname.split(":")[0]
            ref = dict()
            if refname in ids_cache:
                ref["id"] = ids_cache[refname]
            else:
                #sql_ref = "SELECT u.id FROM sinaweibo_users u WHERE u.screen_name = '%(ref)s' " % { 'ref': refname }
                sql_ref = "SELECT s.user_id as id FROM sinaweibo s WHERE s.retweeted_status = %(tid)d AND s.created_at < '%(created_at)s' AND s.screen_name = '%(ref)s' ORDER BY s.created_at DESC LIMIT 1 " % {
                    'tid': tid,
                    "created_at": r["created_at"],
                    'ref': refname.replace("'", "''")
                }
                #print sql_ref
                try:
                    rows_ref = pgconn.query(sql_ref).dictresult()
                except pg.ProgrammingError as e:
                    print e
                    print refname
                    continue
                if len(rows_ref) == 0:  # get from users table
                    sql_ref_fromusers = "SELECT u.id FROM sinaweibo_users u WHERE u.screen_name = '%(ref)s' " % {
                        'ref': refname
                    }
                    rows_ref = pgconn.query(sql_ref_fromusers).dictresult()
                if len(rows_ref) > 0:
                    ref["id"] = rows_ref[0]["id"]
                    ids_cache[refname] = ref["id"]
                else:
                    if get_users:
                        resp = sw.user(None, refname)
                        try:
                            ref["id"] = resp["data"][0]["id"]
                            ids_cache[refname] = ref["id"]
                        except KeyError:
                            ref["id"] = None
                            ids_cache[refname] = None
                            missing_users.append(refname)
                    else:
                        ref["id"] = None
                        ids_cache[refname] = None
                        missing_users.append(refname)
            ref["name"] = refname
            refs.append(ref)
            count += 1
        r["references"] = refs
        if get_users and r[
                "user_id_ref"] is None:  # users who reposted, but not in our DB yet
            missing_users_ids.append(r["user_id"])
        rts.append(r)
    out["missing_users"] = missing_users
    out["missing_users_count"] = len(missing_users)
    out["reposts"] = rts
    out["reposts_count"] = len(rts)
    out["generated"] = long(time.mktime(
        datetime.datetime.now().timetuple())) * 1000
    out["generated_end"] = datetime.datetime.now().strftime("%c")
    if get_users and len(missing_users_ids) > 0:
        f = open(str(tid) + "_missing_users_ids.txt", "w")
        for x in missing_users_ids:
            f.write(str(x) + "\n")
    if get_users and len(missing_users) > 0:  # print missing users list
        f = open(str(tid) + "_missing_users.txt", "w")
        for x in missing_users:
            f.write(str(x) + "\n")
    if outformat == "json":
        if minimal:
            return json.dumps(out)
        else:
            return json.dumps(out, sort_keys=True, indent=4)
    else:
        return out
Exemplo n.º 5
0
def gviz_trends(tid,
                req_id=0,
                interval="",
                period="",
                province=0,
                listid=0,
                outformat="json"):
    try:
        tid = long(tid)
    except ValueError:
        sys.exit()
    import gviz_api
    sql_interval = "to_char(date_trunc('hour', s.created_at), 'YYYY-MM-DD HH24:MI')"
    # Interval part
    dateformat = "%Y-%m-%d %H:%M"
    if interval == "":
        delta_t = datetime.timedelta(hours=1)
    elif string.lower(interval) == "10m":
        delta_t = datetime.timedelta(minutes=10)
        sql_interval = "substring(to_char(s.created_at, 'YYYY-MM-DD HH24:MI') for 15)||0"
    elif string.lower(interval[len(interval) - 1]) == "m":
        mins_interval = interval[0:len(interval) - 1]
        delta_t = datetime.timedelta(minutes=mins_interval)
        sql_interval = "to_char(date_trunc('hour', s.created_at) + \
INTERVAL '%(mins_interval)s min' * ROUND(date_part('minute', s.created_at) \
/ %(mins_interval)s.0), 'YYYY-MM-DD HH24:MI')" % {
            "mins_interval": mins_interval
        }
    elif string.lower(interval) == "d":
        dateformat = "%Y-%m-%d"
        delta_t = datetime.timedelta(days=1)
        sql_interval = "date(s.created_at) "
    else:
        delta_t = datetime.timedelta(hours=1)
    # Period part
    if len(period) > 0:
        measure = string.lower(period[len(period) - 1])
        try:
            nb = int(str(period[0:len(period) - 1]))
        except ValueError:
            nb = 1
        today = datetime.date.today()
        datedatatype = "string"
        if measure == "d":
            basetime = today - datetime.timedelta(days=nb)
        elif measure == "w":
            basetime = today - datetime.timedelta(weeks=nb)
        elif measure == "m":
            basetime = datetime.date(today.year + (today.month - nb - 1) / 12,
                                     (today.month - nb - 1) % 12 + 1,
                                     today.day)
    else:
        basetime = None
    if basetime is None:
        sql_period = ""
        sw_tables = "sinaweibo"
    else:
        basetime = datetime.datetime.combine(basetime, datetime.time())
        sql_period = " AND s.created_at >= '%s' " % basetime.strftime(
            "%Y-%m-%d")
        import sinaweibooauth
        sw = sinaweibooauth.SinaWeiboOauth()
        sw_tables = "(%s)" % sw.getRangePartitionSQL(basetime)
    sql_location = ""
    sql_listidjoin = ""
    sql_listid = ""
    if int(listid) > 0:
        sql_listidjoin = "LEFT JOIN sinaweibo_userlist ul ON u.id = ul.user_id "
        sql_listid = " AND ul.list_id = %d " % int(listid)
    if int(province) > 0:
        sql_location = " AND u.province = %d " % int(province)
    sql = "SELECT %(interval)s AS time, COUNT(*) AS count, COUNT(DISTINCT s.user_id) AS users \
FROM %(sw_tables)s s LEFT JOIN sinaweibo_users u ON s.user_id = u.id %(sql_listidjoin)s WHERE retweeted_status = %(tid)d %(sql_period)s %(sql_location)s %(sql_listid)s GROUP BY time ORDER BY time " \
% {"tid": tid, "interval": sql_interval, "sql_period": sql_period, "sql_location": sql_location, "sql_listidjoin": sql_listidjoin, "sql_listid": sql_listid, "sw_tables": sw_tables }
    rows = pgconn.query(sql).dictresult()
    description = {
        "time": ("string", "Time"),
        "count": ("number", "statuses"),
        "users": ("number", "distinct users")
    }
    columns_order = "time", "count", "users"
    order_by = "time"
    data = []
    if basetime is None:
        basetime = datetime.datetime.strptime(rows[0]["time"], dateformat)
    elif len(rows) > 0:
        try:
            datastart = datetime.datetime.strptime(rows[0]["time"], dateformat)
        except:
            dateformat = "%Y-%m-%d"
            datastart = datetime.datetime.strptime(rows[0]["time"], dateformat)
        while basetime + delta_t < datastart:
            data.append({
                "time":
                datetime.datetime.strftime(basetime, dateformat),
                "count":
                0,
                "users":
                0
            })
            basetime += delta_t
    for r in rows:
        thistime = datetime.datetime.strptime(r["time"], dateformat)
        while basetime + delta_t < thistime:
            #data.append({"time": basetime, "count": 0, "users": 0})
            data.append({
                "time":
                datetime.datetime.strftime(basetime, dateformat),
                "count":
                0,
                "users":
                0
            })
            basetime += delta_t
        #data.append({"time": thistime, "count": r["count"], "users": r["users"]})
        data.append({
            "time": r["time"],
            "count": r["count"],
            "users": r["users"]
        })
        basetime = thistime + delta_t
    data_table = gviz_api.DataTable(description)
    data_table.LoadData(data)
    if outformat == "json":
        return data_table.ToJSon(columns_order, order_by, req_id)
    elif outformat == "jsonresp":
        return data_table.ToJSonResponse(columns_order, order_by, req_id)
    else:
        return data_table.ToJSCode("jscode_data", columns_order, order_by,
                                   req_id)