def queryEntry(conn, tar, tab, cond=None, agg=None, aggcond=None): '''Query the database for the entries with the given conditions, Args: conn: The connection to the database tar: Target list tab: "From" conditions cond: "Where" conditions agg: aggregation conditions aggcond:"having" conditions Returns: result: Cursor pointing to the query results ''' execstr = "" try: c = conn.cursor() execstr = "select " + ",".join(tar) + " " execstr += "from " + "(" + ",".join(tab) + ") " if cond is not None: execstr += "where " + "(" + cond + ") " if agg is not None: execstr += "group by " + "(" + ",".join(agg) + ") " if aggcond is not None: execstr += "having " + "(" + aggcond + ")" c.execute(execstr) return c except Exception: csplog.logexc(sys.exc_info()) print execstr return None return None
def getParticipation(commInfo, insert=True): try: conn = httplib.HTTPSConnection(govhost) dbc = dbmngr.connectDB("./data/", "cspdb", False) res, jd = None, None for c in commInfo: endpoint = "/api/v2/committee_member?" endpoint += "committee=" + str(c[0]) endpoint += "&limit=300" conn.request("GET", endpoint) res = conn.getresponse() checkResponse(res, endpoint) jd = json.loads(res.read()) formatted = [(gUID(p[u'person'][u'id'], p[u'committee'][u'id']), p[u'person'][u'id'], p[u'committee'][u'id'], p[u'role']) for p in jd[u'objects']] if insert: if not dbmngr.insertMany(dbc, "participates", ["id", "lid", "cid", "role"], formatted): raise Exception("Database Insertion Error") else: pass dbc.close() conn.close() return True except Exception: csplog.logexc(sys.exc_info()) return False return False
def genLikes(contentid, idlist, authorideo=None, ideolist=None): try: dbc = dbmngr.connectDB("./data/", "cspdb", False) dbc.text_factory = str collist = ["id", "lid", "cid"] problist = [] if authorideo is not None and ideolist is not None: problist = [0.01 * (4 - abs(i - authorideo)) for i in ideolist] else: problist = [0.05 for _ in idlist] likes = [ idlist[i] for i in xrange(len(idlist)) if random.random() < problist[i] ] likelist = [[ unicode( uuid.uuid3(uuid.NAMESPACE_DNS, str(i) + str(contentid) + 'l')), i, contentid ] for i in likes] if not dbmngr.insertMany(dbc, "likes", collist, likelist): raise Exception("Database Insertion Error at genVotes") dbc.close() return likes except Exception: csplog.logexc(sys.exc_info()) return None return None
def genReplies(num, iden, replyto): try: #generate with genTweets dbc = dbmngr.connectDB("./data/", "cspdb", False) dbc.text_factory = str replies = genTweets(num, iden, False) #insert into database collist = [ "id", "time", "type", "contents", "author", "replyto", ] contentlist = [[ unicode(uuid.uuid3(uuid.NAMESPACE_DNS, t[-2])), unicode(datetime.datetime.now()), u"reply", t[-2], iden[0], replyto ] for t in replies] if not dbmngr.insertMany(dbc, "contents", collist, contentlist): raise Exception("Database Insertion Error at genReplies") dbc.close() return contentlist except Exception: csplog.logexc(sys.exc_info()) return None return None
def createTable(conn, name, col, foreign=None): '''Create a table with the given connection Example: createTable(conn, "example", {"col1":["INT"], "col2": ["int","primary","not null"]}) Args: conn: The Connection to the database name: The name of the table col: A dictionary to the columns, with the keys as names, and values as type and Keywords such as "PRIMARY," "NOT NULL" etc. Values must be iterable foreign:A dictionary to indicate which columns are foreign. Keys are the local columns, and Values are foreign columns Returns: bool: Whether the operation was successful ''' execstr = "" try: c = conn.cursor() #name = sanitize(name) execstr = "create table " + name + "(" for k, v in col.iteritems(): execstr += k + " " execstr += " ".join(v) execstr += "," if foreign is not None: for (k, v) in foreign.iteritems(): execstr += "foreign key(" + k + ") references " + v + "," execstr = execstr[:-1] + ")" c.execute(execstr) conn.commit() return True except Exception: csplog.logexc(sys.exc_info()) print execstr return False return False
def insertEntry(conn, table, entry): '''Insert an entry into the database referred to by the connection Example: insertEntry(conn,"sampleTable",{"col1":2,"col2":0}) Args: conn: The connection to the database table: The name of the table entry: A dictionary of the columns that the entry has, and the respective values Returns: bool: whether the operation was successful ''' execstr = "" try: c = conn.cursor() execstr = "insert into " + table col = "(" val = "(" col += ",".join(entry.keys()) v = entry.values() v = ["'" + w + "'" for w in v] val += ",".join(v) col += ")" val += ")" execstr += " " + col + " values " + val + ";" c.execute(execstr) conn.commit() return True except Exception: print execstr csplog.logexc(sys.exc_info()) return False return False
def getAllTweets(screen_name, filename): try: #Twitter only allows access to a users most recent 3240 tweets with this method consumer_key, consumer_secret, access_key, access_secret = twcred.auth( ) #authorize twitter, initialize tweepy auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) api = tweepy.API(auth) alltweets = [] i = 0 for s in screen_name: try: alltweets.extend(api.user_timeline(screen_name=s, count=200)) i += 1 print str(i) + "/" + str(len(screen_name)) + " done" except Exception as e: print s, e outtweets = [tweet.text.encode("utf-8") for tweet in alltweets] outtweets = map(removeLinks, outtweets) with open(filename, 'w+') as f: for t in outtweets: f.write(t + '\n') return True except Exception as e: csplog.logexc(sys.exc_info()) return False return False
def genTweetBlobs(twaccnts): try: dbc = dbmngr.connectDB("./data/", "cspdb", False) idcur = dbmngr.queryEntry(dbc, ["id", "ideology"], ["legislators"]) if idcur is None: raise Exception("Query Error at updateLegisImg") idlist = idcur.fetchall() print "query for id, ideology done" for i in range(5): acclist = [twaccnts[k[0]] for k in idlist if int(k[1]) == i] datadir = './data/twblobs/' + str(i) + '/' getAllTweets(acclist, datadir + 'input.txt') print "input file generation for ideology " + str(i) + " success" res = subprocess.call([ "python", "./rnn/train.py", "--data_dir=" + datadir, "--save_dir=" + datadir + 'model/', "--rnn_size=" + str(32), "--num_epochs=" + str(1), "--seq_length=" + str(10), "--learning_rate=" + str(0.003), "--model=lstm" ]) if res != 0: raise Exception( "Training subprocess call Error at genTweetBlobs") print "model trained for ideology " + str(i) return True except Exception: csplog.logexc(sys.exc_info()) return False return False
def updateMany(conn, table, cols, vals): '''Update certain columns on certain entries in the given table, with the given values. Example: Args: conn: connection to the database table: the table to be targeted for updates cols: the columns to be updated vals: the values to be inserted as the update, format: (value1,value2,...,valuen,id) Returns: bool: whether the operation was successful ''' execstr = "" try: if len(cols) != len(vals[0]) - 1: return False c = conn.cursor() execstr = "update " + table + " set " for col in cols: execstr += col + "= ?," execstr = execstr[:-1] + " where id = ?;" c.executemany(execstr, vals) conn.commit() return True except Exception: csplog.logexc(sys.exc_info()) print execstr return False return False
def genMemes(num, iden, background, insert=True, primer=None): try: #generate tweets with genTweets(num,iden) tw = genTweets(num, iden, False, primer) #break up tweets into 2 parts randomly for t in tw: k = random.choice(range(1, len(t[-2].split(" ")) - 2)) t[-2] = "<MEME>".join(t[-2].split(t[-2].split(" ")[k])) print "top/bottom text generated, separated by string <MEME>" collist = ["id", "time", "type", "contents", "author", "memebg"] contentlist = [[ unicode(uuid.uuid3(uuid.NAMESPACE_DNS, t[-2])), unicode(datetime.datetime.now()), u'meme', t[-2], iden[0], background ] for t in tw] #insert into database if insert: dbc = dbmngr.connectDB("./data/", "cspdb", False) dbc.text_factory = str if not dbmngr.insertMany(dbc, "contents", collist, contentlist): raise Exception("Database Insertion Error at genMemes") dbc.close() print "Meme generation completed" return contentlist except Exception: csplog.logexc(sys.exc_info()) return None return None
def insertMany(conn, table, cols, entrylist): '''For inserting many rows at the same time Args: conn: The connection to the database table: The table to be inserted into cols: List of columns related to the entry entrylist: List of tuples of entry values to be inserted. Returns: bool: Whether the operation was successful ''' execstr = "" try: if len(cols) != len(entrylist[0]): return False c = conn.cursor() colstr = "(" + ",".join(cols) + ")" qmstr = "(" + ",".join(["?" for _ in xrange(len(entrylist[0]))]) + ")" execstr = "insert into " + table + " " + colstr + " values " + qmstr + ";" c.executemany(execstr, entrylist) conn.commit() return True except Exception: csplog.logexc(sys.exc_info()) print execstr return False return False
def genTweets(num, iden, insert=True, prime=None): '''generate and insert tweets under iden's name, according to iden's ideology''' def dict_factory(cursor, row): d = {} d[row[0]] = row[1] return d try: #acquire ideology blob dbc = dbmngr.connectDB("./data/", "cspdb", False) #dbc.row_factory = dict_factory idcur = dbmngr.queryEntry(dbc, ["id", "ideology"], ["legislators"]) if idcur is None: raise Exception("Query Error at genTweets") idict = idcur.fetchall() idict = {t[0]: t[1] for t in idict} modeldir = './data/twblobs/' + str(int(idict[iden[0]])) + '/model/' dbc.close() #generate tweets gentweets = [] for i in range(num): numwords = random.choice(range(15, 30)) params = [ "python", "./rnn/sample.py", "--save_dir", modeldir, "-n", str(numwords), "--sample", str(1) ] if prime is not None: params += ["--prime", prime] gentweets += [subprocess.check_output(params).split("\n")[1]] print("tweets" if insert else "reply") + " generation from model complete" collist = ["id", "time", "type", "contents", "author"] contentlist = [[ unicode(uuid.uuid3(uuid.NAMESPACE_DNS, t)), unicode(datetime.datetime.now()), u'post', t, iden[0] ] for t in gentweets] if insert: #insert into database dbc = dbmngr.connectDB("./data/", "cspdb", False) dbc.text_factory = str if not dbmngr.insertMany(dbc, "contents", collist, contentlist): raise Exception("Database Insertion Error at genTweets") dbc.close() return contentlist except Exception: csplog.logexc(sys.exc_info()) return None return None
def genBillBlobs(): try: datadir = './data/bills/' res = subprocess.call([ "python", "./rnn/train.py", "--data_dir=" + datadir, "--save_dir=" + datadir + 'model/', "--rnn_size=" + str(64), "--num_epochs=" + str(3), "--seq_length=" + str(10), "--learning_rate=" + str(0.003), "--model=lstm" ]) if res != 0: raise Exception("Training subprocess call Error at genBillBlobs") print "model trained for bills" return True except Exception: csplog.logexc(sys.exc_info()) return False return False
def genVotes(billid, voters): try: dbc = dbmngr.connectDB("./data/", "cspdb", False) dbc.text_factory = str collist = ["id", "lid", "cid", "votes"] voteres = [[ unicode(uuid.uuid3(uuid.NAMESPACE_DNS, str(i) + str(billid))), i, billid, "yea" if random.choice(range(2)) > 0 else "nay" ] for i in voters] if not dbmngr.insertMany(dbc, "votes", collist, voteres): raise Exception("Database Insertion Error at genVotes") dbc.close() return voteres except Exception: csplog.logexc(sys.exc_info()) return None return None
def genBills(num, committee, iden): try: res = [] for i in range(num): #generate a random number k, k in [1,5] k = random.choice(range(1, 6)) #generate k bill title literals from bill blob literals = [] modeldir = "./data/bills/model/" genlits = [] for _ in range(k): numwords = random.choice(range(1, 4)) genlits += [ subprocess.check_output([ "python", "./rnn/sample.py", "--save_dir", modeldir, "-n", str(numwords), "--sample", str(1) ]).split("\n")[1].capitalize() ] #concatenate literals res += [((", ".join(genlits[:-1]) + " and ") if k > 1 else "") + genlits[-1] + " Act of 2017"] print res print "{0}/{1} bills generated".format(i + 1, num) #insert into database collist = ["id", "time", "type", "contents", "author", "committees"] contentlist = [[ unicode(uuid.uuid3(uuid.NAMESPACE_DNS, r)), unicode(datetime.datetime.now()), u'bill', r, iden[0], committee ] for r in res] dbc = dbmngr.connectDB("./data/", "cspdb", False) dbc.text_factory = str if not dbmngr.insertMany(dbc, "contents", collist, contentlist): raise Exception("Database Insertion Error at genBills") dbc.close() print "Bills insertion complete" return contentlist except Exception: csplog.logexc(sys.exc_info()) return None return None
def getIdeology(): '''Get all of the legislators' ideology score Args: None Returns: dict: A dictionary with {id->ideology}, None if operation failed ''' try: dbc = dbmngr.connectDB('./data/', 'cspdb', False) idcur = dbmngr.queryEntry(dbc, ["id", "ideology"], ["legislators"]).fetchall() dbc.close() iddict = {i[0]: i[1] for i in idcur} csplog.logevent("query", "queried all ideologies") return iddict except Exception: csplog.logexc(sys.exc_info()) return None return None
def populate(t, r, insert=True): try: idlist = None if r > 0: dbc = dbmngr.connectDB("./data/", "cspdb", False) idlist = dbmngr.queryEntry(dbc, ["id"], ["legislators"]).fetchall() dbc.close() if t == 0: return True for l in idlist: tweets = genTweets(t, l, True) if r == 0: continue for tw in tweets: reper = random.choice(idlist) genReplies(r, reper, tw[0]) return True except Exception: csplog.logexc(sys.exc_info()) return False return False
def updateLegisImg(): def nonexist(): '''handle the situation where person doesn't have an image on govtrack''' return open("./data/noimg.jpeg", "rb").read() def getImg(conn, iden): '''Gets the person 'iden's image from govtrack''' endpoint = "/data/photos/" + str(iden[0]) + "-200px.jpeg" try: conn.request("GET", endpoint) res = conn.getresponse() if res.status != 200: #means this person doesn't have an image on the govtrack database if res.status == 404: print endpoint return buffer(nonexist()) else: raise Exception( "HTTP error:" + str(res.status) + " at updateLegisImg", endpoint) return res.read() except Exception as e: print endpoint return buffer(nonexist()) try: conn = httplib.HTTPSConnection(govhost) dbc = dbmngr.connectDB("./data/", "cspdb", False) idcur = dbmngr.queryEntry(dbc, ["id"], ["legislators"]) if idcur is None: raise Exception("Query Error at updateLegisImg") idlist = idcur.fetchall() updlist = [(sqlite3.Binary(getImg(conn, p)), p[0]) for p in idlist] if not dbmngr.updateMany(dbc, "legislators", ["image"], updlist): raise Exception("Update Error at updateLegisImg") conn.close() dbc.close() return True except Exception: csplog.logexc(sys.exc_info()) return False return False
def removeEntry(conn, table, cond): '''Remove all entries within the given table with the given conditions Args: conn: The connection to the database table: "From" clause conditions cond: "Where"clause conditions Returns: bool: Whether the operation was successfu; ''' execstr = "" try: c = conn.cursor() execstr = "delete from " + table if cond is not None: execstr += " where " + cond c.execute(execstr) conn.commit() return True except Exception: csplog.logexc(sys.exc_info()) print execstr return False return False
def connectDB(directory, name, new=False): """Connect to a given database, creates one if it doesn't exist Args: directory: The directory of the database name: The name (or intended name) of the database new: Whether the method should create a new database Returns: conn: The connection to the Database, None if database doesn't exist and the instruction is to not create one; or if there was an exception """ try: namestr = directory + name + ".db" if os.path.isfile(namestr) and new: os.remove(namestr) conn = sqlite3.connect(namestr) elif os.path.isfile(namestr) or new: conn = sqlite3.connect(namestr) else: return None except Exception: csplog.logexc(sys.exc_info()) return None return conn
def getCommInfo(insert=True): try: conn = httplib.HTTPSConnection(govhost) endpoint = "/api/v2/committee?obsolete=false&committee=null&limit=300" conn.request("GET", endpoint) res = conn.getresponse() jd = json.loads(res.read()) formatted = [(c[u'id'], c[u'name'], c[u'jurisdiction'], c[u'committee_type']) for c in jd[u'objects']] if insert: dbc = dbmngr.connectDB("./data/", "cspdb", False) if not dbmngr.insertMany(dbc, "committees", ["id", "name", "desc", "floor"], formatted): raise Exception("Database Insertion Error") else: pass dbc.close() conn.close() return formatted except Exception: csplog.logexc(sys.exc_info()) return None return None
def getBasicInfo(insert=True): try: conn = httplib.HTTPSConnection(govhost) endpoint = "/api/v2/role?"+\ "current=true&"+\ "role_type__in=senator|representative&"+\ "fields=person__firstname,person__lastname,state,person__twitterid,person__id,person__name,party,role_type&"+\ "limit=600" conn.request("GET", endpoint) res = conn.getresponse() checkResponse(res, endpoint) data = json.loads(res.read()) print "scraped basic info from govtrack" endpoint = "/data/us/" + caucusnum + "/stats/sponsorshipanalysis_h.txt" conn.request("GET", endpoint) res = conn.getresponse() checkResponse(res, endpoint) ideo = res.read().split("\n") ideo = ideo[1:-1] print "scraped house ideology" endpoint = "/data/us/" + caucusnum + "/stats/sponsorshipanalysis_s.txt" conn.request("GET", endpoint) res = conn.getresponse() checkResponse(res, endpoint) ideo.extend(res.read().split("\n")[1:-1]) print "scraped senate ideology" ideo = [k.split(",") for k in ideo] ideo = sorted(ideo, key=lambda l: l[1]) binsize = len(ideo) // 5 for i in range(5): for j in range(binsize * i, binsize * (i + 1) - 1): ideo[j][1] = i for j in range(binsize * 4, len(ideo)): ideo[j][1] = 4 print "ideology formatted" conn.close() ideo = {int(p[0]): p[1] for p in ideo} formatted = [ (p[u'person'][u'id'], (p[u'person'][u'firstname'] + " " + p[u'person'][u'lastname']), p[u'person'][u'name'], p[u'role_type'], p[u'party'], p[u'state'], ideo[p[u'person'][u'id']], None) for p in data[u'objects'] ] if insert: dbc = dbmngr.connectDB("./data/", "cspdb", False) if not dbmngr.insertMany(dbc,"legislators",\ ["id","name","desc","role","party","state","ideology","image"],formatted): raise Exception("Database Insertion Error") dbc.close() return { p[u'person'][u'id']: p[u'person'][u'twitterid'] for p in data[u'objects'] } else: pass return formatted except Exception: csplog.logexc(sys.exc_info()) return None return None