def backward(s, transitions, emissions): s_length = len(s) # n.Rows num_of_states = len(emissions) # k.Columns b = np.zeros((num_of_states, s_length), dtype=float) # initialize b[n, i] for i in range(0, num_of_states): b[i, len(s) - 1] = math.log(1) # The most left column for i in reversed(range(0, len(s) - 1)): for j in range(0, num_of_states): a_max = sys.float_info.min a_l = [] for l in range(0, num_of_states): emission = emissions[l].get( s[i + 1] ) # emission inserted into the "l" for because he is being dependent on l curr = b[l, i + 1] + mylog(transitions[j, l]) + mylog(emission) if curr > a_max: a_max = curr a_l.append(curr) # Regular # b[j, i] += b[l, i + 1] * transition * emission b[j, i] = 0 for l in range(0, num_of_states): b_l = a_l[l] - a_max b[j, i] += myexp(b_l) b[j, i] = mylog(b[j, i]) + a_max return b.T
def main(srcdir, destdir): def _dest_name(fname): rx = re.search(r"(.+?)(\d+)\.(\w+)$", fname) if rx: rx = rx.groups() # we need to rename/renumber the file for its destpath fname = "{}-{}.{}".format(rx[0], rx[1].rjust(3, "0"), rx[2]) return fname zipnames = sorted(list(srcdir.glob("*.zip"))) myinfo(f"Found {len(zipnames)} zipfiles") for zn in zipnames: # we actually make a subdir for each zip file _zsub = re.search(r"(\w+)_\d{8}", zn.stem).groups()[0] zdir = destdir.joinpath(_zsub) zdir.mkdir(exist_ok=True, parents=True) mylog(zn, label="Unpacking") zfile = ZipFile(zn) zlist = sorted(zfile.filelist, key=lambda x: x.filename) for _z in zlist: zname = _z.filename fname = _dest_name(zname) if len(zfile.filelist) > 1 else zname destpath = zdir.joinpath(fname) destpath.write_bytes(zfile.read(zname)) mylog( destpath.name, destpath.parent, f"{existed_size(destpath)} bytes", label="Extracted", )
def main(src_dir): def _get_data_paths(src_dir): paths = sorted(p for p in src_dir.rglob('*.csv') if all(_sk not in p.name for _sk in SKIPPED_FILES)) # return [p for p in paths if 'violation_event' in p.name] return paths def _group_data_paths(paths): d = defaultdict(list) for p in paths: q = p.parent.stem d[q].append(p) return d TARGET_DB_PATH.parent.mkdir(exist_ok=True, parents=True) mylog(TARGET_DB_PATH, label="Connecting to") conn = connect_to_db(TARGET_DB_PATH) mylog("Creating tables") create_tables(conn, schema_path=CREATE_PATH) allpaths = _get_data_paths(src_dir) myinfo(f"{len(allpaths)} total files") gpaths = _group_data_paths(allpaths) myinfo(f"{len(gpaths.keys())} groups") for i, (gname, srcpaths) in enumerate(gpaths.items()): myinfo(f"#{i+1} Group {gname} has {len(srcpaths)} files") for j, path in enumerate(srcpaths): myinfo(f"File {j+1} of {len(srcpaths)} {path}") insert_from_csv(conn, path) conn.close()
def main(): TARGET_DB_PATH.parent.mkdir(parents=True, exist_ok=True) mylog(TARGET_DB_PATH, label="Connecting to") conn = connect_to_db(TARGET_DB_PATH) conn = load_custom_functions(conn) mylog("Creating tables") create_tables(conn, schema_path=CREATE_DB_PATH) inserts(conn) conn.close()
def fetch_catalog_urls(): mylog(CATALOG_URL, label="Fetching catalog") resp = requests.post(CATALOG_URL, data={"agency": "osha"}) soup = lxsoup(resp.text) urls = soup.xpath('//a[contains(@href, "csv.zip")]/@href') """ each url will look like this: https://enfxfr.dol.gov/../data_catalog/OSHA/osha_accident_injury_20200727.csv.zip so we tidy it to: https://enfxfr.dol.gov/data_catalog/OSHA/osha_accident_injury_20200727.csv.zip """ return [u.replace("../data_catalog", "data_catalog") for u in urls]
def main(stash_dir, target_dir): def _fix_header(fields): """ we expect every actual data table to have a "load_dt" column as its last column, but some tables have load_date or ld_dt """ header = fields.copy() if fields[-1] in ( 'load_date', 'ld_dt', ): header[-1] = 'load_dt' myinfo(f"From {fields[-1]} to {header[-1]}", label='Fixed header') return header # def init_csv(seriesname): # """seriesname is a expected to be a string corresponding to # a subdir like 'osha_violations/' # """ data_dirs = sorted(d for d in stash_dir.iterdir() if d.is_dir()) myinfo(f"{stash_dir}", f"{len(data_dirs)} data directories", label="Main stash dir") target_dir.mkdir(parents=True, exist_ok=True) for datadir in data_dirs: src_paths = sorted(datadir.glob('*.csv')) myinfo(f"{datadir}", f"{len(src_paths)} files", label="Stash subdir") targetpath = target_dir.joinpath(f'{datadir.name}.csv') targetfile = open(targetpath, 'w') target = csv.writer(targetfile) _rowcount = 0 for series_idx, srcpath in enumerate(src_paths): mylog( f"{series_idx}. {srcpath.name} | {existed_size(srcpath)} bytes", label="Reading") with open(srcpath) as srcfile: src = csv.reader(srcfile) header = next(src) if series_idx == 0: # first file in series, write header to target xheader = _fix_header(header) target.writerow(xheader) for row in src: target.writerow(row) _rowcount += 1 myinfo(targetpath, f"{_rowcount} rows total (+ header)", label="Wrote") targetfile.close()
def viterbi(s, transitions, emissions): s_length = len(s) # n.Rows num_of_states = len(emissions) # k.Columns v = np.zeros((num_of_states, s_length), dtype=object) v[0, 0] = ( math.log(1), -1 ) # the tuple is to know from what i value in the previous column the maximum was chosen. # initialize v[0, j] for i in range(1, num_of_states): v[i, 0] = (mylog(emissions[0].get(s[0])), -1 ) # there is no previous because this is the most left column. for i in range(1, len(s)): for j in range(0, num_of_states): curr_max = -math.inf max_prev_state_index = -1 emission = emissions[j].get(s[i]) for l in range(0, num_of_states): score = mylog(emission) + float(v[l, i - 1][0]) + mylog( transitions[l, j]) if score > curr_max: curr_max = score max_prev_state_index = l v[j, i] = (curr_max, max_prev_state_index) last_column_max = -math.inf result = [] # Find the max in the last column prev_index = -1 for idx in range(num_of_states): if v[idx, len(s) - 1][0] > last_column_max: last_column_max = v[idx, len(s) - 1][0] prev_index = idx # Reconstructing for k in reversed(range(0, len(s))): result.append((s[k], prev_index, v[prev_index, k][0])) prev_index = v[prev_index, k][1] result.reverse() return result, last_column_max
def forward(s, transitions, emissions): s_length = len(s) # n.Rows num_of_states = len(emissions) # k.Columns f = np.zeros((num_of_states, s_length), dtype=float) # initialize f[0, i] # Regular # f[0, 0] = 1 f[0, 0] = math.log(1) for i in range(1, num_of_states): # Regular # f[i, 0] = 0 f[i, 0] = mylog(0) for i in range(1, len(s)): for j in range(0, num_of_states): emission = emissions[j].get(s[i]) a_max = sys.float_info.min a_l = [] for l in range(0, num_of_states): curr = f[l, i - 1] + mylog(transitions[l, j]) if curr > a_max: a_max = curr a_l.append(curr) # Regular # f[j, i] += f[l, i - 1] * transitions[l, j] * emission f[j, i] = 0 for l in range(0, num_of_states): b_l = a_l[l] - a_max f[j, i] += math.exp(b_l) f[j, i] = mylog(f[j, i]) + a_max + mylog(emission) likelihood = 0 for i in range(0, num_of_states): curr = f[i, len(s) - 1] if curr > -math.inf: likelihood += curr print(f"forward likelihood is: {likelihood}") return f
def inserts(connection): def _get_paths(): return sorted(INSERTS_DIR.glob("*.sql")) cursor = connection.cursor() cursor.execute(f"ATTACH DATABASE '{SRC_DB_PATH}' AS src_db;") cursor.execute(f"ATTACH DATABASE '{TARGET_DB_PATH}' AS target_db;") srcpaths = _get_paths() myinfo(f"{len(srcpaths)} INSERT SQL scripts", label="File count") for i, insertpath in enumerate(srcpaths): tname = re.match(r"insert_(\w+)", insertpath.stem).groups()[0] targettbl = f"target_db.{tname}" mylog(f"#{i+1} {targettbl}", insertpath, label="Running insert") stmt = insertpath.read_text() cursor.execute(stmt) myinfo(f"{count_rows(cursor, targettbl)} rows in {targettbl}", label="Inserted")
def main(dbpath): mylog(dbpath, label="Connecting to") conn = connect_to_db(dbpath) colindexes = collate_indexes(conn) outs = csv.DictWriter( stdout, fieldnames=("table_name", "rowcount", "colgroup", "pct_total_count") ) outs.writeheader() for tablename, colstrings in colindexes.items(): total_rows = count_table_rows(conn, tablename) outs.writerow({"table_name": tablename, "rowcount": total_rows}) for colstr in colstrings: d = {"table_name": tablename, "colgroup": colstr} d["rowcount"] = count_colgroup_rows(conn, tablename, colstr) d["pct_total_count"] = floor(100.0 * d["rowcount"] / total_rows) outs.writerow(d) conn.close()
def fetch_and_save(url, destpath): xb = existed_size(destpath) purl = Path(url) if xb: mylog(f"{destpath}", f"{xb} bytes", label="Exists") mylog(purl.name, purl.parent, label="Skipping") else: mylog(purl.name, purl.parent, label="Downloading") resp = fetch(url) destpath.parent.mkdir(exist_ok=True, parents=True) with open(destpath, "wb") as dest: for data in resp: dest.write(data) mylog(destpath, f"{existed_size(destpath)} bytes", label="Saved")
def create_tables(connection, schema_path): def _parse_statements(txt): """ assumes each create statement is delimited by ';' Returns a dict, with table names as keys, create statements as values """ d = {} create_stmts = [s.strip("\n ") for s in txt.split(CREATE_TABLE_DELIMITER)] create_stmts = [s for s in create_stmts if s] for t in create_stmts: stmt = t.strip() tbl = re.search(r'CREATE TABLE[^"]*?"([^"]+)" *\(', stmt).groups()[0] d[tbl] = stmt return d txt = schema_path.read_text() statements = _parse_statements(txt) myinfo(f"Read {len(statements.keys())} create table statements") for tbl, stmt in statements.items(): mylog(f'CREATE TABLE "{tbl}"...') connection.cursor().execute(stmt)
def index_table(connection): mylog(INDEXES_PATH, label="Indexing tables") stmt = INDEXES_PATH.read_text() connection.cursor().execute()
def honestParty(pid, N, t, controlChannel, broadcast, receive, send, B=-1): # RequestChannel is called by the client and it is the client's duty to broadcast the tx it wants to include if B < 0: B = int(math.ceil(N * math.log(N))) transactionCache = [] finishedTx = set() proposals = [] receivedProposals = False commonSet = [] locks = defaultdict(lambda: Queue(1)) doneCombination = defaultdict(lambda: False) ENC_THRESHOLD = N - 2 * t global finishcount encPK, encSKs = getEncKeys() encCounter = defaultdict(lambda: {}) includeTransactionChannel = Queue() def probe(i): if len( encCounter[i] ) >= ENC_THRESHOLD and receivedProposals and not locks[i].full( ) and not doneCombination[i]: # by == this part only executes once. oriM = encPK.combine_shares( deserializeEnc(proposals[i][:ENC_SERIALIZED_LENGTH]), dict(itertools.islice(encCounter[i].iteritems(), ENC_THRESHOLD))) doneCombination[i] = True locks[i].put(oriM) def listener(): while True: sender, msgBundle = receive() if msgBundle[0] == 'O': encCounter[msgBundle[1]][sender] = msgBundle[2] probe(msgBundle[1]) else: includeTransactionChannel.put( (sender, msgBundle)) # redirect to includeTransaction Greenlet(listener).start() while True: op, msg = controlChannel.get() if op == "IncludeTransaction": if isinstance(msg, Transaction): # transactionCache.add(msg) transactionCache.append(msg) elif isinstance(msg, set): for tx in msg: transactionCache.append(tx) elif isinstance(msg, list): transactionCache.extend(msg) elif op == "Halt": break elif op == "Msg": broadcast(eval(msg)) # now the msg is something we mannually send mylog("timestampB (%d, %lf)" % (pid, time.time()), verboseLevel=-2) if len(transactionCache) < B: # Let's wait for many transactions. : ) time.sleep(0.5) print "Not enough transactions", len(transactionCache) continue oldest_B = transactionCache[:B] selected_B = random.sample(oldest_B, min(B / N, len(oldest_B))) print "[%d] proposing %d transactions" % (pid, min( B / N, len(oldest_B))) aesKey = random._urandom(32) # encrypted_B = encrypt(aesKey, ''.join(selected_B)) encryptedAESKey = encPK.encrypt(aesKey) proposal = serializeEnc(encryptedAESKey) + encrypted_B mylog("timestampIB (%d, %lf)" % (pid, time.time()), verboseLevel=-2) commonSet, proposals = includeTransaction( pid, N, t, proposal, broadcast, includeTransactionChannel.get, send) mylog("timestampIE (%d, %lf)" % (pid, time.time()), verboseLevel=-2) receivedProposals = True for i in range(N): probe(i) for i, c in enumerate(commonSet): # stx is the same for every party if c: share = encSKs[pid].decrypt_share( deserializeEnc(proposals[i][:ENC_SERIALIZED_LENGTH])) broadcast(('O', i, share)) mylog("timestampIE2 (%d, %lf)" % (pid, time.time()), verboseLevel=-2) recoveredSyncedTxList = [] def prepareTx(i): rec = locks[i].get() encodedTxSet = decrypt(rec, proposals[i][ENC_SERIALIZED_LENGTH:]) assert len(encodedTxSet) % TR_SIZE == 0 recoveredSyncedTx = [ encodedTxSet[i:i + TR_SIZE] for i in range(0, len(encodedTxSet), TR_SIZE) ] recoveredSyncedTxList.append(recoveredSyncedTx) thList = [] for i, c in enumerate(commonSet): # stx is the same for every party if c: s = Greenlet(prepareTx, i) thList.append(s) s.start() gevent.joinall(thList) mylog("timestampE (%d, %lf)" % (pid, time.time()), verboseLevel=-2) for rtx in recoveredSyncedTxList: finishedTx.update(set(rtx)) mylog("[%d] %d distinct tx synced and %d tx left in the pool." % (pid, len(finishedTx), len(transactionCache) - len(finishedTx)), verboseLevel=-2) lock.get() finishcount += 1 lock.put(1) if finishcount >= N - t: # convenient for local experiments sys.exit() mylog("[%d] Now halting..." % (pid))
def honestParty(pid, N, t, controlChannel, broadcast, receive, send, B = -1): # RequestChannel is called by the client and it is the client's duty to broadcast the tx it wants to include if B < 0: B = int(math.ceil(N * math.log(N))) transactionCache = [] finishedTx = set() proposals = [] receivedProposals = False commonSet = [] locks = defaultdict(lambda : Queue(1)) doneCombination = defaultdict(lambda : False) ENC_THRESHOLD = N - 2 * t global finishcount encPK, encSKs = getEncKeys() encCounter = defaultdict(lambda : {}) includeTransactionChannel = Queue() def probe(i): if len(encCounter[i]) >= ENC_THRESHOLD and receivedProposals and not locks[i].full() and not doneCombination[i]: # by == this part only executes once. oriM = encPK.combine_shares(deserializeEnc(proposals[i][:ENC_SERIALIZED_LENGTH]), dict(itertools.islice(encCounter[i].iteritems(), ENC_THRESHOLD)) ) doneCombination[i] = True locks[i].put(oriM) def listener(): while True: sender, msgBundle = receive() if msgBundle[0] == 'O': encCounter[msgBundle[1]][sender] = msgBundle[2] probe(msgBundle[1]) else: includeTransactionChannel.put((sender, msgBundle)) # redirect to includeTransaction Greenlet(listener).start() while True: op, msg = controlChannel.get() if op == "IncludeTransaction": if isinstance(msg, Transaction): # transactionCache.add(msg) transactionCache.append(msg) elif isinstance(msg, set): for tx in msg: transactionCache.append(tx) elif isinstance(msg, list): transactionCache.extend(msg) elif op == "Halt": break elif op == "Msg": broadcast(eval(msg)) # now the msg is something we mannually send mylog("timestampB (%d, %lf)" % (pid, time.time()), verboseLevel=-2) if len(transactionCache) < B: # Let's wait for many transactions. : ) time.sleep(0.5) print "Not enough transactions", len(transactionCache) continue oldest_B = transactionCache[:B] selected_B = random.sample(oldest_B, min(B/N, len(oldest_B))) print "[%d] proposing %d transactions" % (pid, min(B/N, len(oldest_B))) aesKey = random._urandom(32) # encrypted_B = encrypt(aesKey, ''.join(selected_B)) encryptedAESKey = encPK.encrypt(aesKey) proposal = serializeEnc(encryptedAESKey) + encrypted_B mylog("timestampIB (%d, %lf)" % (pid, time.time()), verboseLevel=-2) commonSet, proposals = includeTransaction(pid, N, t, proposal, broadcast, includeTransactionChannel.get, send) mylog("timestampIE (%d, %lf)" % (pid, time.time()), verboseLevel=-2) receivedProposals = True for i in range(N): probe(i) for i, c in enumerate(commonSet): # stx is the same for every party if c: share = encSKs[pid].decrypt_share(deserializeEnc(proposals[i][:ENC_SERIALIZED_LENGTH])) broadcast(('O', i, share)) mylog("timestampIE2 (%d, %lf)" % (pid, time.time()), verboseLevel=-2) recoveredSyncedTxList = [] def prepareTx(i): rec = locks[i].get() encodedTxSet = decrypt(rec, proposals[i][ENC_SERIALIZED_LENGTH:]) assert len(encodedTxSet) % TR_SIZE == 0 recoveredSyncedTx = [encodedTxSet[i:i+TR_SIZE] for i in range(0, len(encodedTxSet), TR_SIZE)] recoveredSyncedTxList.append(recoveredSyncedTx) thList = [] for i, c in enumerate(commonSet): # stx is the same for every party if c: s = Greenlet(prepareTx, i) thList.append(s) s.start() gevent.joinall(thList) mylog("timestampE (%d, %lf)" % (pid, time.time()), verboseLevel=-2) for rtx in recoveredSyncedTxList: finishedTx.update(set(rtx)) mylog("[%d] %d distinct tx synced and %d tx left in the pool." % (pid, len(finishedTx), len(transactionCache) - len(finishedTx)), verboseLevel=-2) lock.get() finishcount += 1 lock.put(1) if finishcount >= N - t: # convenient for local experiments sys.exit() mylog("[%d] Now halting..." % (pid))
def _init_model(self): # Placeholders for the input values self.X = Input(shape=(self.params['img_height'], self.params['img_width'], self.params['img_channels'])) self.z = Input(shape=(self.params['z_dim'], )) self.c = Input(shape=(self.params['c_dim'], )) self.D = self.create_discriminator() self.E = self.create_encoder() self.G = self.create_generator() # Encoding of real image self.Z_hat = self.encode(self.X) # Fake image generated by G self.X_hat = self.generate(Concatenate()([self.z, self.c])) # Encoding of fake image self.Z_gen = self.encode(self.X_hat) # D prediction for real images D_enc = self.discriminate(self.X, self.Z_hat) # D prediction for generated images D_gen = self.discriminate(self.X_hat, Concatenate()([self.z, self.c])) # Get disentangled components of the encoding c_gen = self.Z_gen[:, self.params['z_dim']:] c_gen_cont = c_gen[:, self.params['num_disc_vars']:] c_cont = self.c[:, self.params['num_disc_vars']:] c_gen_cat = c_gen[:, :self.params['num_disc_vars']] c_cat = self.c[:, :self.params['num_disc_vars']] # Crossentropy in continuous variables cont_stddev_c_gen = K.ones_like(c_gen_cont) eps_c_gen = (c_cont - c_gen_cont) / (cont_stddev_c_gen + 1e-8) crossent_c_gen_cont = K.mean( -K.sum(0.5*np.log(2*np.pi) - mylog(cont_stddev_c_gen) \ - 0.5*K.square(eps_c_gen), 1)) # Crossentropy in categorical variables crossent_c_gen_cat = K.mean(-K.sum(mylog(c_gen_cat) * c_cat, 1)) # Loss for Discriminator and Generator/Encoder D_loss = -K.mean(mylog(D_enc) + mylog(1 - D_gen)) G_loss = -K.mean(mylog(D_gen) + mylog(1 - D_enc)) + \ crossent_c_gen_cat + crossent_c_gen_cont # Collect the trainable weights weights_D = self.D.trainable_weights weights_GE = self.G.trainable_weights + self.E.trainable_weights training_updates_D = Adam(lr=self.params['lr_D'], beta_1=self.params['beta1_D'], decay=self.params['ld_D']).get_updates( weights_D, [], D_loss) training_updates_GE = Adam(lr=self.params['lr_G'], beta_1=self.params['beta1_G'], decay=self.params['ld_G']).get_updates( weights_GE, [], G_loss) self.train_D_fn = K.function(inputs=[self.X, self.z, self.c], outputs=[D_loss], updates=training_updates_D) self.train_GE_fn = K.function(inputs=[self.X, self.z, self.c], outputs=[G_loss], updates=training_updates_GE)
def insert_from_csv(connection, src_path): NULL_CELL_COUNT = 0 NULL_ROW_COUNT = 0 def _convert_blank_to_null(iterdata): """ every cell is expected to be a string """ nonlocal NULL_CELL_COUNT nonlocal NULL_ROW_COUNT for row in iterdata: _row_nulled = False for i, v in enumerate(row): val = v.strip() if not val: row[i] = None NULL_CELL_COUNT += 1 if _row_nulled is False: _row_nulled = True NULL_ROW_COUNT += 1 else: row[i] = val yield row def _get_insert_statement(tablename, fields): fields_qstr = ', '.join(fields) vals_qstr = ', '.join('?' for f in fields) return f"INSERT INTO {tablename}({fields_qstr}) VALUES ({vals_qstr})" def _get_table_name(path): """ path can be anything from: osha_violation.csv to: osha_violation-004.csv Returns: violation """ mx = re.match(r'osha_(\w+)(?:-\d+)?', path.stem) return mx.groups()[0] # --------------------------------------------------------- mylog(src_path.name, src_path.parent, label="Reading") srcfile = src_path.open() records = csv.reader(srcfile) tablename = _get_table_name(src_path) fieldnames = next(records) mylog(tablename, label="Inserting into table") iq = _get_insert_statement(tablename, fieldnames) # myinfo(iq, label="INSERT query") xrecords = _convert_blank_to_null(records) # cursor = connection.cursor() # cursor.executemany(iq, xrecords) # myinfo(f"{count_rows(cursor, tablename)} rows in table: {tablename}", label="Row count") with connection as conn: # context helper provides MASSIVE speed boost cursor = conn.cursor() cursor.executemany(iq, xrecords) myinfo(f"{count_rows(cursor, tablename)} rows in table: {tablename}", label="Row count") myinfo(NULL_ROW_COUNT, label="Empty rows NULLED") myinfo(NULL_CELL_COUNT, label="Empty cells NULLED") srcfile.close()