def process_input(tweet): """ Process input data to tokenize and make labels. """ tokens = tokenize(to_ascii(tweet.text)) label = LABELS.index(tweet.topic) assert label >= 0 return tweet.id, tokens, label
def consume_l2(self, q): if q.ul: ult = "UL" else: ult = "DL" print("%s %s <%-20s> %s" % (datetime.datetime.fromtimestamp( q.time).strftime("%Y-%m-%dT%H:%M:%S"), ult, q.prehdr.hex(":"), to_ascii(q.data, escape=True)), file=outfile)
def search(origin, destination, date): url = "http://us.megabus.com/JourneyResults.aspx" payload = { "originCode": str(origin), "destinationCode": str(destination), "outboundDepartureDate": str(date), "inboundDepartureDate": "", "passengerCount": "1", "transportType": "0", "concessionCount": "0", "nusCount": "0", "outboundWheelchairSeated": "0", "outboundOtherDisabilityCount": "0", "inboundWheelchairSeated": "0", "inboundOtherDisabilityCount": "0", "outboundPcaCount": "0", "inboundPcaCount": "0", "promotionCode": "", "withReturn": "0", } result = requests.get(url, params=payload) complete_url = result.url result = to_ascii(result.content).encode("ascii", "ignore") result = BeautifulSoup(result) results = [] for id, r in enumerate(result.find_all(class_="journey standard")): second = r.find(class_="two") departs = list(second.p.children) depart_time = str(departs[2].strip()) depart_city = str(departs[4].strip()) depart_station = str(departs[8].strip()) arrives = list(second.find(class_="arrive").children) arrive_time = str(arrives[2].strip()) arrive_city = str(arrives[4].strip()) arrive_station = str(arrives[8].strip()) duration = r.find(class_="three").text.strip() cost = r.find(class_="five").text.strip() d = datetime.datetime.strptime(date, "%m/%d/%Y").date() dt = datetime.datetime.strptime(depart_time, "%I:%M%p").time() at = datetime.datetime.strptime(arrive_time, "%I:%M%p").time() depart_time = datetime.datetime.combine(d, dt) if at < dt: arrive_time = datetime.datetime.combine(d, at) + datetime.timedelta(days=1) else: arrive_time = datetime.datetime.combine(d, at) results.append( Trip( Stop(depart_time, depart_city, depart_station), Stop(arrive_time, arrive_city, arrive_station), duration, cost, url=(id + 1, complete_url), ) ) return results
def consume(self, msg): date = datetime.datetime.fromtimestamp( msg.time).strftime("%Y-%m-%dT%H:%M:%S") str = "Message %07d %02d @%s (len:%d)" % (msg.ric, msg.seq, date, msg.pcnt) txt = msg.content if msg.fmt == 5: out = to_ascii(txt, escape=True) str += " %3d" % msg.csum elif msg.fmt == 3: out = txt str += " BCD" str += (" fail:", " OK:")[msg.correct] str += " %s" % (out) print(str, file=outfile)
def consume(self, q): (data, time, ul, level, freq) = q if ul: ul = "UL" else: ul = "DL" str = "" str += to_ascii(data, True) fbase = freq - base_freq fchan = int(fbase / channel_width) foff = fbase % channel_width freq_print = "%3d|%05d" % (fchan, foff) print("%15.6f %s %s %s | %s" % (time, freq_print, ul, data.hex(" "), str), file=outfile)
def do_run(args): """ Run the neural net to predict on new data. """ # Load the model and weights model = load_model(args.model, args.weights) wvecs = WordVectorModel.from_file(args.wvecs, False, '*UNKNOWN*') data = ((tweet.id, tokenize(to_ascii(tweet.text))) for tweet in RowObjectFactory.from_stream(csv.reader(args.input, delimiter="\t"))) writer = csv.writer(args.output, delimiter='\t') writer.writerow(['id',] + LABELS) for ix in tqdm(grouper(args.batch_size, data)): ids_batch, X_batch = zip(*ix) X_batch = wvecs.embed_sentences(X_batch) labels = model.predict_on_batch(X_batch) for id, label in zip(ids_batch, labels): writer.writerow([id,] + [float(l) for l in label])
def prepare_data(tweets): """ Assumes that the input is a stream of a Tweet objects that contain the attributes: - text - label Labels are also returned as one-hot vectors. """ X_train, y_train = [], [] for tweet in tqdm(tweets): feats = to_features(to_ascii(tweet.text)) labels = [float_(tweet.hc), float_(tweet.bs), float_(tweet.dt), float_(tweet.tc)] X_train.append(feats) y_train.append(labels) return np.array(X_train), np.array(y_train)
def process_l2(self, q): (data, time, ul, _, _) = q # level, freq # check for SBD if data[0] == 0x76: pass elif data[0] == 0x06 and data[1] == 0: pass else: return # corrupt / no data if len(data) < 5: return # uninteresing (unclear) if data[0] == 0x76 and data[1] == 5: return if data[0] == 0x76: if ul: if data[1] < 0x0c or data[1] > 0x0e: print("WARN: SBD: ul pkt with unclear type", data.hex(":"), file=sys.stderr) return else: if data[1] < 0x08 or data[1] > 0x0b: print("WARN: SBD: dl pkt with unclear type", data.hex(":"), file=sys.stderr) return if data[0] == 0x06: if data[1] != 0x00: print("WARN: SBD: HELLO pkt with unclear type", data.hex(":"), file=sys.stderr) return elif data[2] not in (0x10, 0x20, 0x40, 0x50, 0x70): print("WARN: SBD: HELLO pkt with unknown sub-type", data.hex(":"), file=sys.stderr) return self.sbd_cnt += 1 typ = "%02x%02x" % (data[0], data[1]) data = data[2:] if typ == "0600": prehdr = data[:29] data = data[29:] msgcnt = prehdr[15] msgno = 1 if msgcnt == 0: msgno = 0 hdr = bytes() else: if typ == "7608": if data[0] == 0x26: prehdr = data[:7] data = data[7:] elif data[0] == 0x20: prehdr = data[:5] data = data[5:] else: print("WARN: SBD: DL pkt with unclear header", data.hex(":"), file=sys.stderr) prehdr = data[:7] data = data[7:] msgcnt = prehdr[3] else: prehdr = bytes() msgcnt = -1 if ul and len(data) >= 3 and data[0] == 0x50: prehdr = data[:3] # remove data = data[3:] if len(data) == 0: hdr = bytes() msgno = 0 elif len(data) > 3 and data[0] == 0x10: hdr = data[:3] # hdr: 0x10 len msg-cnt data = data[3:] msgno = hdr[2] if len(data) < hdr[1]: if verb2: print("SBD: Pkt too short", end=" ") print("[%f] %2d/%2d %s <%s> <%s> %s" % (time, msgno, msgcnt, typ, prehdr.hex(":"), hdr.hex(":"), data.hex(":"))) return elif len(data) > hdr[1]: if verb2: print("SBD: Pkt too long", end=" ") print("[%f] %2d/%2d %s <%s> <%s> %s" % (time, msgno, msgcnt, typ, prehdr.hex(":"), hdr.hex(":"), data.hex(":"))) data = data[:hdr[1]] else: hdr = bytes() msgno = 0 if verb2: print("SBD: Pkt weird:", end=" ") print("[%f] %2d/%2d %s <%s> <%s> %s" % (time, msgno, msgcnt, typ, prehdr.hex(":"), hdr.hex(":"), data.hex(":"))) pkt = SBDObject(typ, time, ul, prehdr, data) if verb2 and (msgno > 1 or msgcnt > 1): print("[%f] %2d/%2d %s <%s> <%s> %s" % (time, msgno, msgcnt, typ, prehdr.hex(":"), hdr.hex(":"), to_ascii(data, escape=True))) for (idx, (_, _, _, t)) in reversed(list(enumerate(self.multi[:]))): if t + 5 < time: if verb2: print("Expired one:", idx) self.sbd_broken += 1 self.multi.pop(idx) if msgno == 0: # mboxcheck self.sbd_short += 1 return pkt elif msgcnt == 1 and msgno == 1: # single-message self.sbd_single += 1 return pkt elif msgcnt > 1: # first new multi-packet self.multi.append([msgno, msgcnt, pkt, time]) self.sbd_assembled += 1 return None elif msgno > 1: # addon ok = False for (idx, (no, cnt, p, t)) in reversed(list(enumerate(self.multi[:]))): if msgno == no + 1 and msgno < cnt and p.ul == ul: # could check if "typ" seems right. self.multi[idx][2].data += data self.multi[idx][0] += 1 self.sbd_assembled += 1 if verb2: print("Merged: %f s" % (time - t)) return None elif msgno == no + 1 and msgno == cnt and p.ul == ul: # could check if "typ" seems right. p.data += data p.typ += typ self.multi.pop(idx) if verb2: print("Merged & finished: %f s" % (time - t)) self.sbd_assembled += 1 self.sbd_multi += 1 return p self.sbd_broken += 1 if verb2: print("Couldn't attach subpkt.") return None else: raise Exception("Shouldn't happen:" + str(msgno) + str(msgcnt) + str(pkt.__dict__))
def consume_l2(self, q): if len(q.data) == 0: # Currently not interested :) return if q.data[0] != 1: # prelim. check for ACARS return def parity7(data): ok = True for c in data: bits = bin(c).count("1") if bits % 2 == 0: ok = False return ok, bytes([x & 0x7f for x in data]) q.errors = 0 csum = bytes() q.hdr = bytes() q.errors = [] q.data = q.data[1:] if q.data[-1] == 0x7f: csum = q.data[-3:-1] q.data = q.data[:-3] if q.data[0] == 0x3: # header of unknown meaning q.hdr = q.data[0:8] q.data = q.data[8:] if len(csum) > 0: q.the_crc = self.acars_crc16(q.data + csum) if q.the_crc != 0: q.errors.append("CRC_FAIL") else: q.errors.append("CRC_MISSING") if len(q.data) < 13: q.errors.append("TRUNCATED") return # throw away for now ok, data = parity7(q.data) if not ok: q.errors.append("PARITY_FAIL") q.mode = data[0:1] q.f_reg = data[1:8] # address / aircraft registration q.ack = data[8:9] q.label = data[9:11] q.b_id = data[11:12] # block id data = data[12:] q.cont = False if data[-1] == 0x03: # ETX data = data[:-1] elif data[-1] == 0x17: # ETB q.cont = True data = data[:-1] else: q.errors.append("ETX incorrect") if len(data) > 0 and data[0] == 2: # Additional content if data[0] == 2: if q.ul: q.seqn = data[1:5] # sequence number q.f_no = data[5:11] # flight number q.txt = data[11:] else: q.txt = data[1:] else: q.txt = data q.errors.append("STX missing") else: q.txt = bytes() if len(q.errors) > 0 and not 'showerrs' in config.args: return q.timestamp = datetime.datetime.fromtimestamp( q.time).strftime("%Y-%m-%dT%H:%M:%S%z") while len(q.f_reg) > 0 and q.f_reg[0:1] == b'.': q.f_reg = q.f_reg[1:] # PRETTY-PRINT (json) if 'json' in config.args: out = {} out['header'] = q.hdr.hex() out['errors'] = " ".join(q.errors) for key in ('timestamp', 'mode', 'f_reg:tail', 'ack', 'label', 'b_id:block_id', 'txt:text', 'cont:continues', 'seqn:sequence_no', 'f_no:flight_no', 'ul:uplink'): okey, _, jkey = key.partition(':') if jkey == '': jkey = okey if okey in q.__dict__: val = q.__dict__[okey] if isinstance(val, bytes): val = val.decode('ascii') out[jkey] = val out['source'] = {'transport': 'iridium', 'protocol': 'acars'} if config.station: out['source']['station_id'] = config.station print(json.dumps(out), file=outfile) return # PRETTY-PRINT (ascii) out = "" out += q.timestamp + " " if len(q.hdr) > 0: out += "[hdr: %s]" % q.hdr.hex() else: out += "%-23s" % "" out += " " if q.ul: out += "Dir:%s" % "UL" else: out += "Dir:%s" % "DL" out += " " out += "Mode:%s" % q.mode.decode('latin-1') out += " " out += "REG:%-7s" % q.f_reg.decode('latin-1') out += " " if q.ack[0] == 21: out += "NAK " else: out += "ACK:%s" % q.ack.decode('latin-1') out += " " out += "Label:" if q.label == b'_\x7f': out += '_?' else: out += to_ascii(q.label, escape=True) out += " " if q.label in acars_labels: out += "(%s)" % acars_labels[q.label] else: out += "(?)" out += " " out += "bID:%s" % (to_ascii(q.b_id, escape=True)) out += " " if q.ul: out += "SEQ: %s, FNO: %s" % (to_ascii( q.seqn, escape=True), to_ascii(q.f_no, escape=True)) out += " " if len(q.txt) > 0: out += "[%s]" % to_ascii(q.txt, escape=True) if q.cont: out += " CONT'd" if len(q.errors) > 0: out += " " + " ".join(q.errors) print(out, file=outfile)
def consume(self, q): (data, time, ul, _, freq) = q if len(data) <= 2: return fbase = freq - base_freq fchan = int(fbase / channel_width) foff = fbase % channel_width freq_print = "%3d|%05d" % (fchan, foff) if ul: ul = "UL" else: ul = "DL" tmaj = "%02x" % (data[0]) tmin = "%02x%02x" % (data[0], data[1]) if tmaj == "83" or tmaj == "89": # Transaction Identifier set (destination side) tmin = "%02x%02x" % (data[0] & 0x7f, data[1]) data = data[2:] majmap = { "03": "CC", "83": "CC(dest)", "05": "MM", "06": "06", "08": "08", "09": "SMS", "89": "SMS(dest)", "76": "SBD", } minmap = { "0301": "Alerting", "0302": "Call Proceeding", "0303": "Progress", "0305": "Setup", "030f": "Connect Acknowledge", "0325": "Disconnect", "032a": "Release Complete", "032d": "Release", "0502": "Location Updating Accept", "0504": "Location Updating Reject", "0508": "Location Updating Request", "0512": "Authentication Request", "0514": "Authentication Response", "0518": "Identity request", "0519": "Identity response", "051a": "TMSI Reallocation Command", "0600": "Register/SBD:uplink", "0901": "CP-DATA", "0904": "CP-ACK", "0910": "CP-ERROR", "7605": "7605", "7608": "downlink #1", "7609": "downlink #2", "760a": "downlink #3+", "760c": "uplink initial", "760d": "uplink #2", "760e": "uplink #3", } if tmin in minmap: tstr = "[" + majmap[tmaj] + ": " + minmap[tmin] + "]" else: if tmaj in majmap: tstr = "[" + majmap[tmaj] + ": ?]" else: tstr = "[?]" typ = tmin # print >>outfile, "%15.6f"%(time), strtime = datetime.datetime.fromtimestamp(time, tz=Z).strftime( "%Y-%m-%dT%H:%M:%S.{:02.0f}Z".format(int((time % 1) * 100))) print("%s" % strtime, end=' ', file=outfile) print("%s %s [%s] %-36s" % (freq_print, ul, typ, tstr), end=' ', file=outfile) if typ in ("0600", "760c", "760d", "760e", "7608", "7609", "760a"): # SBD prehdr = "" hdr = "" addlen = None if ul == 'UL' and typ in ("0600"): # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 # 20:13:f0:10:02|IMEI |MOMSN|MC|_c|LEN| |TIME # 10:13:f0:10|TMSI? |LAC? |LAC? |00:00:00|MC| |TIME hdr = data[:29] if len(hdr) < 29: # packet too short print("ERR:short", file=outfile) return data = data[29:] prehdr = "<" + hdr[0:4].hex(":") if hdr[0] == 0x20: prehdr += ",%02x" % hdr[4] bcd = [ "%x" % (x >> s & 0xf) for x in hdr[5:13] for s in (0, 4) ] prehdr += "," + bcd[0] + ",imei:" + "".join(bcd[1:]) prehdr += " MOMSN=%02x%02x" % (hdr[13], hdr[14]) addlen = hdr[17] elif hdr[0] in (0x10, 0x40, 0x50, 0x70): prehdr += "," + "".join(["%02x" % x for x in hdr[4:8]]) prehdr += ",%02x%02x" % (hdr[8], hdr[9]) prehdr += ",%02x%02x" % (hdr[10], hdr[11]) prehdr += ",%02x%02x%02x" % (hdr[12], hdr[13], hdr[14]) else: prehdr += "[ERR:hdrtype]" prehdr += " " + hdr[4:15].hex(":") prehdr += " msgct:%d" % hdr[15] prehdr += " " + hdr[16:25].hex(":") ts = hdr[25:] tsi = int(ts.hex(), 16) _, strtime = fmt_iritime(tsi) prehdr += " t:" + strtime prehdr += ">" hdr = "" elif ul == 'UL' and typ in ("760c", "760d", "760e"): if data[0] == 0x50: # <50:xx:xx> MTMSN echoback? prehdr = data[:3] data = data[3:] prehdr = "<" + prehdr.hex(":") + ">" elif ul == 'DL' and typ in ("7608", "7609", "760a"): if typ == "7608": # <26:44:9a:01:00:ba:85> # 1: always? 26 # 2+3: sequence number (MTMSN) # 4: number of packets in message # 5: number of messages waiting to be delivered / backlog # 6+7: unknown / maybe MOMSN? # # <20:33:17:03:01> # fields same as above except 6+7 if data[0] == 0x26: prehdr = data[:7] data = data[7:] prehdr = "<" + prehdr.hex(":") + ">" elif data[0] == 0x20: prehdr = data[:5] data = data[5:] prehdr = "<" + prehdr.hex(":") + ">" else: prehdr = "<ERR:prehdr_type?>" else: prehdr = "<ERR:nomatch>" print("%-22s %-10s " % (prehdr, hdr), end=' ', file=outfile) if typ != "0600" and len(data) > 0: if data[0] == 0x10: # <10:87:01> # 1: always 10 # 2: length in bytes of message # 3: number of packet (760c => 2, 760d => 3, 760e => 4) # (7608 => 1, 7609 => 2, 760a => 3+) hdr = data[:3] data = data[3:] addlen = hdr[1] hdr = "<" + hdr.hex(":") + ">" else: print("ERR:no_0x10", end=" ", file=outfile) if addlen is not None and len(data) != addlen: print("ERR:len(%d!=%d)" % (len(data), addlen), end=" ", file=outfile) # > 0600 / 10:13:f0:10: tmsi+lac+lac+00 +bytes # < 0605 ? # > 0508 Location Updating Request # < 0512 Authentication Request # > 0514 Authentication Response # < 051a TMSI reallocation command [09 f1 30](MCC/MNC/LAC) + [08 f4]TMSI # < 0518 Identity request 02: IMEI # > 0519 Identity response (IMEI) # < 0502 Location Updating Accept (MCC/MNC/LAC) # > 0600 / 20:13:f0:10: 02 imei + momsn + msgcnt + XC + len + bytes + time + (len>0: msg) # < 7608 <26:00:00:00:00:xx:xx> 0 messages (xx=MTMSN?) # > 760c <50:xx:xx> MTMSN echoback? # < 7605 ? elif typ == "032d": # CC Release if len(data) == 4 and data[0] == 8: data = data[1:] (rv, data) = p_disc(data) print("%s" % (rv), end=' ', file=outfile) elif typ == "032a": # CC Release Complete if len(data) == 4 and data[0] == 8: data = data[1:] (rv, data) = p_disc(data) print("%s" % (rv), end=' ', file=outfile) elif typ == "0325": # CC Disconnect (rv, data) = p_disc(data) print("%s" % (rv), end=' ', file=outfile) elif typ == "0502": # Loc up acc. (rv, data) = p_lai(data) print("%s" % (rv), end=' ', file=outfile) if len(data) >= 1 and data[0] == 0x17: data = data[1:] (rv, data) = p_mi_iei(data) print("%s" % (rv), end=' ', file=outfile) if len(data) >= 1 and data[0] == 0xa1: print("Follow-on Proceed", end=' ', file=outfile) data = data[1:] elif typ == "0508": # Loc up req. if data[0] & 0xf == 0 and data[ 6] == 0x28: # 6 == Mobile station classmark if data[0] >> 4 == 7: print("key=none", end=' ', file=outfile) else: print("key=%d" % (data[0] >> 4), end=' ', file=outfile) data = data[1:] (rv, data) = p_lai(data) print("%s" % (rv), end=' ', file=outfile) data = data[1:] # skip classmark (rv, data) = p_mi_iei(data) print("%s" % (rv), end=' ', file=outfile) elif typ == "051a": # TMSI realloc. (rv, data) = p_lai(data) print("%s" % (rv), end=' ', file=outfile) (rv, data) = p_mi_iei(data) print("%s" % (rv), end=' ', file=outfile) elif typ == "0504": # Loc up rej. if data[0] == 2: print("02(IMSI unknown in HLR)", end=' ', file=outfile) data = data[1:] elif typ == "0518": # Identity Req if data[0] == 2: print("02(IMEI)", end=' ', file=outfile) data = data[1:] elif data[0] == 1: print("01(IMSI)", end=' ', file=outfile) data = data[1:] elif typ == "0519": # Identity Resp. (rv, data) = p_mi_iei(data) print("[%s]" % (rv), end=' ', file=outfile) if len(data) > 0: print(" ".join("%02x" % x for x in data), end=' ', file=outfile) print(" | %s" % to_ascii(data, dot=True), file=outfile) else: print("", file=outfile) return