Successes.value += 1 Successes.release() return 'S' except KeyboardInterrupt: return '-' except superfastmatch.SuperFastMatchError, e: Failures.acquire() Failures.value += 1 Failures.release() if e.status in (502, '502'): warn("The SuperFastMatch server is down.") else: warn("Exception {2!r} caught while searching for {0} character string: {1!r}".format(len(searchtext), searchtext[:40] + '...', e)) return 'F' except Exception, e: Failures.acquire() Failures.value += 1 Failures.release() warn("Exception {2!r} caught while searching for {0} character string: {1!r}".format(len(searchtext), searchtext[:40] + '...', e)) return 'F' except: Failures.acquire() Failures.value += 1 Failures.release() warn("Untyped exception caught while searching for {0} character string: {1!r}".format(len(searchtext), searchtext[:40] + '...')) return 'F' def go(url, dirname, statuschar, workernum, modulo, textrange): def info(msg): print u"[{0}] {1}".format(workernum, msg)
class ApiWorker(Process, TankWorker): SECTION = 'core' FINISH_FILENAME = 'finish_status.yaml' def __init__(self, manager, config_paths, cli_options=None, cfg_patches=None, cli_args=None, no_local=False, log_handlers=None, wait_lock=False, files=None, ammo_file=None): Process.__init__(self) TankWorker.__init__(self, configs=config_paths, cli_options=cli_options, cfg_patches=cfg_patches, cli_args=cli_args, no_local=no_local, log_handlers=log_handlers, wait_lock=wait_lock, files=files, ammo_file=ammo_file, api_start=True, manager=manager) self._status = Value(ctypes.c_char_p, Status.TEST_INITIATED) self._test_id = Value(ctypes.c_char_p, self.core.test_id.encode('utf8')) self._retcode = Value(ctypes.c_int, 0) self._msg = Value(ctypes.c_char_p, b'') @property def test_id(self): return self._test_id.value.decode('utf8') @property def status(self): self._status.acquire() res = self._status.value self._status.release() return res @status.setter def status(self, val): self._status.acquire() self._status.value = val self._status.release() @property def retcode(self): return self._retcode.value @retcode.setter def retcode(self, val): self._retcode.value = val @property def msg(self): self._msg.acquire() res = self._msg.value.decode('utf8') self._msg.release() return res @msg.setter def msg(self, val): value = val.encode('utf8') self._msg.acquire() self._msg.value = value self._msg.release() def run(self): with Cleanup(self) as add_cleanup: lock = self.get_lock() add_cleanup('release lock', lock.release) self.status = Status.TEST_PREPARING logger.info('Created a folder for the test. %s' % self.folder) self.core.plugins_configure() add_cleanup('plugins cleanup', self.core.plugins_cleanup) self.core.plugins_prepare_test() with Finish(self): self.status = Status.TEST_RUNNING self.core.plugins_start_test() self.retcode = self.core.wait_for_finish() self.status = Status.TEST_POST_PROCESS self.retcode = self.core.plugins_post_process(self.retcode)
class SSD1306_I2C: def __init__(self, height = 64, width = 128, pageTime = 1): self.i2c = busio.I2C(board.SCL, board.SDA) self.height = height self.width = width self.oled = OledText(self.i2c, self.width, self.height) self.pageTime = pageTime self.oled.auto_show = False #disable auto write on oled_text module self.lastDetection = Value(ctypes.c_bool, False) self.lastTime = Value(ctypes.c_double, 0xdeadbeef) self.lastTemperature = Value(ctypes.c_double, 0xdeadbeef) self.lastHumidity = Value(ctypes.c_double, 0xdeadbeef) self.lastTempUpdate = Value(ctypes.c_double, 0xdeadbeef) self.lastHumUpdate = Value(ctypes.c_double, 0xdeadbeef) self.lastPotentiometer = Value(ctypes.c_int, 0xdeadbeef) self.lastPotentiometerUpdate = Value(ctypes.c_double, 0xdeadbeef) def __write(self, text, line): self.oled.text(text, line) def __clearAll(self): self.oled.clear() def __clear(self, line): self.oled.text("", line) def __setLayout(self, layout): self.oled.layout = layout def __show(self): self.oled.show() def updatePIR(self, data): self.lastDetection.acquire() self.lastTime.acquire() self.lastDetection.value = data['detection'] self.lastTime.value = data['timestamp'] self.lastTime.release() self.lastDetection.release() def updateDHT22(self, data): self.lastTemperature.acquire() self.lastHumidity.acquire() self.lastTempUpdate.acquire() self.lastHumUpdate.acquire() self.lastTemperature.value = data['temperature'] self.lastHumidity.value = data['humidity'] self.lastTempUpdate.value = data['temperatureTimestamp'] self.lastHumUpdate.value = data['humidityTimestamp'] self.lastHumUpdate.release() self.lastTemperature.release() self.lastHumidity.release() self.lastTempUpdate.release() def updateSerial(self, data): self.lastPotentiometer.acquire() self.lastPotentiometerUpdate.acquire() self.lastPotentiometer.value = data['potentiometer'] self.lastPotentiometerUpdate.value = data['timestamp'] self.lastPotentiometerUpdate.release() self.lastPotentiometer.release() def start(self, DHT22Connection, PIRConnection, SerialConnection): self.displayProcess = Process(target=self.__updateDisplay, args=(DHT22Connection, PIRConnection, SerialConnection,)) self.displayProcess.start() def stop(self): if self.displayProcess.is_alive(): self.displayProcess.join() pass def __showDHT22(self): #set 3 lines layout for DHT22 self.__setLayout( { 1: SmallLine(1, 1, font="FreeSans.ttf", size=14), 2: SmallLine(1, 17, font="FreeSans.ttf", size=14), 3: SmallLine(1, 33, font="FreeSans.ttf", size=14), 4: SmallLine(1, 49, font="FreeSans.ttf", size=14) }) self.lastTemperature.acquire() self.lastHumidity.acquire() self.lastTempUpdate.acquire() self.lastHumUpdate.acquire() if int(self.lastTempUpdate.value) == 0xdeadbeef or int(self.lastHumUpdate.value) == 0xdeadbeef: #check if data has been received self.lastHumUpdate.release() self.lastTempUpdate.release() self.lastHumidity.release() self.lastTemperature.release() return tempTime = time.localtime(self.lastTempUpdate.value) tempTime = time.strftime('%H:%M', tempTime) humTime = time.localtime(self.lastHumUpdate.value) humTime = time.strftime('%H:%M', humTime) self.__write(f'Temperature: {self.lastTemperature.value:.2f}', 1) self.__write(f'At time: {tempTime}',2) self.__write(f'Humidity: {self.lastHumidity.value:.2f}', 3) self.__write(f'At time: {humTime}', 4) self.__show() self.lastTemperature.release() self.lastHumidity.release() self.lastTempUpdate.release() self.lastHumUpdate.release() def __showPIR(self): #Set layout for PIR self.__setLayout( { 1: SmallLine(1, 2, font="FreeSans.ttf", size=14), 2: SmallLine(1, 18, font="FreeSans.ttf", size=14) }) self.lastDetection.acquire() self.lastTime.acquire() if int(self.lastTime.value) == 0xdeadbeef: #check if data has been received self.lastTime.release() self.lastDetection.release() return readTime = time.localtime(self.lastTime.value) readTime = time.strftime('%H:%M', readTime) self.__write(f'Position: {self.lastDetection.value}', 1) self.__write(f'At time: {readTime}', 2) self.__show() self.lastTime.release() self.lastDetection.release() def __showSerial(self): #Set layout for serial self.__setLayout( { 1: SmallLine(1, 2, font="FreeSans.ttf", size=14), 2: SmallLine(1, 18, font="FreeSans.ttf", size=14) }) self.lastPotentiometer.acquire() self.lastPotentiometerUpdate.acquire() if int(self.lastPotentiometerUpdate.value) == 0xdeadbeef: #check if data has been received self.lastPotentiometerUpdate.release() self.lastPotentiometer.release() return readTime = time.localtime(self.lastPotentiometerUpdate.value) readTime = time.strftime('%H:%M', readTime) self.__write(f'Potentiometer: {self.lastPotentiometer.value}', 1) self.__write(f'At time: {readTime}', 2) self.__show() self.lastPotentiometerUpdate.release() self.lastPotentiometer.release() def __updateDisplay(self, DHT22Connection, PIRConnection, SerialConnection): try: #get number of pages to show cycles = 0; if DHT22Connection: cycles = cycles + 1 if PIRConnection: cycles = cycles + 1 if SerialConnection: cycles = cycles + 1 while True: try: if DHT22Connection: self.__showDHT22() time.sleep(self.pageTime/ cycles) if PIRConnection: self.__showPIR() time.sleep(self.pageTime / cycles) if SerialConnection: self.__showSerial() time.sleep(self.pageTime / cycles) except OSError: print("display disconnected", file=sys.stderr) try: self.oled = OledText(self.i2c, self.width, self.height) time.sleep(1) except Exception: print("no display found", file=sys.stderr) except KeyboardInterrupt: sys.exit(0)
def process_project(queue, args): nodes = args.nodes.replace(" ", "").split(',') i = 0 for file_path in queue['files_to_process']: print "" print "### File " + str(i + 1) + " of " + str( queue['count']) + ": " + file_path start_time = datetime.now() print "### " + str(start_time) print "" # First, lets instantiate the EVTX object evtxObject = evtx.Evtx(file_path) # Get first chunk header total_records = 0 with evtxObject as log: chunks = iter(log.chunks()) try: while True: chunk = next(chunks) if chunk.log_last_record_number( ) != 0 and chunk.log_first_record_number() != 0: records = (chunk.log_last_record_number() - chunk.log_first_record_number()) + 1 total_records = total_records + records except StopIteration: pass if total_records is 0: print "No logs in file, skipping..." i = i + 1 continue # We need to calculate how many records to assign to each process record_batch = (total_records / args.cores) remainder = total_records - (record_batch * args.cores) store_remainder = remainder # If record_batch is not zero, there is at least one record per task Tasks = {} start = 0 if record_batch > 0: for task in range(args.cores): if remainder > 0: task_batch = record_batch + 1 remainder = remainder - 1 else: task_batch = record_batch Tasks[task] = {} Tasks[task]['count'] = task_batch Tasks[task]['start'] = start start = start + task_batch # We should have already caught and removed log files with < 1 records in total else: # There is less than 1 record per task for task in range(args.cores): if remainder > 0: task_batch = record_batch + 1 remainder = remainder - 1 else: task_batch = 0 Tasks[task] = {} Tasks[task]['count'] = task_batch Tasks[task]['start'] = start start = start + task_batch print "There are " + str(total_records) + " logs in total." print "Allocating " + str( record_batch) + " logs per process with " + str( store_remainder) + " remainder." procs = [] proc = [] # We need some variables to store some processing data GlobalRecordCount = Value('i', lock=True) # Will store count GlobalPercentageComplete = Value('i', lock=True) GlobalTiming = Value('f', lock=True) TooShortToTime = Value('i', lock=True) # Some of these values need to be set try: if GlobalTiming.acquire(True, 1000): GlobalTiming.value = time.time() if GlobalPercentageComplete.acquire(True, 1000): GlobalPercentageComplete.value = 0 if TooShortToTime.acquire(True, 1000): TooShortToTime.value = 0 except: pass finally: GlobalTiming.release() GlobalPercentageComplete.release() TooShortToTime.release() # TODO: Create a single dedicated process to handle web submissions and reporting # Form the queue! support_queue = Queue() supportproc = Process(target=elastic.core_posting_worker, args=(support_queue, GlobalRecordCount, GlobalPercentageComplete, GlobalTiming, TooShortToTime), name="support") supportproc.start() # Start the processes for parsing event data for process in range(args.cores): proc_name = str(process) proc = Process(target=parserecord, args=(Tasks[process], file_path, total_records, int(args.buffer), args.index, nodes, GlobalRecordCount, GlobalPercentageComplete, GlobalTiming, TooShortToTime, args.debug, support_queue, args.token), name=proc_name) procs.append(proc) proc.start() # Wait for all processes to complete for proc in procs: proc.join() # Once all processes are joined, we can send stop command to support core support_queue.put("STOP") supportproc.join() i = i + 1 end_time = datetime.now() duration = end_time - start_time print "" print "File " + str(i) + " completed in: " + str(duration) print ""
class MuseMonitor(): def __init__(self, server=None, port=None, debug=False): self.server = server self.port = port self.window_size = 1024 self.sample_rate = 256 self._buffer = [] self._attention_buff = [.5, .5, .5, .5, .5] self._meditation_buff = [.5, .5, .5, .5, .5] self._running_stats = {'att': RunningStats(), 'med': RunningStats()} self.scaler = joblib.load('res/scaler') self.raw = Value('d', 0) self.attention = Value('d', 0) self.meditation = Value('d', 0) self.waves = Manager().dict() if not debug: process = Process(target=self._run) process.daemon = True process.start() def _get_dispatcher(self): d = dispatcher.Dispatcher() d.map("/debug", print) d.map("/muse/eeg", self._eeg_handler, "EEG") return d def _get_fft(self, raw_list): fft = np.abs(np.fft.rfft(raw_list - np.mean(raw_list))) * 2 / self.window_size freqs = np.fft.rfftfreq(self.window_size, 1 / self.sample_rate) return [freqs, fft] def _get_bands(self, raw_list): band_list = {b: [] for b in BAND_RANGE} freqs, fft = self._get_fft(raw_list) for freq, amps in zip(freqs, fft): for b in BAND_RANGE: low, high = BAND_RANGE[b] if low <= freq < high: band_list[b] += [amps] for b in band_list: band_list[b] = np.mean(band_list[b]) ** 2 return band_list def _reject_outliers(self, data, m=3): data = np.array(data) Q3 = np.percentile(data, 75) Q1 = np.percentile(data, 25) IQR = (Q3 - Q1) * m return data[(data > Q1 - IQR) & (data < Q3 + IQR)] def _is_wearing(self, key, val): return val < 5 and self._running_stats[key].get_std() < 0.5 def _calibrate(self, key, val): self._running_stats[key].update(val) if not self._is_wearing(key, val): self._running_stats[key].clear() val = 0 if self._running_stats[key].get_count() > 5: val = (val - self._running_stats[key].get_mean()) / self._running_stats[key].get_std() * 0.23 + 0.5 return min(1, max(1e-5, val)) def _convert_to_mindwave(self, band, value): mind_c, muse_c, mind_mean, muse_mean = CONVERT_MAP[band] value = value / 1.8 * 4096 * 2 return ((value ** (1 / muse_c)) - muse_mean + mind_mean) ** mind_c def _attention(self, waves): waves = waves.copy() for band in waves: waves[band] = self._convert_to_mindwave(band, waves[band]) for i in range(5): waves[f'attention-{i+1}'] = self._attention_buff[i] for i in list(waves): waves[f'log2-{i}'] = np.log2(waves[i]) waves['log2-theta-alpha'] = np.log2(waves['theta'] + waves['low-alpha'] + waves['high-alpha']) wave_array = np.array([[val for val in waves.values()]]) wave_transformed = self.scaler.transform(wave_array) att = np.sum(wave_transformed * ATT_COEF) + ATT_INTERCEPT att = self._calibrate('att', att) if 0 < att < 1: self._attention_buff = [att] + self._attention_buff[:-1] return att def _meditation(self, waves): waves = waves.copy() for band in waves: waves[band] = self._convert_to_mindwave(band, waves[band]) for i in range(5): waves[f'meditation-{i+1}'] = self._meditation_buff[i] for i in list(waves): waves[f'log2-{i}'] = np.log2(waves[i]) waves['log2-theta-alpha'] = np.log2(waves['theta'] + waves['low-alpha'] + waves['high-alpha']) wave_array = np.array([[val for val in waves.values()]]) wave_transformed = self.scaler.transform(wave_array) med = np.sum(wave_transformed * MED_COEF) + MED_INTERCEPT med = self._calibrate('med', med) if 0 < med < 1: self._meditation_buff = [med] + self._meditation_buff[:-1] return med def _eeg_handler(self, unused_addr=None, args=None, TP9=None, AF7=None, AF8=None, TP10=None, AUX=None): self.raw.acquire() self.raw.value = AF7 - TP9 # TP9 หลังหู-ซ้าย, AF7 หน้าผาก-ซ้าย self.raw.release() self._buffer.append(self.raw.value) if len(self._buffer) > self.window_size: self._buffer = self._buffer[1:] self.waves.update(self._get_bands(self._buffer)) self._buffer = self._buffer[self.sample_rate:] new_attention = self._attention(self.waves) self.attention.acquire() self.attention.value = np.round(new_attention, 2) * 100 self.attention.release() new_meditation = self._meditation(self.waves) self.meditation.acquire() self.meditation.value = np.round(new_meditation, 2) * 100 self.meditation.release() def _run(self): server = osc_server.BlockingOSCUDPServer((self.server, self.port), self._get_dispatcher()) try: server.serve_forever() except KeyboardInterrupt: server.server_close()
class PIRSensor: def __init__(self, pin, frequency, calibrationTime=0): self.dataPin = pin self.startTime = time.time() self.calibrationTime = calibrationTime self.valueReaded = Value(ctypes.c_bool, False) self.readTime = Value(ctypes.c_double, 0xdeadbeef) self.frequency = frequency GPIO.setmode(GPIO.BCM) #Set BCM pin enumeration GPIO.setup(self.dataPin, GPIO.IN) def read(self): if (time.time() - self.startTime ) <= self.calibrationTime: #PIR requires a calibration time return self.valueReaded.acquire() self.readTime.acquire() self.valueReaded.value = (GPIO.input(self.dataPin) != GPIO.LOW) self.readTime.value = time.time() self.readTime.release() self.valueReaded.release() def getPosition(self): self.valueReaded.acquire() self.readTime.acquire() t = (self.valueReaded.value, self.readTime.value) self.readTime.release() self.valueReaded.release() return t def close(self): GPIO.cleanup() def getData(self): self.valueReaded.acquire() self.readTime.acquire() if int( self.readTime.value ) == 0xdeadbeef: #Do not return data if nothing has been read yet self.readTime.release() self.valueReaded.release() return None data = { 'detection': self.valueReaded.value, 'timestamp': self.readTime.value } self.readTime.release() self.valueReaded.release() return data def startUpdater(self): self.updater = Process(target=self.__updateData, args=()) self.updater.start() def stopUpdater(self): if self.updater.is_alive(): self.updater.join() self.close() def __updateData(self): try: while True: self.read() time.sleep(self.frequency) except KeyboardInterrupt: sys.exit(0)
class SerialReader: def __init__(self, device, frequency, speed=9600, parity=serial.PARITY_NONE, stopbits=serial.STOPBITS_ONE, blocksize=serial.EIGHTBITS, timeout=1): self.device = device self.readValue = Value(ctypes.c_int, 0xdeadbeef) self.readTime = Value(ctypes.c_double, 0xdeadbeef) self.frequency = frequency self.speed = speed self.parity = parity self.stopbits = stopbits self.blocksize = blocksize self.timeout = timeout #init serial module self.serial_module = serial.Serial(port=self.device, baudrate=self.speed, parity=self.parity, stopbits=self.stopbits, bytesize=self.blocksize, timeout=self.timeout) def read(self): acquired = 0 try: msg = self.serial_module.readline().decode().rstrip("\n") readDict = json.loads(msg) self.readValue.acquire() acquired = 1 self.readTime.acquire() acquired = 2 self.readValue.value = readDict['Potentiometer'] self.readTime.value = time.time() self.readTime.release() self.readValue.release() except (UnicodeDecodeError, json.decoder.JSONDecodeError): print('malformed message from serial', file=sys.stderr) #free locks if acquired > 1: self.readTime.release() if acquired > 0: self.readValue.release() except (serial.SerialException, serial.serialutil.SerialException, KeyError): print('Serial device disconnected', file=sys.stderr) #try to reconnect try: self.serial_module = serial.Serial(port=self.device, baudrate=self.speed, parity=self.parity, stopbits=self.stopbits, bytesize=self.blocksize, timeout=self.timeout) time.sleep(1) except Exception: print('Cannot find serial device', file=sys.stderr) def getValue(self): self.readValue.acquire() self.readTime.acquire() t = (self.readValue.value, self.readTime.value) self.readTime.release() self.readValue.release() return t def close(self): self.serial_module.close() def getData(self): self.readValue.acquire() self.readTime.acquire() if int(self.readTime.value) == 0xdeadbeef: self.readTime.release() self.readValue.release() return None data = { 'potentiometer': self.readValue.value, 'timestamp': self.readTime.value } self.readTime.release() self.readValue.release() return data def startUpdater(self): self.updater = Process(target=self.__updateData, args=()) self.updater.start() def stopUpdater(self): if self.updater.is_alive(): self.updater.join() self.close() def __updateData(self): try: while True: self.read() time.sleep(self.frequency) except KeyboardInterrupt: sys.exit(0)
class DHT22Sensor: DHT_MAXT = 80 DHT_MINT = -40 DHT_MAXH = 100 DHT_MINH = 0 def __init__(self, datapin, readFrequency, tHyst=0, hHyst=0): self.datapin = datapin self.readFrequency = readFrequency self.tempHyst = tHyst self.humHyst = hHyst self.temp = Value(ctypes.c_double, 0xdeadbeef) self.hum = Value(ctypes.c_double, 0xdeadbeef) self.lastTemperatureTime = Value(ctypes.c_double, 0xdeadbeef) self.lastHumidityTime = Value(ctypes.c_double, 0xdeadbeef) @staticmethod def validTemperature(temp): return temp != None and temp >= DHT22Sensor.DHT_MINT and temp <= DHT22Sensor.DHT_MAXT @staticmethod def validHumidity(hum): return hum != None and hum >= DHT22Sensor.DHT_MINH and hum <= DHT22Sensor.DHT_MAXH def hasChangedTemp(self, temp): return temp <= (self.temp.value - self.tempHyst) or temp >= ( self.temp.value + self.tempHyst) def hasChangedHum(self, hum): return hum <= (self.hum.value - self.humHyst) or hum >= (self.hum.value + self.humHyst) def read(self): humidity, temperature = Adafruit_DHT.read_retry( DHT_SENSOR, self.datapin) currentTime = time.time() self.temp.acquire() self.hum.acquire() self.lastTemperatureTime.acquire() self.lastHumidityTime.acquire() if DHT22Sensor.validTemperature(temperature): if self.temp.value == 0xdeadbeef: self.temp.value = temperature elif self.hasChangedTemp(temperature): self.temp.value = temperature self.lastTemperatureTime.value = currentTime if DHT22Sensor.validHumidity(humidity): if self.hum.value == 0xdeadbeef: self.hum.value = humidity elif self.hasChangedHum(humidity): self.hum.value = humidity self.lastHumidityTime.value = currentTime self.lastHumidityTime.release() self.lastTemperatureTime.release() self.hum.release() self.temp.release() def getTemp(self): self.temp.acquire() self.lastTemperatureTime.acquire() t = (self.temp.value, self.lastTemperatureTime.value) self.lastTemperatureTime.release() self.temp.release() return t def getHum(self): self.hum.acquire() self.lastHumidityTime.acquire() t = (self.hum.value, self.lastHumidityTime.value) self.lastHumidityTime.release() self.hum.release() return t def getData(self): self.temp.acquire() self.hum.acquire() self.lastTemperatureTime.acquire() self.lastHumidityTime.acquire() if int(self.lastTemperatureTime.value) == 0xdeadbeef or int( self.lastHumidityTime.value) == 0xdeadbeef: self.lastHumidityTime.release() self.lastTemperatureTime.release() self.hum.release() self.temp.release() return None data = { 'temperature': round(self.temp.value, 3), 'humidity': round(self.hum.value, 3), 'temperatureTimestamp': self.lastTemperatureTime.value, 'humidityTimestamp': self.lastHumidityTime.value } self.lastHumidityTime.release() self.lastTemperatureTime.release() self.hum.release() self.temp.release() return data def startUpdater(self): self.updater = Process(target=self.__updateData, args=()) self.updater.start() def stopUpdater(self): if self.updater.is_alive(): self.updater.join() def __updateData(self): try: while True: self.read() time.sleep(self.readFrequency) except KeyboardInterrupt: sys.exit(0)
def start_derivatives(self, derivative_job, derivative_types): """Initiate the derivatives job. args: derivative_job (deriviative job object): job to process derivative_types (list of dict): list of the derivative types along with their command information """ self.derivative_job = derivative_job self.derivative_types = derivative_types status_id = None job_messages.objects.create( job_id=derivative_job.job_id, created=timezone.now(), message="Initializing Derivative Job." ) # Get the types specified in the job derivative_job_types = job_derivatives.objects.filter(derive_id = self.derivative_job) types = [d.derive_type for d in derivative_job_types] output_endings = [d["output_file"].replace("{0}","") for d in self.derivative_types] # filter the config settings based on only the selected derivatives for this job self.derivative_types = [d for d in self.derivative_types if d['derivative_type'] in types] # get the job_derivative objects for each of the types derive_objs = {} for derivative in self.derivative_types: try: derive_obj = job_derivatives.objects.get(derive_id=self.derivative_job, derive_type=derivative["derivative_type"]) derive_objs[derivative["derivative_type"]] = derive_obj except Exception as e: # TODO -- why does it fall in this when the record does exist in the db?? # only seems to fall into it sometimes, not consistantly logging.error("Could not find job_derivatives object for {0}, {1} in DB.".format(self.derivative_job.derive_id,derivative["derivative_type"])) continue # get the 3 status objects to pass to the threads (success, fail, skip) status_objs = {} try: status_objs['skipped'] = derivative_results.objects.get(label="Skipped") status_objs['success'] = derivative_results.objects.get(label="Success") status_objs['failure'] = derivative_results.objects.get(label="Failure") except Exception as e: logging.error("Could not find all 3 status objects in the database.") raise processes = [] thread_index = 0 files_processed = 0 thread_count = Value('i',0) # walk the directory starting at the source_dir for root, dirs, files in os.walk(self.derivative_job.source_dir): for f in files: # Check if the job has been stopped (i.e. cancelled by the user) failure_status = jobs.objects.get(job_id=self.derivative_job.job_id.job_id).status_id.failure if failure_status == "manual": break f = f.replace('.'+self.derivative_job.source_file_extension.upper(),'.'+self.derivative_job.source_file_extension.lower()) ## determine if file matches the source_ext, skip if not if f.endswith('.'+self.derivative_job.source_file_extension.lower()): ## if the file matches the output file format, skips since this file is a derivative source_file = os.path.join(root, f) if f.endswith(tuple(output_endings)): #logging.debug("Not processing {0}, since this file is a derivative.".format(source_file)) continue ## check if over the subset count; if so, break if self.derivative_job.subset is not None and self.derivative_job.subset > 0 and files_processed >= self.derivative_job.subset: logging.debug("Reached subset count ({0}), stopping processing.".format(files_processed)) break ## loop over each derivative type to be created for each object for derivative in self.derivative_types: # Check if the job has been stopped (i.e. cancelled by the user) failure_status = jobs.objects.get(job_id=self.derivative_job.job_id.job_id).status_id.failure if failure_status == "manual": job_messages.objects.create( job_id=derivative_job.job_id, created=timezone.now(), message="Job manually stopped by user." ) status_id = status.objects.get(status="Cancelled By User").status_id logging.debug("Derivative job manually cancelled by user") break thread_index += 1 ### if we are over the max thread count, wait until one frees up if thread_count.value >= settings.MAX_THREADS: while thread_count.value >= settings.MAX_THREADS: logging.debug("{0} threads running, waiting before continuing processing".format(thread_count.value)) sleep(3) ### get the derive object try: derive_obj = derive_objs[derivative["derivative_type"]] except Exception as e: logging.error("Could not find job_derivatives object for {0}, {1}.".format(self.derivative_job.derive_id,derivative["derivative_type"])) continue ### create a process for create_derivative for the object and derivative type p = multiprocessing.Process(target=self.create_derivative, args=(self.derivative_job,derive_obj, source_file, derivative, status_objs, thread_count, thread_index)) processes.append(p) ### update the number of running threads thread_count.acquire() thread_count.value +=1 logging.debug("Kicking off thread: {0}".format(thread_count.value)) thread_count.release() ### start the process db.connections.close_all() p.start() ## increment number of files processed files_processed += 1 # Check if the job has been stopped (i.e. cancelled by the user) failure_status = jobs.objects.get(job_id=self.derivative_job.job_id.job_id).status_id.failure if failure_status == "manual": break ## check if over the subset count; if so, break if self.derivative_job.subset is not None and self.derivative_job.subset > 0 and files_processed >= self.derivative_job.subset: break # re-join all the threads to wait until they are complete logging.debug("All derivatives queued, waiting for them to finish") for p in processes: p.join(timeout=120) # TODO move to config? # check if process is still running (i.e. it hung and timed out) if p.is_alive(): logging.debug("Thread {0} still alive after timeout limit, terminating.".format(thread_index)) p.terminate() p.join(timeout=2) # check if process is still running after being terminated if p.is_alive(): logging.error("Thread {0} still alive after terminate.".format(thread_index)) if status_id is None: status_id = status.objects.get(status="Complete").status_id return status_id
class TokenizedCorpus(Dataset): def __init__(self, corpus_path: str, vocab: Union[VocabSP, VocabYTTM], seq_len: int, repeat: bool = True): self.corpus_fp = open(corpus_path, 'r', encoding='utf-8') self.vocab = vocab self.seq_len = seq_len self.repeat = repeat self.buffer = [] self.buffer_pointer = 0 self.exit_signal = -999 self.tmp_buffer = RawArray('i', [self.exit_signal] * 512 * 512 * 8) # self.q = Queue() self.refill = Value('b', True) p = mp.Process(target=self._fill_buffer_mp) p.start() # self.refill_lock = threading.Lock() # self.read_event = threading.Event() # self.t = threading.Thread(target=self._fill_buffer_in_bg) # self.t.setDaemon(True) # self.t.start() # self.read_event.set() def skip(self, count: int): for _ in range(count): if not self.corpus_fp.readline(): # Raise error when all sequences are fetched. if not self.repeat: raise StopIteration() # Or, move to the first of the corpus. self.corpus_fp.seek(0) self.corpus_fp.readline() def _fetch_one(self) -> Dict[str, List[int]]: while True: # Read subword-tokenized sequence from corpus. indices = self._read_n_tokens(self.seq_len - 2) if len(indices) + 2 > self.seq_len: continue # Decorate the sequence with additional tokens. indices = [self.vocab.bos_idx] + indices + [self.vocab.eos_idx] indices += [self.vocab.pad_idx] * (self.seq_len - len(indices) + 1) return {'input': indices[:-1], 'output': indices[1:]} def _read_n_tokens(self, n: int) -> List[int]: if (self.buffer_pointer + n) >= len(self.buffer): # print("Asking for data") self.refill.acquire() # while self.refill.value is True: time.sleep(0.00001) # self.refill.value = True self.buffer_pointer = 0 # print("tmp_buffer len = ", len(self.tmp_buffer)) self.buffer = [] for idx in self.tmp_buffer: if idx == self.exit_signal: # print("saw exit signal") break self.buffer.append(idx) # self.buffer = [idx for idx in self.tmp_buffer if idx != -5] # print("buffer len = ", len(self.buffer)) # print("q len = ", len(self.q)) # self.buffer.clear() # while True: # idx = self.q.get() # if idx == -999: # break # self.buffer.append(idx) # print("buffer len = ", len(self.buffer)) self.refill.release() p = mp.Process(target=self._fill_buffer_mp) p.start() if (self.buffer_pointer + n) >= len(self.buffer): return self._read_n_tokens(n) # while self.read_event.is_set(): time.sleep(0.0001) # self.buffer = self.tmp_buffer # self.buffer_pointer = 0 # self.read_event.set() # print("Got continuing") res = self.buffer[self.buffer_pointer:self.buffer_pointer + n] self.buffer_pointer += n return res # count = 0 # text = "" # while True: # if self.buffer_pointer >= len(self.buffer): # self._fill_buffer() # text = "" # count = 0 # char = self.buffer[self.buffer_pointer] # self.buffer_pointer += 1 # if char.isspace(): # count += 1 # if count >= n: # return [int(idx) for idx in text.split()] # text += char def _fill_buffer_mp(self, char_count: int = 1048576): # print("Reading") self.refill.acquire() try: text = self.corpus_fp.read(char_count) except: self.refill.release() self._fill_buffer_mp() return if len(text) < char_count: print("Consumed all of the corpus.") # Raise error when all sequences are read. if not self.repeat: raise StopIteration() print("Rewinding") # Or, reset current tokens and move to the beginning of the corpus. self.corpus_fp.seek(0) self._fill_buffer_mp() # print("Read") for i, token_idx in enumerate(self.vocab.encode(text)): # self.q.put(token_idx) self.tmp_buffer[i] = token_idx # self.q.put(-999) # print("Indexed len = ", len(self.tmp_buffer)) # print("Indexed len = ", len(self.q)) self.refill.value = False self.refill.release() # time.sleep(0.000001) # def _fill_buffer_in_bg(self, char_count: int = 2097152): # while True: # self.read_event.clear() # self.read_event.wait(60) # print("Reading") # text = self.corpus_fp.read(char_count) # if len(text) < char_count: # print("Consumed all of the corpus.") # # Raise error when all sequences are read. # if not self.repeat: # raise StopIteration() # print("Rewinding") # # Or, reset current tokens and move to the beginning of the corpus. # self.corpus_fp.seek(0) # continue # print("Read") # self.tmp_buffer = self.vocab.encode(text) # print("Indexed") # time.sleep(0.000001) def fetch(self, batch: Optional[int] = None) -> Dict[str, torch.Tensor]: if batch is None: data = self._fetch_one() else: data = [self._fetch_one() for _ in range(batch)] data = {k: [d[k] for d in data] for k in data[0]} return {k: torch.tensor(v, dtype=torch.long) for k, v in data.items()} def where(self) -> Dict[str, Any]: return {'offset': self.corpus_fp.tell()} def assign(self, where: Dict[str, Any]): self.corpus_fp.seek(where['offset'])