def textSpliterStart(self): self.delimeter = self.ui.delimeter.toPlainText().strip() if not self.delimeter: self.updateLogMsg("Input Delimeter, plz") return self.spliterThread = TextSplitter(self, self.filePath, self.delimeter, self.ui.startIndex.toPlainText(), self.ui.chkAutoRename.isChecked()) self.spliterThread.start() self.ui.statusBar.showMessage("Processing...")
class unittest_TextSplitter(unittest.TestCase): def setUp(self): f = open('db/test-file-bulletin', 'r') text = f.read(8192) f.close() self.splitter = TextSplitter(text, 40, ' ', 5) def test_TextSplitter(self): self.assertEqual(self.splitter.breakLongText(), [ 'FNCN55 CWAO 040800 ', 'Extended forecasts for Wednesday T ', 'hursday Friday Saturday and Sunday ', 'for the Maritimes and Iles de la M ', 'adeleine issued by Environment ', 'Canada at 5.00 am adt Monday 4 Jun ', 'e 2012. ', 'Mon Jun 4 04:00:01 EDT 2012 ' ]) self.assertEqual(self.splitter.breakMarker(), [ 'FNCN55 CWAO 040800\nExtended forecasts fo', 'r Wednesday Thursday Friday Saturday and', ' Sunday\nfor the Maritimes and Iles de la', ' Madeleine issued by Environment\nCanada ', 'at 5.00 am adt Monday 4 June 2012.\nMon J', 'un 4 04:00:01 EDT 2012\n\n' ])
def readFromDisk(self): # If our buffer is empty, we read data from disk if not len(self.dataFromFiles): self.reader.read() self.dataFromFiles = self.reader.getFilenamesAndContent(self.batch) # If it is still empty, we quit if not len(self.dataFromFiles): #self.logger.warning("No data to read on the disk") if self.slow: time.sleep(2) else: # Break the bulletin in the number of appropriate parts (possibly only one) self.mm.partsToSend = TextSplitter(self.dataFromFiles[0][0], MessageAFTN.MAX_TEXT_SIZE, MessageAFTN.ALIGNMENT, MessageAFTN.TEXT_SPECIFIC_OVERHEAD).breakLongText() # Will add //END PART 01//\n\r or //END PART 03/03//\n\r self.mm.completePartsToSend(self.mm.partsToSend) assert self.mm.nextPart == 0, "Next part not equal to zero when sending the first part of a message" self._writeMessageToSocket([self.mm.partsToSend[0]], False, self.mm.nextPart)
def write_segmented_data(self,data,path): # at this point, I expect the bulletin to be ok # first line assumed header... terminated by \n pos = string.find(data,"\n") header = data[:pos] lheader = len(header) + 1 # SHOULD SEGMENT BUT : At the moment BUFR are not segmented but discarded if data[lheader:lheader+4] == "BUFR" : self.logger.error("Unable to segment and send %s ! Reason : type %s, Size: %s" % (path, "BUFR", len(data) )) self.unlink_file(path) return ( False, 0 ) # SHOULD SEGMENT BUT : At the moment GRIB are not segmented but discarded if data[lheader:lheader+4] == "GRIB" : self.logger.error("Unable to segment and send %s ! Reason : type %s, Size: %s" % (path, "GRIB", len(data) )) self.unlink_file(path) return ( False, 0 ) # SHOULD SEGMENT BUT : the bulletin already have a BBB group -> not segmented but discarded # FIXME should validate that the 4th token is realy a BBB (AAa-z CCa-z RRa-z Pa-za-z AMD COR RTM) tokn = header.split() if len(tokn) == 4 : self.logger.error("Unable to send %s Segmented ! Reason : BBB = %s, Size: %s" % (path, tokn[3], len(data) )) self.unlink_file(path) return ( False, 0 ) # compute block size limit = maxLength - preamble(22) - endofMessage(4) - bulletinheader with "\r\r\n" limit = self.maxLength -26 - (lheader + 2) # replace all \n by Amis endOfLineSep dataWmo = data[lheader:].strip().replace("\n", "\r\r\n" ) # perform Segmentation blocks = TextSplitter(dataWmo, limit, "\n", 0, "=\r\r\n" ).breakMarker() self.logger.info("Bulletin %s segmented in %d parts" % (path,len(blocks))) i = 0 totSent = 0 priority = path.split('/')[-3] alpha=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'] for part in blocks : rawSegment = header + "\n" + part rawSegment = rawSegment.replace("\r\r\n", "\n" ) i = i + 1 if self.client.nodups and priority != '0' and self.in_cache( rawSegment, False, None ) : continue succes, nbBytesSent = self.write_data(rawSegment) if succes : totSent += nbBytesSent self.logger.info("(%i Bytes) Bulletin Segment number %d sent" % (nbBytesSent,i)) else : return (False, totSent) return (True, totSent)
def setUp(self): f = open('db/test-file-bulletin', 'r') text = f.read(8192) f.close() self.splitter = TextSplitter(text, 40, ' ', 5)
# Word Frequency Analysis from TextSplitter import TextSplitter from BatchMaker import BatchMaker from FrequencyAnalyser import FrequencyAnalyser if __name__ == '__main__': files = [ "2001_ASpaceOdyssey.txt", "BladeRunner.txt", "Dune.txt", "FightClub.txt", "LoremIpsumFiller.txt", "MadMax.txt", "Matrix.txt", "Memento.txt", "StarWars_EmpireStrikesBack.txt" ] bm = BatchMaker(files) bm.write_batch() bm.create_folders() for f in files: print() print(f"Script :: {f}") movie = TextSplitter(f) movie.partition_words() movie.partition() for f in files: print() print(f"Script :: {f}") fs = FrequencyAnalyser(f) fs.analyse() fs.plot()
class MainApplication(QtWidgets.QMainWindow): threadFinished = pyqtSignal() threadCanceled = pyqtSignal() updateLog = pyqtSignal(str) def __init__(self): super(MainApplication, self).__init__(None) self.filePath = "" self.log = "" self.logRename = "" self.spliterThread = None self.renameThread = None self.delimeter = "" self.threadFinished.connect(self.endTask) self.threadCanceled.connect(self.cancelTask) self.updateLog.connect(self.updateLogMsg) self.ui = Ui_MainWindow() self.ui.setupUi(self) self.ui.btnOpen.clicked.connect(self.showFileOpen) self.ui.btnStart.clicked.connect(self.textSpliterStart) self.ui.btnStop.clicked.connect(self.textSpliterStop) self.ui.menuAbout.triggered.connect(self.showDialogAbout) def showDialogAbout(self): AboutDialog().exec_() def toggleStartIndex(self): self.ui.startIndex.setDisabled(self.ui.chkAutoRename.isChecked()) def textSpliterStart(self): self.delimeter = self.ui.delimeter.toPlainText().strip() if not self.delimeter: self.updateLogMsg("Input Delimeter, plz") return self.spliterThread = TextSplitter(self, self.filePath, self.delimeter, self.ui.startIndex.toPlainText(), self.ui.chkAutoRename.isChecked()) self.spliterThread.start() self.ui.statusBar.showMessage("Processing...") def textSpliterStop(self): if self.spliterThread is not None: self.spliterThread.thread_stop() # TODO: def showFileOpen(self): fname = QtWidgets.QFileDialog.getOpenFileName(self, 'Open File', os.getcwd(), "Text files (*.txt)") if len(fname[0]) == 0: return self.clearLogMsg() encodingTypes = set() with open(fname[0], 'rb') as ori_file: self.ui.textFile.setText(fname[0]) for i in range(100): rst = chardet.detect(ori_file.readline()).get("encoding") encodingTypes.add(rst) self.filePath = fname[0] self.updateLogMsg("============================") self.updateLogMsg("File Info") self.updateLogMsg("- Path : " + self.filePath) self.updateLogMsg("- Lines : " + str(len(ori_file.readlines()))) self.updateLogMsg("- Encoding types on Top 100 lines : " + str(encodingTypes)) self.updateLogMsg("============================") ori_file.close() self.ui.btnStart.setDisabled(False) self.ui.btnStop.setDisabled(False) self.ui.delimeter.setDisabled(False) self.ui.chkAutoRename.setDisabled(False) self.ui.startIndex.setDisabled(False) self.ui.statusBar.showMessage("File Open Complete.") def windowClose(self): if self.spliterThread is not None: self.textSpliterStop() if self.renameThread is not None: self.renameStop() sys.exit(app.exec_()) def updateLogMsg(self, str): self.ui.textLog.append(str) def clearLogMsg(self): self.log = "" self.ui.textLog.setText(self.log) def endTask(self): self.spliterThread = None self.ui.statusBar.showMessage("Done.") def cancelTask(self): self.spliterThread = None self.updateLogMsg("====== Canceled ======") self.ui.statusBar.showMessage("Canceled.")
def write_segmented_data(self, data, path): unBulletinAm = bulletinAm.bulletinAm(data, self.logger, lineSeparator='\r\r\n') limit = self.maxLength - 128 header = unBulletinAm.getHeader() lheader = len(header) + 1 # SHOULD SEGMENT BUT : At the moment BUFR are not segmented but discarded if data[lheader:lheader + 4] == "BUFR": self.logger.error( "Unable to segment and send %s ! Reason : type %s, Size: %s" % (path, "BUFR", len(data))) self.unlink_file(path) return (False, 0) # SHOULD SEGMENT BUT : At the moment GRIB are not segmented but discarded if data[lheader:lheader + 4] == "GRIB": self.logger.error( "Unable to segment and send %s ! Reason : type %s, Size: %s" % (path, "GRIB", len(data))) self.unlink_file(path) return (False, 0) # SHOULD SEGMENT BUT : the bulletin already have a BBB group -> not segmented but discarded # FIXME should validate that the 4th token is realy a BBB (AAa-z CCa-z RRa-z Pa-za-z AMD COR RTM) tokn = header.split() if len(tokn) == 4: self.logger.error( "Unable to send %s Segmented ! Reason : BBB = %s, Size: %s" % (path, tokn[3], len(data))) self.unlink_file(path) return (False, 0) # Perform segmentation # segmentation the block size is computed like this : # maxLength - 128 (Am struct size) - (len(header) + '\n\ + ' ' + BBB ) limit = self.maxLength - 128 - (lheader + 4) blocks = TextSplitter(data[lheader:], limit).breakMarker() self.logger.info( "(%i Bytes) Bulletin %s delivered segmented in %d parts" % (len(data), os.path.basename(path), len(blocks))) self.logger.debug("Bulletin is \n%s" % data) i = 0 totSent = 0 priority = path.split('/')[-3] alpha = [ 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' ] for part in blocks: l1 = alpha[i / 24] l2 = alpha[i % 24] i = i + 1 if i == len(blocks): l1 = 'Z' rawSegment = header + " P" + l1 + l2 + '\n' + part if self.client.nodups and priority != '0' and self.in_cache( rawSegment, False, None): continue succes, nbBytesSent = self.write_data(rawSegment) if succes: self.tallyBytes(nbBytesSent) self.logger.info( "(%i Bytes) Bulletin Segment number %d sent (%s)" % (nbBytesSent, i, header + " P" + l1 + l2)) self.logger.debug("Bulletin is \n%s" % rawSegment) else: return (False, totSent) return (True, totSent)