Python TextSplitter示例，TextSplitter.TextSplitter Python示例

示例#1

0

显示文件

文件： TextSplitterWindow.py 项目： FidelisLee/BliceTextSplitter

 def textSpliterStart(self):
     self.delimeter = self.ui.delimeter.toPlainText().strip()
     if not self.delimeter:
         self.updateLogMsg("Input Delimeter, plz")
         return
     self.spliterThread = TextSplitter(self, self.filePath, self.delimeter, self.ui.startIndex.toPlainText(), self.ui.chkAutoRename.isChecked())
     self.spliterThread.start()
     self.ui.statusBar.showMessage("Processing...")

示例#2

0

显示文件

文件： unittest_TextSplitter.py 项目： nourou6/Sundew

class unittest_TextSplitter(unittest.TestCase):
    def setUp(self):
        f = open('db/test-file-bulletin', 'r')
        text = f.read(8192)
        f.close()
        self.splitter = TextSplitter(text, 40, ' ', 5)

    def test_TextSplitter(self):
        self.assertEqual(self.splitter.breakLongText(), [
            'FNCN55 CWAO 040800 ', 'Extended forecasts for Wednesday T ',
            'hursday Friday Saturday and Sunday ',
            'for the Maritimes and Iles de la M ',
            'adeleine issued by Environment ',
            'Canada at 5.00 am adt Monday 4 Jun ', 'e 2012. ',
            'Mon Jun  4 04:00:01 EDT 2012  '
        ])
        self.assertEqual(self.splitter.breakMarker(), [
            'FNCN55 CWAO 040800\nExtended forecasts fo',
            'r Wednesday Thursday Friday Saturday and',
            ' Sunday\nfor the Maritimes and Iles de la',
            ' Madeleine issued by Environment\nCanada ',
            'at 5.00 am adt Monday 4 June 2012.\nMon J',
            'un  4 04:00:01 EDT 2012\n\n'
        ])

示例#3

0

显示文件

    def readFromDisk(self):
        # If our buffer is empty, we read data from disk
        if not len(self.dataFromFiles):
            self.reader.read()
            self.dataFromFiles = self.reader.getFilenamesAndContent(self.batch) 
        # If it is still empty, we quit
        if not len(self.dataFromFiles):
            #self.logger.warning("No data to read on the disk")
            if self.slow:
                time.sleep(2)
        else:
            # Break the bulletin in the number of appropriate parts (possibly only one)
            self.mm.partsToSend = TextSplitter(self.dataFromFiles[0][0], MessageAFTN.MAX_TEXT_SIZE, MessageAFTN.ALIGNMENT, 
                                               MessageAFTN.TEXT_SPECIFIC_OVERHEAD).breakLongText()

            # Will add //END PART 01//\n\r  or //END PART 03/03//\n\r
            self.mm.completePartsToSend(self.mm.partsToSend)

            assert self.mm.nextPart == 0, "Next part not equal to zero when sending the first part of a message"
            self._writeMessageToSocket([self.mm.partsToSend[0]], False, self.mm.nextPart)

示例#4

0

显示文件

文件： senderWmo.py 项目： nourou6/Sundew

    def write_segmented_data(self,data,path):

        # at this point, I expect the bulletin to be ok
        # first line assumed header... terminated by \n

        pos     = string.find(data,"\n")
        header  = data[:pos]
        lheader = len(header) + 1

        # SHOULD SEGMENT BUT : At the moment BUFR are not segmented but discarded
        if data[lheader:lheader+4] == "BUFR" :
           self.logger.error("Unable to segment and send %s ! Reason : type %s, Size: %s" % (path, "BUFR", len(data) ))
           self.unlink_file(path)
           return ( False, 0 )

        # SHOULD SEGMENT BUT : At the moment GRIB are not segmented but discarded
        if data[lheader:lheader+4] == "GRIB" :
           self.logger.error("Unable to segment and send %s ! Reason : type %s, Size: %s" % (path, "GRIB", len(data) ))
           self.unlink_file(path)
           return ( False, 0 )

        # SHOULD SEGMENT BUT : the bulletin already have a BBB group -> not segmented but discarded
        # FIXME should validate that the 4th token is realy a BBB (AAa-z CCa-z RRa-z Pa-za-z AMD COR RTM)
        tokn = header.split()
        if len(tokn) == 4 :
           self.logger.error("Unable to send %s Segmented ! Reason : BBB = %s, Size: %s" % (path, tokn[3], len(data) ))
           self.unlink_file(path)
           return ( False, 0 )

        # compute block size limit = maxLength - preamble(22) - endofMessage(4) - bulletinheader with "\r\r\n"

        limit  = self.maxLength -26 - (lheader + 2)

        # replace all \n by Amis endOfLineSep

        dataWmo = data[lheader:].strip().replace("\n", "\r\r\n" )

        # perform Segmentation

        blocks  = TextSplitter(dataWmo, limit, "\n", 0, "=\r\r\n"  ).breakMarker()
        self.logger.info("Bulletin %s segmented in %d parts" % (path,len(blocks)))

        i       =  0
        totSent =  0

        priority = path.split('/')[-3]

        alpha=['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']

        for part in blocks :
            rawSegment = header + "\n" + part
            rawSegment = rawSegment.replace("\r\r\n", "\n" )
            i = i + 1

            if self.client.nodups and priority != '0' and self.in_cache( rawSegment, False, None ) :
               continue

            succes, nbBytesSent = self.write_data(rawSegment)
            if succes :
               totSent += nbBytesSent
               self.logger.info("(%i Bytes) Bulletin Segment number %d sent" % (nbBytesSent,i))
            else :
               return (False, totSent)

        return (True, totSent)

示例#5

0

显示文件

文件： unittest_TextSplitter.py 项目： nourou6/Sundew

 def setUp(self):
     f = open('db/test-file-bulletin', 'r')
     text = f.read(8192)
     f.close()
     self.splitter = TextSplitter(text, 40, ' ', 5)

示例#6

0

显示文件

# Word Frequency Analysis
from TextSplitter import TextSplitter
from BatchMaker import BatchMaker
from FrequencyAnalyser import FrequencyAnalyser

if __name__ == '__main__':
    files = [
        "2001_ASpaceOdyssey.txt", "BladeRunner.txt", "Dune.txt",
        "FightClub.txt", "LoremIpsumFiller.txt", "MadMax.txt", "Matrix.txt",
        "Memento.txt", "StarWars_EmpireStrikesBack.txt"
    ]

    bm = BatchMaker(files)
    bm.write_batch()
    bm.create_folders()

    for f in files:
        print()
        print(f"Script :: {f}")

        movie = TextSplitter(f)
        movie.partition_words()
        movie.partition()

    for f in files:
        print()
        print(f"Script :: {f}")
        fs = FrequencyAnalyser(f)
        fs.analyse()
        fs.plot()

示例#7

0

显示文件

文件： TextSplitterWindow.py 项目： FidelisLee/BliceTextSplitter

class MainApplication(QtWidgets.QMainWindow):
    threadFinished = pyqtSignal()
    threadCanceled = pyqtSignal()
    updateLog = pyqtSignal(str)

    def __init__(self):
        super(MainApplication, self).__init__(None)
        self.filePath = ""
        self.log = ""
        self.logRename = ""
        self.spliterThread = None
        self.renameThread = None
        self.delimeter = ""
        self.threadFinished.connect(self.endTask)
        self.threadCanceled.connect(self.cancelTask)
        self.updateLog.connect(self.updateLogMsg)

        self.ui = Ui_MainWindow()
        self.ui.setupUi(self)
        self.ui.btnOpen.clicked.connect(self.showFileOpen)
        self.ui.btnStart.clicked.connect(self.textSpliterStart)
        self.ui.btnStop.clicked.connect(self.textSpliterStop)
        self.ui.menuAbout.triggered.connect(self.showDialogAbout)

    def showDialogAbout(self):
        AboutDialog().exec_()

    def toggleStartIndex(self):
        self.ui.startIndex.setDisabled(self.ui.chkAutoRename.isChecked())

    def textSpliterStart(self):
        self.delimeter = self.ui.delimeter.toPlainText().strip()
        if not self.delimeter:
            self.updateLogMsg("Input Delimeter, plz")
            return
        self.spliterThread = TextSplitter(self, self.filePath, self.delimeter, self.ui.startIndex.toPlainText(), self.ui.chkAutoRename.isChecked())
        self.spliterThread.start()
        self.ui.statusBar.showMessage("Processing...")

    def textSpliterStop(self):
        if self.spliterThread is not None:
            self.spliterThread.thread_stop()

    # TODO:
    def showFileOpen(self):
        fname = QtWidgets.QFileDialog.getOpenFileName(self, 'Open File', os.getcwd(), "Text files (*.txt)")
        if len(fname[0]) == 0:
            return

        self.clearLogMsg()

        encodingTypes = set()
        with open(fname[0], 'rb') as ori_file:
            self.ui.textFile.setText(fname[0])

            for i in range(100):
                rst = chardet.detect(ori_file.readline()).get("encoding")
                encodingTypes.add(rst)

            self.filePath = fname[0]
            self.updateLogMsg("============================")
            self.updateLogMsg("File Info")
            self.updateLogMsg("- Path : " + self.filePath)
            self.updateLogMsg("- Lines : " + str(len(ori_file.readlines())))
            self.updateLogMsg("- Encoding types on Top 100 lines : " + str(encodingTypes))
            self.updateLogMsg("============================")
            ori_file.close()

            self.ui.btnStart.setDisabled(False)
            self.ui.btnStop.setDisabled(False)
            self.ui.delimeter.setDisabled(False)
            self.ui.chkAutoRename.setDisabled(False)
            self.ui.startIndex.setDisabled(False)
            self.ui.statusBar.showMessage("File Open Complete.")

    def windowClose(self):
        if self.spliterThread is not None:
            self.textSpliterStop()
        if self.renameThread is not None:
            self.renameStop()
        sys.exit(app.exec_())

    def updateLogMsg(self, str):
        self.ui.textLog.append(str)

    def clearLogMsg(self):
        self.log = ""
        self.ui.textLog.setText(self.log)

    def endTask(self):
        self.spliterThread = None
        self.ui.statusBar.showMessage("Done.")

    def cancelTask(self):
        self.spliterThread = None
        self.updateLogMsg("====== Canceled ======")
        self.ui.statusBar.showMessage("Canceled.")

示例#8

0

显示文件

    def write_segmented_data(self, data, path):

        unBulletinAm = bulletinAm.bulletinAm(data,
                                             self.logger,
                                             lineSeparator='\r\r\n')
        limit = self.maxLength - 128
        header = unBulletinAm.getHeader()
        lheader = len(header) + 1

        # SHOULD SEGMENT BUT : At the moment BUFR are not segmented but discarded
        if data[lheader:lheader + 4] == "BUFR":
            self.logger.error(
                "Unable to segment and send %s ! Reason : type %s, Size: %s" %
                (path, "BUFR", len(data)))
            self.unlink_file(path)
            return (False, 0)

        # SHOULD SEGMENT BUT : At the moment GRIB are not segmented but discarded
        if data[lheader:lheader + 4] == "GRIB":
            self.logger.error(
                "Unable to segment and send %s ! Reason : type %s, Size: %s" %
                (path, "GRIB", len(data)))
            self.unlink_file(path)
            return (False, 0)

        # SHOULD SEGMENT BUT : the bulletin already have a BBB group -> not segmented but discarded
        # FIXME should validate that the 4th token is realy a BBB (AAa-z CCa-z RRa-z Pa-za-z AMD COR RTM)
        tokn = header.split()
        if len(tokn) == 4:
            self.logger.error(
                "Unable to send %s Segmented ! Reason : BBB = %s, Size: %s" %
                (path, tokn[3], len(data)))
            self.unlink_file(path)
            return (False, 0)

        # Perform segmentation
        # segmentation the block size is computed like this :
        # maxLength - 128 (Am struct size) - (len(header) + '\n\ + ' ' + BBB  )

        limit = self.maxLength - 128 - (lheader + 4)
        blocks = TextSplitter(data[lheader:], limit).breakMarker()
        self.logger.info(
            "(%i Bytes) Bulletin %s  delivered segmented in %d parts" %
            (len(data), os.path.basename(path), len(blocks)))
        self.logger.debug("Bulletin is \n%s" % data)

        i = 0
        totSent = 0

        priority = path.split('/')[-3]

        alpha = [
            'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
            'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'
        ]

        for part in blocks:
            l1 = alpha[i / 24]
            l2 = alpha[i % 24]
            i = i + 1
            if i == len(blocks): l1 = 'Z'
            rawSegment = header + " P" + l1 + l2 + '\n' + part

            if self.client.nodups and priority != '0' and self.in_cache(
                    rawSegment, False, None):
                continue

            succes, nbBytesSent = self.write_data(rawSegment)
            if succes:
                self.tallyBytes(nbBytesSent)
                self.logger.info(
                    "(%i Bytes) Bulletin Segment number %d sent (%s)" %
                    (nbBytesSent, i, header + " P" + l1 + l2))
                self.logger.debug("Bulletin is \n%s" % rawSegment)
            else:
                return (False, totSent)

        return (True, totSent)