示例#1
0
def processText(inFile, outDir, dbDir):
    pathRecycleOut = sysHandle.getRawPath(inFile, outDir)
    pathDatabaseIn = sysHandle.getRawPath(inFile, dbDir)
    trashListIn = sysHandle.getWordFromTextFile(inFile)
    databseListIn = sysHandle.getWordFromTextFile(pathDatabaseIn)
    recycleListOut = [
        item for item in trashListIn if item not in databseListIn
    ]
    standardList = getWordList()
    newRecycle = [item for item in recycleListOut if item not in standardList]
    #print (newRecycle)

    sysHandle.writeListToFile(newRecycle, pathRecycleOut)
    sysHandle.openDir(outDir)
    sys.exit()
def processText(pathIn, dirOut):
	pathOut = sysHandle.getRawPath(pathIn, dirOut)
	words = sysHandle.getWordFromTextFile(pathIn)
	cleanList = trim_and_sort(words)
	sysHandle.writeListToFile(cleanList, pathOut)
	sysHandle.openDir(dirOut)
	sys.exit()
def processText(pathIn, dirOut, dirLog):

    #print('pathIn', pathIn, '\ndirOut', dirOut, '\ndirLog', dirLog)
    pathOut = sysHandle.getRawPath(pathIn, dirOut)
    filePrefix = "Extract_Sentence_on_"
    pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog)
    #print('pathLog:', pathLog)

    #print(pathIn)
    #print(pathOut)

    logData = []
    dateStamp = sysHandle.getDateStamp()
    logData.append("Starting to extract sentences at " + dateStamp)

    #print('dateStamp:', dateStamp)

    extractSentences(pathIn, pathOut)
    #print(cleanList)

    dateStamp = sysHandle.getDateStamp()
    logData.append("Sentence extracting completed at " + dateStamp)

    sysHandle.writeListToFile(logData, pathLog)
    sysHandle.openDir(dirOut)
    sys.exit()
def processTab3(pathClean, dirRaw, dirRecycle):
    #print('pathClean:', pathClean, '\ndirRaw:', dirRaw, '\ndirRecycle:', dirRecycle)

    pathRaw = getRawPath(pathClean, dirRaw)
    pathRecycle = getRawPath(pathClean, dirRecycle)

    #print('pathRaw', pathRaw)
    #print('\npathRecycle', pathRecycle)

    contentRaw = loadFormatPairFile(pathRaw)
    #print(contentRaw)
    contentClean = loadFormatPairFile(pathClean)
    #print(contentClean)

    recycleData = [item for item in contentRaw if item not in contentClean]

    dataOut = unpackPairs(recycleData)
    #print(dataOut)

    writeListToFile(dataOut, pathRecycle)
    openDir(dirRecycle)
    sys.exit()
示例#5
0
def processText(pathIn, dirOut, dirExclusion, dirLog):
    pathOut = sysHandle.getRawPath(pathIn, dirOut)
    #print('dirLog', dirLog)
    initialString = "Word_Extract_Log_"
    pathLog = sysHandle.getDatedFilePath(initialString, dirLog)
    logData = []
    dateStamp = sysHandle.getDateStamp()
    message = "Starting to extract words at " + dateStamp
    logData.append(message)
    print(message)

    #STEP 1: read data file and split to get words
    words = sysHandle.getWordFromTextFile(pathIn)
    dateStamp = sysHandle.getDateStamp()
    message = "Reading word list completed at " + dateStamp
    logData.append(message)
    print(message)

    #STEP 2: trim left, right, remove overlappings and sort
    wordList = cleanWordList(words)
    dateStamp = sysHandle.getDateStamp()
    message = "Trimming word list completed at " + dateStamp
    logData.append(message)
    print(message)
    #print(wordList)

    #STEP 3: remove items found in exclusion list, remove empty string
    exclusionList = sysHandle.loadDictionaries(dirExclusion)
    #print(exclusionList)
    cleanList = [w for w in wordList if w.lower() not in exclusionList]
    #remove empty items
    cleanList = [w for w in cleanList if w]

    #log activity
    dateStamp = sysHandle.getDateStamp()
    message = "Removing exluded items completed at " + dateStamp
    logData.append(message)
    print(message)

    #print(cleanList)
    sysHandle.writeListToFile(cleanList, pathOut)
    sysHandle.writeListToFile(logData, pathLog)
    sysHandle.openDir(dirOut)
    sys.exit()
def processText(inFile, outDir, dictDir, trashDir, logDir, recycleDir):
    #print ('logDir:', logDir, 'recyle Dir:', recycleDir)

    #print ('recycleList:', recycleList)

    initialString = "Dictionary_Check_Log_"
    pathLog = sysHandle.getDatedFilePath(initialString, logDir)
    logData = []
    dateStamp = sysHandle.getDateStamp()
    message = "Starting to directionary-check at " + dateStamp
    logData.append(message)
    print(message)

    pathOutClean = sysHandle.getRawPath(inFile, outDir)
    pathOutTrash = sysHandle.getRawPath(inFile, trashDir)
    #print ('path clean:', pathOutClean, 'path trash:', pathOutTrash)
    rawList = convertList(sysHandle.readTextFile(inFile))
    dicList = sysHandle.loadDictionaries(dictDir)

    #split clean and trash based on dictionary
    listClean, listTrash = filterList(rawList, dicList)

    #split into lower case and upper case parts
    lowerClean, upperClean = splitDictByCase(listClean)

    #get a list of words from mysql database
    lowerDic, upperDic = splitDictByCase(getWordList())

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Loading dictionary completed at " + dateStamp
    logData.append(message)
    print(message)

    newUpperClean = [
        item for item in upperClean if item.lower() not in lowerDic
    ]

    newClean = newUpperClean + lowerClean

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Completed dictionary checking at " + dateStamp
    logData.append(message)
    print(message)

    recycleList = sysHandle.loadDictionaries(recycleDir)
    newListTrash = [item for item in listTrash if item not in recycleList]

    sysHandle.writeListToFile(newClean, pathOutClean)
    sysHandle.writeListToFile(newListTrash, pathOutTrash)

    #logging activity
    dateStamp = sysHandle.getDateStamp()
    message = "Finished directionary checking at " + dateStamp
    logData.append(message)
    print(message)
    sysHandle.writeListToFile(logData, pathLog)

    sysHandle.openDir(outDir)
    sys.exit()