def processText(inFile, outDir, dbDir): pathRecycleOut = sysHandle.getRawPath(inFile, outDir) pathDatabaseIn = sysHandle.getRawPath(inFile, dbDir) trashListIn = sysHandle.getWordFromTextFile(inFile) databseListIn = sysHandle.getWordFromTextFile(pathDatabaseIn) recycleListOut = [ item for item in trashListIn if item not in databseListIn ] standardList = getWordList() newRecycle = [item for item in recycleListOut if item not in standardList] #print (newRecycle) sysHandle.writeListToFile(newRecycle, pathRecycleOut) sysHandle.openDir(outDir) sys.exit()
def processText(pathIn, dirOut): pathOut = sysHandle.getRawPath(pathIn, dirOut) words = sysHandle.getWordFromTextFile(pathIn) cleanList = trim_and_sort(words) sysHandle.writeListToFile(cleanList, pathOut) sysHandle.openDir(dirOut) sys.exit()
def processText(pathIn, dirOut, dirLog): #print('pathIn', pathIn, '\ndirOut', dirOut, '\ndirLog', dirLog) pathOut = sysHandle.getRawPath(pathIn, dirOut) filePrefix = "Extract_Sentence_on_" pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog) #print('pathLog:', pathLog) #print(pathIn) #print(pathOut) logData = [] dateStamp = sysHandle.getDateStamp() logData.append("Starting to extract sentences at " + dateStamp) #print('dateStamp:', dateStamp) extractSentences(pathIn, pathOut) #print(cleanList) dateStamp = sysHandle.getDateStamp() logData.append("Sentence extracting completed at " + dateStamp) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(dirOut) sys.exit()
def processTab3(pathClean, dirRaw, dirRecycle): #print('pathClean:', pathClean, '\ndirRaw:', dirRaw, '\ndirRecycle:', dirRecycle) pathRaw = getRawPath(pathClean, dirRaw) pathRecycle = getRawPath(pathClean, dirRecycle) #print('pathRaw', pathRaw) #print('\npathRecycle', pathRecycle) contentRaw = loadFormatPairFile(pathRaw) #print(contentRaw) contentClean = loadFormatPairFile(pathClean) #print(contentClean) recycleData = [item for item in contentRaw if item not in contentClean] dataOut = unpackPairs(recycleData) #print(dataOut) writeListToFile(dataOut, pathRecycle) openDir(dirRecycle) sys.exit()
def processText(pathIn, dirOut, dirExclusion, dirLog): pathOut = sysHandle.getRawPath(pathIn, dirOut) #print('dirLog', dirLog) initialString = "Word_Extract_Log_" pathLog = sysHandle.getDatedFilePath(initialString, dirLog) logData = [] dateStamp = sysHandle.getDateStamp() message = "Starting to extract words at " + dateStamp logData.append(message) print(message) #STEP 1: read data file and split to get words words = sysHandle.getWordFromTextFile(pathIn) dateStamp = sysHandle.getDateStamp() message = "Reading word list completed at " + dateStamp logData.append(message) print(message) #STEP 2: trim left, right, remove overlappings and sort wordList = cleanWordList(words) dateStamp = sysHandle.getDateStamp() message = "Trimming word list completed at " + dateStamp logData.append(message) print(message) #print(wordList) #STEP 3: remove items found in exclusion list, remove empty string exclusionList = sysHandle.loadDictionaries(dirExclusion) #print(exclusionList) cleanList = [w for w in wordList if w.lower() not in exclusionList] #remove empty items cleanList = [w for w in cleanList if w] #log activity dateStamp = sysHandle.getDateStamp() message = "Removing exluded items completed at " + dateStamp logData.append(message) print(message) #print(cleanList) sysHandle.writeListToFile(cleanList, pathOut) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(dirOut) sys.exit()
def processText(inFile, outDir, dictDir, trashDir, logDir, recycleDir): #print ('logDir:', logDir, 'recyle Dir:', recycleDir) #print ('recycleList:', recycleList) initialString = "Dictionary_Check_Log_" pathLog = sysHandle.getDatedFilePath(initialString, logDir) logData = [] dateStamp = sysHandle.getDateStamp() message = "Starting to directionary-check at " + dateStamp logData.append(message) print(message) pathOutClean = sysHandle.getRawPath(inFile, outDir) pathOutTrash = sysHandle.getRawPath(inFile, trashDir) #print ('path clean:', pathOutClean, 'path trash:', pathOutTrash) rawList = convertList(sysHandle.readTextFile(inFile)) dicList = sysHandle.loadDictionaries(dictDir) #split clean and trash based on dictionary listClean, listTrash = filterList(rawList, dicList) #split into lower case and upper case parts lowerClean, upperClean = splitDictByCase(listClean) #get a list of words from mysql database lowerDic, upperDic = splitDictByCase(getWordList()) #logging activity dateStamp = sysHandle.getDateStamp() message = "Loading dictionary completed at " + dateStamp logData.append(message) print(message) newUpperClean = [ item for item in upperClean if item.lower() not in lowerDic ] newClean = newUpperClean + lowerClean #logging activity dateStamp = sysHandle.getDateStamp() message = "Completed dictionary checking at " + dateStamp logData.append(message) print(message) recycleList = sysHandle.loadDictionaries(recycleDir) newListTrash = [item for item in listTrash if item not in recycleList] sysHandle.writeListToFile(newClean, pathOutClean) sysHandle.writeListToFile(newListTrash, pathOutTrash) #logging activity dateStamp = sysHandle.getDateStamp() message = "Finished directionary checking at " + dateStamp logData.append(message) print(message) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(outDir) sys.exit()