def processJSONDirectory(dataDir, logDir): logPath = getDatedFilePath('JSON_To_Mongo_Log', logDir) print('log path', logPath) logData = [] dateStamp = getDateStamp() message = 'Started processing JSON at ' + dateStamp logData.append(message) print(message) dataFileList = os.listdir(dataDir) #print(dataFileList) for dataFile in dataFileList: logData += processSingleFile(dataFile, dataDir) dateStamp = getDateStamp() message = 'Finished processing JSON at ' + dateStamp logData.append(message) print(message) writeListToFile(logData, logPath)
def processText(pathIn, dirOut, dirLog): #print('pathIn', pathIn, '\ndirOut', dirOut, '\ndirLog', dirLog) pathOut = sysHandle.getRawPath(pathIn, dirOut) filePrefix = "Extract_Sentence_on_" pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog) #print('pathLog:', pathLog) #print(pathIn) #print(pathOut) logData = [] dateStamp = sysHandle.getDateStamp() logData.append("Starting to extract sentences at " + dateStamp) #print('dateStamp:', dateStamp) extractSentences(pathIn, pathOut) #print(cleanList) dateStamp = sysHandle.getDateStamp() logData.append("Sentence extracting completed at " + dateStamp) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(dirOut) sys.exit()
def processText(pathIn, dirOut, dirExclusion, dirLog): pathOut = sysHandle.getRawPath(pathIn, dirOut) #print('dirLog', dirLog) initialString = "Word_Extract_Log_" pathLog = sysHandle.getDatedFilePath(initialString, dirLog) logData = [] dateStamp = sysHandle.getDateStamp() message = "Starting to extract words at " + dateStamp logData.append(message) print(message) #STEP 1: read data file and split to get words words = sysHandle.getWordFromTextFile(pathIn) dateStamp = sysHandle.getDateStamp() message = "Reading word list completed at " + dateStamp logData.append(message) print(message) #STEP 2: trim left, right, remove overlappings and sort wordList = cleanWordList(words) dateStamp = sysHandle.getDateStamp() message = "Trimming word list completed at " + dateStamp logData.append(message) print(message) #print(wordList) #STEP 3: remove items found in exclusion list, remove empty string exclusionList = sysHandle.loadDictionaries(dirExclusion) #print(exclusionList) cleanList = [w for w in wordList if w.lower() not in exclusionList] #remove empty items cleanList = [w for w in cleanList if w] #log activity dateStamp = sysHandle.getDateStamp() message = "Removing exluded items completed at " + dateStamp logData.append(message) print(message) #print(cleanList) sysHandle.writeListToFile(cleanList, pathOut) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(dirOut) sys.exit()
def uploadData(pathIn, bookID, dirLog): filePrefix = "Upload_Sentences_To_MySQL_on_" pathLog = sysHandle.getDatedFilePath(filePrefix, dirLog) logData = [] dateStamp = sysHandle.getDateStamp() logData.append("Starting to upload sentences at " + dateStamp) logData.append("BookID being uploaded: " + str(bookID)) #print(pathIn, bookID) sentence_total = upload_data(pathIn, bookID) logData.append("Total sentences written to MySQL " + str(sentence_total)) #sysHandle.openDir(outDir) dateStamp = sysHandle.getDateStamp() logData.append("Sentence uploading completed at " + dateStamp) sysHandle.writeListToFile(logData, pathLog) sys.exit()
print('Connection Error with IP ' + str(item)) statList.append('Connection Error with IP ' + str(item)) except urllib3.exceptions.MaxRetryError: print('Max Retry Error with IP ' + str(item)) statList.append('Max Retry Error with IP ' + str(item)) except Exception as e: statList.append('Error verifying IP ' + str(item)) print(e) file.close() dateStamp = sysHand.getDateStamp() statList.append('Finish verifying proxy at ' + dateStamp) return statList if __name__ == '__main__': proxyDir = 'D:/Proxy/List' outPath = 'D:/Proxy/Filter/good_proxy_list.txt' logDir = 'D:/Proxy/Log' initialString = "Proxy_Verification_Log_" logPath = sysHand.getDatedFilePath(initialString, logDir) #clear file contents open(outPath, "w").close() proxyList = sysHand.loadProxyLines(proxyDir) statusList = verifyProxy(proxyList, outPath) sysHand.writeListToFile(statusList, logPath)
return message if __name__ == "__main__": dirIn = 'E:/FULLTEXT/LEXICO/TEXT' dirOut = 'E:/FULLTEXT/LEXICO/TEXT2' dirLog = 'E:/FULLTEXT/LEXICO/LOG' cf = config_handler.ConfigHandler() recentFile = cf.get_config_value(cf.RECENT_OPEN_FILE2) #print(recentFile) fileList = os.listdir(dirIn) lastFile = '' prefix = 'Lexicon_Second_Run_Log_' logData = [] logPath = getDatedFilePath(prefix, dirLog) #print('log path:', logPath) timeStamp = getDateStamp() message = 'Starting processing at ' + timeStamp logData.append(message) print(message) for item in fileList: if (item > recentFile): lastFile = item message = 'Processsing item ' + item logData.append(message) print(message) message = processRawText(item, dirIn, dirOut) logData.append(message) print(message)
def processText(inFile, outDir, dictDir, trashDir, logDir, recycleDir): #print ('logDir:', logDir, 'recyle Dir:', recycleDir) #print ('recycleList:', recycleList) initialString = "Dictionary_Check_Log_" pathLog = sysHandle.getDatedFilePath(initialString, logDir) logData = [] dateStamp = sysHandle.getDateStamp() message = "Starting to directionary-check at " + dateStamp logData.append(message) print(message) pathOutClean = sysHandle.getRawPath(inFile, outDir) pathOutTrash = sysHandle.getRawPath(inFile, trashDir) #print ('path clean:', pathOutClean, 'path trash:', pathOutTrash) rawList = convertList(sysHandle.readTextFile(inFile)) dicList = sysHandle.loadDictionaries(dictDir) #split clean and trash based on dictionary listClean, listTrash = filterList(rawList, dicList) #split into lower case and upper case parts lowerClean, upperClean = splitDictByCase(listClean) #get a list of words from mysql database lowerDic, upperDic = splitDictByCase(getWordList()) #logging activity dateStamp = sysHandle.getDateStamp() message = "Loading dictionary completed at " + dateStamp logData.append(message) print(message) newUpperClean = [ item for item in upperClean if item.lower() not in lowerDic ] newClean = newUpperClean + lowerClean #logging activity dateStamp = sysHandle.getDateStamp() message = "Completed dictionary checking at " + dateStamp logData.append(message) print(message) recycleList = sysHandle.loadDictionaries(recycleDir) newListTrash = [item for item in listTrash if item not in recycleList] sysHandle.writeListToFile(newClean, pathOutClean) sysHandle.writeListToFile(newListTrash, pathOutTrash) #logging activity dateStamp = sysHandle.getDateStamp() message = "Finished directionary checking at " + dateStamp logData.append(message) print(message) sysHandle.writeListToFile(logData, pathLog) sysHandle.openDir(outDir) sys.exit()