def cleanAllWordsFromArticles(listOfArticles, pathToArticles, lock): dbManager = DatabaseManager() for currentFileName in listOfArticles: # Start with structure cleaning args = structureCleaning(currentFileName) if not args==None: lock.acquire() passed = dbManager.put_article_with_args(args) lock.release() if passed: # Save file to new file write_to_file(args[3], pathToArticles + currentFileName) # Open it again # TODO it shouldn't be opened again newFileContent = "" currentFile = open(pathToArticles + currentFileName, 'r+') # Use nltk for line in currentFile: for word in line.split(): cleanedWord = cleaningOfWord(word) if cleanedWord is not None: newFileContent+=cleanedWord +" " # Write new content of file currentFile.seek(0) currentFile.write(newFileContent) currentFile.truncate() currentFile.close() else: print "Duplicate file: " + currentFileName else: print "Wrong structure, denied file: " + currentFileName os.remove(DIR_DUMP + currentFileName) return len(listOfArticles)