예제 #1
0
def cleanAllWordsFromArticles(listOfArticles, pathToArticles, lock):
    dbManager = DatabaseManager()

    for currentFileName in listOfArticles:

        # Start with structure cleaning
        args = structureCleaning(currentFileName)
        if not args==None:
            lock.acquire()
            passed =  dbManager.put_article_with_args(args)
            lock.release()

            if passed:

                # Save file to new file
                write_to_file(args[3], pathToArticles + currentFileName)

                # Open it again
                # TODO it shouldn't be opened again
                newFileContent = ""
                currentFile = open(pathToArticles + currentFileName, 'r+')

                # Use nltk
                for line in currentFile:
                    for word in line.split():
                        cleanedWord = cleaningOfWord(word)
                        if cleanedWord is not None:
                            newFileContent+=cleanedWord +" "

                # Write new content of file
                currentFile.seek(0)
                currentFile.write(newFileContent)
                currentFile.truncate()
                currentFile.close()
            else:
                print "Duplicate file: " + currentFileName
        else:
            print "Wrong structure, denied file: " + currentFileName
            os.remove(DIR_DUMP + currentFileName)

    return len(listOfArticles)