def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile):
    tokenizeddocs = []
    combineddocuments = []
    fo = FileOperations()
    # tokenizer = RegexpTokenizer(r'\w+')
    if fo.exists(filename):
        # Load the file
        combineddocuments = fo.LoadFile(filename)
        pass

    else:
        tokenizer = MWETokenizer(glossarylist)
        regtokenizer = RegexpTokenizer(r'\w+')
        for doc in tqdm(docs):
            sentences = sent_tokenize(doc)

            tmp = []
            for sentence in sentences:
                tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower()))
                token_lowercase = [x.lower() for x in tokens]
                tmp.append(token_lowercase)
            tokenizeddocs.append(tmp)

        for doc in tqdm(tokenizeddocs):
            tokdoc = []
            [tokdoc.extend(sent) for sent in doc]
            combineddocuments.append(tokdoc)

        # Save the file
        fo.SaveFile(filename, combineddocuments, mode='wb')

    del fo

    return combineddocuments
def InitializeGlossary():

    # Create FileOperation object
    fo = FileOperations()

    # Initialize the two list to None
    glossarylist, synonymlist = [None]*2

    if fo.exists(GV.healthGlossaryFilePath):
        # Load the file from disk
        glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath) , fo.LoadFile(GV.synonymsFilePath)

    else:
        # Get all the glossary terms
        glossarylist, synonymlist = GetGlossaryTerms()

        # Save the glossary terms

        fo.SaveFile(GV.healthGlossaryFilePath, glossarylist, mode='wb')

        # Save the synonyms
        fo.SaveFile(GV.synonymsFilePath, synonymlist, mode='wb')

    del fo

    return glossarylist, synonymlist
def PreprocessData():
    # Create an object initialized to None
    pubmedarticlelists = None

    # Create FileOperations object
    fo = FileOperations()

    # parse the xml file
    p = Preprocessing()

    # If parsed file is present then load the file else parse the file
    if fo.exists(GV.parsedDataFile):
        pubmedarticlelists = p.LoadFile(GV.parsedDataFile)

    else:
        # Call the Parse method
        pubmedarticlelists, unsavedpmids = p.parse(GV.inputXmlFile)

        print(len(pubmedarticlelists))
        print(len(unsavedpmids))

        # Save the parsed data to a file
        fo.SaveFile(GV.parsedDataFile, pubmedarticlelists, mode='wb')
        fo.SaveFile(GV.unsavedPmidFile, unsavedpmids, mode='w')

        pubmedarticlelists = p.LoadFile(GV.parsedDataFile)

    del fo

    return pubmedarticlelists
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile):
    tokenizeddocs = []
    combineddocuments = []
    fo = FileOperations()
    # tokenizer = RegexpTokenizer(r'\w+')
    if fo.exists(filename):
        # Load the file
        combineddocuments = fo.LoadFile(filename)
        pass

    else:
        tokenizer = MWETokenizer(glossarylist)
        regtokenizer = RegexpTokenizer(r'\w+')
        lmtzr = WordNetLemmatizer()
        stemmer = SnowballStemmer("english", ignore_stopwords=True)
        stop_words = stopwords.words('english')
        for doc in tqdm(docs):
            sentences = sent_tokenize(doc)

            tmp = []
            for sentence in sentences:
                # For each sentence in the sentences

                # Tokenize the sentence based on Regex and then using MWETokenizer
                tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower()))

                # Lower the case of all the tokens
                token_lowercase = [x.lower() for x in tokens]

                # Lemmatize the sentence. Find the POS tags and then lemmatize
                tokens_lowecase_tagged = nltk.pos_tag(token_lowercase)
                lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged]

                # Stem the sentence
                stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence]

                # Remove the stop words
                processed_sentence = [word for word in stemmed_sentence if word not in stop_words]

                tmp.append(processed_sentence)
            tokenizeddocs.append(tmp)

        for doc in tqdm(tokenizeddocs):
            tokdoc = []
            [tokdoc.extend(sent) for sent in doc]
            combineddocuments.append(tokdoc)

        # Save the file
        fo.SaveFile(filename, combineddocuments, mode='wb')

    del fo

    return combineddocuments
def SaveGlossary(glossarylist, synonymlist):
    fo = FileOperations()

    if fo.exists(GV.glossaryFilePath):
        return
    else:
        glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath), fo.LoadFile(GV.synonymsFilePath)
        synonymterm2 = set(tuple(term2) for term1, term2 in synonymlist)
        synonymterm2 = list((list(term) for term in synonymterm2))
        glossarylist += list(synonymterm2)
        fo.SaveFile(GV.glossaryFilePath, glossarylist, mode='wb')
    del fo
    def CreateTaggedDocuments(self, tokenizeddocs, ids):
        taggeddocuments = None
        fo = FileOperations()

        if fo.exists(GV.taggedDocumentFile):
            taggeddocuments = fo.LoadFile(GV.taggedDocumentFile)
        else:
            taggeddocuments = [
                gensim.models.doc2vec.TaggedDocument(s, [ids[i]])
                for i, s in tqdm(enumerate(tokenizeddocs))
            ]
            fo.SaveFile(GV.taggedDocumentFile, taggeddocuments, mode='wb')

        del fo

        return taggeddocuments
示例#7
0
    def check_depend_then_ren_and_embed_original_metadata(
            self, append_faststart=True, artwork=False, copy_chapters=False):
        """This method will run the "check_dependencies_then_render" method and attempt to embed any artwork from the
		original file into the output (due to how ffmpeg works, the artwork can't always be copied in one command.)\n
		if artwork is True it will try to embed artwork from the input into the output specifically.
		This may happen if ffmpeg tries to output artwork to the first stream of an audio only file."""

        # Run standard command to render output.
        out_file_exists_result = self.check_depend_then_ren(
            append_faststart=append_faststart)

        if type(self.in_path) is list:
            in_meta_file = self.in_path[0]
        else:
            in_meta_file = self.in_path

        # If the output file exists then run the attempt_embed_metadata_silently method.
        if out_file_exists_result is True:
            # NOTE: This import is down here to avoid an infinite import.
            from FileOperations import FileOperations
            # This will attempt to embed any metadata (mainly for artwork) from the original file into the output.
            # (Due to how ffmpeg works, the artwork can't always be copied in one command.)
            # Create temporary output file with the original metadata embedded, delete the original output without the metadata,
            # and rename this temporary output to the desired output.
            for out_path in self.out_paths_list:
                temp_directory_to_embed_metadata = paths.Path().joinpath(
                    out_path.parent, '--temp_dir_to_embed_metadata_silently')
                paths.Path.mkdir(temp_directory_to_embed_metadata)
                temp_out_file = paths.Path().joinpath(
                    temp_directory_to_embed_metadata,
                    out_path.stem + out_path.suffix)
                FileOperations(out_path, temp_directory_to_embed_metadata,
                               False, self.print_ren_info, False,
                               False).copy_over_metadata(
                                   in_meta_file, copy_chapters)
                if temp_out_file.exists() is False:
                    if self.print_err is True:
                        print(
                            f'Error, input file to extract metadata silently from "{out_path}" not found.'
                        )
                    paths.Path(temp_directory_to_embed_metadata).rmdir()
                else:
                    out_path.unlink()
                    temp_out_file.rename(out_path)
                if artwork is True:
                    temp_art = FileOperations(
                        in_meta_file, temp_directory_to_embed_metadata, False,
                        self.print_ren_info, False, False).extract_artwork()
                    if temp_art is not False:
                        if temp_art.exists():
                            FileOperations(out_path,
                                           temp_directory_to_embed_metadata,
                                           False, self.print_ren_info, False,
                                           False).embed_artwork(temp_art)
                            temp_art.unlink()
                            out_path.unlink()
                            temp_out_file.rename(out_path)
                temp_directory_to_embed_metadata.rmdir()
                return True

        else:
            # A problem occurred while rendering and no output file was created so quit.
            return False