def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) token_lowercase = [x.lower() for x in tokens] tmp.append(token_lowercase) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def InitializeGlossary(): # Create FileOperation object fo = FileOperations() # Initialize the two list to None glossarylist, synonymlist = [None]*2 if fo.exists(GV.healthGlossaryFilePath): # Load the file from disk glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath) , fo.LoadFile(GV.synonymsFilePath) else: # Get all the glossary terms glossarylist, synonymlist = GetGlossaryTerms() # Save the glossary terms fo.SaveFile(GV.healthGlossaryFilePath, glossarylist, mode='wb') # Save the synonyms fo.SaveFile(GV.synonymsFilePath, synonymlist, mode='wb') del fo return glossarylist, synonymlist
def PreprocessData(): # Create an object initialized to None pubmedarticlelists = None # Create FileOperations object fo = FileOperations() # parse the xml file p = Preprocessing() # If parsed file is present then load the file else parse the file if fo.exists(GV.parsedDataFile): pubmedarticlelists = p.LoadFile(GV.parsedDataFile) else: # Call the Parse method pubmedarticlelists, unsavedpmids = p.parse(GV.inputXmlFile) print(len(pubmedarticlelists)) print(len(unsavedpmids)) # Save the parsed data to a file fo.SaveFile(GV.parsedDataFile, pubmedarticlelists, mode='wb') fo.SaveFile(GV.unsavedPmidFile, unsavedpmids, mode='w') pubmedarticlelists = p.LoadFile(GV.parsedDataFile) del fo return pubmedarticlelists
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') lmtzr = WordNetLemmatizer() stemmer = SnowballStemmer("english", ignore_stopwords=True) stop_words = stopwords.words('english') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: # For each sentence in the sentences # Tokenize the sentence based on Regex and then using MWETokenizer tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) # Lower the case of all the tokens token_lowercase = [x.lower() for x in tokens] # Lemmatize the sentence. Find the POS tags and then lemmatize tokens_lowecase_tagged = nltk.pos_tag(token_lowercase) lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged] # Stem the sentence stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence] # Remove the stop words processed_sentence = [word for word in stemmed_sentence if word not in stop_words] tmp.append(processed_sentence) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def SaveGlossary(glossarylist, synonymlist): fo = FileOperations() if fo.exists(GV.glossaryFilePath): return else: glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath), fo.LoadFile(GV.synonymsFilePath) synonymterm2 = set(tuple(term2) for term1, term2 in synonymlist) synonymterm2 = list((list(term) for term in synonymterm2)) glossarylist += list(synonymterm2) fo.SaveFile(GV.glossaryFilePath, glossarylist, mode='wb') del fo
def CreateTaggedDocuments(self, tokenizeddocs, ids): taggeddocuments = None fo = FileOperations() if fo.exists(GV.taggedDocumentFile): taggeddocuments = fo.LoadFile(GV.taggedDocumentFile) else: taggeddocuments = [ gensim.models.doc2vec.TaggedDocument(s, [ids[i]]) for i, s in tqdm(enumerate(tokenizeddocs)) ] fo.SaveFile(GV.taggedDocumentFile, taggeddocuments, mode='wb') del fo return taggeddocuments
def check_depend_then_ren_and_embed_original_metadata( self, append_faststart=True, artwork=False, copy_chapters=False): """This method will run the "check_dependencies_then_render" method and attempt to embed any artwork from the original file into the output (due to how ffmpeg works, the artwork can't always be copied in one command.)\n if artwork is True it will try to embed artwork from the input into the output specifically. This may happen if ffmpeg tries to output artwork to the first stream of an audio only file.""" # Run standard command to render output. out_file_exists_result = self.check_depend_then_ren( append_faststart=append_faststart) if type(self.in_path) is list: in_meta_file = self.in_path[0] else: in_meta_file = self.in_path # If the output file exists then run the attempt_embed_metadata_silently method. if out_file_exists_result is True: # NOTE: This import is down here to avoid an infinite import. from FileOperations import FileOperations # This will attempt to embed any metadata (mainly for artwork) from the original file into the output. # (Due to how ffmpeg works, the artwork can't always be copied in one command.) # Create temporary output file with the original metadata embedded, delete the original output without the metadata, # and rename this temporary output to the desired output. for out_path in self.out_paths_list: temp_directory_to_embed_metadata = paths.Path().joinpath( out_path.parent, '--temp_dir_to_embed_metadata_silently') paths.Path.mkdir(temp_directory_to_embed_metadata) temp_out_file = paths.Path().joinpath( temp_directory_to_embed_metadata, out_path.stem + out_path.suffix) FileOperations(out_path, temp_directory_to_embed_metadata, False, self.print_ren_info, False, False).copy_over_metadata( in_meta_file, copy_chapters) if temp_out_file.exists() is False: if self.print_err is True: print( f'Error, input file to extract metadata silently from "{out_path}" not found.' ) paths.Path(temp_directory_to_embed_metadata).rmdir() else: out_path.unlink() temp_out_file.rename(out_path) if artwork is True: temp_art = FileOperations( in_meta_file, temp_directory_to_embed_metadata, False, self.print_ren_info, False, False).extract_artwork() if temp_art is not False: if temp_art.exists(): FileOperations(out_path, temp_directory_to_embed_metadata, False, self.print_ren_info, False, False).embed_artwork(temp_art) temp_art.unlink() out_path.unlink() temp_out_file.rename(out_path) temp_directory_to_embed_metadata.rmdir() return True else: # A problem occurred while rendering and no output file was created so quit. return False