def InitializeGlossary(): # Create FileOperation object fo = FileOperations() # Initialize the two list to None glossarylist, synonymlist = [None]*2 if fo.exists(GV.healthGlossaryFilePath): # Load the file from disk glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath) , fo.LoadFile(GV.synonymsFilePath) else: # Get all the glossary terms glossarylist, synonymlist = GetGlossaryTerms() # Save the glossary terms fo.SaveFile(GV.healthGlossaryFilePath, glossarylist, mode='wb') # Save the synonyms fo.SaveFile(GV.synonymsFilePath, synonymlist, mode='wb') del fo return glossarylist, synonymlist
def SaveGlossary(glossarylist, synonymlist): fo = FileOperations() if fo.exists(GV.glossaryFilePath): return else: glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath), fo.LoadFile(GV.synonymsFilePath) synonymterm2 = set(tuple(term2) for term1, term2 in synonymlist) synonymterm2 = list((list(term) for term in synonymterm2)) glossarylist += list(synonymterm2) fo.SaveFile(GV.glossaryFilePath, glossarylist, mode='wb') del fo
def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) token_lowercase = [x.lower() for x in tokens] tmp.append(token_lowercase) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') lmtzr = WordNetLemmatizer() stemmer = SnowballStemmer("english", ignore_stopwords=True) stop_words = stopwords.words('english') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: # For each sentence in the sentences # Tokenize the sentence based on Regex and then using MWETokenizer tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) # Lower the case of all the tokens token_lowercase = [x.lower() for x in tokens] # Lemmatize the sentence. Find the POS tags and then lemmatize tokens_lowecase_tagged = nltk.pos_tag(token_lowercase) lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged] # Stem the sentence stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence] # Remove the stop words processed_sentence = [word for word in stemmed_sentence if word not in stop_words] tmp.append(processed_sentence) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def CreateTaggedDocuments(self, tokenizeddocs, ids): taggeddocuments = None fo = FileOperations() if fo.exists(GV.taggedDocumentFile): taggeddocuments = fo.LoadFile(GV.taggedDocumentFile) else: taggeddocuments = [ gensim.models.doc2vec.TaggedDocument(s, [ids[i]]) for i, s in tqdm(enumerate(tokenizeddocs)) ] fo.SaveFile(GV.taggedDocumentFile, taggeddocuments, mode='wb') del fo return taggeddocuments