def PreprocessData(): # Create an object initialized to None pubmedarticlelists = None # Create FileOperations object fo = FileOperations() # parse the xml file p = Preprocessing() # If parsed file is present then load the file else parse the file if fo.exists(GV.parsedDataFile): pubmedarticlelists = p.LoadFile(GV.parsedDataFile) else: # Call the Parse method pubmedarticlelists, unsavedpmids = p.parse(GV.inputXmlFile) print(len(pubmedarticlelists)) print(len(unsavedpmids)) # Save the parsed data to a file fo.SaveFile(GV.parsedDataFile, pubmedarticlelists, mode='wb') fo.SaveFile(GV.unsavedPmidFile, unsavedpmids, mode='w') pubmedarticlelists = p.LoadFile(GV.parsedDataFile) del fo return pubmedarticlelists
def InitializeGlossary(): # Create FileOperation object fo = FileOperations() # Initialize the two list to None glossarylist, synonymlist = [None]*2 if fo.exists(GV.healthGlossaryFilePath): # Load the file from disk glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath) , fo.LoadFile(GV.synonymsFilePath) else: # Get all the glossary terms glossarylist, synonymlist = GetGlossaryTerms() # Save the glossary terms fo.SaveFile(GV.healthGlossaryFilePath, glossarylist, mode='wb') # Save the synonyms fo.SaveFile(GV.synonymsFilePath, synonymlist, mode='wb') del fo return glossarylist, synonymlist
def TokenizeDocs(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) token_lowercase = [x.lower() for x in tokens] tmp.append(token_lowercase) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def TokenizeDocsNew(docs, glossarylist, filename=GV.tokenizedDocumentD2VFile): tokenizeddocs = [] combineddocuments = [] fo = FileOperations() # tokenizer = RegexpTokenizer(r'\w+') if fo.exists(filename): # Load the file combineddocuments = fo.LoadFile(filename) pass else: tokenizer = MWETokenizer(glossarylist) regtokenizer = RegexpTokenizer(r'\w+') lmtzr = WordNetLemmatizer() stemmer = SnowballStemmer("english", ignore_stopwords=True) stop_words = stopwords.words('english') for doc in tqdm(docs): sentences = sent_tokenize(doc) tmp = [] for sentence in sentences: # For each sentence in the sentences # Tokenize the sentence based on Regex and then using MWETokenizer tokens = tokenizer.tokenize(regtokenizer.tokenize(sentence.lower())) # Lower the case of all the tokens token_lowercase = [x.lower() for x in tokens] # Lemmatize the sentence. Find the POS tags and then lemmatize tokens_lowecase_tagged = nltk.pos_tag(token_lowercase) lammetized_sentence = [lmtzr.lemmatize(wrd, pos=get_wordnet_pos(tag)) for wrd, tag in tokens_lowecase_tagged] # Stem the sentence stemmed_sentence = [stemmer.stem(wrd) for wrd in lammetized_sentence] # Remove the stop words processed_sentence = [word for word in stemmed_sentence if word not in stop_words] tmp.append(processed_sentence) tokenizeddocs.append(tmp) for doc in tqdm(tokenizeddocs): tokdoc = [] [tokdoc.extend(sent) for sent in doc] combineddocuments.append(tokdoc) # Save the file fo.SaveFile(filename, combineddocuments, mode='wb') del fo return combineddocuments
def SaveGlossary(glossarylist, synonymlist): fo = FileOperations() if fo.exists(GV.glossaryFilePath): return else: glossarylist, synonymlist = fo.LoadFile(GV.healthGlossaryFilePath), fo.LoadFile(GV.synonymsFilePath) synonymterm2 = set(tuple(term2) for term1, term2 in synonymlist) synonymterm2 = list((list(term) for term in synonymterm2)) glossarylist += list(synonymterm2) fo.SaveFile(GV.glossaryFilePath, glossarylist, mode='wb') del fo
def SaveSimilarDocuments(self, pubmedarticlelists, similardocfilename): pdocs = self.doc2vec_model.docvecs.doctag_syn0 # [:pts] # Get all the pmids pmids = self.doc2vec_model.docvecs.offset2doctag # [:pts] # Create the similar documents dictionary for each pmid similardocdict = {} import pickle for idx, pmid in tqdm(enumerate(pmids)): # output the top 20 similair documents similardocdict[pmid] = self.doc2vec_model.docvecs.most_similar( pmid, topn=23752) similardocdict[pmid].insert(0, (pmid, '1.0')) #TODO New code if idx % 1000 == 0 or idx == 23753: with open('./saveddata/simdictpmid.pkl', mode='a+b') as f: # appending, not writing pickle.dump(similardocdict, f) similardocdict = {} #TODO # { 'pmid1': {'Title':'Title', {Similar:[[id, 'title', score], [id, 'title', score], [id, 'title', score]]}, # 'pmid2': {'Title':'Title', {Similar:[[id, 'title', score], [id, 'title', score], [id, 'title', score]]}, # ... # } similararticlesdict = {} for idx, pmid in tqdm(enumerate(pmids)): # Find current pmid title doctitle = pubmedarticlelists[pmid].ArticleTitle # Find similar documents pmids similardocpmids = similardocdict[pmid] similartitlescorelist = [] # Iterate through all the pmids for id, score in similardocpmids: articletitle = pubmedarticlelists[id].ArticleTitle similartitlescorelist.append([id, articletitle, score]) similararticlesdict[pmid] = { 'Title': doctitle, 'Similar': similartitlescorelist } # Save the similar documents fo = FileOperations() fo.SaveFile(similardocfilename, similararticlesdict)
def CreateTaggedDocuments(self, tokenizeddocs, ids): taggeddocuments = None fo = FileOperations() if fo.exists(GV.taggedDocumentFile): taggeddocuments = fo.LoadFile(GV.taggedDocumentFile) else: taggeddocuments = [ gensim.models.doc2vec.TaggedDocument(s, [ids[i]]) for i, s in tqdm(enumerate(tokenizeddocs)) ] fo.SaveFile(GV.taggedDocumentFile, taggeddocuments, mode='wb') del fo return taggeddocuments