def process_input_phrase_query(self,phrase): phrasal_word=self.process_query_removing_special_characters(phrase) new_sentence="" for word in phrasal_word: if indexutils.return_is_stop_word(word)==False: current_word=indexutils.return_stemmed_word(word) new_sentence=new_sentence+" "+current_word return new_sentence.strip()
def populate_index_hash(self): file_list="" index="" tokenCountHash={} count=0 # For every path supplied as corpus path for documentPath in self.docs: #get list of files in a particular corpus path file_list=indexutils.get_files_in_path(documentPath) # Calculate the total number of documents in the corpus path self.total_documents=self.total_documents+len(file_list) # read every file in the corpus for file in file_list: documentPaths=indexutils.construct_path(documentPath,file) documentContent=self.get_document_content(documentPaths) documentTitle="" documentId="" documentAuthor="" documentBibilio="" # TODO replace the hard coded characters # Parse the corpus and extract the text,title, author and bibiliography info documentText=str(documentContent.getElementsByTagName('TEXT')[0].firstChild.data.strip()) documentTitle=str(documentContent.getElementsByTagName('TITLE')[0].firstChild.data.strip()) documentId=str(documentContent.getElementsByTagName('DOCNO')[0].firstChild.data.strip()) documentAuthor=str(documentContent.getElementsByTagName('AUTHOR')[0].firstChild.data.strip()) documentBibilio=str(documentContent.getElementsByTagName('BIBLIO')[0].firstChild.data.strip()) documentDetails=() documentDetails=(documentId,documentTitle,documentAuthor) # Split the document into non punctuated words tokens=self.tokenize_document_text(documentText) title_token =self.tokenize_document_text(documentTitle) tokens=tokens+title_token # Each token/word returned by the tokenizer tokenCountHash={} positionCounter=0 positionList= [] # For every token got from corpus file for token in tokens: self.total_words=self.total_words+1 # introduce something here if stemming has to be done here #ignore if word is stop word if indexutils.return_is_stop_word(token): positionCounter=positionCounter self.total_stop_words=self.total_stop_words+1 else: positionCounter=positionCounter+1 token=token.strip() #else perform stemming new_word=indexutils.return_stemmed_word(token) # if the word has been actually stemmed if(new_word!=token): self.number_of_stemmed_words=self.number_of_stemmed_words+1 token=new_word #add the new word to the position list formed, i.e if a word already exists append the new #position to the position list of that word in the file being currently read if tokenCountHash.has_key(token): positionList= tokenCountHash[token] positionList [0]= int(positionList [0])+1 positionList.append(positionCounter) tokenCountHash.update({token:positionList}) else: positionList=[1] positionList.append(positionCounter) tokenCountHash.update({token:positionList}) #updating the index dictionary self.update_index_dictionaries(documentDetails,tokenCountHash) #update the document meta data dictionary self.documentIndex.update({documentId:{"path":documentPaths,"title":documentTitle,"author":documentAuthor,"bibliography":documentBibilio}})