def process_input_phrase_query(self,phrase):
        phrasal_word=self.process_query_removing_special_characters(phrase)
        new_sentence=""
        for word in  phrasal_word:

               if indexutils.return_is_stop_word(word)==False:
                 current_word=indexutils.return_stemmed_word(word)
                 new_sentence=new_sentence+" "+current_word
        return new_sentence.strip()
Пример #2
0
  def populate_index_hash(self):
      file_list=""
      index=""
      tokenCountHash={}
      count=0
      # For every path supplied as corpus path
      for documentPath in self.docs:

          #get list of files in a particular corpus path
          file_list=indexutils.get_files_in_path(documentPath)
          # Calculate the total number of documents in the corpus path
          self.total_documents=self.total_documents+len(file_list)
          # read every file in the corpus
          for file in file_list:


              documentPaths=indexutils.construct_path(documentPath,file)
              documentContent=self.get_document_content(documentPaths)
              documentTitle=""
              documentId=""
              documentAuthor=""
              documentBibilio=""
              # TODO replace the hard coded characters
              # Parse the corpus and extract the text,title, author and bibiliography info
              documentText=str(documentContent.getElementsByTagName('TEXT')[0].firstChild.data.strip())
              documentTitle=str(documentContent.getElementsByTagName('TITLE')[0].firstChild.data.strip())
              documentId=str(documentContent.getElementsByTagName('DOCNO')[0].firstChild.data.strip())
              documentAuthor=str(documentContent.getElementsByTagName('AUTHOR')[0].firstChild.data.strip())
              documentBibilio=str(documentContent.getElementsByTagName('BIBLIO')[0].firstChild.data.strip())

              documentDetails=()
              documentDetails=(documentId,documentTitle,documentAuthor)
              # Split the document into non punctuated  words
              tokens=self.tokenize_document_text(documentText)
              title_token =self.tokenize_document_text(documentTitle)
              tokens=tokens+title_token
              # Each token/word returned by the tokenizer
              tokenCountHash={}
              positionCounter=0
              positionList= []

              # For every token got from corpus file
              for token in tokens:


                self.total_words=self.total_words+1
                  # introduce something here if stemming has to be done here
                #ignore if word is stop word
                if indexutils.return_is_stop_word(token):
                  positionCounter=positionCounter
                  self.total_stop_words=self.total_stop_words+1
                else:
                  positionCounter=positionCounter+1
                  token=token.strip()
                  #else perform stemming
                  new_word=indexutils.return_stemmed_word(token)
                 # if the word has been actually stemmed
                  if(new_word!=token):
                      self.number_of_stemmed_words=self.number_of_stemmed_words+1
                  token=new_word

                  #add the new word to the position list formed, i.e if a word already exists append the new
                  #position to the position list of that word in the file being currently read
                  if tokenCountHash.has_key(token):

                     positionList= tokenCountHash[token]
                     positionList [0]= int(positionList [0])+1
                     positionList.append(positionCounter)
                     tokenCountHash.update({token:positionList})
                  else:

                     positionList=[1]
                     positionList.append(positionCounter)
                     tokenCountHash.update({token:positionList})

              #updating the index dictionary
              self.update_index_dictionaries(documentDetails,tokenCountHash)
              #update the document meta data dictionary
              self.documentIndex.update({documentId:{"path":documentPaths,"title":documentTitle,"author":documentAuthor,"bibliography":documentBibilio}})