def get_position_text(self): """ This method asks the user for the div ids to include, gets the content of said divs, and then asks the user for a percentage of said div to use :return: The queries generated from the text left after above reduction """ text = '' include_answer = raw_input("Do you want to enter divs to be included or excluded? Enter i for include, " "e for exclude") include = True#defaults to include if include_answer == 'e': include = False #todo check words and percentage are ints before convert divs = raw_input("enter IDs of the divs separated by spaces \n") ids = ['wrapper']#defaults to wrapper, overwritten if divs are entered if divs: #split string into list of IDs ids = divs.split() if include: #set the extractor with no divs to ignore and process the page pce = PositionContentExtractor() pce.process_html_page(self.page_html) #now set the text of the pce to be the text from the divs with given ids pce.set_all_content(ids,"div") else: pce = PositionContentExtractor(div_ids=ids) pce.process_html_page(self.page_html) limit_by_words = raw_input("enter y if you want to limit by a number of words \n") #defaults to no if you just hit enter yes_vals = ["y",'Y',"Yes",'yes'] if limit_by_words in yes_vals: while True: words = raw_input("enter the number of words to use" "in generating queries \n") if self.is_integer(words): words = int(words) text = pce.get_subtext(num_words=words) break else: limit_by_percent = raw_input("enter y if you want to limit by a percentage of words \n") if limit_by_percent in yes_vals: while True: percentage = raw_input("the percentage of words to use in generating queries \n") if self.is_integer(percentage): percentage = int(percentage) text = pce.get_subtext(percentage=percentage) break else: text = pce.get_subtext() return text
def reduce_page(self, percentage): """ this method reduces the whole page content to a percentage of the content :param percentage: the percentage of the page to be used for generating queries :return: the reduced page content as a string """ pce = PositionContentExtractor() pce.process_html_page(self.page_html) return pce.get_subtext(percentage=percentage)
def get_position_text(self): pce = PositionContentExtractor() pce.process_html_page(self.page_html) #now set the text of the pce to be the text from the divs with given ids self.set_divs() #set the content to be that of the divs if there are any if self.divs: pce.set_all_content(self.divs,"div") else: print "no divs, in default **" #todo checking no divs works #now check if to limit by words text ='' if self.doc_portion_count: if self.is_integer(self.doc_portion_count): words = int(self.doc_portion_count) text = pce.get_subtext(num_words=words) elif self.doc_portion_percent: if self.is_integer(self.doc_portion_percent): percentage = int(self.doc_portion_percent) text = pce.get_subtext(percentage=percentage) else: text = pce.get_subtext() return text
def get_position_text(self): pce = PositionContentExtractor() pce.process_html_page(self.page_html) # now set the text of the pce to be the text from the divs with given ids self.set_divs() # set the content to be that of the divs if there are any if self.divs: pce.set_all_content(self.divs, "div") else: print "no divs, in default **" # todo checking no divs works # now check if to limit by words text = "" if self.doc_portion_count: if self.is_integer(self.doc_portion_count): words = int(self.doc_portion_count) text = pce.get_subtext(num_words=words) elif self.doc_portion_percent: if self.is_integer(self.doc_portion_percent): percentage = int(self.doc_portion_percent) text = pce.get_subtext(percentage=percentage) else: text = pce.get_subtext() return text