def processBody(self): queue = ready_queue(self.url, self.body) #print "found %i links to queue" % len(queue) self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body))) if len(self.text) > 5000: offset = 0 i = 0 l = [] while True: j = self.findnth(self.text[i:],' ',500) offset += j if j == -1: break l.append(self.text[i:j]) i = offset + j+1 logger.debug("processing with %i threads" % len(l)) try: if len(l) == 0: return [] pool = Pool(processes=(len(l))) self.keyword_dicts = pool.map(rankKeywords, l) except KeyboardInterrupt: pool.terminate() pool.join() sys.exit() else: pool.close() pool.join() logger.debug("processed, returned %i dicts" % len(self.keyword_dicts)) else: self.keyword_dicts.append(rankKeywords(self.text)) return queue
def processBody(self): queue = ready_queue(self.url, self.body) #print "found %i links to queue" % len(queue) self.text = stripPunctuation( self.remove_html_tags(stripScript(self.body))) if len(self.text) > 5000: offset = 0 i = 0 l = [] while True: j = self.findnth(self.text[i:], ' ', 500) offset += j if j == -1: break l.append(self.text[i:j]) i = offset + j + 1 logger.debug("processing with %i threads" % len(l)) try: if len(l) == 0: return [] pool = Pool(processes=(len(l))) self.keyword_dicts = pool.map(rankKeywords, l) except KeyboardInterrupt: pool.terminate() pool.join() sys.exit() else: pool.close() pool.join() logger.debug("processed, returned %i dicts" % len(self.keyword_dicts)) else: self.keyword_dicts.append(rankKeywords(self.text)) return queue
def process(self): text_lower = self.text.lower() temptitle = self.text[text_lower.find('<title')+6:text_lower.find('</title>')] self.title = temptitle[temptitle.find('>'):len(temptitle)] self.head = self.text[text_lower.find('<head')+5:text_lower.find('</head>')] self.body = self.text[text_lower.find('<body'):text_lower.find('</body>')] self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body))) self.links=ready_queue(self.url, self.body) return self.links
def processBody(self): queue = ready_queue(self.url, self.body) #print "found %i links to queue" % len(queue) self.text = stripPunctuation( self.remove_html_tags(stripScript(self.body))) if len(self.text) > 5000: offset = 0 i = 0 l = [] cont = True while cont: #this divides the text into sets of 500 words #set j to the index of the last letter of the 500th word j = self.findnth(self.text[i:], ' ', 500) #if only 500 words or less are left if j == -1: cont = False #Should append a string that contains 500 words for each loop(except the last loop) to l #last loop should append a string with 500 words or less to l l.append(self.text[i:i + j]) i += j + 1 logger.debug("processing with %i threads" % len(l)) try: if len(l) == 0: return [] pool = Pool(processes=(len(l))) self.keyword_dicts = pool.map(rankKeywords, l) except KeyboardInterrupt: pool.terminate() pool.join() sys.exit() else: pool.close() pool.join() logger.debug("processed, returned %i dicts" % len(self.keyword_dicts)) else: self.keyword_dicts.append(rankKeywords(self.text)) return queue
def processBody(self): queue = ready_queue(self.url, self.body) #print "found %i links to queue" % len(queue) self.text = stripPunctuation(self.remove_html_tags(stripScript(self.body))) if len(self.text) > 5000: offset = 0 i = 0 l = [] cont = True while cont: #this divides the text into sets of 500 words #set j to the index of the last letter of the 500th word j = self.findnth(self.text[i:],' ',500) #if only 500 words or less are left if j == -1: cont = False #Should append a string that contains 500 words for each loop(except the last loop) to l #last loop should append a string with 500 words or less to l l.append(self.text[i:i+j]) i += j+1 logger.debug("processing with %i threads" % len(l)) try: if len(l) == 0: return [] pool = Pool(processes=(len(l))) self.keyword_dicts = pool.map(rankKeywords, l) except KeyboardInterrupt: pool.terminate() pool.join() sys.exit() else: pool.close() pool.join() logger.debug("processed, returned %i dicts" % len(self.keyword_dicts)) else: self.keyword_dicts.append(rankKeywords(self.text)) return queue