class KeywordExtractor(object): ''' Extracts keywords from title, extracted_text, meta_description ''' def __init__(self): self.r = Datastore() self.URL2ID = "URL2ID" self.WORD_SET = "WORD_SET" self.WORD2ID = "WORD2ID" self.WORD_IN = "WORD_IN" self.WORD_CTR = "WORD_CTR" #self.r.set(self.WORD_CTR, -1) self.stemmer = nltk.stem.PorterStemmer() self.stopwords = set([self.clean(x) for x in nltk.corpus.stopwords.words('english')]) def process_item(self, item, spider): if item['shutdown']: return item print item['url'] text = item['title'] + " . " + item['extracted_text'] + " . " + item['meta_description'] words = [self.clean(x) for x in nltk.wordpunct_tokenize(text)] item['ordered_words'] = words cleaned_words = set(words) - self.stopwords cleaned_words = [self.clean(w) for w in cleaned_words if w.isalnum() and len(w) > 1 and not w.isdigit()] item['words'] = cleaned_words if not item['words']: raise DropItem self.buildWordIndex(item) return item def buildWordIndex(self, item): ''' Get current url id For each word in current url's text, add the url to the set of urls which contain that word ''' url_id = self.r.get("%s:%s" % (self.URL2ID, hashxx(item['url']))) word_id = "" for word in item['words']: if self.r.sadd(self.WORD_SET, word): word_id = str(self.r.incr(self.WORD_CTR, 1)) self.r.set("%s:%s" % (self.WORD2ID, word), word_id) else: word_id = self.r.get("%s:%s" % (self.WORD2ID, word)) self.r.sadd("%s:%s" % (self.WORD_IN, word_id), url_id) def clean(self, s): return self.stemmer.stem(s.lower())
class RequestsLimiter(object): def __init__(self): self.r = Datastore() #self.r.flushdb() self.DOMAIN = "DOMAIN" self.LIMIT = 200 self.DOMAIN_SET = "DOMAIN_SET" def process_request(self, request, spider): try: domain = urlparse(request.url).hostname if int(self.r.get(self.DOMAIN + ":" + domain) or 0) < self.LIMIT: self.r.sadd(self.DOMAIN_SET, domain) self.r.incr(self.DOMAIN + ":" + domain, 1) return None else: log.msg("DOMAIN limit Crossed:%s" % request.url, level=log.CRITICAL) raise IgnoreRequest except TypeError as e: raise IgnoreRequest def process_response(self, request, response, spider): try: if 'text/html' not in response.headers['Content-Type'] and 'text/plain' not in response.headers['Content-Type']: log.msg("Non-HTML/Plain:%s" % request.url, level=log.CRITICAL) raise IgnoreRequest if langid.classify(response.body)[0] != 'en': log.msg("Non-English:%s" % request.url, level=log.CRITICAL) raise IgnoreRequest except KeyError: log.msg("KeyError(Content-Type):%s" % request.url, level=log.CRITICAL) raise IgnoreRequest del request return response