class DataWriter(object): def __init__(self): self.f_url = open("data/url.txt", "a+") self.f_key = open("data/keywords.txt", "a+") self.f_mat = open("data/matrix.mtx", "a+") self.f_cla = open("data/classes.txt", "a+") self.r = Datastore() self.URL2ID = "URL2ID" self.ID2URL = "ID2URL" self.PROCESSED_CTR = "PROCESSED_CTR" '''l = enumerate(os.listdir("/home/nvdia/kernel_panic/core/config_data/classes_odp")) l = [(x[0] + 1, x[1]) for x in l] self.classes = dict(l)''' def process_item(self, item, spider): if item['shutdown']: self.f_url.close() self.f_key.close() self.f_mat.close() self.f_cla.close() self.r.set("POWER_SWITCH", "KILL") return item self.writeURL(item) self.writeKeywords(item) self.writeWebMatrix(item) #self.writeClasses(item) self.r.incr(self.PROCESSED_CTR, 1) return item def writeURL(self, item): self.f_url.write(item['url'] + "\n") def writeKeywords(self, item): for k in item['words']: self.f_key.write("%s," % k) self.f_key.write("\n") def writeWebMatrix(self, item): ''' Builds web graph in matrix market format file ''' u = self.r.get("%s:%s" % (self.URL2ID, hashxx(item['url']))) v = 0 for link in set(item['link_set']): v = self.r.get("%s:%s" % (self.URL2ID, hashxx(link))) self.f_mat.write("%s\t%s\t1\n" % (u, v)) def writeClasses(self, item): self.f_cla.write("%s:%s\n" % (item['title'], self.classes[item['predict'][0]]))
class KeywordExtractor(object): ''' Extracts keywords from title, extracted_text, meta_description ''' def __init__(self): self.r = Datastore() self.URL2ID = "URL2ID" self.WORD_SET = "WORD_SET" self.WORD2ID = "WORD2ID" self.WORD_IN = "WORD_IN" self.WORD_CTR = "WORD_CTR" #self.r.set(self.WORD_CTR, -1) self.stemmer = nltk.stem.PorterStemmer() self.stopwords = set([self.clean(x) for x in nltk.corpus.stopwords.words('english')]) def process_item(self, item, spider): if item['shutdown']: return item print item['url'] text = item['title'] + " . " + item['extracted_text'] + " . " + item['meta_description'] words = [self.clean(x) for x in nltk.wordpunct_tokenize(text)] item['ordered_words'] = words cleaned_words = set(words) - self.stopwords cleaned_words = [self.clean(w) for w in cleaned_words if w.isalnum() and len(w) > 1 and not w.isdigit()] item['words'] = cleaned_words if not item['words']: raise DropItem self.buildWordIndex(item) return item def buildWordIndex(self, item): ''' Get current url id For each word in current url's text, add the url to the set of urls which contain that word ''' url_id = self.r.get("%s:%s" % (self.URL2ID, hashxx(item['url']))) word_id = "" for word in item['words']: if self.r.sadd(self.WORD_SET, word): word_id = str(self.r.incr(self.WORD_CTR, 1)) self.r.set("%s:%s" % (self.WORD2ID, word), word_id) else: word_id = self.r.get("%s:%s" % (self.WORD2ID, word)) self.r.sadd("%s:%s" % (self.WORD_IN, word_id), url_id) def clean(self, s): return self.stemmer.stem(s.lower())
class RequestsLimiter(object): def __init__(self): self.r = Datastore() #self.r.flushdb() self.DOMAIN = "DOMAIN" self.LIMIT = 200 self.DOMAIN_SET = "DOMAIN_SET" def process_request(self, request, spider): try: domain = urlparse(request.url).hostname if int(self.r.get(self.DOMAIN + ":" + domain) or 0) < self.LIMIT: self.r.sadd(self.DOMAIN_SET, domain) self.r.incr(self.DOMAIN + ":" + domain, 1) return None else: log.msg("DOMAIN limit Crossed:%s" % request.url, level=log.CRITICAL) raise IgnoreRequest except TypeError as e: raise IgnoreRequest def process_response(self, request, response, spider): try: if 'text/html' not in response.headers['Content-Type'] and 'text/plain' not in response.headers['Content-Type']: log.msg("Non-HTML/Plain:%s" % request.url, level=log.CRITICAL) raise IgnoreRequest if langid.classify(response.body)[0] != 'en': log.msg("Non-English:%s" % request.url, level=log.CRITICAL) raise IgnoreRequest except KeyError: log.msg("KeyError(Content-Type):%s" % request.url, level=log.CRITICAL) raise IgnoreRequest del request return response