def process_item(self, item, spider): if not item.get('body'): raise DropItem(f'Missing body property in {item}') return item
def item_completed(self, results, item, info): img_paths = [x['path'] for ok, x in results if ok] if not img_paths: raise DropItem("图片没下载好 %s" % img_paths)
def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem('Image Download Failed') return item
def process_item(self, item, spider): # print(item) valid = True i = md5(item['status']).hexdigest() print(i) returndf = self.df.add(i) print(returndf) if item['url'].find('error') == -1: iserror = False else: iserror = True print(iserror) if returndf or iserror: valid = False else: for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) print(valid) print("$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$") if valid: self.fa.writelines(i + '\r\n') self.collection.insert(dict(item)) #logging.log(msg="Car added to MongoDB database!", level=logging.INFO) self.mongocounts += 1 #logging.log(msg="scrapy " + str(self.counts) + " items", level=logging.INFO) #mysql save if settings['MONGODB_COLLECTION'] in [ "taoche", "youxin", "ttpai", "che168", "youxinpai", "guazi", "renrenche", "kaixin", "haoche51", "souche", "hx2car", "che168", "renrenche", "iautos", "souhu", "haoche99", "che273", "che101", "chewang", "xcar", "ganji", "zg2sc", "ygche", "che58", "youche", "cn2che", "baixing", "che273_test" ]: domtext = scrapy.selector.Selector(text=item["datasave"][1]) parsed_item = car_parse.ILikeParse(self.caritemlist, item, domtext) self.items.append(parsed_item) # logging.log(msg=item["datasave"][0],level=logging.INFO) # logging.log(msg=item["datasave"][1],level=logging.INFO) # logging.log(msg=parsed_item,level=logging.INFO) # logging.log(msg=item['url'],level=logging.INFO) # logging.log(msg="add to SQL",level=logging.INFO) self.items = self.savedata(self.items, self.table, self.mysqlconnection, 1) elif settings['MONGODB_COLLECTION'] in [ "chemao", "aokangda", "auto51", 'aokangda_test', 'chezhibao' ]: domtext = scrapy.selector.Selector(text=item["datasave"][0]) parsed_item = car_parse.ILikeParse(self.caritemlist, item, domtext) self.items.append(parsed_item) # logging.log(msg=item["datasave"][0],level=logging.INFO) #logging.log(msg=item["datasave"][1],level=logging.INFO) # logging.log(msg=parsed_item,level=logging.INFO) # logging.log(msg=item['url'],level=logging.INFO) # logging.log(msg="add to SQL",level=logging.INFO) self.items = self.savedata(self.items, self.table, self.mysqlconnection, 1) # logging.log(msg="add sql " + str(self.counts) + " items", level=logging.INFO) elif iserror: logging.log(msg="Car Error!", level=logging.INFO) # log save urlog = {'url': item['url'], 'grabtime': item['grabtime']} self.collectionwrong.insert(urlog) else: pass #logging.log(msg="Car duplicated!", level=logging.INFO) #log save urlog = {'url': item['url'], 'grabtime': item['grabtime']} self.collectionurllog.insert(urlog) return item
def __init__(self, original_url="", *args): self.original_url = original_url self.style = color.color_style() DropItem.__init__(self, *args)
def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem('图片未下载好 %s' % image_paths)
def process_item(self, item, spider): if not scan(item): raise DropItem('item is incomplete') return item
def __str__(self): #####for usage: print e print self.style.ERROR("DROP(CrawledUrlDrop):" + self.url) return DropItem.__str__(self)
def process_item(self, item, spider): titulo = item['titulo'] if ('capsula' not in titulo): raise DropItem('No tiene capsula') else: return item
def process_item(self, item, spider): if item['zid'] in self.zud_seen: raise DropItem('Duplicate listing found %s' % item['zid']) elif re.search('AuthRequired', item['link']): raise DropItem('Unauthorized listing found %s' % item['zid'])
def process_item(self, item, spider): if self.filter is None or re.search(self.filter, item["url"]): return item else: raise DropItem("Pattern [%s] not in url [%s]" % (self.filter, item["url"]))
def process_item(self, item, spider): exist_url = self.session.query(Urls).filter_by(url=item["url"]).first() if exist_url is not None: # the current quote exists raise DropItem("Duplicate url found: %s" % item["url"]) else: return item
def __str__(self): #####for usage: print e print(self.style.ERROR("DROP(NofilesDrop):" + self.original_url)) return DropItem.__str__(self)
def __str__(self):#####for usage: print e print self.style.ERROR("DROP(NofilesDrop):" + self.original_url) return DropItem.__str__(self)
def _title(self, item: Any) -> None: if len(item['title']) < 3: raise DropItem("Drop item as title '{}' is bad".format( item['title'])) item['title'] = item['title']
def __str__(self):#####for usage: print e print "DROP(NofilesDrop):" + self.original_url return DropItem.__str__(self)
def _content(self, item: Any) -> None: if len(html2text(item['content'])) < 200: raise DropItem("Drop item as content is too short") item['content'] = item['content']
def __init__(self, info="", url="", *args): self.info = info self.url = url self.style = color.color_style() DropItem.__init__(self, *args)
def process_item(self,item,spider): if redis_db.hexists(redis_data_dict,item['id']): raise DropItem("Duplicate book found:%s" % item) return item
def process_item(self, item, spider): match = re.match("item\?id=[0-9]+", item["url"]) if match: print("Excluded self-post: " + str(item["url"])) raise DropItem("Excluded self-post: " + str(item["url"])) return item
def process_item(self, item, spider): if item['title']: item["title"] = clean_spaces(item["title"]) return item else: raise DropItem("Missing title in %s" % item)
def process_item(self, item, spider): score_diff = int(item['score_diff']) if score_diff > 0 and score_diff < 250: return item else: raise DropItem("score_diff not matched!")
def drop_item_by_product_name(self, item): filter_types = [ 'boots', 'tote', 'crevasse', 'watch', 'watches', 'glove', 'mittens', 'backpack', 'kabyte', 'kabig', 'kaban', 'itinerant', 'gnomad', 'crevasse', 'toter', 'duffel', 'access', 'pack', 'bag', 'scrunchie', 'lanyard', 'sunglasses', 'sackpack', 'shoe', 'belt', 'gaiter', 'kneepad', 'earphone', 'skateboard', 'slides', 'cleats', 'spikes', 'wader', 'lacrosse', 'pads', 'mule', 'mitts', 'mitt', 'goggle', 'booties', 'bootie', 'shawl', 'blanket', 'pouch', 'torque', 'mat', 'fastpack', 'puddle', 'phone', 'strap', 'boot', 'soccer', 'benassi', 'sack', 'sandals', 'sneaker', 'ball', 'cleat', 'slide', 'moc', 'sandal', 'cleat', 'slide', 'waistbag', 'canteen', 'Hair Ties', 'Water Bottle', 'spray', 'lotion', 'co-wash', 'shampoo', 'sunscreen', 'pouch', 'conditioner', 'balm', ] for t in filter_types: if t in item['Name'].lower(): raise DropItem('found type Name : ', t) if t in item['Url'].lower(): raise DropItem('found type Url : ', t) try: if t in item['Clothing'].lower(): raise DropItem('found type :', t) except: pass
def process_item(self, item, spider): if item['id'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['id']) return item
def item_completed(self, results, item, info): thumbnail_url = [x['path'] for ok, x in results if ok] if not thumbnail_url: raise DropItem("Item contains no images") item['thumbnail_url'] = thumbnail_url return item
def process_item(self, item, spider): if item['url'] in self.url_seen: raise DropItem(f'Item already visited {item}') else: self.url_seen.add(item['url']) return item
def process_item(self, item, spider): # START CLEANUP fields = [ 'title', 'geek_rating', 'min_age', 'votes', 'min_players', 'max_players', 'weight', 'avg_rating' ] try: for field in fields: if field not in item or not item[field]: item[field] = 0 else: item[field] = item[field].strip() if item['time'] == DOUBLE_EN_DASH or item['time'] is None: item['time'] = 0 else: item['time'] = item['time'].strip() if 'mechanisms' in item: item['mechanisms'] = {k.strip() for k in item['mechanisms']} else: item['mechanisms'] = 0 except AttributeError as err: raise DropItem(f'INFO: Dropping {item["title"]}, unhandled field.') # END CLEANUP # START PROCESSING if item['txt_cnt']: item['txt_cnt'] = int(re.findall('\d+', item['txt_cnt'])[0]) else: item['txt_cnt'] = 0 if item['vid_cnt']: item['vid_cnt'] = int(re.findall('\d+', item['vid_cnt'])[0]) else: item['vid_cnt'] = 0 item['review_count'] = item['txt_cnt'] + item['vid_cnt'] if item['min_age'] == DOUBLE_EN_DASH: item['min_age'] = 0 else: item['min_age'] = item['min_age'][:-1] for field in fields: if item[field] == 'N/A': item[field] = 0 # END PROCESSING # START FILTERING if item['avg_rating'] != 0: int_fields = [ 'min_age', 'time', 'votes', 'min_players', 'max_players' ] float_fields = ['weight', 'avg_rating', 'geek_rating'] for field in int_fields: item[field] = int(item[field]) for field in float_fields: item[field] = float(item[field]) # remove unwanted fields item.pop('txt_cnt', None) item.pop('vid_cnt', None) print(f'PROCESSED: {item["bg_id"]}, {item["title"]}') return item else: raise DropItem( f'BG: {item["title"]}, doesn\'t have enough info. "avg_rating: {item["avg_rating"]}"' )
def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") item['image_paths'] = image_paths return item
def process_item(self, item, spider): if item[self.key] not in self.records: return item else: raise DropItem('Duplicate %s: %s' % (self.key, item[self.key]))
def process_item(self, item, spider): response = item['resp'] item = vuln() xss_type = response.meta['type'] orig_url = response.meta['orig_url'] injections = response.meta['injections'] quote_enclosure = response.meta['quote'] inj_point = response.meta['inj_point'] resp_url = response.url body = response.body # Regex: ( ) mean group 1 is within the parens, . means any char, # {1,50} means match any char 1 to 50 times #chars_between_delims = '%s(.{1,50}?)%s' % (self.test_str, self.test_str) chars_between_delims = '%s(.{0,50}?)%s' % (self.test_str, self.test_str) inj_num = len(injections) mismatch = False if xss_type == 'form': POST_to = response.meta['POST_to'] else: POST_to = None orig_payload = response.meta['payload'].strip( self.test_str) # xss char payload escaped_payload = self.unescape_payload(orig_payload) break_tag_chars = set(['>', '<', '(', ')']) break_attr_chars = set([quote_enclosure, '(', ')']) break_js_chars = set(['"', "'", '(', ')']) matches = re.findall(chars_between_delims, body) if matches: xss_num = len(matches) if xss_num != inj_num: err = ( 'Mismatch between harmless injection count and payloaded injection count: %d vs %d, increased chance of false positive' % (inj_num, xss_num)) item['error'] = err for idx, match in enumerate(matches): unfiltered_chars = self.get_unfiltered_chars( match, escaped_payload) if unfiltered_chars: try: line, tag, attr, attr_val = spider.parse_injections( injections[idx]) except IndexError: mismatch = True # Mismatch in num of test injections and num of payloads found line, tag, attr, attr_val = 'Unknown', 'Unknown', None, None joined_chars = ''.join(unfiltered_chars) chars = set(joined_chars) line_html = self.get_inj_line(body, match) ###### XSS RULES ######## # If there's more XSS matches than harmless injections, we still want to check for the most dangerous characters # May see some false positives here, but better than false negatives if mismatch == True: if '>' in escaped_payload and '<' in escaped_payload: if '<' in joined_chars and '>' in joined_chars: item = self.make_item(joined_chars, xss_type, orig_payload, tag, orig_url, inj_point, line_html, POST_to, item) item = self.url_item_filtering(item, spider) return item # Redirect if 'javascript:prompt(99)' == joined_chars.lower( ): # redir item = self.make_item(joined_chars, xss_type, orig_payload, tag, orig_url, inj_point, line_html, POST_to, item) item = self.url_item_filtering(item, spider) return item # JS breakout if self.js_pld == escaped_payload: #js chars if break_js_chars.issubset(chars): item = self.make_item(joined_chars, xss_type, orig_payload, tag, orig_url, inj_point, line_html, POST_to, item) item = self.url_item_filtering(item, spider) return item # Attribute breakout if attr: if quote_enclosure in escaped_payload: if break_attr_chars.issubset(chars): item = self.make_item(joined_chars, xss_type, orig_payload, tag, orig_url, inj_point, line_html, POST_to, item) item = self.url_item_filtering(item, spider) return item # Tag breakout else: if '<' and '>' in escaped_payload: if break_tag_chars.issubset(chars): item = self.make_item(joined_chars, xss_type, orig_payload, tag, orig_url, inj_point, line_html, POST_to, item) item = self.url_item_filtering(item, spider) return item # Check the entire body for exact match # Escape out all the special regex characters to search for the payload in the html body re_payload = escaped_payload.replace('(', '\(').replace( ')', '\)').replace('"', '\\"').replace("'", "\\'") re_payload = re_payload.replace('{', '\{').replace('}', '\}').replace( ']', '\]').replace('[', '\[') re_payload = '.{1}?' + re_payload full_matches = re.findall(re_payload, body) for f in full_matches: unescaped_match = ''.join( self.get_unfiltered_chars(f, escaped_payload)) if unescaped_match == escaped_payload: #if '\\' == unescaped_match[0]: # continue item[ 'error'] = 'Response passed injection point specific search without success, checked for exact payload match in body (higher chance of false positive here)' item['line'] = self.get_inj_line(body, f) item['xss_payload'] = orig_payload item['unfiltered'] = escaped_payload item['inj_point'] = inj_point item['xss_type'] = xss_type item['url'] = orig_url if POST_to: item['POST_to'] = POST_to return item # In case it slips by all of the filters, then we move on raise DropItem( 'No XSS vulns in %s. Tested: type = %s, injection point = %s' % (resp_url, xss_type, inj_point))
def __init__(self, original_url="", *args): self.original_url = original_url DropItem.__init__(self, *args)
def update(self, collection, item): try: collection.insert(dict(item)) return item except: raise DropItem('Item already exists.')
def __str__(self): #####for usage: print e print self.style.ERROR("DROP(NoTitleDrop):" + self.url) return DropItem.__str__(self)
def from_crawler(cls, crawler): if not crawler.settings.get('MYSQL_SETTINGS'): raise DropItem("缺少MySQL的配置") return cls(mysql_settings=settings.DATABASES.get('default'), )
def __str__(self): #####for usage: print e print self.style.ERROR("DROP(NotContentPageDrop):" + self.info + '|' + self.url) return DropItem.__str__(self)
def process_item(self, item, spider): collection_name = self.stats.get_value("collection_name") where = {} if item.get("list_id"): item["list_id"] = ObjectId(item["list_id"]) fileid = item.get("fileid") biz_id = item.get("biz_id") content_url = item.get("content_url") item = dict(item) item.update({ "updateAt": datetime.utcnow() }) ''' 对文章列表和文章详情表进行处理 ''' if collection_name in collections: if collection_name == "article.detail": where = { "$or":[ { "content_url":content_url }, ] } if item.get("content"): content = item.get("content") item["content"] = content.replace("data-src", "src") img_paths = item.get("image_paths") #修改内容图片地址为下载好的地址 if img_paths is not None and item.get("biz_id") is not None: try: img_replace_urls = Selector(text = item.get("content")).css("img::attr(src)").extract() except Exception as e: logger.error("没有找到图片") else: for url in img_replace_urls: temp_url = handle_img_urls(url) sha1_url = img_uuid(temp_url) path_url = item.get("biz_id")+"/"+sha1_url+".jpg" if path_url in img_paths: path_url = BASE_URL + "upload/" + path_url item["content"] = item["content"] .replace(url, path_url) elif collection_name == "article.list": where = { "$and":[ { "fileid": fileid, }, { "biz_id": biz_id } ] } if item.get("image_paths") and item.get("cover") and len(item.get("image_paths")) > 0: item["cover"] = BASE_URL + "upload/" + item.get("image_paths")[0] else: raise DropItem("不存在的表") #不会进行处理 self.db[collection_name].update_one(where, {"$set":item}, upsert=True) # old_data = self.db[collection_name].find_one(where, {"_id":1}) # if not old_data: # # self.db[collection_name].update_one(where, {"$set":item}, upsert=True) # self.db[collection_name].insert(item) # else: # if item.get("image_paths"): # self.db.update_one(where, {"$set":item} ) return item
def __str__(self): #####for usage: print e print self.style.ERROR("DROP(KeywordNotFitDrop):" + self.info + '|' + self.url) return DropItem.__str__(self)
def process_item(self, item, spider): # UNQUINESS CHECK: URL # ===================================================================== if item['url'] in self.ids_seen: raise DropItem("Duplicate item found: %s" % item) else: self.ids_seen.add(item['url']) # CLEAN THE LOCATION # ===================================================================== # There types either location w/reuters, no location w/ reuters, breakingviews w/ location, or location no reuters if 'Breakingviews' not in item['location'][0] and '(Reuters)' in item[ 'location'][0]: item['location'] = item['location'][0][:item['location'][0]. find('(Reuters)')] elif 'Breakingviews' in item['location'][0]: item['location'] = item['location'][ 0][:item['location'][0].find('(Reuters Breakingviews)')] elif '(Reuters)' not in item['location'][0]: item['location'] = item['location'][0][:item['location'][0]. find('-')] # CLEAN THE DATE # ===================================================================== # I believe this is the posting timezone, but haven't verified local = pytz.timezone('America/New_York') date = item['published_date'][0][:( item['published_date'][0].rfind('/'))] date = str(date.replace('/', '')) date = parse(date) local_dt = local.localize(date, is_dst=None) utc_dt = local_dt.astimezone(pytz.utc) item['published_date'] = utc_dt # CLEAN THE PARAGRAPHS # ===================================================================== item['paragraphs'][0] = item['paragraphs'][0][( item['paragraphs'][0].find('(Reuters) -') + 12):] replace_paragraph = [] for value in item['paragraphs']: # Removes this goddamn block element '▒' value = value.encode('utf-8').decode('unicode_escape').encode( 'ascii', 'ignore').decode('utf-8') replace_paragraph.append(value) item['paragraphs'] = replace_paragraph # Not all Articles Have Authors on Reuters if len(item['author']) == 0: item['author'] = ['None'] # DATABASE INSERT # ===================================================================== # print('author type ' + str(type(item['author']))) # print('paragraphs type ' + str(type(item['paragraphs']))) # print('subject type ' + str(type(item['subject']))) # print('title type ' + str(type(item['title']))) # print('url type ' + str(type(item['url']))) # print('locaiton type ' + str(type(item['location']))) # print('additional_authors type ' + str(type(item['additional_authors']))) # print('date type ' + str(type(item['published_date']))) self.cur.execute( "INSERT INTO reuters_daily (author, paragraphs, published_date, subject, location, additional_authors, url, title) VALUES( %s, %s, %s, %s, %s, %s, %s, %s)", (item['author'], item['paragraphs'], item['published_date'], item['subject'], item['location'], item['additional_authors'], item['url'], item['title'])) self.connection.commit() return item