def __init__(self): model=DataBaseModel() engine = model.db_connect() model.create_drawing_table(engine) self._session = sessionmaker(bind=engine) self._handlers=[DrawingItemDataBaseHanlder(self._session), DrawingTagItemDataBaseHanlder(self._session), DrawingIntroItemDataBaseHanlder(self._session)]
class JdBaseSpider(scrapy.Spider): allowed_domains = ["http://www.jd.com/", "item.jd.com", "list.jd.com", "d.3.cn", "search.jd.com"] def __init__(self): self._model = DataBaseModel() self._engine = self._model.db_connect() self._session = sessionmaker(bind=self._engine)() def _get_drawing_by_unique_key(self, key): drawings = self._session.query(DrawingTable).filter(DrawingTable.isbn == key) if drawings.count() == 0: return None else: return drawings.first() def _is_empty(self, intro): return intro == '' or intro == 'n/a' def _is_content_or_author_intro_empty(self, drawing): if not drawing: return True return self._is_empty(drawing.content_intro) or self._is_empty(drawing.author_intro) def _handle_drawing_page(self, response): item = DrawingItemBuilder(response).build() # DRAWING_UNIQUE_KEY absent if not item[DRAWING_UNIQUE_KEY]: logger.warn("drop the item found in %s since no %s found" % (response.url, DRAWING_UNIQUE_KEY)) return drawing = self._get_drawing_by_unique_key(item[DRAWING_UNIQUE_KEY]) # drawing has not been in DataBase if self._create_new_drawing_if_not_exist and not drawing: yield item # unique tags for a single DRAWING_UNIQUE_KEY if item['press']: yield DrawingTagItemBuilder(item[DRAWING_UNIQUE_KEY], u"出版社", item['press']).build() if item['author']: yield DrawingTagItemBuilder(item[DRAWING_UNIQUE_KEY], u"作者/绘者/译者", item['author']).build() if item['drawer']: yield DrawingTagItemBuilder(item[DRAWING_UNIQUE_KEY], u"作者/绘者/译者", item['drawer']).build() if item['translator']: yield DrawingTagItemBuilder(item[DRAWING_UNIQUE_KEY], u"作者/绘者/译者", item['translator']).build() # add one more tag if self._tag_type and self._tag_value: yield DrawingTagItemBuilder(item[DRAWING_UNIQUE_KEY], self._tag_type, self._tag_value).build() # crawl for content introduction and/or author introduction if self._is_content_or_author_intro_empty(drawing) and item['product_code']: drawing_detail_page = u"http://d.3.cn/desc/" + item['product_code'] + u"?cdn=1&callback=showdesc" yield scrapy.Request(drawing_detail_page, callback=lambda rsp, key=item[DRAWING_UNIQUE_KEY]: self._handle_drawing_detail_page( rsp, key)) def _handle_drawing_detail_page(self, response, key): yield DrawingIntroItemBuilder(response, key).build()
def mark_crawling_task_to_done(crawl_task): model = DataBaseModel() engine = model.db_connect() session = sessionmaker(bind=engine)() for row in session.query(CrawlListTable).filter(CrawlListTable.id == crawl_task.tag_id): logger.info("crawl done %d %s %s %s" % (row.id, row.tag_type, row.tag_value, row.start_url)) row.done = True session.commit() session.close()
def get_jd_drawings_to_crawl(by_search=False): model = DataBaseModel() engine = model.db_connect() session = sessionmaker(bind=engine)() to_crawl_list = [] for row in session.query(CrawlListTable).filter(CrawlListTable.done == False).filter( CrawlListTable.by_search == by_search).all(): to_crawl_list.append(DrawingsToCrawl(row.id, row.tag_type, row.tag_value, row.start_url)) session.close() return to_crawl_list
class DuplicateInDataBasePipeline(object): def __init__(self): self._model = DataBaseModel() self._engine = self._model.db_connect() self._session = sessionmaker(bind=self._engine) def _exist_in_db(self,item): return self._session().query(DrawingTable).filter( DrawingTable.isbn == item[DRAWING_UNIQUE_KEY]).count() > 0 def process_item(self, item, spider): if type(item) == DrawingItem: if self._exist_in_db(item): raise DropItem("Duplicate item found in DB: %s" % item) return item
# -*- coding: utf-8 -*- import logging import time import requests import string from drawing.util.log import setup_logger from drawing.models.tables.drawing import DrawingTable from drawing.models.model import DataBaseModel from sqlalchemy.orm import sessionmaker logger = setup_logger(loggername=__name__, console=logging.DEBUG) DATABASE_MODEL = DataBaseModel() DATABASE_ENGINE = DATABASE_MODEL.db_connect() class CrawlListGenerator(object): def _valid_isbn(self, isbn): return len(isbn) == 13 or len(isbn) == 10 def _book_to_crawl(self, row): return self._valid_isbn(row.isbn) and (not row.tried_in_douban) and row.single_book def generate(self): db_session = sessionmaker(bind=DATABASE_ENGINE) return [row.isbn for row in db_session().query(DrawingTable.isbn, DrawingTable.tried_in_douban, DrawingTable.single_book).all() if self._book_to_crawl(row)]