def get_scrapy_data_path(createdir=True): """ Return a path to a folder where Scrapy is storing data. Usually that's a .scrapy folder inside the project. """ # This code is extracted from scrapy.utils.project.data_path function, # which does too many things. path = project_data_dir() if inside_project() else ".scrapy" if createdir: os.makedirs(path, exist_ok=True) return path
def process_item(self, item, spider): datadir = '/'.join(project_data_dir().split('/')[:-1]) + '/data' try: os.makedirs(datadir) except OSError: pass dbfile = '{}/{}.sqlite'.format(datadir, spider.name) SH = SqliteHandle(dbfile) SH.create_table(spider.name, self.table_model()) try: SH.insert(spider.name, item) logging.info('Current records and storage success') except Exception as e: logging.warning(e) SH.close() return item
def _get_config(): datadir = os.path.join(project_data_dir(), '.scrapy', 'scrapyd') conf = { 'eggs_dir': os.path.join(datadir, 'eggs'), 'logs_dir': os.path.join(datadir, 'logs'), 'dbs_dir': os.path.join(datadir, 'dbs'), } for k in ['eggs_dir', 'logs_dir', 'dbs_dir']: # create dirs d = conf[k] if not os.path.exists(d): os.makedirs(d) scrapyd_conf = """ [scrapyd] eggs_dir = %(eggs_dir)s logs_dir = %(logs_dir)s dbs_dir = %(dbs_dir)s """ % conf return Config(extra_sources=[StringIO(scrapyd_conf)])
def _get_config(): datadir = os.path.join(project_data_dir(), 'scrapydartx') conf = { 'eggs_dir': os.path.join(datadir, 'eggs'), 'logs_dir': os.path.join(datadir, 'logs'), 'items_dir': os.path.join(datadir, 'items'), 'dbs_dir': os.path.join(datadir, 'dbs'), } for k in ['eggs_dir', 'logs_dir', 'items_dir', 'dbs_dir']: # create dirs d = conf[k] if not os.path.exists(d): os.makedirs(d) scrapyd_conf = """ [scrapydartx] eggs_dir = %(eggs_dir)s logs_dir = %(logs_dir)s items_dir = %(items_dir)s dbs_dir = %(dbs_dir)s """ % conf return Config(extra_sources=[StringIO(scrapyd_conf)])
def __init__(self, name): self.name = name self.cachedir = os.path.join(project_data_dir(), name) if not exists(self.cachedir): os.makedirs(self.cachedir)
# # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html import os from scrapy.utils.project import project_data_dir STORE_IMAGES = False # if True, will use proxy USE_PROXY = False # LINK TO GENERATE REQUESTS FROM ROOT_LINK = 'https://boston.craigslist.org/search/hhh' WALKSCORE_API_KEY = '28e22315d2e853f1afe5db5423a4d6a8' PROJECT_DIR = os.path.dirname(project_data_dir('scraper_craig')) # csv file to save scraped items CSV_FILEPATH = os.path.join(PROJECT_DIR, 'csv/Craiglist_Export.csv') # Store images there IMAGES_STORE = os.path.join(PROJECT_DIR, 'images') # if True, scrapper saves images # spider uses types to check if it's in the text, if static phrase is in text, it's added to item field HOUSE_TYPES = [ 'apartment', 'condo', 'cottage/cabin', 'duplex', 'flat', 'house', 'in-law', 'loft', 'townhouse', 'manufactured', 'assisted living', 'land' ] PARKING_TYPES = [ 'carport', 'attached garage', 'detached garage', 'off-street parking', 'street parking', 'valet parking', 'no parking'
from datetime import datetime from contextlib import suppress from pathlib import Path from scrapy import signals from scrapy.exceptions import NotConfigured from scrapy.utils.project import project_data_dir def safe_filename(filename): _filename = str(filename).replace('/', '|') whitelist_characters = string.ascii_letters + string.digits + '|' return ''.join([c for c in _filename if c in whitelist_characters]) REQUESTS_DIR = Path(project_data_dir()).parent.joinpath('_requests') RESPONSES_DIR = Path(project_data_dir()).parent.joinpath('_responses') def dump(message, directory, filename, extension, body, headers, url, method, status): filepath = '{}/{}.{}'.format(directory, filename, extension) with open(filepath, 'w') as file: if extension in ('html', 'xml', 'text'): headers = [ '{}: {}'.format(key.decode('utf8'), value[0].decode('utf8')) for key, value in headers.items() ] data = [ '<!--', message,