예제 #1
0
def get_scrapy_data_path(createdir=True):
    """ Return a path to a folder where Scrapy is storing data.
    Usually that's a .scrapy folder inside the project.
    """
    # This code is extracted from scrapy.utils.project.data_path function,
    # which does too many things.
    path = project_data_dir() if inside_project() else ".scrapy"
    if createdir:
        os.makedirs(path, exist_ok=True)
    return path
예제 #2
0
    def process_item(self, item, spider):
        datadir = '/'.join(project_data_dir().split('/')[:-1]) + '/data'
        try:
            os.makedirs(datadir)
        except OSError:
            pass

        dbfile = '{}/{}.sqlite'.format(datadir, spider.name)
        SH = SqliteHandle(dbfile)
        SH.create_table(spider.name, self.table_model())
        try:
            SH.insert(spider.name, item)
            logging.info('Current records and storage success')
        except Exception as e:
            logging.warning(e)
        SH.close()
        return item
예제 #3
0
def _get_config():
    datadir = os.path.join(project_data_dir(), '.scrapy', 'scrapyd')
    conf = {
        'eggs_dir': os.path.join(datadir, 'eggs'),
        'logs_dir': os.path.join(datadir, 'logs'),
        'dbs_dir': os.path.join(datadir, 'dbs'),
    }
    for k in ['eggs_dir', 'logs_dir', 'dbs_dir']: # create dirs
        d = conf[k]
        if not os.path.exists(d):
            os.makedirs(d)
    scrapyd_conf = """
[scrapyd]
eggs_dir = %(eggs_dir)s
logs_dir = %(logs_dir)s
dbs_dir  = %(dbs_dir)s
    """ % conf
    return Config(extra_sources=[StringIO(scrapyd_conf)])
예제 #4
0
def _get_config():
    datadir = os.path.join(project_data_dir(), 'scrapydartx')
    conf = {
        'eggs_dir': os.path.join(datadir, 'eggs'),
        'logs_dir': os.path.join(datadir, 'logs'),
        'items_dir': os.path.join(datadir, 'items'),
        'dbs_dir': os.path.join(datadir, 'dbs'),
    }
    for k in ['eggs_dir', 'logs_dir', 'items_dir', 'dbs_dir']:  # create dirs
        d = conf[k]
        if not os.path.exists(d):
            os.makedirs(d)
    scrapyd_conf = """
[scrapydartx]
eggs_dir = %(eggs_dir)s
logs_dir = %(logs_dir)s
items_dir = %(items_dir)s
dbs_dir  = %(dbs_dir)s
    """ % conf
    return Config(extra_sources=[StringIO(scrapyd_conf)])
예제 #5
0
 def __init__(self, name):
     self.name = name
     self.cachedir = os.path.join(project_data_dir(), name)
     if not exists(self.cachedir):
         os.makedirs(self.cachedir)
예제 #6
0
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
import os
from scrapy.utils.project import project_data_dir

STORE_IMAGES = False
# if True, will use proxy
USE_PROXY = False
# LINK TO GENERATE REQUESTS FROM
ROOT_LINK = 'https://boston.craigslist.org/search/hhh'

WALKSCORE_API_KEY = '28e22315d2e853f1afe5db5423a4d6a8'

PROJECT_DIR = os.path.dirname(project_data_dir('scraper_craig'))
# csv file to save scraped items
CSV_FILEPATH = os.path.join(PROJECT_DIR, 'csv/Craiglist_Export.csv')
# Store images there
IMAGES_STORE = os.path.join(PROJECT_DIR, 'images')
# if True, scrapper saves images

# spider uses types to check if it's in the text, if static phrase is in text, it's added to item field
HOUSE_TYPES = [
    'apartment', 'condo', 'cottage/cabin', 'duplex', 'flat', 'house', 'in-law',
    'loft', 'townhouse', 'manufactured', 'assisted living', 'land'
]

PARKING_TYPES = [
    'carport', 'attached garage', 'detached garage', 'off-street parking',
    'street parking', 'valet parking', 'no parking'
예제 #7
0
from datetime import datetime
from contextlib import suppress
from pathlib import Path

from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.utils.project import project_data_dir


def safe_filename(filename):
    _filename = str(filename).replace('/', '|')
    whitelist_characters = string.ascii_letters + string.digits + '|'
    return ''.join([c for c in _filename if c in whitelist_characters])


REQUESTS_DIR = Path(project_data_dir()).parent.joinpath('_requests')
RESPONSES_DIR = Path(project_data_dir()).parent.joinpath('_responses')


def dump(message, directory, filename, extension, body, headers, url, method,
         status):
    filepath = '{}/{}.{}'.format(directory, filename, extension)
    with open(filepath, 'w') as file:
        if extension in ('html', 'xml', 'text'):
            headers = [
                '{}: {}'.format(key.decode('utf8'), value[0].decode('utf8'))
                for key, value in headers.items()
            ]
            data = [
                '<!--',
                message,