Пример #1
0
import csv
import time
import json

import scrapy
from amazon.utils.util import escape
from amazon.utils import genlog
from amazon.item.findnsave import FindnsaveBrandItem
from amazon.utils.rediscli import get_cli, RedisLock, RedisLockError
from amazon.utils.util import first_item, safe, \
                              xpath, f_xpath, first_item_xpath, \
                              xpath_extract, fx_extract, first_item_xpath_extract

logger = genlog.createlogger('findnsave_brands')
genlog.logger = logger


class FindnsaveBrandsSpider(scrapy.Spider):

    logger = logger

    name = 'findnsavebrands'
    allowed_domains = ("findnsave.com", )
    location = 'newyork'
    rooturl = "http://%s.findnsave.com" % location

    start_urls = [rooturl + "/brands/"]

    #csv_fd = open( '/tmp/brands.csv', 'w' )
    #csv.writer( csv_fd ).writerow( [ 'id', 'cid', 'name', 'href' ] )
Пример #2
0
import csv
import time
import json

import scrapy
from amazon.utils import genlog
from amazon.utils.util import escape
from amazon.item.findnsave import FindnsaveSaleItem
from amazon.utils.util import first_item, safe, \
                              xpath, f_xpath, first_item_xpath, \
                              xpath_extract, fx_extract, first_item_xpath_extract

logger = genlog.createlogger( 'findnsave_sales' )
genlog.logger = logger

class FindnsaveStoresSpider(scrapy.Spider):

    logger = logger

    name = 'findnsavesales'
    allowed_domains = ( "findnsave.com", )
    location = 'newyork'
    rooturl = "http://%s.findnsave.com" % location

    start_urls = [ rooturl + "/store/Walmart/10175/" ]
    #start_urls = [ rooturl + "/store/Target/10002/" ]
    #start_urls = [ rooturl + "/store/ToysRUs/10011/" ]

    #csv_fd = open( '/tmp/newyork_sales.csv', 'w' )
    #writer = csv.writer( csv_fd, delimiter = '\\' )
Пример #3
0
import csv
import time
import json

import scrapy
from amazon.utils import genlog
from amazon.item.findnsave import FindnsaveCategoryItem
from amazon.utils.rediscli import get_cli, RedisLock, RedisLockError
from amazon.utils.util import first_item, safe, \
                              xpath, f_xpath, first_item_xpath, \
                              xpath_extract, fx_extract, first_item_xpath_extract

logger = genlog.createlogger('findnsave_categories')
genlog.logger = logger


class FindnsaveCategoriesSpider(scrapy.Spider):

    logger = logger

    name = 'findnsavecategories'
    allowed_domains = ("findnsave.com", )
    location = 'newyork'
    rooturl = "http://%s.findnsave.com" % location

    start_urls = [rooturl + "/categories/"]

    #csv_fd = open( '/tmp/categories.csv', 'w' )
    #csv.writer( csv_fd ).writerow( [ 'id', 'cid', 'name', 'href' ] )

    def parse(self, response):
Пример #4
0
import json

import scrapy
from amazon.utils import genlog
from amazon.utils.s3clientutil import authedclient, put_file_from_url
from amazon.utils.util import first_item, safe, \
                              xpath, f_xpath, first_item_xpath, \
                              xpath_extract, fx_extract, first_item_xpath_extract

logger = genlog.createlogger('earthpics')


class EarthPicsSpider(scrapy.Spider):

    name = 'earthpics'
    allowed_domains = ("earthpics.me", )
    start_urls = ["http://earthpics.me/"]

    prefix_len = len('http://earthpics.me/')

    @safe
    def parse_one_top(self, response):

        logger.info('fetch : ' + response.url)

        img = f_xpath(response,
                      '//div[contains(@class, "inner-main-content")]')

        meta = {}
        meta['name'] = fx_extract(img, './div/h3/text()').strip().strip('#')
        meta['img'] = fx_extract(img, './/div[@class="inner-image"]/img/@src')
import csv
import time
import json

import scrapy
from amazon.utils import genlog
from amazon.item.findnsave import FindnsaveCategoryItem
from amazon.utils.rediscli import get_cli, RedisLock, RedisLockError
from amazon.utils.util import first_item, safe, \
                              xpath, f_xpath, first_item_xpath, \
                              xpath_extract, fx_extract, first_item_xpath_extract

logger = genlog.createlogger( 'findnsave_categories' )
genlog.logger = logger

class FindnsaveCategoriesSpider(scrapy.Spider):

    logger = logger

    name = 'findnsavecategories'
    allowed_domains = ( "findnsave.com", )
    location = 'newyork'
    rooturl = "http://%s.findnsave.com" % location

    start_urls = [ rooturl + "/categories/" ]

    #csv_fd = open( '/tmp/categories.csv', 'w' )
    #csv.writer( csv_fd ).writerow( [ 'id', 'cid', 'name', 'href' ] )

    def parse(self, response):
Пример #6
0
import csv
import time
import json

import scrapy
from amazon.utils.util import escape
from amazon.utils import genlog
from amazon.item.findnsave import FindnsaveBrandItem
from amazon.utils.rediscli import get_cli, RedisLock, RedisLockError
from amazon.utils.util import first_item, safe, \
                              xpath, f_xpath, first_item_xpath, \
                              xpath_extract, fx_extract, first_item_xpath_extract

logger = genlog.createlogger( 'findnsave_brands' )
genlog.logger = logger

class FindnsaveBrandsSpider(scrapy.Spider):

    logger = logger

    name = 'findnsavebrands'
    allowed_domains = ( "findnsave.com", )
    location = 'newyork'
    rooturl = "http://%s.findnsave.com" % location

    start_urls = [ rooturl + "/brands/" ]

    #csv_fd = open( '/tmp/brands.csv', 'w' )
    #csv.writer( csv_fd ).writerow( [ 'id', 'cid', 'name', 'href' ] )
Пример #7
0
import csv
import time
import json

import scrapy
from amazon.utils import genlog
from amazon.item.findnsave import FindnsaveStoreItem
from amazon.utils.rediscli import get_cli, RedisLock, RedisLockError
from amazon.utils.util import first_item, safe, \
                              xpath, f_xpath, first_item_xpath, \
                              xpath_extract, fx_extract, first_item_xpath_extract

logger = genlog.createlogger( 'findnsave_stores' )
genlog.logger = logger

class FindnsaveStoresSpider(scrapy.Spider):

    logger = logger

    name = 'findnsavestores'
    allowed_domains = ( "findnsave.com", )
    location = 'newyork'
    rooturl = "http://%s.findnsave.com" % location

    start_urls = [ rooturl + "/stores/?sort=top" ]

    #csv_fd = open( '/tmp/stores.csv', 'w' )
    #csv.writer( csv_fd ).writerow( [ 'id', 'sid', 'name', 'href' ] )

    def parse(self, response):
Пример #8
0
import csv
import time
import json

import scrapy
from amazon.utils import genlog
from amazon.item.findnsave import FindnsaveAreaItem
from amazon.utils.util import first_item, safe, \
                              xpath, f_xpath, first_item_xpath, \
                              xpath_extract, fx_extract, first_item_xpath_extract

logger = genlog.createlogger( 'findnsave_location' )
genlog.logger = logger

class FindnsaveLocationSpider(scrapy.Spider):

    logger = logger

    name = 'findnsavelocation'
    allowed_domains = ( "findnsave.com", )
    rooturl = "http://findnsave.com"

    start_urls = [ rooturl + "/?markets=1" ]

    def parse(self, response):

        logger.info( 'fetch : ' + response.url )
        states = f_xpath( response, '//select[@id="states-dropdown"]' ).xpath( './option' )

        sts = {}
Пример #9
0
import json

import scrapy
from amazon.utils import genlog
from amazon.utils.s3clientutil import authedclient, put_file_from_url
from amazon.utils.util import first_item, safe, \
                              xpath, f_xpath, first_item_xpath, \
                              xpath_extract, fx_extract, first_item_xpath_extract

logger = genlog.createlogger( 'earthpics' )

class EarthPicsSpider(scrapy.Spider):

    name = 'earthpics'
    allowed_domains = ( "earthpics.me", )
    start_urls = [ "http://earthpics.me/" ]

    prefix_len = len( 'http://earthpics.me/' )

    @safe
    def parse_one_top( self, response ):

        logger.info( 'fetch : ' + response.url )

        img = f_xpath( response, '//div[contains(@class, "inner-main-content")]' )

        meta = {}
        meta[ 'name' ] = fx_extract( img, './div/h3/text()' ).strip().strip('#')
        meta[ 'img'  ] = fx_extract( img, './/div[@class="inner-image"]/img/@src' )
        meta[ 'key'  ] = meta[ 'img' ][ self.prefix_len: ]