예제 #1
0
파일: iwjw.py 프로젝트: eastdog/crawler
def get_houses(dir, type='sale'):

    global district, logger
    house = re.compile(r'http://www.iwjw.com/%s/([^/]+)/\?from' % type)
    district_re = re.compile(r'searchName="(.+)"')
    type_mapping = {
        'sale': 1,
        'chuzu': 0
    }
    for f in os.listdir(dir):
        try:
            district_id = district.findall(f)[0]
        except:
            logger.error('district id failure#%s' % f)
            continue
        with codecs.open(os.path.join(dir, f), encoding='utf-8') as content:
            content = content.read()
            try:
                district_name = district_re.findall(content)[0].strip()
            except:
                district_name = None
                logger.error('district name failure#%s' % district_id)
            house_ids = set(house.findall(content))
            database = torndb.Connection(**dbutil.get_mysql_config())
            dbutil.update_district(database, district_id, district_name, type_mapping.get(type), house_ids)
            database.close()
            for id in house_ids:
                yield 'http://www.iwjw.com/%s/%s/' % (type, id)
예제 #2
0
파일: iwjw.py 프로젝트: eastdog/crawler
def process_estates():

    database = torndb.Connection(**dbutil.get_mysql_config())
    path_template = '../iwjw/estate/%s.html'
    eids = map(lambda x: x.communityId, database.query('select distinct communityId from house;'))
    for eid in eids:
        try:
            process1estate(database, eid, path_template)
        except Exception, e:
            logger.error('%s#%s' % (eid, e))
예제 #3
0
파일: iwjw.py 프로젝트: eastdog/crawler
def fetch_estate():

    database = torndb.Connection(**dbutil.get_mysql_config())
    urls = []
    for result in database.query('select distinct communityId from house'):
        estate_id = result.communityId
        urls.append('http://www.iwjw.com/estate/%s/' % estate_id)
    master = Master(rest_period=5, result_model='html', result_dir='../iwjw/estate')
    fetcher = Fetcher(processor=ps.Processor_hn())
    master.add_fetchers(fetcher)
    master.start(urls)
예제 #4
0
파일: iwjw.py 프로젝트: eastdog/crawler
def fetch_house_from_db():

    print 'sales'
    existed = set([f.replace('.html', '') for f in os.listdir('../iwjw/sale')])
    master = Master(rest_period=5, result_model='html', result_dir='../iwjw/sale')
    fetcher = Fetcher(processor=ps.Processor_hn())
    master.add_fetchers(fetcher)
    database = torndb.Connection(**dbutil.get_mysql_config())
    sale_list = database.query('select houseId from house where type=1;')
    sale_list = [result.houseId for result in sale_list if not result.houseId in existed]
    sale_list = ['http://www.iwjw.com/sale/%s/' % hid for hid in sale_list]
    master.start(sale_list)
    database.close()
예제 #5
0
파일: iwjw.py 프로젝트: eastdog/crawler
def process_houses():

    database = torndb.Connection(**dbutil.get_mysql_config())
    template = {
        0: '../iwjw/rent/%s.html',
        1: '../iwjw/sale/%s.html'
    }
    for result in database.query('select type, houseId from house;'):
        type, houseId = int(result.type), result.houseId
        try:
            process1house(database, houseId, template.get(type))
            logger.info('processed#%s' % houseId)
        except Exception, e:
            continue