示例#1
0
class FetchCsvSample(BaseTask, CsvWriteBase):

    def __init__(self, channel, thread_cnt, need_cnt):
        BaseTask.__init__(self, "fetch_csv_sample", thread_cnt=thread_cnt)
        self.channel = channel
        self.rand = RandomDispatcher(channel, self._queue, need_cnt)
        self.dispatcher = lambda q: self.rand.dispatcher()

        self.dir_path = self.get_save_dir_path()
        CsvWriteBase.__init__(self, self.dir_path, self.channel)

        self.cv_measure_store = CVRawStore(channel, 'measure')

    def _load_data(self):
        self.rand.load_data()

    def get_save_dir_path(self):
        dir = os.path.join(os.path.dirname(__file__), '%s_%s_result' % (self.channel, self._name))
        if not os.path.exists(dir):
            os.mkdir(dir)
        return dir

    def get_pagecontent(self, cvId):
        doc = self.rand.cv_page_store.get_one(cvId)
        filepath = doc['pageContentPath']
        return self.rand.getPageContent(filepath, 'remote')

    def save_html(self, cvId, pagecontent):
        path = os.path.join(self.dir_path, '%s_html_result' % self.channel)
        if not os.path.exists(path):
            os.mkdir(path)

        with open('%s/%s.html' % (path, cvId.split("://")[1]), 'wb') as f:
            f.write(pagecontent)

    def run_job(self, job):
        cvId = job.get('cvId')
        pagecontent = self.get_pagecontent(cvId)
        self.save_html(cvId, pagecontent)

        measure_data = self.cv_measure_store.get_one(cvId)

        self.save(job, measure_data)
        print "SUCCESS COPIED %s" % cvId

    def end_operation(self, *args, **kwargs):

        print "***********************************" * 2
示例#2
0
class ETLDispatcher(ETLDispatcherBase):
    def __init__(self, channel, q):
        ETLDispatcherBase.__init__(self, channel, q)
        self.cv_raw_store = CVRawStore(self.channel, stage='raw')

    def fill_data_with_flag(self, indexUrl, realUrl, contentSign, updateTime, filePath, flag):

        self.process_item[indexUrl] = {
            'updateTime': updateTime,
            'contentSign': contentSign,
            'realUrl': realUrl,
            'filePath': filePath,
            'flag': flag,
        }

        if len(self.process_item)%10000 == 0:
            print "load items: %d" % len(self.process_item)

    def check_and_put(self, item):

        updateTime = item.get('updateTime')
        indexUrl = item.get('indexUrl')
        contentSign = item.get('contentSign')
        file_path = item.get('pageContentPath')
        realUrl = item.get('realUrl')

        self.fill_data_with_flag(indexUrl, realUrl, contentSign, updateTime, file_path, 0)

    def real_dispatcher(self, from_which):
        if from_which == 'db':
            self.load_data()
            self.dispatcher_from_db()
        # elif from_which == 'file':
        #     self.dispatcher_from_file()
        else:
            raise Exception("unknown from_which")

    def exist_in_raw(self, indexUrl):
        if self.cv_raw_store.get_one(indexUrl):
            return True

        return False

    def load_data(self):
        for item in self.cv_page_store.get_all():
            # if self.exist_in_raw(item['indexUrl']):
            #     continue
            self.check_and_put(item)

        print "============= totally load %d items ===============" % len(self.process_item)

    def dispatcher_from_db(self):

        i = 0
        total_cnt = len(self.process_item)
        for item in self.process_item:
            pagecontent = self.getPageContent(self.process_item[item].get("filePath"), self.bin_file_location)
            self.queue.put({'indexUrl': item, 'pagecontent': pagecontent, 'updateTime':self.process_item[item]['updateTime'], 'contentSign':
                   self.process_item[item]['contentSign']})

            i += 1
            if i % 10000 == 0:
                print "processed %f%%" % (float(i*100/total_cnt))

        self.queue.put(None)