Пример #1
0
def run():
    sets = Pipeline(time.site_id, time.site_name).structure_set()
    Pipeline(time.site_id, time.site_name).open_spider(sets)

    for item in Time().first_requests():
        Pipeline(time.site_id, time.site_name).process_item(item)
        Pipeline(time.site_id, time.site_name).upload_item(item, sets)

    try:
        Pipeline(time.site_id, time.site_name).close_spider()
    except:
        Logger().setLogger(time.log_path, 4, "Failed to close spider,db_session may failed")
def run():
    sets = Pipeline(bbc.site_id, bbc.site_name).structure_set()
    Pipeline(bbc.site_id, bbc.site_name).open_spider(sets)

    urls = BBC().first_requests()
    for item in BBC().second_requests(urls):
        Pipeline(bbc.site_id, bbc.site_name).process_item(item)
        Pipeline(bbc.site_id, bbc.site_name).upload_item(item, sets)

    try:
        Pipeline(bbc.site_id, bbc.site_name).close_spider()
    except:
        Logger().setLogger(bbc.log_path, 4,
                           "Failed to close spider,db_session may failed")
        pass
Пример #3
0
    def __init__(s, dtype, stages, pipeq, bypassq):

        s.in_ = InValRdyBundle(dtype)
        s.out = OutValRdyBundle(dtype)

        s.in_q = InValRdyQueue(dtype, pipe=pipeq)
        s.out_q = OutValRdyQueue(dtype, bypass=bypassq)

        s.pipe = Pipeline(stages)
        s.connect(s.in_, s.in_q.in_)
        s.connect(s.out, s.out_q.out)

        @s.tick
        def logic():

            # Automatically enq from input / deq from output
            s.in_q.xtick()
            s.out_q.xtick()

            # No stall
            if not s.out_q.is_full():

                # Insert item into pipeline from input queue
                if not s.in_q.is_empty():
                    s.pipe.insert(s.in_q.deq())

                # Items graduating from pipeline, add to output queue
                if s.pipe.ready():
                    s.out_q.enq(s.pipe.remove())

                # Advance the pipeline
                s.pipe.advance()
Пример #4
0
def test_Pipeline(dump_vcd, stages):

    # Create the pipeline
    pipeline = Pipeline(stages)
    pipeline.vcd_file = dump_vcd

    # Fill up the pipeline
    i = -1
    for i in range(stages - 1):
        pipeline.advance()
        pipeline.insert(i)
        assert not pipeline.ready()

    # Insert one last item
    pipeline.advance()
    pipeline.insert(i + 1)

    # Make sure there is something at the tail of the pipeline
    assert pipeline.ready()

    # Start removing items from the pipeline
    for i in range(stages):
        assert pipeline.ready()
        assert pipeline.remove() == i
        pipeline.advance()

    assert not pipeline.ready()
Пример #5
0
def run():
    sets = Pipeline(hq.site_id, hq.site_name).structure_set()
    Pipeline(hq.site_id, hq.site_name).open_spider(sets)

    detail_url = Huanqiu().first_requests()

    for item in Huanqiu().second_requests(detail_url):
        Huanqiu().process_item(item)

        Pipeline(hq.site_id, hq.site_name).process_item(item)
        Pipeline(hq.site_id, hq.site_name).upload_item(item, sets)

    try:
        Pipeline(hq.site_id, hq.site_name).close_spider()
    except:
        Logger().setLogger(hq.log_path, 4, "Failed to close spider,db_session may failed")
        pass
  def __init__( s, out, nstages=1 ):

    s.nstages    = nstages

    # instantiate a single-entry bypass queue adapter
    s.out_q      = OutValRdyQueueAdapter( out )

    # instantiate a cycle-level pipeline
    if s.nstages > 0:
      s.pipe       = Pipeline( s.nstages )
Пример #7
0
 def __init__(self, browser, settings, *a, **kw):
     self.browser = browser
     self.settings = settings
     if self.browser:
         self._init_from_db()
     self.internal_err = True
     self.deny = True
     self.clawed_urls = []
     self.site_urls = []
     self.currentUrl = ""
     self.started = False
     self.pipeline = Pipeline(self.settings.get('MYSERVER_URI'))
     self.proxyUrl = self.settings.get("SPLASH_URL")
     logger.info('spider init is finished!')
Пример #8
0
    def assess(model,
               df,
               columns,
               metrics,
               n_splits=5,
               early_stopping_rounds=20,
               verbose=0):
        """
        k-fold cross-validation
    
        Checkpoints saving strategy ...
        :param model: sklearn-like object
        :param df: DataFrame with X and y
        :param columns: column names splited by types like utils.split_columns_by_types
        :param metrics: sklearn.metrics like function
        :param n_splits: the number of folds
        :param early_stopping_rounds: LightGBM param
        :param verbose: 0 - no logs, 1 - info, 2 - debug
        :return: iterations log
        """
        if n_splits == 1:
            total_rows = df.shape[0]
            train_size = int(0.95 * total_rows)
            splits = [(df.index[:train_size], df.index[train_size:])]
        else:
            splits = kfold_with_respect_to_groups(df, n_splits=n_splits)
        log = []
        for train_index, valid_index in splits:
            print('\n---------------------------')
            with Timer('Data Preparation:', verbose):
                pipeline = Pipeline(**columns, verbose=verbose)
                x_train = pipeline.fit_transform(df.loc[train_index, :])
                y_train = df.loc[train_index, columns['target']]
                x_valid = pipeline.transform(df.loc[valid_index, :])
                y_valid = df.loc[valid_index, columns['target']]

            with Timer('Fitting:', verbose):
                model.fit(
                    x_train,
                    y_train,
                    eval_set=[(x_valid, y_valid)],
                    early_stopping_rounds=early_stopping_rounds,
                    verbose=-1 if verbose != 2 else 1,
                )

            # with Timer('Postprocessing:', verbose):
            #     pred_train = scores_postprocessing(
            #         df=df.loc[train_index, :],
            #         predicted=model.predict(x_train),
            #         columns=columns,
            #         is_test=False,
            #     )[columns['target']]
            #     pred_valid = scores_postprocessing(
            #         df=df.loc[valid_index, :],
            #         predicted=model.predict(x_valid),
            #         columns=columns,
            #         is_test=False,
            #     )[columns['target']]
            pred_train, pred_valid = model.predict(x_train), model.predict(
                x_valid)

            with Timer('Saving:', verbose):
                train_score = metrics(y_train, pred_train)
                valid_score = metrics(y_valid, pred_valid)
                step = dict(
                    model=model,
                    pipeline=pipeline,
                    train_score=train_score,
                    valid_score=valid_score,
                    not_adj_train_score=metrics(y_train,
                                                model.predict(x_train)),
                    not_adj_valid_score=metrics(y_valid,
                                                model.predict(x_valid)),
                    train_index=train_index,
                    valid_index=valid_index,
                    path=None,
                    cached=False,
                )
                try:
                    step = save_model(step)
                except Exception:
                    if verbose == 1:
                        print("Warning: Couldn't save the model")
                log.append(step)
                gc.collect()

            if verbose == 1:
                print(step['train_score'], step['valid_score'])
            print('---------------------------\n')

        if verbose == 1:
            print('Erasing cache ...')
        for idx, step in enumerate(
                sorted(log, key=lambda dct: dct['valid_score'], reverse=True)):
            if idx == 0:
                step['best'] = True
                continue
            step['best'] = False

            try:
                os.remove(step['path'])
                if verbose == 2:
                    print('Removed:', step['abspath'])
            except Exception:
                if verbose == 2:
                    print("Warning: Couldn't remove file:", step['abspath'])
        return log