def main(): sd = input("Start Date(yyyy,m,d): ") ed = input("End Date(yyyy,m,d): ") print(datetime.datetime.now()) multiParsedTagList = hp.get_fullParsedTagList(sd, ed) tagSelect = sc.get_singlePageInfo(multiParsedTagList) pageInfos = sc.get_pageInfos(tagSelect) #pp.print_mergedList(pageInfos) pp.save_csv(pageInfos, sd, ed) print(datetime.datetime.now())
def run(): sets = Pipeline(time.site_id, time.site_name).structure_set() Pipeline(time.site_id, time.site_name).open_spider(sets) for item in Time().first_requests(): Pipeline(time.site_id, time.site_name).process_item(item) Pipeline(time.site_id, time.site_name).upload_item(item, sets) try: Pipeline(time.site_id, time.site_name).close_spider() except: Logger().setLogger(time.log_path, 4, "Failed to close spider,db_session may failed")
def __init__(self, browser, settings, *a, **kw): self.browser = browser self.settings = settings if self.browser: self._init_from_db() self.internal_err = True self.deny = True self.clawed_urls = [] self.site_urls = [] self.currentUrl = "" self.started = False self.pipeline = Pipeline(self.settings.get('MYSERVER_URI')) self.proxyUrl = self.settings.get("SPLASH_URL") logger.info('spider init is finished!')
def run(): sets = Pipeline(bbc.site_id, bbc.site_name).structure_set() Pipeline(bbc.site_id, bbc.site_name).open_spider(sets) urls = BBC().first_requests() for item in BBC().second_requests(urls): Pipeline(bbc.site_id, bbc.site_name).process_item(item) Pipeline(bbc.site_id, bbc.site_name).upload_item(item, sets) try: Pipeline(bbc.site_id, bbc.site_name).close_spider() except: Logger().setLogger(bbc.log_path, 4, "Failed to close spider,db_session may failed") pass
def __init__(s, dtype, stages, pipeq, bypassq): s.in_ = InValRdyBundle(dtype) s.out = OutValRdyBundle(dtype) s.in_q = InValRdyQueue(dtype, pipe=pipeq) s.out_q = OutValRdyQueue(dtype, bypass=bypassq) s.pipe = Pipeline(stages) s.connect(s.in_, s.in_q.in_) s.connect(s.out, s.out_q.out) @s.tick def logic(): # Automatically enq from input / deq from output s.in_q.xtick() s.out_q.xtick() # No stall if not s.out_q.is_full(): # Insert item into pipeline from input queue if not s.in_q.is_empty(): s.pipe.insert(s.in_q.deq()) # Items graduating from pipeline, add to output queue if s.pipe.ready(): s.out_q.enq(s.pipe.remove()) # Advance the pipeline s.pipe.advance()
def run(): sets = Pipeline(hq.site_id, hq.site_name).structure_set() Pipeline(hq.site_id, hq.site_name).open_spider(sets) detail_url = Huanqiu().first_requests() for item in Huanqiu().second_requests(detail_url): Huanqiu().process_item(item) Pipeline(hq.site_id, hq.site_name).process_item(item) Pipeline(hq.site_id, hq.site_name).upload_item(item, sets) try: Pipeline(hq.site_id, hq.site_name).close_spider() except: Logger().setLogger(hq.log_path, 4, "Failed to close spider,db_session may failed") pass
def __init__( s, out, nstages=1 ): s.nstages = nstages # instantiate a single-entry bypass queue adapter s.out_q = OutValRdyQueueAdapter( out ) # instantiate a cycle-level pipeline if s.nstages > 0: s.pipe = Pipeline( s.nstages )
def assess(model, df, columns, metrics, n_splits=5, early_stopping_rounds=20, verbose=0): """ k-fold cross-validation Checkpoints saving strategy ... :param model: sklearn-like object :param df: DataFrame with X and y :param columns: column names splited by types like utils.split_columns_by_types :param metrics: sklearn.metrics like function :param n_splits: the number of folds :param early_stopping_rounds: LightGBM param :param verbose: 0 - no logs, 1 - info, 2 - debug :return: iterations log """ if n_splits == 1: total_rows = df.shape[0] train_size = int(0.95 * total_rows) splits = [(df.index[:train_size], df.index[train_size:])] else: splits = kfold_with_respect_to_groups(df, n_splits=n_splits) log = [] for train_index, valid_index in splits: print('\n---------------------------') with Timer('Data Preparation:', verbose): pipeline = Pipeline(**columns, verbose=verbose) x_train = pipeline.fit_transform(df.loc[train_index, :]) y_train = df.loc[train_index, columns['target']] x_valid = pipeline.transform(df.loc[valid_index, :]) y_valid = df.loc[valid_index, columns['target']] with Timer('Fitting:', verbose): model.fit( x_train, y_train, eval_set=[(x_valid, y_valid)], early_stopping_rounds=early_stopping_rounds, verbose=-1 if verbose != 2 else 1, ) # with Timer('Postprocessing:', verbose): # pred_train = scores_postprocessing( # df=df.loc[train_index, :], # predicted=model.predict(x_train), # columns=columns, # is_test=False, # )[columns['target']] # pred_valid = scores_postprocessing( # df=df.loc[valid_index, :], # predicted=model.predict(x_valid), # columns=columns, # is_test=False, # )[columns['target']] pred_train, pred_valid = model.predict(x_train), model.predict( x_valid) with Timer('Saving:', verbose): train_score = metrics(y_train, pred_train) valid_score = metrics(y_valid, pred_valid) step = dict( model=model, pipeline=pipeline, train_score=train_score, valid_score=valid_score, not_adj_train_score=metrics(y_train, model.predict(x_train)), not_adj_valid_score=metrics(y_valid, model.predict(x_valid)), train_index=train_index, valid_index=valid_index, path=None, cached=False, ) try: step = save_model(step) except Exception: if verbose == 1: print("Warning: Couldn't save the model") log.append(step) gc.collect() if verbose == 1: print(step['train_score'], step['valid_score']) print('---------------------------\n') if verbose == 1: print('Erasing cache ...') for idx, step in enumerate( sorted(log, key=lambda dct: dct['valid_score'], reverse=True)): if idx == 0: step['best'] = True continue step['best'] = False try: os.remove(step['path']) if verbose == 2: print('Removed:', step['abspath']) except Exception: if verbose == 2: print("Warning: Couldn't remove file:", step['abspath']) return log
logging.info(f"Command executed: {' '.join(sys.argv)}") logging.info("Starting outside variant pipeline analysis") file1 = args.case_gen file2 = args.control_gen pairing = args.SNP_pairs init_file = args.init_file p_file = args.output_folder override_folder = args.override odds_file = "" logging.info("Initializing pipeline. This might take a few seconds.") args.exec_dir = os.getcwd() with cd(args.input_folder_path): pipe = Pipeline.init_from_file( init_file, file1, file2, pairing, p_file, odds_file, args) logging.info("Making output directory") working_dir = make_working_dir(p_file, override_folder) pipe.working_dir = working_dir pipe.p_value_filename = p_file.split("/")[-1] pipe.hash = make_hash(args.input_folder_path, init_file, file1, file2, pairing, args.unique_identifier) with cd(args.input_folder_path): pipe.read_input_files() logging.info("Running pipeline...") with cd(pipe.working_dir): pipe.run()
class BaseSpider(object): """The most top class.""" start_urls = [] start_host = "http://www.bjpc.gov.cn/" filter_urls = [] init_db = False depart = "" def __init__(self, browser, settings, *a, **kw): self.browser = browser self.settings = settings if self.browser: self._init_from_db() self.internal_err = True self.deny = True self.clawed_urls = [] self.site_urls = [] self.currentUrl = "" self.started = False self.pipeline = Pipeline(self.settings.get('MYSERVER_URI')) self.proxyUrl = self.settings.get("SPLASH_URL") logger.info('spider init is finished!') def _init_from_db(self): self.client = pymongo.MongoClient(self.settings.get('MONGO_URI')) self.db = self.client[self.settings.get('MONGO_DATABASE', 'test')] def start(self, site, startUrl=None): res = self.db.GovDepartment.find_one({"key": site}) self._init_gov_data(res) logger.info('start crawl %s!' % startUrl) #self.start_requests() self.request(startUrl) self.destroy_init_data() def _is_filter_url(self, url): if not url.startwith(self.start_host): return True #only claw in self site if self.deny: for u in self.filter_urls: if url.startwith(u): return True return False else: for u in self.filter_urls: if url.startwith(u): return False return True def destroy_init_data(self): self.condition = "" self.start_urls = [] del self.fields self.init_db = False del self.link_extractor self.f.close() #self.start_host = res["link"] def _init_gov_data(self, gov): try: self.condition = gov["condition"] self.start_urls.append(gov["link"]) #self.browser = webdriver.Firefox() self.start_host = gov["link"] self.f = open(gov["key"] + ".txt", 'a+') self.fields = {} self.init_db = True self.internal_err = False self.link_extract() for field in gov["fields"]: self.fields[field["name"]] = field["xpath"] except Exception as e: print "_init_gov_data error: %s" % e.message def link_extract(self): self.link_extractor = None def start_requests(self): for url in self.start_urls: self.request(url) def close(self): logger.warning('self mongo db closed') self.client.close() def getResponse(self, url, browser): res = TextResponse(url, body=browser.page_source.encode("utf-8")) return res def _getPage(self, url, browser): newUrl = "%s%s%s%s" % (self.proxyUrl, "?url=", url, "&timeout=10&wait=0.5") print "get newUrl %s" % newUrl self.currentUrl = url browser.get(newUrl) def request(self, url): pass def get_item(self, response): il = PageItemLoader(item=PageContentItem(depart=self.depart), response=response) il.add_value('link', response.url) for (k, v) in self.fields.items(): il.add_xpath(k, v) #only support simple select current return il.load_item() def save_item(self, item): self.pipeline.send_item(item) def satisfy_craw(self, response): for condition in self.condition: data = response.xpath(condition).extract() if data != []: return True return False def crawedAppend(self, url): if not self.hasCrawedUrl(url): self.clawed_urls.append(url) self.f.write(url.encode("utf-8") + "\n") def hasCrawedUrl(self, url): if not url in self.clawed_urls: return False return True def add_urls_noduplicate(self, site_urls): res = [] if isinstance(site_urls, list): for link in site_urls: self._add_url_nodup(link.url, res) else: self._add_url_nodup(site_urls, res) return res def _add_url_nodup(self, url, dstArr): if not url in dstArr: dstArr.append(url)
def test_Pipeline(dump_vcd, stages): # Create the pipeline pipeline = Pipeline(stages) pipeline.vcd_file = dump_vcd # Fill up the pipeline i = -1 for i in range(stages - 1): pipeline.advance() pipeline.insert(i) assert not pipeline.ready() # Insert one last item pipeline.advance() pipeline.insert(i + 1) # Make sure there is something at the tail of the pipeline assert pipeline.ready() # Start removing items from the pipeline for i in range(stages): assert pipeline.ready() assert pipeline.remove() == i pipeline.advance() assert not pipeline.ready()