def test_shutdown_reason(): data = parse(SHUTDOWN) assert data['shutdown_reason'] == 'Received SIGTERM twice' assert data['finish_reason'] == 'shutdown' data = parse(SHUTDOWN.replace('twice', '')) assert data['shutdown_reason'] == 'Received SIGTERM' assert data['finish_reason'] == 'shutdown'
def test_only_stats_dumped(): replaces = [( "'downloader/response_status_count/302': 1,", "'downloader/response_status_count/302': 7,\n 'downloader/response_status_count/301': 8," ), ("'response_received_count': 3,", "'response_received_count': 30,"), ("'item_scraped_count': 2,", "'item_scraped_count': 20,"), ("'log_count/ERROR': 5,", "'log_count/ERROR': 4,"), ("'finish_reason': 'finished',", "'finish_reason': 'forceshutdown',")] dict_count = dict(critical_logs=5, error_logs=4, warning_logs=3, redirect_logs=15, retry_logs=2, ignore_logs=1) text = END for replace in replaces: text = text.replace(*replace) data = parse(text, headlines=50, taillines=50) # cst.json_dumps(data) assert data['first_log_time'] == '2018-10-23 18:29:41' assert data['latest_log_time'] == '2018-10-23 18:29:42' assert data['runtime'] == '0:00:01' assert data['datas'] == [] assert data['pages'] == 30 assert data['items'] == 20 for k, v in data['latest_matches'].items(): assert v == '' for k, v in dict_count.items(): assert data['log_categories'][k]['count'] == v assert data['log_categories'][k]['details'] == [] assert data['finish_reason'] == 'forceshutdown'
def test_invalid_log(): for text in ["", ERROR_404]: data = parse(text) cst.json_dumps(data) if not text: assert not (data['head'] or data['tail']) else: assert '404 - No Such Resource' in data['head'] and '404 - No Such Resource' in data['tail'] assert set(data.keys()) == set(cst.PARSE_KEYS) for k in ['first_log_time', 'latest_log_time', 'runtime', 'shutdown_reason', 'finish_reason']: assert data[k] == cst.NA for k in ['first_log_timestamp', 'latest_log_timestamp', 'latest_crawl_timestamp', 'latest_scrape_timestamp']: assert data[k] == 0 for k in ['pages', 'items']: assert data[k] is None # assert data['last_update_timestamp'] > 0 # 1546272001 # assert len(data['last_update_time']) == 19 # "2019-01-01 00:00:01" assert cst.string_to_timestamp(data['last_update_time']) == data['last_update_timestamp'] assert data['datas'] == [] for v in data['latest_matches'].values(): assert v == '' assert set(data['latest_matches'].keys()) == set(cst.LATEST_MATCHES_RESULT_DICT.keys()) for v in data['log_categories'].values(): assert v == dict(count=0, details=[]) assert set(data['log_categories'].keys()) == set(cst.LOG_CATEGORIES_RESULT_DICT.keys())
def dispatch_request(self, **kwargs): try: # Use io.open for compatibility with Python 2 with io.open(os.path.join(self.PARSE_PATH, self.filename), encoding='utf-8', errors='ignore') as f: self.text = f.read() except Exception as err: return render_template( self.template_fail, node=self.node, alert="An error occurred when reading the uploaded logfile", text='%s\n%s' % (err.__class__.__name__, err)) self.get_job_info() kwargs = dict( project=self.project, spider=self.spider, job=self.job, url_source=url_for('.source', filename=self.filename), # url_utf8=url_utf8, # To hide url_utf8 link in page http://127.0.0.1:5000/log/uploaded/ScrapydWeb_demo.log ) kwargs.update(parse(self.text)) # self.logger.debug("Parsed result: %s" % self.json_dumps(kwargs)) return render_template(self.template, node=self.node, **kwargs)
def dispatch_request(self, **kwargs): if self.report_logparser: self.read_stats_for_report() # Try to request stats by LogParser to avoid reading/requesting the whole log if not self.logparser_valid and (self.stats_logparser or self.report_logparser): if self.IS_LOCAL_SCRAPYD_SERVER and self.LOCAL_SCRAPYD_LOGS_DIR: self.read_local_stats_by_logparser() if not self.logparser_valid: self.request_stats_by_logparser() if not self.logparser_valid and not self.text: # Try to read local logfile if self.IS_LOCAL_SCRAPYD_SERVER and self.LOCAL_SCRAPYD_LOGS_DIR: self.read_local_scrapy_log() # Has to request scrapy logfile if not self.text: self.request_scrapy_log() if self.status_code != 200: if self.stats_logparser or self.report_logparser: self.load_backup_stats() if not self.backup_stats_valid: if not self.report_logparser: kwargs = dict(node=self.node, url=self.url, status_code=self.status_code, text=self.text) return render_template(self.template_fail, **kwargs) else: self.url += self.SCRAPYD_LOG_EXTENSIONS[0] else: self.url += self.SCRAPYD_LOG_EXTENSIONS[0] if (not self.utf8_realtime and not self.logparser_valid and self.text and self.status_code in [0, 200]): self.logger.warning('Parse the whole log') self.stats = parse(self.text) # Note that the crawler_engine is not available when using parse() self.stats.setdefault('crawler_engine', {}) self.stats.setdefault('status', self.OK) if self.report_logparser: if self.stats and not self.stats.setdefault('from_memory', False): self.simplify_stats_for_report() self.keep_stats_for_report() get_flashed_messages() # 0, -1, 404 load backup if self.status_code < 100 or self.stats: status_code = 200 else: status_code = self.status_code return self.json_dumps(self.stats or dict(status='error'), as_response=True), status_code else: self.update_kwargs() if self.ENABLE_MONITOR and self.POST: # Only poll.py would make POST request self.monitor_alert() return render_template(self.template, **self.kwargs)
def update_kwargs(self): if self.utf8_realtime: self.kwargs['text'] = self.text self.kwargs['last_update_timestamp'] = time.time() if self.job_finished or self.job_key in self.job_finished_set: self.kwargs['url_refresh'] = '' else: self.kwargs['url_refresh'] = 'javascript:location.reload(true);' else: # Parsed data comes from json.loads, for compatibility with Python 2, # use str(time_) to avoid [u'2019-01-01 00:00:01', 0, 0, 0, 0] in JavaScript. if self.logparser_valid: for d in self.stats['datas']: d[0] = str(d[0]) else: self.logger.warning('Parse the whole log') self.stats = parse(self.text) # Note that the crawler_engine is not available when using parse() self.stats['crawler_engine'] = {} # For sorted orders in stats.html with Python 2 for k in ['crawler_stats', 'crawler_engine']: if self.stats[k]: self.stats[k] = self.get_ordered_dict(self.stats[k]) if self.BACKUP_STATS_JSON_FILE: self.backup_stats() self.kwargs.update(self.stats) if (self.kwargs['finish_reason'] == self.NA and not self.job_finished and self.job_key not in self.job_finished_set): # http://flask.pocoo.org/docs/1.0/api/#flask.Request.url_root # _query_string = '?ui=mobile' # self.url_refresh = request.script_root + request.path + _query_string self.kwargs['url_refresh'] = 'javascript:location.reload(true);' if self.kwargs['url_refresh']: if self.stats_logparser and not self.logparser_valid: self.kwargs['url_jump'] = '' else: self.kwargs['url_jump'] = url_for('log', node=self.node, opt='stats', project=self.project, spider=self.spider, job=self.job, with_ext=self.with_ext, ui=self.UI, realtime='True' if self.stats_logparser else None) # Stats link of 'a.json' from the Logs page should hide these links if self.with_ext and self.job.endswith('.json'): self.kwargs['url_source'] = '' self.kwargs['url_opt_opposite'] = '' self.kwargs['url_refresh'] = '' self.kwargs['url_jump'] = '' else: self.kwargs['url_source'] = self.url self.kwargs['url_opt_opposite'] = url_for('log', node=self.node, opt='utf8' if self.opt == 'stats' else 'stats', project=self.project, spider=self.spider, job=self.job, job_finished=self.job_finished, with_ext=self.with_ext, ui=self.UI)
def test_scrapy_fieldstats(): data = parse(SCRAPY_FIELDSTATS) d = data['crawler_stats'] assert d['fields_coverage'] == { u'Chinese 汉字': '50%', 'author': { 'a': 1, 'b': 2 } }
def test_latest_scrape_item(): data = parse(LATEST_SCRAPE_ITEM_ONE_LINE) d = data['latest_matches'] latest_scrape = '2019-01-01 00:00:01 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/get>' assert d['latest_scrape'] == latest_scrape assert d['latest_item'] == "{'item': 1}" data = parse(LATEST_SCRAPE_ITEM_MULTIPLE_LINES) d = data['latest_matches'] latest_scrape = '2019-01-01 00:00:02 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/get>' assert d['latest_scrape'] == latest_scrape assert json.loads(d['latest_item'].replace("'", '"')) == dict(item=2) data = parse(LATEST_SCRAPE_ITEM_MIXED) d = data['latest_matches'] latest_scrape = '2019-01-01 00:00:03 [scrapy.core.scraper] DEBUG: Scraped from <200 http://httpbin.org/get>' assert d['latest_scrape'] == latest_scrape assert json.loads(d['latest_item'].replace("u'", "'").replace( "'", '"')) == dict(item={u'Chinese 汉字': 3})
def test_telnet_info(): data = parse(TELNET_160_DEFAULT) d = data['latest_matches'] assert d['scrapy_version'] == '1.6.0' assert d['telnet_console'] == '127.0.0.1:6024' assert d['telnet_username'] == '' assert d['telnet_password'] == '9d3a29f17ee1bf9a' data = parse(TELNET_160_USERNAME) d = data['latest_matches'] assert d['telnet_username'] == 'usr123' assert d['telnet_password'] == 'd24ad6be287d69b3' data = parse(TELNET_160_PASSWORD) d = data['latest_matches'] assert d['telnet_username'] == '' assert d['telnet_password'] == '456psw' data = parse(TELNET_160_USERNAME_PASSWORD) d = data['latest_matches'] assert d['telnet_username'] == 'usr123' assert d['telnet_password'] == '456psw'
def test_demo_log(): modified_logstats = FRONT.replace("Crawled 3 pages (at 0 pages/min), scraped 2 items (at 0 items/min)", "Crawled 1 pages (at 2 pages/min), scraped 3 items (at 4 items/min)") for case, text in zip(['without_stats_dumped', 'whole_log', 'modified_logstats'], [FRONT, FRONT + END, modified_logstats + END]): data = parse(text, headlines=50, taillines=100) # 180 lines in total # cst.json_dumps(data) if case == 'without_stats_dumped': cst.check_demo_data(data, without_stats_dumped=True) elif case == 'modified_logstats': # to test update_data_with_crawler_stats() cst.check_demo_data(data, without_stats_dumped=False, modified_logstats=True) else: cst.check_demo_data(data, without_stats_dumped=False)
def communicate(log_root): log_file = os.path.join(log_root, 'files.log') if not os.path.isfile(log_file): return LOG_FILE = parse(log_file) for line in LOG_FILE.context.itertuples(): if is_nan(getattr(line, 'extracted', None)): continue local_name = line.extracted dump_path = os.path.join(DUMP_PATH, local_name) if not os.path.exists(dump_path): warnings.warn(f'No such file or directory: {local_name!r}', ExtractWarning) return
def main(): if len(sys.argv) < 2: sys.exit("Usage: %s [SOURCE_DIR|LOG_FILE]" % sys.argv[0]) sourcedir = os.path.relpath(sys.argv[1]) logs = list(parse(sourcedir)) remove_redundant_authors(logs) network = build_network(sourcedir, logs) authors = [n for n in network if network.node[n]['type'] == 'author'] projection = project_graph(network, authors) results = { 'Random' : decomp_by_random(network, projection, authors), 'Commit Count': decomp_by_commit_count(network, projection, logs), 'Degree' : decomp_by_degree(network, projection, authors), 'Closeness' : decomp_by_closeness(network, projection, authors), 'Betweenness' : decomp_by_betweenness(network, projection, authors), 'Eigenvector' : decomp_by_eigenvector(network, projection, authors) } print_results(results)
def generate_log(log_name): global DATE date = time.strftime('%Y-%m-%d') if date != DATE: archive(DATE) DATE = date INFO = os.path.join(LOGS_PATH, 'info', f'{DATE}.log') log_stem = log_name log_root = os.path.join(LOGS_PATH, log_name) log_uuid = re.match( r'.*?-(?P<uuid>[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12})', log_stem, re.IGNORECASE).group('uuid') log_file = os.path.join(log_root, 'files.log') if not os.path.isfile(log_file): return LOG_FILE = parse(log_file) LOG_CONN = parse(os.path.join(log_root, 'conn.log')) for line in LOG_FILE.context.itertuples(): if is_nan(getattr(line, 'extracted', None)): continue hosts = [ dict(tx=ipaddress.ip_address(tx), rx=ipaddress.ip_address(rx)) for (tx, rx) in zip(line.tx_hosts, line.rx_hosts) ] conns = list() is_orig = line.is_orig for conn_uid in line.conn_uids: record = next( LOG_CONN.context[lambda df: df.uid == conn_uid].iterrows())[1] # pylint: disable=cell-var-from-loop if is_orig: conn = dict( src_h=ipaddress.ip_address(record['id.orig_h']), src_p=int(record['id.orig_p']), dst_h=ipaddress.ip_address(record['id.resp_h']), dst_p=int(record['id.resp_p']), ) else: conn = dict( src_h=ipaddress.ip_address(record['id.resp_h']), src_p=int(record['id.resp_p']), dst_h=ipaddress.ip_address(record['id.orig_h']), dst_p=int(record['id.orig_p']), ) conns.append(conn) local_name = line.extracted mime_type = None dump_path = os.path.join(DUMP_PATH, local_name) if os.path.exists(dump_path): with contextlib.suppress(Exception): mime_type = magic.detect_from_filename(dump_path).mime_type # if mime_type is None or MIME_REGEX.match(mime_type) is None: # if MIME_MODE: # local_name = rename_dump(local_name, line.mime_type) # else: # if MIME_MODE or (mime_type != line.mime_type): # pylint: disable=else-if-used # local_name = rename_dump(local_name, mime_type) else: dump_path = None info = dict(timestamp=line.ts if LOG_FILE.format == 'json' else line.ts.timestamp(), log_uuid=log_uuid, log_path=log_root, log_name=log_stem, dump_path=dump_path, local_name=local_name, source_name=getattr(line, 'filename', None), hosts=hosts, conns=conns, bro_mime_type=line.mime_type, real_mime_type=mime_type, hash=dict( md5=getattr(line, 'md5', None), sha1=getattr(line, 'sha1', None), sha256=getattr(line, 'sha256', None), )) print_file(json.dumps(info, cls=IPAddressJSONEncoder), file=INFO)
def test_latest_item_unicode_escape(): text = (FRONT + END).replace("{'item': 2}", u"{u'Chinese \\u6c49\\u5b57': 2}") data = parse(text) assert data['latest_matches']['latest_item'] == u"{u'Chinese 汉字': 2}"
tree = create_tree(scen = scene, domain_union=False, subset=sub, optimize_agents=True) save_tree(tree, tree_path(path_tmp, scene, sub)) # for agent in ['AFB','SynchBB']: # iagt = 0 for agent in agt_name.keys(): # iagt += 1 # if iagt >= agt_start : print('\t- agent : '+agent) ''' running the problem ''' # print('\t\t- running DCOP') cmd(path_tmp, scene, agent, sub) ''' parsing the result ''' # print('\t\t- parsing the result') df = parse(log_path(path_tmp, scene, sub, agent), debug = False) time = df['time'][0] nagts = df['agts'][0] nmes = df['mes'][0] nbytes = df['bytes'][0] nrow = [sub, agent, time, nagts, nmes, int(nmes/nagts), nbytes, int(nbytes/nagts)] big_df.loc[big_df.shape[0]] = nrow ''' writing the overall results ''' print('\n- writing all the results...') big_df.to_csv('all-sub-incremental-3-15.csv') print('\nall done.')