def consume(item): package = {} root = etree.fromstring(item, parser=etree.XMLParser(recover=True)) log.debug("Processing " + str(root)) # Initial process package['class'] = root.tag package['pubkey'] = root.get('key') package['authors'] = [] for child in root: if child.tag == 'author': package['authors'] += [child.text] else: package[child.tag] = get_text_from_node(child) package['authors'] = list(set(package['authors'])) # Post process package['type'] = package['pubkey'].split('/')[0] if 'year' in package and package['year'] is not None and package['year'].isdigit(): package['year'] = int(package['year']) else: package['year'] = 0 package['month'] = int(root.get('mdate').split('-')[1]) if 'pages' in package and package['pages'] is not None: package['total_page'] = get_total_page(package['pages']) else: package['total_page'] = get_total_page('') # if 'volume' in package: # package['volume'] = int(package['volume']) # if 'number' in package: # package['number'] = int(package['number']) # Collect collector.collect(package)
def scan(self): collect(self.building_comboBox.currentText(), self.RP_comboBox.currentText(), 1) label = QLabel("Scan is successfuly done", self.dialog) label.setGeometry(120, 70, 250, 24) label.move(100, 100) label.setObjectName("label") # btnDialog = QPushButton("위치별 수집내역 확인", self.dialog) # btnDialog.setGeometry(1700, 1000, 250, 50) # btnDialog.move(130, 150) # btnDialog.clicked.connect(self.scan_check) btnDialog2 = QPushButton("Send data to server", self.dialog) btnDialog2.setGeometry(50, 50, 250, 50) btnDialog2.move(50, 250) btnDialog2.clicked.connect(self.dialog_close) btnDialog2 = QPushButton("Don't send data", self.dialog) btnDialog2.setGeometry(50, 50, 250, 50) btnDialog2.move(300, 250) btnDialog2.clicked.connect(self.dialog_close) self.dialog.setWindowTitle('scan complete') self.dialog.setWindowModality(Qt.ApplicationModal) self.dialog.resize(600, 400) self.dialog.show()
def test_collect(self): virt_platform = collector.collect(_CONFIG_PATH) self.assertTrue("hosts" in virt_platform) self.assertEqual(len(virt_platform["hosts"]), 4) for host in ['node1', 'node2', 'node3']: self.assertTrue("disks" in virt_platform["hosts"][host]) self.assertEqual(virt_platform["hosts"][host]["nics"][0]['mac'], 'dddd')
def run(self): item = self.queue.get() while item != None: try: url = item['url'] key = item['key'] constraint = item['constraint'] data = fetch(url) if data == None: self.logger.info('Not fetched: %s because type != text/html', url) else: links = get_all_links(data, base = url) feeds = get_all_feeds(data, base = url) interesting = collect(links) if interesting: self.collection_mutex.acquire() if key not in self.collection: self.collection[key] = {'feeds':{}} if feeds: for feed in feeds: self.collection[key]['feeds'][feed['href']] = feed['type'] for service, accounts in interesting.items(): if service not in self.collection[key]: self.collection[key][service] = {} for a,u in accounts.items(): self.collection[key][service][a] = {'url': u, 'depth':constraint.depth} self.collection_mutex.release() for l in links: new_constraint = constraint.inherit(url, l) if new_constraint == None: continue self.mutex.acquire() if l not in self.visited_urls: self.queue.put({'url':l, 'key':key, 'constraint': new_constraint}) self.visited_urls.add(l) self.mutex.release() except HTTPError: self.logger.info('HTTPError exception on url: %s', url) self.queue.task_done() item = self.queue.get() self.queue.task_done() # task_done on None
def login(): if request.method == 'POST': if request.form.get('submitBtn') == 'submitBtn': username = request.form.get('accountOptions') userinfo = get_userinfo(username) status = collect(userinfo['userid'], userinfo['token']) return redirect(url_for('user', username=username)) elif request.form.get('removeBtn') == 'removeBtn': username = request.form.get('accountOptions') delete_user(username) usernames = get_usernames() return render_template('login.html', input_error=False, usernames=usernames)
def __collect_links(start_url, route_table, visit_limit): ''' Test how link collecting works in small sandbox @types: str, dict[str, tuple[int, str, str]], int -> dict[str, UrlInfo] @param route_table: routing table defines small subset of WEB that will be used for crawler testing each record in table is composed of: - URL to visit - status code and mime type for HEAD request - list of outgoing links included in visited page ''' def get_from_route(url): print "GET from route: ", url record = route_table.get(url) code, content = 404, None if record: code, _, content = record response = mock.Mock() response.text = content response.status_code = code return response def head_from_route(url): r = route_table.get(url, (404, None, None)) status_code, mimetype, _ = r response = mock.Mock() response.status_code = status_code response.headers.get.return_value = mimetype return response with contextlib.nested( mock.patch("requests.get", get_from_route), mock.patch("requests.head", head_from_route), mock.patch("grequests.get", get_from_route), mock.patch("grequests.head", head_from_route), mock.patch("grequests.map", list)): return collector.collect(start_url, visit_limit)
if __name__ == "__main__": falgs, flags = sys.argv[1], sys.argv[2:] progress_bar_size = 10 start = time.time() n = 2 if "-m" in flags: # model ftrained = flags[flags.index("-m") + 1] else: ftrained = "trained.txt" if "-c" in flags: # collect fcollection = flags[flags.index("-t") + 1] collect(4800, fcollection) if "-t" in flags: # train fcollection = flags[flags.index("-t") + 1] out("Processing " + fcollection + " - ") c = Analyzer.fromcollection(n, fcollection) c.save(ftrained) out("saved to " + ftrained + ".\n") # collect list of raw algorithms out("Processing " + falgs + " - extracted ") alg_re = re.compile( "([URFLDBMESurfldbxyz][2']?[2']? )+([URFLDBMESurfldbxyz][2']?[2']?)" ) # CE algs raw = [] prevlen = str(len(raw))
def do_collect(): print("Data updating...") return collect()
def collect_game(href): print(f'Collecting https://www.basketball-reference.com{href}') html = collect('https://www.basketball-reference.com' + href) if html is None: return [] game_data = {} team_data = [] player_data = [] game_data['id'] = href.split('/')[2].split('.')[0] game_scoreboard = html.select('div.scorebox > div')[2] game_data['date'] = game_scoreboard.contents[1].contents[0] if len(game_scoreboard.contents[2].contents) >= 1: game_data['venue'] = game_scoreboard.contents[2].contents[0].split( ',')[0] else: game_data['venue'] = '' is_home = 0 for team_html in html.select('div.scorebox > div')[:2]: data = {} team_data.append(data) team_link = team_html.select('strong a')[0] data['id'] = team_link['href'].split('/')[2] data['name'] = team_link.contents[0] data['points'] = int(team_html.select('div.score')[0].contents[0]) score_parts = list( map(lambda x: int(x), team_html.contents[4].contents[0].split('-'))) data['season_wins'] = score_parts[0] data['season_losses'] = score_parts[1] data['is_home'] = is_home is_home = 1 if team_data[0]['points'] > team_data[1]['points']: team_data[0]['season_wins'] -= 1 team_data[1]['season_losses'] -= 1 else: team_data[1]['season_wins'] -= 1 team_data[0]['season_losses'] -= 1 for team in team_data: starting = 1 for stat_row in html.select(f'#box-' + team['id'] + '-game-basic tbody tr'): data = {} if stat_row.has_attr('class'): starting = 0 elif len(stat_row.select('td')) > 1: player_data.append(data) player_link = stat_row.select('th a')[0] data['id'] = player_link['href'].split('/')[3].split('.')[0] data['name'] = player_link.contents[0] data['starting'] = starting for stat_cell in stat_row.select('td'): stat = stat_cell['data-stat'] if not stat.endswith('pct') and stat != 'plus_minus': data[stat_cell['data-stat']] = stat_cell.contents[0] for curr_team in team_data: prefix = 'team_' if curr_team['id'] != team['id']: prefix = 'opp_team_' for key in curr_team: data[prefix + key] = str(curr_team[key]) for key in game_data: data[f'game_{key}'] = str(game_data[key]) return player_data
for curr_team in team_data: prefix = 'team_' if curr_team['id'] != team['id']: prefix = 'opp_team_' for key in curr_team: data[prefix + key] = str(curr_team[key]) for key in game_data: data[f'game_{key}'] = str(game_data[key]) return player_data year = start_year while year <= end_year: for month in months: html = collect( f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html' ) if html is not None: for row in html.select('#schedule tr'): th = row.select('th')[0] if th.has_attr('csk'): links = row.select('a') if len(links) >= 4: href = row.select('a')[3]['href'] player_data.extend(collect_game(href)) keys = player_data[0].keys() with open(f'../data/nba_{year}.csv', 'w', encoding='utf-8', newline='') as file: dict_writer = csv.DictWriter(file, keys) dict_writer.writeheader()
def collect(symbol): """Get a stock from AV.""" return collector.collect(symbol)