示例#1
0
def consume(item):
    package = {}
    root = etree.fromstring(item, parser=etree.XMLParser(recover=True))
    log.debug("Processing " + str(root))

    # Initial process
    package['class'] = root.tag
    package['pubkey'] = root.get('key')
    package['authors'] = []
    for child in root:
        if child.tag == 'author':
            package['authors'] += [child.text]
        else:
            package[child.tag] = get_text_from_node(child)
    package['authors'] = list(set(package['authors']))

    # Post process
    package['type'] = package['pubkey'].split('/')[0]
    if 'year' in package and package['year'] is not None and package['year'].isdigit():
        package['year'] = int(package['year'])
    else:
        package['year'] = 0
    package['month'] = int(root.get('mdate').split('-')[1])
    if 'pages' in package and package['pages'] is not None:
        package['total_page'] = get_total_page(package['pages'])
    else:
        package['total_page'] = get_total_page('')

    # if 'volume' in package:
    #     package['volume'] = int(package['volume'])
    # if 'number' in package:
    #     package['number'] = int(package['number'])

    # Collect
    collector.collect(package)
示例#2
0
    def scan(self):
        collect(self.building_comboBox.currentText(),
                self.RP_comboBox.currentText(), 1)

        label = QLabel("Scan is successfuly done", self.dialog)
        label.setGeometry(120, 70, 250, 24)
        label.move(100, 100)
        label.setObjectName("label")

        # btnDialog = QPushButton("위치별 수집내역 확인", self.dialog)
        # btnDialog.setGeometry(1700, 1000, 250, 50)
        # btnDialog.move(130, 150)
        # btnDialog.clicked.connect(self.scan_check)

        btnDialog2 = QPushButton("Send data to server", self.dialog)
        btnDialog2.setGeometry(50, 50, 250, 50)
        btnDialog2.move(50, 250)
        btnDialog2.clicked.connect(self.dialog_close)

        btnDialog2 = QPushButton("Don't send data", self.dialog)
        btnDialog2.setGeometry(50, 50, 250, 50)
        btnDialog2.move(300, 250)
        btnDialog2.clicked.connect(self.dialog_close)

        self.dialog.setWindowTitle('scan complete')
        self.dialog.setWindowModality(Qt.ApplicationModal)
        self.dialog.resize(600, 400)
        self.dialog.show()
示例#3
0
    def test_collect(self):
        virt_platform = collector.collect(_CONFIG_PATH)
        self.assertTrue("hosts" in virt_platform)
        self.assertEqual(len(virt_platform["hosts"]), 4)

        for host in ['node1', 'node2', 'node3']:
            self.assertTrue("disks" in virt_platform["hosts"][host])
            self.assertEqual(virt_platform["hosts"][host]["nics"][0]['mac'],
                             'dddd')
示例#4
0
	def run(self):
		item = self.queue.get()

		while item != None:
			try:
				url = item['url']
				key = item['key']
				constraint = item['constraint']
				data = fetch(url)

				if data == None:
					self.logger.info('Not fetched: %s because type != text/html', url)
				else:
					links = get_all_links(data, base = url)
					feeds = get_all_feeds(data, base = url)
					interesting = collect(links)
	
					if interesting:
						self.collection_mutex.acquire()
						if key not in self.collection:
							self.collection[key] = {'feeds':{}}

						if feeds:
							for feed in feeds:
								self.collection[key]['feeds'][feed['href']] = feed['type']

						for service, accounts in interesting.items():
							if service not in self.collection[key]:
								self.collection[key][service]  = {}

							for a,u in accounts.items():
								self.collection[key][service][a] = {'url': u, 'depth':constraint.depth}
						self.collection_mutex.release()


					for l in links:
						new_constraint = constraint.inherit(url, l)
						if new_constraint == None:
							continue
	
						self.mutex.acquire()
						if l not in self.visited_urls:
							self.queue.put({'url':l, 'key':key, 'constraint': new_constraint})
							self.visited_urls.add(l)
						self.mutex.release()

			except HTTPError:
				self.logger.info('HTTPError exception on url: %s', url)

			self.queue.task_done()

			item = self.queue.get()

		self.queue.task_done() # task_done on None
def login():
    if request.method == 'POST':
        if request.form.get('submitBtn') == 'submitBtn':
            username = request.form.get('accountOptions')
            userinfo = get_userinfo(username)
            status = collect(userinfo['userid'], userinfo['token'])
            return redirect(url_for('user', username=username))
        elif request.form.get('removeBtn') == 'removeBtn':
            username = request.form.get('accountOptions')
            delete_user(username)
    usernames = get_usernames()
    return render_template('login.html',
                           input_error=False,
                           usernames=usernames)
示例#6
0
def __collect_links(start_url, route_table, visit_limit):
    ''' Test how link collecting works in small sandbox

    @types: str, dict[str, tuple[int, str, str]], int -> dict[str, UrlInfo]

    @param route_table:
        routing table defines small subset of WEB
        that will be used for crawler testing
        each record in table is composed of:
        - URL to visit
        - status code and mime type for HEAD request
        - list of outgoing links included in visited page
    '''
    def get_from_route(url):
        print "GET from route: ", url
        record = route_table.get(url)
        code, content = 404, None
        if record:
            code, _, content = record
        response = mock.Mock()
        response.text = content
        response.status_code = code
        return response

    def head_from_route(url):
        r = route_table.get(url, (404, None, None))
        status_code, mimetype, _ = r
        response = mock.Mock()
        response.status_code = status_code
        response.headers.get.return_value = mimetype
        return response

    with contextlib.nested(
            mock.patch("requests.get", get_from_route),
            mock.patch("requests.head", head_from_route),
            mock.patch("grequests.get", get_from_route),
            mock.patch("grequests.head", head_from_route),
            mock.patch("grequests.map", list)):
        return collector.collect(start_url, visit_limit)
示例#7
0

if __name__ == "__main__":
    falgs, flags = sys.argv[1], sys.argv[2:]
    progress_bar_size = 10
    start = time.time()
    n = 2

    if "-m" in flags:  # model
        ftrained = flags[flags.index("-m") + 1]
    else:
        ftrained = "trained.txt"

    if "-c" in flags:  # collect
        fcollection = flags[flags.index("-t") + 1]
        collect(4800, fcollection)

    if "-t" in flags:  # train
        fcollection = flags[flags.index("-t") + 1]
        out("Processing " + fcollection + " - ")
        c = Analyzer.fromcollection(n, fcollection)
        c.save(ftrained)
        out("saved to " + ftrained + ".\n")

    # collect list of raw algorithms
    out("Processing " + falgs + " - extracted ")
    alg_re = re.compile(
        "([URFLDBMESurfldbxyz][2']?[2']? )+([URFLDBMESurfldbxyz][2']?[2']?)"
    )  # CE algs
    raw = []
    prevlen = str(len(raw))
示例#8
0
def do_collect():
    print("Data updating...")
    return collect()
示例#9
0
def collect_game(href):
    print(f'Collecting https://www.basketball-reference.com{href}')
    html = collect('https://www.basketball-reference.com' + href)
    if html is None:
        return []
    game_data = {}
    team_data = []
    player_data = []
    game_data['id'] = href.split('/')[2].split('.')[0]
    game_scoreboard = html.select('div.scorebox > div')[2]
    game_data['date'] = game_scoreboard.contents[1].contents[0]
    if len(game_scoreboard.contents[2].contents) >= 1:
        game_data['venue'] = game_scoreboard.contents[2].contents[0].split(
            ',')[0]
    else:
        game_data['venue'] = ''
    is_home = 0
    for team_html in html.select('div.scorebox > div')[:2]:
        data = {}
        team_data.append(data)
        team_link = team_html.select('strong a')[0]
        data['id'] = team_link['href'].split('/')[2]
        data['name'] = team_link.contents[0]
        data['points'] = int(team_html.select('div.score')[0].contents[0])
        score_parts = list(
            map(lambda x: int(x),
                team_html.contents[4].contents[0].split('-')))
        data['season_wins'] = score_parts[0]
        data['season_losses'] = score_parts[1]
        data['is_home'] = is_home
        is_home = 1
    if team_data[0]['points'] > team_data[1]['points']:
        team_data[0]['season_wins'] -= 1
        team_data[1]['season_losses'] -= 1
    else:
        team_data[1]['season_wins'] -= 1
        team_data[0]['season_losses'] -= 1
    for team in team_data:
        starting = 1
        for stat_row in html.select(f'#box-' + team['id'] +
                                    '-game-basic tbody tr'):
            data = {}
            if stat_row.has_attr('class'):
                starting = 0
            elif len(stat_row.select('td')) > 1:
                player_data.append(data)
                player_link = stat_row.select('th a')[0]
                data['id'] = player_link['href'].split('/')[3].split('.')[0]
                data['name'] = player_link.contents[0]
                data['starting'] = starting
                for stat_cell in stat_row.select('td'):
                    stat = stat_cell['data-stat']
                    if not stat.endswith('pct') and stat != 'plus_minus':
                        data[stat_cell['data-stat']] = stat_cell.contents[0]
                for curr_team in team_data:
                    prefix = 'team_'
                    if curr_team['id'] != team['id']:
                        prefix = 'opp_team_'
                    for key in curr_team:
                        data[prefix + key] = str(curr_team[key])
                for key in game_data:
                    data[f'game_{key}'] = str(game_data[key])
    return player_data
示例#10
0
                for curr_team in team_data:
                    prefix = 'team_'
                    if curr_team['id'] != team['id']:
                        prefix = 'opp_team_'
                    for key in curr_team:
                        data[prefix + key] = str(curr_team[key])
                for key in game_data:
                    data[f'game_{key}'] = str(game_data[key])
    return player_data


year = start_year
while year <= end_year:
    for month in months:
        html = collect(
            f'https://www.basketball-reference.com/leagues/NBA_{year}_games-{month}.html'
        )
        if html is not None:
            for row in html.select('#schedule tr'):
                th = row.select('th')[0]
                if th.has_attr('csk'):
                    links = row.select('a')
                    if len(links) >= 4:
                        href = row.select('a')[3]['href']
                        player_data.extend(collect_game(href))

    keys = player_data[0].keys()
    with open(f'../data/nba_{year}.csv', 'w', encoding='utf-8',
              newline='') as file:
        dict_writer = csv.DictWriter(file, keys)
        dict_writer.writeheader()
示例#11
0
def collect(symbol):
    """Get a stock from AV."""
    return collector.collect(symbol)