Python get_html示例，html.get_html Python示例

示例#1

0

显示文件

def get_match_lineups(matchID):
    html = get_html(f"https://www.hltv.org/matches/{matchID}")
    if html is None:
        print(f"Failed for {matchID}")
        return []

    # Get all of the players in a match
    playerIDs = re.findall('<a href=\"/player/.*/', html)

    # Give up if no team names found
    if len(playerIDs) < 1:
        print(f"{matchID} failed, no players detected")
        return []
    for i in range(0, len(playerIDs)):
        playerIDs[i] = (playerIDs[i].split("/"))[2].split("/")[0]

    # Make an array for pool.map to process
    if len(playerIDs) > 15:
        players = []
        players.append(playerIDs[0])
        players.append(playerIDs[1])
        players.append(playerIDs[2])
        players.append(playerIDs[3])
        players.append(playerIDs[4])
        players.append(playerIDs[5])
        players.append(playerIDs[6])
        players.append(playerIDs[7])
        players.append(playerIDs[8])
        players.append(playerIDs[9])
        players.append(matchID)
        return players
    else:
        print(f"HLTV altered lineup layout for {matchID}")
        return []

示例#2

0

显示文件

def get_players(playerID):
    html = get_html(f"https://www.hltv.org/player/{playerID}/a")
    if html is None:
        print(f"Failed for {playerID}")
        return []

    # Find a player's name and country
    playerName = re.findall('Complete statistics for.*</a>', html)
    if len(playerName) < 1:
        return []
    playerCountry = re.findall('class=\"flag\" title=\".*\"> ', html)
    if len(playerCountry) < 1:
        return []

    # Parse the playerName
    if len(playerName) > 0:
        playerName[0] = (playerName[0].replace("Complete statistics for ",
                                               "")).replace("</a>", "")
    else:
        playerName.append(0)

    # Parse the playerCountry
    if len(playerCountry) > 0:
        playerCountry[0] = (playerCountry[0].replace("class=\"flag\" title=\"",
                                                     "")).replace("\"> ", "")
    else:
        playerCountry.append(0)

    # Make an array for pool.map to process
    array = []
    array.append(playerName[0])
    array.append(playerCountry[0])
    array.append(playerID)

    return array

示例#3

0

显示文件

def find_match_ids_at_url(url):
    # Get the HTML using get_html()
    html = get_html(url)

    # Create an array of all of the Match URLs on the page
    matchIDs = re.findall('"(.*?000"><a href="/matches/.*?)"', html)

    # Loop through the messy array and removes the pesky parts
    for i in range(0, len(matchIDs)):
        matchIDs[i] = matchIDs[i].split('/', 2)[-1]
    return matchIDs

示例#4

0

显示文件

def find_match_ids_at_url(url):
    # Get the HTML using get_html()
    html = get_html(url)

    # Create an array of all of the Match URLs on the page
    event_ids = re.findall('events/.*/', html)

    # Loop through the messy array and removes the pesky parts
    for i in range(0, len(event_ids)):
        event_ids[i] = event_ids[i].split('/')[1]
    # print(event_ids[1:51])
    return event_ids[1:51]

示例#5

0

显示文件

def get_event_names(eventID):
    html = get_html("https://www.hltv.org/results?offset=0&event=%s" %
                    (eventID))
    if html is None:
        print(f"Failed for {eventID}")
        return []

    # Find the type of event (online, LAN, etc), as well as the name and date of event
    eventType = re.findall('title=\".*\">.*</span></td>', html)
    eventNames = re.findall('<div class=\"eventname\">.*</div>', html)
    eventEndDate = re.findall('data-unix=\".*\">', html)
    eventPrize = re.findall('\$.*</td>', html)

    # Parse the eventType
    if len(eventType) > 0:
        eventType[0] = (eventType[0].split('>')[1]).replace("</span", "")
    else:
        eventType.append(0)

    # Parse the eventNames
    if len(eventNames) > 0:
        eventNames[0] = (eventNames[0].replace("<div class=\"eventname\">",
                                               "")).replace("</div>", "")
    else:
        eventNames.append(0)

    # Parse the eventEndDate
    if len(eventEndDate) > 0:
        eventEndDate[0] = (eventEndDate[0].split('\"')[1]).replace("\"",
                                                                   "")[:-3]
        eventEndDate[0] = datetime.utcfromtimestamp(int(
            eventEndDate[0])).strftime('%Y-%m-%d')
    else:
        eventEndDate.append(0)

    # Parse the eventPrize
    if len(eventPrize) > 0:
        eventPrize[0] = (eventPrize[0].replace("$", "")).replace("</td>", "")
    else:
        eventPrize.append(0)

    # Make an array for pool.map to process
    result = []
    result.append(eventType[0])
    result.append(eventNames[0])
    result.append(eventEndDate[0])
    result.append(eventID)
    result.append(eventPrize[0])
    return result

示例#6

0

显示文件

def get_new_iterable_items(page, startID):
    # Increments unique IDs until we get the last one, then return them to a list
    print(f"Checking for new {page}s. This may take awhile.")
    check = True
    array = []

    # Iterate until the page throws a 404
    while check:
        startID += 1
        html = get_html(f"https://www.hltv.org/{page}/{startID}/a")
        if html is None:
            check = False
        else:
            sys.stdout.write('\r' + f"New {page} found: {startID}")
            sys.stdout.flush()
            array.append(startID)
    print(f"\nFound {len(array)} new {page}s.")
    return array

示例#7

0

显示文件

def get_teams(teamID):
    html = get_html(f"https://www.hltv.org/team/{teamID}/a")
    if html is None:
        print(f"Failed for {teamID}")
        return []

    # Find the type of event (online, LAN, etc)
    teamName = re.findall('<div><span class=\"subjectname\">.*</span><br><i',
                          html)
    if len(teamName) < 1:
        return []
    teamRanked = re.findall('<a href=\"\/ranking\/teams\">Ranked #(.*)<\/a>',
                            html)
    if len(teamRanked) < 1:
        teamRanked = [None]
    teamCountry = re.findall('fa fa-map-marker\" aria-hidden=\"true\"></i>.*<',
                             html)
    if len(teamCountry) < 1:
        teamCountry = re.findall(
            'fa fa-map-marker\" aria-hidden=\"true\"></i>.*</div>', html)
    if len(teamCountry) < 1:
        return []

    if len(teamName) > 0:
        teamName[0] = (teamName[0].replace("<div><span class=\"subjectname\">",
                                           "")).replace("</span><br><i", "")
    else:
        teamName.append(0)

    if len(teamCountry) > 0:
        teamCountry[0] = (teamCountry[0].replace(
            "fa fa-map-marker\" aria-hidden=\"true\"></i> ",
            "")).split("<", 1)[0]
    else:
        teamCountry.append(0)

    # Make an array for pool.map to process
    array = []
    array.append(teamName[0])
    array.append(teamCountry[0])
    array.append(teamID)
    array.append(teamRanked[0])

    return array

示例#8

0

显示文件

def get_event_winners(eventID):
    html = get_html("https://www.hltv.org/events/%s/a" % (eventID))
    if html is None:
        print(f"Failed for {eventID}")
        return []

    # Find the total prize and prize winners
    prizeWinners = re.findall('/team.logo/.*\" class', html)

    # Parse the prizeWinners
    if len(prizeWinners) > 0:
        for prize in range(0, len(prizeWinners)):
            prizeWinners[prize] = (prizeWinners[prize].split('/')[3]).replace(
                "\" class", "")
    else:
        prizeWinners.append(0)

    # Make an array for pool.map to process
    return [eventID] + prizeWinners

示例#9

0

显示文件

def get_event_rewards(eventID):
    html = get_html("https://www.hltv.org/events/%s/a" % (eventID))
    if html is None:
        print(f"Failed for {eventID}")
        return []

    # Find the total prizes
    eventPrizes = re.findall('class=\"prizeMoney\">\$.*<', html)

    # Parse the eventPrize
    if len(eventPrizes) > 0:
        for prize in range(0, len(eventPrizes)):
            eventPrizes[prize] = (eventPrizes[prize].split('$')[1]).replace(
                "<", "")
    else:
        eventPrizes.append(0)

    # Make an array for pool.map to process
    return [eventID] + eventPrizes

示例#10

0

显示文件

def get_finished_events(stop=0):
    print("Looking for new completed events.")

    # Create an offset variable for lists that are paginated on HLTV
    offset = 0
    # Empty array to add new IDs to
    event_ids = []

    # Ensure we loop through the proper number of pages
    html = get_html('https://www.hltv.org/events/archive')
    num_pages = int(find_num_pages(html))
    page = 1

    # Loop through the pages of finished events
    for i in range(num_pages - 1):
        # Get the matches at the current offset
        more_event_ids = find_match_ids_at_url(
            f"https://www.hltv.org/events/archive?offset={offset}")

        # Offset by 50 to get the next 100 matches
        offset += 50

        # Append the new IDs to the master list
        for event in more_event_ids:
            event_ids.append([event, 0])

        # Break out when we see the most recent ID
        if not end_check(event_ids, stop):
            slice = event_ids.index([stop, 0])
            # Remove unecessary entries
            event_ids = event_ids[:slice]
            break

        # Continue paginating and updating the user
        page += 1
        length = len(event_ids)
        print(f"Parsed page {page}. {length} events found so far.")

    # Reverse the array so the most recent event is last
    event_ids = event_ids[::-1]
    print(f"Parsed {page} page(s).")
    return event_ids

示例#11

0

显示文件

def get_match_events(matchID):
    html = get_html(f"https://www.hltv.org/matches/{matchID}")
    if html is None:
        print(f"Failed for {matchID}")
        return []
    # Find the type of event (online, LAN, etc)
    eventName = re.findall('\"/events/.*/', html)
    if len(eventName) < 1:
        print(f"Failed for {matchID}")
        return []

    # print eventType
    if len(eventName) > 1:
        eventName[0] = (eventName[0].replace("\"/events/", "")).split("/",
                                                                      1)[0]
    else:
        eventName.append(0)

    # Make an array for pool.map to process
    array = []
    array.append(matchID)
    array.append(eventName[0])
    return array

示例#12

0

显示文件

def get_match_info(matchID):
    html = get_html(f"https://www.hltv.org/matches/{matchID}")
    if html is None:
        print(f"Failed for {matchID}")
        return []

    # Find match date, team IDs, team names, map, and scores
    date = re.findall('data-unix=\".*\"', html)
    teamIDs = re.findall(
        'src=\"https://static.hltv.org/images/team/logo/.*\" class', html)
    teamNames = re.findall('class=\"logo\" title=\".*\">', html)
    map = re.findall('<div class=\"mapname\">.*</div>', html)
    scores = re.findall('<div class=\"results\"><span class=\".*</span><span>',
                        html)

    # Give up if no team names found
    if len(teamNames) < 1:
        return []

    # Find the match date
    if len(date) > 2:
        date = date[1]
        date = (date.replace("data-unix=\"", "")).replace("\"", "")[:-3]
        date = datetime.utcfromtimestamp(int(date)).strftime('%m/%d/%y')
    else:
        date.append(0)

    # Find the Teams respective IDs
    if len(teamIDs) > 0:
        teamIDs[0] = (teamIDs[0].replace(
            "src=\"https://static.hltv.org/images/team/logo/",
            "")).replace("\" class", "")
        teamIDs[1] = (teamIDs[1].replace(
            "src=\"https://static.hltv.org/images/team/logo/",
            "")).replace("\" class", "")
    else:
        teamIDs.append(0)

    # Find the map(s) that the match was played on
    if len(map) == 1:
        map[0] = (map[0].replace("<div class=\"mapname\">",
                                 "")).replace("</div>", "")
    elif len(map) > 1:
        for i in range(0, len(map)):
            map[i] = (map[i].replace("<div class=\"mapname\">",
                                     "")).replace("</div>", "")
    else:
        map.append(0)

    # Find the team starting and half sides
    sides = []
    try:
        if len(scores) == 1:
            if len(scores[0]) > 0:
                # If team 1 is T, team 2 is CT
                if re.findall('\"t\"|\"ct\"', scores[0])[0] == '\"t\"':
                    sides.append("T")
                    sides.append("CT")
                else:
                    sides.append("CT")
                    sides.append("T")
        elif len(scores) > 1:

            # Same as above, but looped for multiple matches
            for i in range(0, len(scores)):
                if len(scores[i]) > 0:
                    if re.findall('\"t\"|\"ct\"', scores[i])[0] == "\"t\"":
                        sides.append("T")
                        sides.append("CT")
                    else:
                        sides.append("CT")
                        sides.append("T")
        else:
            return []
    except IndexError:
        pass

    # Find the scores if there is only one map
    if len(map) == 1:
        scores[0] = re.findall('\d+', scores[0])

    # Find the scores if there are multiple maps
    elif len(map) > 1:
        for i in range(0, len(scores)):
            scores[i] = re.findall('\d+', scores[i])
    else:
        scores.append(0)

    for i in range(0, len(scores)):
        # If there was no overtime, make the OT value 0
        if len(scores[i]) == 6:
            scores[i].append(0)
            scores[i].append(0)
        elif len(scores[i]) > 6:
            # Do nothing, because OT scores are already calculated
            pass
        else:
            print(f"HLTV altered score layout for {matchID}")
            return []

    # Make an array for pool.map to process
    result = []

    # Create counter variable to access the proper item in the sides array
    sideCount = 0
    if len(map) > 1:
        for i in range(0, len(scores)):
            # Create a temp array so that each map's stats are each contained in their own array
            tempArray = []
            tempArray.append(date)
            tempArray.append(map[i])
            tempArray.append(teamIDs[0])
            tempArray.append(sides[sideCount])
            tempArray.append(scores[i][0])
            tempArray.append(scores[i][2])
            tempArray.append(scores[i][4])
            tempArray.append(scores[i][6])
            tempArray.append(teamIDs[1])
            tempArray.append(sides[sideCount + 1])
            tempArray.append(scores[i][1])
            tempArray.append(scores[i][3])
            tempArray.append(scores[i][5])
            tempArray.append(scores[i][7])
            tempArray.append(matchID)
            result.append(tempArray)
            sideCount += 2
    else:
        result.append(date)
        result.append(map[0])
        result.append(teamIDs[0])
        result.append(sides[0])
        result.append(scores[0][0])
        result.append(scores[0][2])
        result.append(scores[0][4])
        result.append(scores[0][6])
        result.append(teamIDs[1])
        result.append(sides[1])
        result.append(scores[0][1])
        result.append(scores[0][3])
        result.append(scores[0][5])
        result.append(scores[0][7])
        result.append(matchID)
    return result

示例#13

0

显示文件

def get_player_stats(matchID):
    html = get_html(f"https://www.hltv.org/matches/{matchID}")
    if html is None:
        print(f"Failed for {matchID}")
        return []

    # Get maps
    maps = re.findall('<div class=\"stats-content\" id=\".*-content\">', html)
    if len(maps) > 0:
        for i in range(0, len(maps)):
            # Really messy way to clean the result
            maps[i] = (maps[i].replace(
                "<div class=\"stats-content\" id=\"",
                "")).replace("-content\">",
                             "").translate({ord(k): None
                                            for k in digits})
        maps.remove(maps[0])
    else:
        print(f"No player stats for {matchID}")
        return []

    # Get Player IDs
    players = re.findall('href=\"/player/.*/', html)
    if len(players) > 0:
        for i in range(0, len(players)):
            players[i] = (players[i].replace("href=\"/player/",
                                             "")).replace("/", "")
    else:
        print(f"No player IDs for {matchID}")
        return []

    # Find player KDs
    kd = re.findall('<td class=\"kd text-center\">.*</td>', html)
    kills = []
    deaths = []
    if len(kd) > 0:
        for i in range(0, len(kd)):
            kd[i] = (kd[i].replace("<td class=\"kd text-center\">",
                                   "")).replace("</td>", "")
            # Clean up the hyphenated numbers
            kills.append(kd[i][0:kd[i].find('-')])
            deaths.append(kd[i][kd[i].find('-') + 1:len(kd[i])])
    else:
        print(f"No player K/D for {matchID}")
        return []
    # Remove unnecessary instances of D
    deaths[:] = [x for x in deaths if x != 'D']
    # Remove unnecessary instances of K
    kills[:] = [x for x in kills if x != 'K']

    # Find player ADR
    adr = re.findall('<td class=\"adr text-center \">.*</td>', html)
    if len(adr) > 0:
        for i in range(0, len(adr)):
            adr[i] = (adr[i].replace("<td class=\"adr text-center \">",
                                     "")).replace("</td>", "")
    else:
        print(f"No player ADR for {matchID}")
        # Add blank items for when data is missing; number may need adjustment if we do BO7s later
        adr = [""] * 70

    # Find player KAST%
    kast = re.findall('<td class=\"kast text-center\">.*</td>', html)
    if len(kast) > 0:
        for i in range(0, len(kast)):
            kast[i] = (kast[i].replace("<td class=\"kast text-center\">",
                                       "")).replace("%</td>", "")
    else:
        print(f"No player KAST ratio for {matchID}")
        # Add blank items for when data is missing; number may need adjustment if we do BO7s later
        kast = [""] * 70

    # Find player rating
    rating = re.findall('<td class=\"rating text-center\">.*</td>', html)
    nonNumbers = []
    if len(rating) > 0:
        for i in range(0, len(rating)):
            rating[i] = (rating[i].replace("<td class=\"rating text-center\">",
                                           "")).replace("</td>", "")

            # Check if the value returned is a float, if not append it to a list for removal
            try:
                float(rating[i])
            except ValueError:
                nonNumbers.append(rating[i])

        # Remove duplicate non-float values
        nonNumbers = list(set(nonNumbers))

        # Remove non-float values from the array of player ratings
        for i in range(0, len(nonNumbers)):
            rating[:] = [value for value in rating if value != nonNumbers[i]]
    else:
        print(f"No player Rating for {matchID}")
        return []

    # Remove unnecessary instances of 'Rating'
    rating[:] = [x for x in rating if x != 'Rating']

    # Handle array building
    masterArray = []
    for i in range(0, len(maps)):
        # Arrays have data for multiple matches, so this offsets us by the amount to get each map separately
        offset = 10 * (i + 1)
        try:
            for b in range(0, 5):
                playerArray = []
                playerArray.append(maps[i])
                playerArray.append(players[b + offset])
                playerArray.append(kills[b + offset])
                playerArray.append(deaths[b + offset])
                playerArray.append(adr[b + offset])
                playerArray.append(kast[b + offset])
                playerArray.append(rating[b + offset])
                playerArray.append(matchID)
                masterArray.append(playerArray)
            for b in range(5, 10):
                playerArray = []
                playerArray.append(maps[i])
                playerArray.append(players[b + offset])
                playerArray.append(kills[b + offset])
                playerArray.append(deaths[b + offset])
                playerArray.append(adr[b + offset])
                playerArray.append(kast[b + offset])
                playerArray.append(rating[b + offset])
                playerArray.append(matchID)
                masterArray.append(playerArray)
        except IndexError:
            print(f"Player stats error with {matchID}")
    return masterArray

示例#14

0

显示文件

文件： main.py 项目： themperek/piano

def message(html):
    return """HTTP/1.0 200 OK
            Content-Type: text/html

            """ + html.get_html()

示例#15

0

显示文件

def get_match_map_bans(matchID):
    html = get_html(f"https://www.hltv.org/matches/{matchID}")
    if html is None:
        print(f"Failed for {matchID}")
        return []

    # Get all of the picks and bans for a match
    raw_picks_and_bans = re.findall('<div>.\..*<', html)

    # Clean raw_picks_and_bans
    pick_type = [' picked ', ' removed ', ' was left over', "random"]
    picks_and_bans = []

    for item in raw_picks_and_bans:

        # Clean the resultant text
        item_clean = re.sub('<div>...', '', item)
        item_clean = item_clean.replace('<', '')
        # For each pick type, get the proper information for the array
        for pick in pick_type:
            if pick in item_clean:
                item = list(item_clean.partition(pick))

                # Remove trailing space
                item[1] = item[1].replace(' ', '')
        picks_and_bans.append(item)

    # Make an array for pool.map to process
    master_array = []
    if len(picks_and_bans) > 0:
        index = 1
        for choice in picks_and_bans:
            # For the items of length 3, we get [Team, type, Map]
            if len(
                    choice
            ) == 3 and 'wasleftover' not in choice and 'random' not in choice:
                array = []
                array.append(matchID)
                array.append(choice[0])
                array.append(index)
                array.append(choice[1])
                array.append(choice[2])
                master_array.append(array)
                index += 1

            else:
                # For the items of length 4 we get [Map, was, left, over] so this is randomized
                array = []
                array.append(matchID)
                array.append('')
                array.append(index)
                array.append('random')
                array.append(choice[0])
                master_array.append(array)
                index += 1
    else:
        # print(f"No picks for {matchID}")
        # Too many BO1s to call out an error here
        pass
        return []
    return master_array

示例#16

0

显示文件

文件： make_updated_files.py 项目： nsc-norway/publications

import html

print("Generating files...")

constant_html = '<p class=\"MsoNormal\" style=\"margin-left:36.0pt;text-indent:-36.0pt\">&nbsp;</p>\n\n<p class=\"MsoNormal\">&nbsp;&nbsp;<img alt=\"\" height=\"333\" src=\"/publications/papers_per_year.png\" style=\"width: 726px; height: 333px;\" width=\"726\" /><br />\n&nbsp;</p>\n\n<p class=\"MsoNormal\" style=\"margin-left: 36pt; text-indent: -36pt;\"><span style=\"mso-ascii-font-family:Cambria;mso-hansi-font-family:Cambria;mso-no-proof:yes\">In case we had missed your publication in the list below, please let us know through our&nbsp;<a href=\"https://nettskjema.uio.no/answer/61221.html\">publication registration form</a>.</span></p>\n\n<p>&nbsp;</p>'

content = constant_html + html.get_html()

with open("index.html", "w", encoding='utf-8') as f:
    f.write("""{
   "resourcetype": "structured-article",
   "properties": {
      "showAdditionalContent": "false",
      "title": "Publications",
      "content": "%s",
      "hidePicture": "false"
   }
}""" % (content.replace("\n", "\\n").replace("\"", "\\\"")))

import graph  #execute it

print("Upload index.html and papers_per_year.png to the WebDAV at:")
print("https://www-dav.sequencing.uio.no/publications/")

示例#17

0

显示文件

# logFile = "E:/python/log/log.log"
# sc = SparkContext("local","Simple App")
# logData = sc.textFile(logFile).cache()
#
# numAs = logData.filter(lambda s: 'a' in s).count()
# numBs = logData.filter(lambda s: 'b' in s).count()
#
# print("Lines with a: %i, lines with b: %i"%(numAs, numBs))
# output = open('E:/python/log/log.log', 'w+')
if __name__ == '__main__':
    data_dicts = []
    for i in range(1, 39792):
        url = "http://www.stat-nba.com/game/" + str(i) + ".html"

        data = html.get_html(url)
        data_list, name_list = html.get_data(data)

        print(i)
        data_entitys = []
        for data_itr in data_list:
            idx = int(data_itr[2])

            if (idx >= len(data_entitys)):
                stats = game_stats.GameStats()
                stats.name = name_list[idx]
                data_entitys.append(stats)
            else:
                stats = data_entitys[idx]
            stats.build(data_itr)