Python Preprocess 예제들, pre_process.Preprocess Python 예제들

예제 #1

0

파일 보기

파일: player-dismissals-scrapper.py 프로젝트: ChamodDamitha/cric-analyzer

def getBowlerInfo(url):
    webpage = urlopen(url).read()
    soup = BeautifulSoup(webpage, "html5lib")
    data = {}
    data['name'] = Preprocess.preprocess(soup.find(class_ = 'ciPlayernametxt').find('h1').text)\
        .strip()
    for p in soup.find_all(class_='ciPlayerinformationtxt'):
        if 'Bowling style' in p.text:
            data['type'] = Preprocess.preprocess(p.find('span').text)
            break
    return data

예제 #2

0

파일 보기

파일: player-dismissals-scrapper.py 프로젝트: ChamodDamitha/cric-analyzer

def scrapeByYear(player, country, year):
    webpage = urlopen(
        "http://www.espncricinfo.com/ci/engine"
        "/series/index.html?season=" + year + ";view=season") \
        .read()

    soup = BeautifulSoup(webpage, "html5lib")
    match_types = soup.find_all(class_='match-section-head')
    all_serieses = soup.find_all(class_='series-summary-wrap')

    # select ODI serieses
    k = 0
    for i in range(len(match_types)):
        if Preprocess.preprocess(
                match_types[i].find('h2').text) == 'One-Day Internationals':
            k = i
            break
    odi_serieses = all_serieses[k]

    for series in odi_serieses.find_all(class_='series-summary-block'):
        series_url = "http://www.espncricinfo.com" + series.find(
            class_='teams').find('a').get('href')
        if country in series.find(class_='teams').text \
                or country in series.find(class_='date-location').text:
            scrapeSeries(player, country, series_url)

예제 #3

0

파일 보기

파일: player-dismissals-scrapper.py 프로젝트: ChamodDamitha/cric-analyzer

def scrapeSeries(player, country, url):
    print("series : " + url)
    webpage = urlopen(url).read()
    soup = BeautifulSoup(webpage, "html5lib")
    match_summaries = soup.find_all('span', class_='potMatchLink')
    i = 0
    for match in soup.find_all(class_='potMatchMenuLink'):
        if "Scorecard" in match.text:
            match_url = match.get('href')
            country_name = '-'.join(country.lower().strip().split(" "))
            if country_name in match_url:
                scrapeMatch(
                    player, country, match_url,
                    Preprocess.preprocess(match_summaries[i].parent.text))
            i += 1

예제 #4

0

파일 보기

파일: player-dismissals-scrapper.py 프로젝트: ChamodDamitha/cric-analyzer

def scrapeMatch(player, country, url, heading):
    print('match : ' + url)
    # url = url.replace('scorecard', 'commentary').strip('/') + '?innings=2&filter=wickets'
    webpage = urlopen(url).read()
    soup = BeautifulSoup(webpage, "html5lib")
    scorecards = soup.find_all(class_='scorecard-section batsmen')

    dismissal = {}
    player_dismissal = 0
    isBreak = False

    wayOut = ""
    player_name = player
    numbers = 0
    i = 0
    for scorecard in scorecards:
        for row in scorecard.find_all(class_='flex-row'):
            if row.find(class_='wrap batsmen'):
                player_name = Preprocess.preprocess(
                    row.find(class_='wrap batsmen').find(
                        class_='cell batsmen').find('a').text)
                if player in player_name:
                    wayOut = row.find(class_='wrap batsmen').find(
                        class_='cell commentary').text
                    if "not out" not in wayOut:
                        wayOut = Preprocess.preprocess(
                            row.find(class_='wrap batsmen').find(class_='cell commentary') \
                                .find('a').text)
                        number_headings_temp = scorecard.find(
                            class_='wrap header').find_all(class_='cell runs')
                        number_headings = [
                            Preprocess.preprocess(x.text)
                            for x in number_headings_temp
                        ]
                        numbers = row.find(class_='wrap batsmen').find_all(
                            class_='cell runs')
                        player_dismissal = row.find(class_='content')
                    isBreak = True
                    break
        if isBreak:
            break
        i += 1
    if player_dismissal != 0:
        dismissal['player'] = player_name
        dismissal['player_innings'] = {}
        dismissal['player_innings']['runs'] = Preprocess.preprocess(
            numbers[number_headings.index("R")].text)
        dismissal['player_innings']['balls'] = Preprocess.preprocess(
            numbers[number_headings.index("B")].text)
        dismissal['player_innings']['4s'] = Preprocess.preprocess(
            numbers[number_headings.index("4s")].text)
        dismissal['player_innings']['6s'] = Preprocess.preprocess(
            numbers[number_headings.index("6s")].text)

        # dismissal['venue'] = heading.split("-")[0].split('at')[1].strip()
        dismissal['date'] = heading.split("-")[1].strip()
        dismissal['stadium'] = Preprocess.preprocess(
            soup.find(class_='stadium-details').find('span').text)
        dismissal['innings'] = i + 1
        dismissal['bowler'] = {}
        dismissal['team'] = {}
        dismissal['opposition'] = {}
        dismissal['team']['country'] = country
        dismissal['wayOut'] = wayOut
        dismissal['scoreAt'] = Preprocess.preprocess(
            player_dismissal.find_all('span')[1].text).strip()
        dismissal['ball'] = Preprocess.preprocess(
            player_dismissal.find_all('span')[0].text).strip()
        dismissal['description'] = Preprocess.preprocess(
            player_dismissal.text).strip()

        countries = heading.split("-")[0].split(":")[1].split("at")[0].split(
            "v")
        for c in countries:
            if country not in c:
                dismissal['opposition']['country'] = c.strip()
                break

        if len(scorecards) > 1:
            dismissal['opposition']['total'] = Preprocess.preprocess(scorecards[1 - i] \
                                                                     .find(class_='wrap total') \
                                                                     .find_all('div')[1].text)
        else:
            dismissal['opposition']['total'] = 'DNB'

        dismissal['team']['total'] = Preprocess.preprocess(scorecards[i] \
                                                           .find(class_='wrap total') \
                                                           .find_all('div')[1].text)
        dismissal['bowler'] = {}
        if 'run out' not in wayOut and 'retired hurt' not in wayOut:
            bowler = None
            temp = wayOut.split(" ")
            for t in temp:
                if t.strip() == 'b':
                    bowler = temp[temp.index(t) + 1]
            isBreak = False
            for bowlerSection in soup.find_all(
                    class_='scorecard-section bowling'):
                for link in bowlerSection.find_all('a'):
                    # print(link.text)
                    if bowler in link.text:
                        dismissal['bowler'] = getBowlerInfo(link.get('href'))
                        isBreak = True
                        break
                if isBreak:
                    break
        print(dismissal)
        dismissals.append(dismissal)

예제 #5

0

파일 보기

파일: profile-scrapper.py 프로젝트: ChamodDamitha/cric-analyzer

def getPlayerInfo(url):
    # Sending the http request
    webpage = urlopen(url).read()
    # making the soup! yummy ;)
    soup = BeautifulSoup(webpage, "html5lib")

    data = {}
    data['bio'] = {}
    data['statistics'] = {}

    data['bio']['name'] = Preprocess.preprocess(
        soup.find(class_='ciPlayernametxt').find('h1').text).strip()

    data['bio']['coutry'] = soup.find(
        class_='PlayersSearchLink').find('b').text

    bio_details = soup.find_all(class_='ciPlayerinformationtxt')
    for det in bio_details:
        data['bio'][det.find('b').text] = Preprocess.preprocess(
            det.find('span').text).strip()

    stat_tables = soup.find_all(class_='engineTable')
    for table in stat_tables:
        if not table.find('thead'):
            stat_tables.remove(table)

    stat_headings = []
    for head in soup.find_all('span', class_='ciPhotoWidgetLink'):
        stat_headings.append(Preprocess.preprocess(head.text))

    for head in stat_headings:
        if head == "Batting and fielding averages" or head == "Bowling averages" or \
                        head == "Recent matches":
            data['statistics'][head] = {}
        else:
            stat_headings.remove(head)

    for k in range(len(stat_tables)):
        keys = []
        if stat_headings[k] != 'Recent matches':
            # Batting and bowling
            for col_name in stat_tables[k].find_all('th'):
                key = Preprocess.preprocess(col_name.text)
                if key == '10':
                    key += 'w'
                keys.append(key)
            for row in stat_tables[k].find('tbody').find_all('tr'):
                tds = row.find_all('td')

                if tds[0].find('a'):
                    head = Preprocess.preprocess(
                        tds[0].find('a').find('span').find('b').text)
                else:
                    head = Preprocess.preprocess(tds[0].find('b').text)

                data['statistics'][stat_headings[k]][head] = {}

                for j in range(1, len(tds)):
                    data['statistics'][stat_headings[k]][head][
                        keys[j]] = Preprocess.preprocess(tds[j].text)
        else:
            # recent scores
            for col_name in stat_tables[k].find_all('th'):
                keys.append(Preprocess.preprocess(col_name.text))
            matches = []
            for row in stat_tables[k].find('tbody').find_all('tr'):
                tds = row.find_all('td')
                match = {}
                for i in range(len(tds)):
                    if tds[i].find('a'):
                        d = Preprocess.preprocess(
                            tds[i].find('a').text).replace(' ', '')
                    else:
                        d = Preprocess.preprocess(
                            preprocess(tds[i].text).replace(' ', '')).replace(
                                ' ', '')
                    if keys[i] == 'Opposition':
                        d = d.strip('v')
                    match[keys[i]] = d
                matches.append(match)
            data['statistics'][stat_headings[k]] = matches
    return data

예제 #6

0

파일 보기

    def run(self):
        model = None
        scaler = None
        while True:
            self.find_data_files()

            print(
                "Enter:\nt - to train the model, \ne - to test a trained model, \nl - to load a pre-trained model\nq - to quit"
            )
            print(
                "IMPORTANT: Always train or load a model before testing it!\n")
            choice = input("Your choice: ")
            if choice is "l":
                model = load_model("model.h5")
            elif choice is "t" or choice is "e":
                print("Data files:\n")
                for i, file in enumerate(self.__files):
                    print("{} - {}".format(i + 1, file))

                if choice is "t":
                    number = input(
                        "\nPlease select the file to train the model on: ")
                else:
                    number = input(
                        "\nPlease select the file to test the model on: ")

                index = int(number)
                index -= 1

                if 0 <= index < len(self.__files):
                    data = self.dr.read_set(self.__data_indices[index])

                    pp = Preprocess()
                    data, scaler = pp.clean_up(data)
                    data = pp.convert_to_supervised(data, sample_shift=0)
                    if choice is "t":
                        train, test = pp.prepare_sets(data, 0.2)
                        train_X, train_y = pp.make_input_output(
                            train, remove_resp_from_input=True)
                        test_X, test_y = pp.make_input_output(
                            test, remove_resp_from_input=True)
                        trainer = RespRatePredictor()
                        self.dr.plot(data)
                        model = trainer.make_network(
                            input_shape=(train_X.shape[1], train_X.shape[2]))
                        model = trainer.fit_network(model, train_X, train_y,
                                                    test_X, test_y)
                        model.save("model_{0:0>2}.h5".format(
                            self.__data_indices[index - 1]))
                    else:
                        all_X, all_y = pp.make_input_output(
                            data.drop("Time [s]", axis=1),
                            remove_resp_from_input=True)
                        predict_y = model.predict(all_X, batch_size=640)
                        # min_ = scaler.min_[1]
                        # scale_ = scaler.scale_[1]

                        # predict_y = (predict_y - min_) / scale_
                        predicted = pnd.DataFrame(
                            {"RESP_PREDICTED": predict_y.flatten()})

                        fused = pnd.concat([data, predicted], axis=1)
                        self.dr.plot(fused)
                        self.dr.plot_detail(fused)
                else:
                    continue
            else:
                break

예제 #7

0

파일 보기

def scrapeMatch(dismissal, url):
    print('match : ' + url)
    webpage = urlopen(url).read()
    soup = BeautifulSoup(webpage, "html5lib")
    scorecards = soup.find_all(class_='scorecard-section batsmen')
    if len(scorecards) == 0:
        return False

    player_dismissal = None
    isBreak = False

    wayOut = ""
    player_name = dismissal['batsman']['name']

    i = 0
    for scorecard in scorecards:
        for row in scorecard.find_all(class_='flex-row'):
            if row.find(class_='wrap batsmen'):
                player_name = Preprocess.preprocess(
                    row.find(class_='wrap batsmen').find(
                        class_='cell batsmen').find('a').text)
                nameMatch = False

                if len(dismissal['batsman']['name'].split(" ")) > 1:
                    n = dismissal['batsman']['name'].split(" ")[-1]
                for n2 in player_name.split(" "):
                    if n in n2:
                        nameMatch = True
                        break

                if nameMatch:
                    if row.find(class_='wrap batsmen').find(
                            class_='cell commentary').find('a'):
                        wayOut = Preprocess.preprocess(
                            row.find(class_='wrap batsmen').find(class_='cell commentary') \
                                .find('a').text)
                    else:
                        wayOut = Preprocess.preprocess(
                            row.find(class_='wrap batsmen').find(
                                class_='cell commentary').text)
                    player_dismissal = row.find(class_='content')
                    isBreak = True
                    break
        if isBreak:
            break
        i += 1
    if player_dismissal != None:
        dismissal['bowler'] = {}
        dismissal['scoreAt'] = Preprocess.preprocess(
            player_dismissal.find_all('span')[1].text).strip()
        dismissal['ball'] = Preprocess.preprocess(
            player_dismissal.find_all('span')[0].text).strip()

        for s in player_dismissal.find_all('span'):
            s.decompose()
        dismissal['description'] = Preprocess.preprocess(
            player_dismissal.text).strip()

    if len(scorecards) > 1:
        dismissal['opposition']['total'] = Preprocess.preprocess(scorecards[1 - i] \
                                                                 .find(class_='wrap total') \
                                                                 .find_all('div')[1].text)
    else:
        dismissal['opposition']['total'] = 'DNB'

    dismissal['team']['total'] = Preprocess.preprocess(scorecards[i] \
                                                       .find(class_='wrap total') \
                                                       .find_all('div')[1].text)
    dismissal['bowler'] = {}
    if 'run out' not in wayOut and 'retired hurt' not in wayOut:
        bowler = None
        temp = wayOut.split(" ")
        for t in temp:
            if t.strip() == 'b':
                bowler = temp[temp.index(t) + 1]
        isBreak = False
        if bowler is not None:
            for bowlerSection in soup.find_all(
                    class_='scorecard-section bowling'):
                for link in bowlerSection.find_all('a'):
                    if bowler in link.text:
                        dismissal['bowler'] = getBowlerInfo(link.get('href'))
                        isBreak = True
                        break
                if isBreak:
                    break
    else:
        return False
    return dismissal

예제 #8

0

파일 보기

def scrapePlayerDismissals(player_name):
    id = 0
    dismissals = []
    webpage = urlopen('http://stats.espncricinfo.com'
                      '/ci/engine/stats/analysis.html?'
                      'search=' + ('+'.join(player_name.split(' '))) +
                      ';template=analysis').read()
    soup = BeautifulSoup(webpage, "html5lib")

    player_link = None
    for link in soup.find_all('a'):
        if 'One-Day Internationals player' in link.text:
            player_link = link
            break
    if player_link is not None:
        player_country = Preprocess.preprocess(
            player_link.parent.parent.find_all('td')[1].text)

        soup_batsman = BeautifulSoup(
            urlopen("http://stats.espncricinfo.com" + player_link.get('href')),
            "html5lib").find(class_='ciPhotoContainer')

        batsman = {}
        batsman['name'] = player_name
        for p in soup_batsman.find_all('p'):
            if 'right-hand bat' in p.text:
                batsman['batting-hand'] = 'right'
                break
            elif 'left-hand bat' in p.text:
                batsman['batting-hand'] = 'left'
                break

        player_url = player_link.get('href').split(';')[0]
        innings_url = 'http://stats.espncricinfo.com' + player_url + \
                      ';filter=advanced;orderby=start;outs=1;' \
                      'template=results;type=batting;view=innings'

        innings_webpage = urlopen(innings_url)
        innings_html = BeautifulSoup(innings_webpage, "html5lib")

        innings_table = None
        for table in innings_html.find_all(class_='engineTable'):
            if table.find('caption') and \
                            'Innings by innings list' in table.find('caption').text:
                innings_table = table
                break

        for row in innings_table.find('tbody').find_all('tr'):
            dismissal = {}
            dismissal['batsman'] = batsman
            dismissal['player_innings'] = {}
            dismissal['dismissal'] = {}
            dismissal['opposition'] = {}
            dismissal['team'] = {}
            dismissal['team']['country'] = player_country
            i = 1
            for data in row.find_all('td'):
                if i == 1:
                    dismissal['player_innings'][
                        'runs'] = Preprocess.preprocess(data.text).strip()
                elif i == 3:
                    dismissal['player_innings'][
                        'balls'] = Preprocess.preprocess(data.text).strip()
                elif i == 4:
                    dismissal['player_innings']['4s'] = Preprocess.preprocess(
                        data.text).strip()
                elif i == 5:
                    dismissal['player_innings']['6s'] = Preprocess.preprocess(
                        data.text).strip()
                elif i == 7:
                    dismissal['player_innings'][
                        'batting_position'] = Preprocess.preprocess(
                            data.text).strip()
                elif i == 8:
                    dismissal['dismissal']['wayOut'] = Preprocess.preprocess(
                        data.text).strip()
                elif i == 9:
                    dismissal['team_innings'] = Preprocess.preprocess(
                        data.text).strip()
                elif i == 11:
                    dismissal['opposition']['country'] = Preprocess.preprocess(
                        data.find('a').text).strip()
                elif i == 12:
                    dismissal['Stadium'] = Preprocess.preprocess(
                        data.text).strip()
                elif i == 13:
                    dismissal['date'] = Preprocess.preprocess(
                        data.text).strip()
                elif i == 14:
                    scorecard_url = "http://www.espncricinfo.com" + data.find(
                        'a').get('href')
                    dismissal = scrapeMatch(dismissal, scorecard_url)
                i += 1
            if dismissal:
                dismissal['id'] = id
                dismissals.append(dismissal)
                id += 1
            print(dismissal)

        with open(
                'Samples/Dismissals/' + '-'.join(player_name.split(" ")) +
                '-odi-dismissals.json', 'w') as outfile:
            json.dump(dismissals, outfile)

예제 #9

0

파일 보기

#                             i += 1
#                             break
#             tot += 1
#             if not detectedBallMovment:
#                 text += desc + "\n"
# text = ''
#
# wickets = []
# i = 0
# tot = 0

custom_Sent_tokenizer = PunktSentenceTokenizer()

for dismissal in dismissals:
    if 'description' in dismissal:
        desc = Preprocess.preprocess(dismissal['description'])
        # desc = desc.replace(',', '')
        tokenized = custom_Sent_tokenizer.tokenize(desc)
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(desc)
            print(tagged)
        break
        # detectedLength = False
        # if dismissal['dismissal']['wayOut'] != 'run out':
        #     for ball_length_list in ball_length:
        #         if not detectedLength:
        #             for k in ball_length_list[1]:
        #                 if all(w in desc for w in k):
        #                     text += (ball_length_list[0] + ", " + desc + "\n")