import fr if __name__ == "__main__": debug = True opener = fr.getOpener() #collect all the countries using the module_choixcompet countries = [] for i in range(0,10): for q in range(0,57, 7): url ="http://www.footballdatabase.eu/module_choixcompet.php?cont="+str(i)+"&paysaff="+str(q) if debug: print(url) s = fr.read(url) a = s.find_all("iframe") countries += [ab['src'] for ab in a if ab['src']!= ""] #for each country, collect all the competitions competitions = [] for c in countries: for r in range(0,42,7): #theyve got french chars c = unidecode(c) url = "http://www.footballdatabase.eu/"+c+"&compaff="+str(r) url = url.replace(" ", "%20") if debug: print(url)
def getGamesForRound(s): global gameId global missedRounds comp, country, year, rnd = s.split(".")[2:6] rnd2 = "" if rnd is None else rnd soup = fr.read("http://www.footballdatabase.eu/"+s) gamesThisRound = 0 s = s.encode('ascii','ignore') sSp = s.split(".") gameTable = soup.find("table", class_="fondsoustitrembleu488") if gameTable is not None: rows = gameTable.find_all("tr") badYears = [str(y) for y in range(1860,1900)] for i, g in enumerate(rows): #while looping through rows keep track of current date dateFind = g.find("td", class_="styledatebleu") if dateFind != None: dfTxt = dateFind.text if " ovember" in dfTxt: dfTxt = dfTxt.replace(" ovember", "November") if "In " in dfTxt: #sometimes date just says "in june 2015", just make it first of month/year if len(dfTxt.strip("In "))==4: #just a year "in 1995" dfTxt = dfTxt.strip("In ") d = datetime.strptime(dfTxt, "%Y") else: dfTxt = "01 "+dfTxt.strip("In").strip() d = datetime.strptime(dfTxt, "%d %B %Y") elif any(x in dfTxt for x in badYears): d = datetime(1900,01,01) else: d = datetime.strptime(dfTxt, "%A %d %B %Y") else: #this isnt a daterow, its a game home = away = ref = homeScore = awayScore = season = compType = url = "" ot = pks = neutral = False homePks = awayPks = det = tds2 = None mid=0 tds = g.find_all("td") if len(tds) > 10: refT = tds[0]; homeT = tds[5]; homeS = tds[6]; awayS = tds[7]; awayT = tds[8]; home = homeT.a['href'].split(".")[2] homeScore = int(homeS.getText()) awayScore = int(awayS.getText()) away = awayT.a['href'].split(".")[2] if refT.a["href"] is not "": ref = refT.a["href"].strip("football.arbitres.").strip(".en.html") #check the next row, if its another stlemneutre there might be game details (pks or et) if i < len(rows)-1: nextGame = rows[i+1] det = nextGame.find("span", class_="detailsr") if det is not None: det = det.getText() if "on penalties" in det: ot = True pks = True tds2 = nextGame.find_all("td") homePks = int(tds2[2].getText()) awayPks = int(tds2[3].getText()) elif "After Extra Time" in det: ot = True #TODO this check could be much more thourough if True: if rnd.startswith('fina'): neutral = True else: neutral = False else: neutral = False #get url and footballdatabase mid url = homeS["onclick"].strip("window.location=") mid = int(homeS["onclick"].strip("window.location=").split(".")[-3]) row = [mid, d.strftime("%d/%m/%Y"), ref, home, homeScore, awayScore, away, comp, country, rnd, ot, pks, homePks, awayPks, 0, 0, neutral, url] games.loc[gameId] = row gameId += 1 gamesThisRound += 1 if gamesThisRound == 0: missedRounds += [s] if debug: print("number of games ", gamesThisRound) return games