예제 #1
0
           "Ice Hockey, Men's" : constants.MENS_ICE_HOCKEY, "Ice Hockey, Women's" : constants.WOMENS_ICE_HOCKEY,
           "Lacrosse, Men's" : constants.MENS_LACROSSE,
           "Rifle, Men's/Women's" : [constants.MENS_RIFLE, constants.WOMENS_RIFLE],
           "Rowing, Women's" : constants.WOMENS_ROWING, "Soccer, Men's" : constants.MENS_SOCCER,
           "Soccer, Women's" : constants.WOMENS_SOCCER, "Softball, Women's" : constants.SOFTBALL,
           "Swimming, Men's" : constants.MENS_SWIMMING_DIVING, "Swimming, Women's" : constants.WOMENS_SWIMMING_DIVING,
           "Tennis, Men's" : constants.MENS_TENNIS, "Tennis, Women's" : constants.WOMENS_TENNIS,
           "Track & Field/Cross Country, Men's" : [constants.MENS_CROSS_COUNTRY, constants.MENS_TRACK_FIELD],
           "Track & Field/Cross Country, Women's" : [constants.WOMENS_CROSS_COUNTRY, constants.WOMENS_TRACK_FIELD],
           "Volleyball, Women's" : constants.WOMENS_VOLLEYBALL, "Wrestling" : constants.WRESTLING }

print ("Scraping Ohio State")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Ohio State University")
d = pq(url=college[1])
for key, sport in sports.items():
    print(sport)
    header = d('b:contains("' + key + '")')
    table = header.parent().parent().parent()
    first = True
    for row in table("tr"):
        if not first:
          scraper.parse_row(cxn, college[0], college[1], sport, row.getchildren(), ["name", "title", "phone", "email"],
                            {'phone_prefix' : '(614) '})
            
        first = False

scraper.close_connection(cxn)


예제 #2
0
import scraper
import constants


sports = {"BASEBALL" : constants.BASEBALL, "MEN'S BASKETBALL" : constants.MENS_BASKETBALL, "WOMEN'S BASKETBALL" : constants.WOMENS_BASKETBALL,
          "WOMEN'S ROWING" : constants.WOMENS_ROWING, "MEN'S CROSS COUNTRY" : constants.MENS_CROSS_COUNTRY,
          "WOMEN'S CROSS COUNTRY" : constants.WOMENS_CROSS_COUNTRY, "FIELD HOCKEY" : constants.FIELD_HOCKEY,
          "FOOTBALL" : constants.FOOTBALL, "ICE HOCKEY" : constants.MENS_ICE_HOCKEY, "MEN'S LACROSSE" : constants.MENS_LACROSSE,
          "WOMEN'S LACROSSE" : constants.WOMENS_LACROSSE, "MEN'S SOCCER" : constants.MENS_SOCCER,
          "WOMEN'S SOCCER" : constants.WOMENS_SOCCER, "SOFTBALL" : constants.SOFTBALL,
          "MEN'S SWIMMING & DIVING" : constants.MENS_SWIMMING_DIVING, "WOMEN'S SWIMMING & DIVING" : constants.WOMENS_SWIMMING_DIVING,
          "WOMEN'S TENNIS" : constants.WOMENS_TENNIS, "MEN'S TRACK & FIELD" : constants.MENS_TRACK_FIELD,
          "WOMEN'S TRACK & FIELD" : constants.WOMENS_TRACK_FIELD }

print ("Scraping Umass")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Massachusetts")
d = pq(url=college[1])
for key, sport in sports.items():
    print (sport);
    header = d('a:contains("' + key + '")').filter(lambda i, this: this.text.startswith(key))
    info_row = header.parent().next()
    while not info_row.attr("bgcolor"):
        info_row = info_row.next();
    while info_row.children().length > 1 and info_row.children().length < 7:
        scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "phone", None, None, "title", "email"],
                          {'phone_prefix' : "(413) "})
        info_row = info_row.next()

scraper.close_connection(cxn)
예제 #3
0
           "Women's Crew" : constants.WOMENS_ROWING, "Men's Cross Country" : constants.MENS_CROSS_COUNTRY,
           "Women's Cross Country" : constants.WOMENS_CROSS_COUNTRY,
           "Fencing" : [constants.MENS_FENCING, constants.WOMENS_FENCING],
           "Field Hockey" : constants.FIELD_HOCKEY, "Football" : constants.FOOTBALL,
           "Men's Golf" : constants.MENS_GOLF, "Women's Golf" : constants.WOMENS_GOLF,
           "Gymnastics" : constants.WOMENS_GYMNASTICS, "Men's Ice Hockey" : constants.MENS_ICE_HOCKEY,
           "Women's Ice Hockey" : constants.WOMENS_ICE_HOCKEY, "Men's Lacrosse" : constants.MENS_LACROSSE,
           "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Skiing" : constants.WOMENS_SKIING,
           "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER,
           "Softball" : constants.SOFTBALL,
           "Men's Swimming & Diving" : constants.MENS_SWIMMING_DIVING, "Women's Swimming & Diving" : constants.WOMENS_SWIMMING_DIVING,
           "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS,
           "Men's Track & Field" : constants.MENS_TRACK_FIELD, "Women's Track & Field" : constants.WOMENS_TRACK_FIELD,
           "Volleyball" : constants.WOMENS_VOLLEYBALL, "Men's Water Polo" : constants.MENS_WATER_POLO,
           "Women's Water Polo" : constants.WOMENS_WATER_POLO, "Wrestling" : constants.WRESTLING}
           

print ("Scraping Brown")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Brown University")
d = pq(url=college[1])
for key, sport in sports.items():
    header = d('h2:contains("' + key + '")')
    table = header.next()
    coaches = table("tr[class^='roster-row']")
    for coach in coaches:
        scraper.parse_row(cxn, college[0], college[1], sport, coach, ["name", "title", "phone", "email"])

scraper.close_connection(cxn)

예제 #4
0
          "footbl" : constants.FOOTBALL, "hockey" : constants.MENS_ICE_HOCKEY,
          "m-bball" : constants.MENS_BASKETBALL, "m-golf" : constants.MENS_GOLF,
          "m-soccer" : constants.MENS_SOCCER,
          "c-swim" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING],
          "m-tennis" : constants.MENS_TENNIS,
          "softbl" : constants.SOFTBALL, "spirit" : constants.CHEERLEADING,
          "mtrack" : [constants.MENS_TRACK_FIELD, constants.MENS_CROSS_COUNTRY,
                       constants.WOMENS_TRACK_FIELD, constants.WOMENS_CROSS_COUNTRY],
          "w-bball" : constants.WOMENS_BASKETBALL, "w-golf" : constants.WOMENS_GOLF,
          "w-gym" : constants.WOMENS_GYMNASTICS, "rowing" : constants.WOMENS_ROWING,
          "w-soccer" : constants.WOMENS_SOCCER, "w-tennis" : constants.WOMENS_TENNIS,
          "volley" : constants.WOMENS_VOLLEYBALL, "wrestle" : constants.WRESTLING }
          
print ("Scraping Michigan State")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Michigan State University")
d = pq(url=college[1])
for key, sport in sports.items():
    print (sport)
    info_row = d('a[name="' + key + '"]').parent().parent().next().next().next()
    # Consideration for bad page formatting
    if len(info_row.children()) < 4:
           info_row = d('a[name="' + key + '"]').parent().next().next().next()
    while len(info_row.children()) > 1:
        scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["title", "name", "phone", "email"],
                          {'phone_prefix' : '(517) '})
        info_row = info_row.next()

scraper.close_connection(cxn)

예제 #5
0
import scraper
import constants

sports = {"m-basebl" : constants.BASEBALL, "m-baskbl" : constants.MENS_BASKETBALL, "w-baskbl" : constants.WOMENS_BASKETBALL,
             "cheer" : constants.CHEERLEADING, "xc" : constants.MENS_CROSS_COUNTRY, "w-xc" : constants.WOMENS_CROSS_COUNTRY,
             "fence" : constants.MENS_FENCING, "f-hockey" : constants.FIELD_HOCKEY, "m-footbl" : constants.FOOTBALL,
             "m-golf" : constants.MENS_GOLF, "w-golf" : constants.WOMENS_GOLF,
             "m-ihockey" : constants.MENS_ICE_HOCKEY, "w-ihockey" : constants.WOMENS_ICE_HOCKEY,
             "w-lax" : constants.WOMENS_LACROSSE,
             "w-row" : constants.WOMENS_ROWING, "sail" : constants.SAILING, "skiing": [constants.MENS_SKIING, constants.WOMENS_SKIING],
             "m-soccer" : constants.MENS_SOCCER,
             "w-soccer" : constants.WOMENS_SOCCER, "softbl" : constants.SOFTBALL,
             "swim" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING],
             "m-tennis" : constants.MENS_TENNIS,
             "w-tennis" : constants.WOMENS_TENNIS, "track" : constants.MENS_TRACK_FIELD, "w-tf" : constants.WOMENS_TRACK_FIELD,
             "w-volley" : constants.WOMENS_VOLLEYBALL}

print ("Scraping BC")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Boston College")
d = pq(url=college[1])
for key, sport in sports.items():
    print (sport);
    anchor = d('a[name="' + key + '"]')
    info_row = anchor.parent().parent().next()
    while info_row.children().length > 1:
        scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"])        
        info_row = info_row.next()

scraper.close_connection(cxn)
예제 #6
0
          "Cheerleading" : constants.CHEERLEADING,
          "Cross Country" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY],
          "Fencing" : [constants.MENS_FENCING, constants.WOMENS_FENCING],
          "Football" : constants.FOOTBALL,
          "Golf" : constants.MENS_GOLF,
          "Gymnastics (M)" : constants.MENS_GYMNASTICS, "Gymnastics (W)" : constants.WOMENS_GYMNASTICS,
          "Ice Hockey" : constants.MENS_ICE_HOCKEY, "Lacrosse" : constants.MENS_LACROSSE,
          "Rifle" : constants.MENS_RIFLE, "Soccer (M)" : constants.MENS_SOCCER,
          "Soccer (W)" : constants.WOMENS_SOCCER, "Swimming & Diving (M)" : constants.MENS_SWIMMING_DIVING,
          "Swimming & Diving (W)" : constants.WOMENS_SWIMMING_DIVING, "Tennis (M)" : constants.MENS_TENNIS,
          "Tennis (W)" : constants.WOMENS_TENNIS, "Track & Field" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD],
          "Volleyball" : constants.WOMENS_VOLLEYBALL, "Water Polo" : constants.MENS_WATER_POLO,
          "Wresting" : constants.WRESTLING }
          

print ("Scraping Air Force")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Air Force Academy")
d = pq(url=college[1])
for key, sport in sports.items():
    print (sport);
    strong = d("strong")
    header = strong('a:contains("' + key + '")')
    table = header.parent().parent().parent().parent().parent().parent().next()
    coaches = table("tr")
    for coach in coaches:
        scraper.parse_row(cxn, college[0], college[1], sport, coach, [["name", "email"], "title", "phone"],
                          {'phone_prefix' : "(719) "})

scraper.close_connection(cxn)
예제 #7
0
from pyquery import PyQuery as pq
import scraper
import constants

sports = { "Baseball" : constants.BASEBALL, "Men's Basketball" : constants.MENS_BASKETBALL,
           "Women's Basketball" : constants.WOMENS_BASKETBALL,
           "Cross Country" : [constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY],
           "Golf" : constants.MENS_GOLF, "Hockey" : constants.MENS_ICE_HOCKEY,
           "Men's Lacrosse" : constants.MENS_LACROSSE,
           "Women's Lacrosse" : constants.WOMENS_LACROSSE, "Rowing" : constants.WOMENS_ROWING,
           "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER,
           "Softball" : constants.SOFTBALL,
           "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING],
           "Volleyball" : constants.WOMENS_VOLLEYBALL }


print ("Scraping Canisius")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Canisius College")
d = pq(url=college[1])
for key, sport in sports.items():
    header = d('font:contains("' + key + '")')
    table = header.parent().parent().parent().parent().next()
    rows = table("tr")
    first = True
    for row in rows:
        first = False if first else scraper.parse_row(cxn, college[0], college[1], sport, row.getchildren(), ["name", "title", "phone", "email"])

scraper.close_connection(cxn)
예제 #8
0
from pyquery import PyQuery as pq
import scraper
import constants

print ("Scraping Wisconsin")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Wisconsin")
d = pq(url=college[1])
for class_name in ["even", "odd"]:
    for row in d.items("tr." + class_name):
        idx = 0
        for element in row.items("td"):
            if idx == 1 and element.text() in scraper.sports:
                scraper.parse_row(cxn, college[0], college[1], scraper.sports[element.text()], row.children(), [["name", "email"], "", "title", "phone"])
            idx += 1
            
scraper.close_connection(cxn)
예제 #9
0
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Michigan")
d = pq(url=college[1])
coach_name_pattern = re.compile('<a href="(.*)">(.*)<\/a> \((.*)\)')
for sport, keys in sports.items():
    print (sport)
    info_row = d('a:contains("' + keys[0] + '")').filter(lambda i, this: this.get("href") == keys[1]).parent().parent()
    print (info_row)
    # Some rows contain 2 coaches
    if len(info_row.children()[1].getchildren()) > 1:
        coaches = []
        coach_elements = info_row.children()[1]
        coaches_names = str(etree.tostring(coach_elements), encoding='utf8').split("<br />")
        phone_elements = info_row.children()[2]
        phone_numbers = str(etree.tostring(phone_elements), encoding='utf8').replace("<td>", "").replace("</td>","").split("<br />")
        email_elements = info_row.children()[3]
        emails = str(etree.tostring(email_elements), encoding='utf8').replace("<td>", "").replace("</td>","").split("<br />")
        for i, coach in enumerate(coaches_names):
            m = coach_name_pattern.search(coach)
            profile_url = scraper.strip_string(m.group(1))
            name = scraper.strip_string(m.group(2))
            title = scraper.strip_string(m.group(3) + " Head Coach")
            phone = scraper.strip_string(phone_numbers[i])
            email = scraper.strip_string(emails[i]) + "@umich.edu"
            scraper.save_coach(cxn, college[0], scraper.get_sport_id(cxn, sport), name, title, phone, email, profile_url) 
    else:
        scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), [None, "name", "phone", "email"],
                          {'email_suffix' : "@umich.edu", 'title' : 'Head Coach'})

scraper.close_connection(cxn)
예제 #10
0
           "Rifle" : [constants.MENS_RIFLE, constants.WOMENS_RIFLE],
           "Men's Soccer" : constants.MENS_SOCCER, "Women's Soccer" : constants.WOMENS_SOCCER,
           "Softball" : constants.SOFTBALL,
           "Swimming and Diving" : [constants.MENS_SWIMMING_DIVING, constants.WOMENS_SWIMMING_DIVING],
           "Men's Tennis" : constants.MENS_TENNIS, "Women's Tennis" : constants.WOMENS_TENNIS,
           "Track & Field/Cross Country" : [constants.MENS_TRACK_FIELD, constants.WOMENS_TRACK_FIELD,
                                            constants.MENS_CROSS_COUNTRY, constants.WOMENS_CROSS_COUNTRY],
           "Volleyball" : constants.WOMENS_VOLLEYBALL, "Wrestling" : constants.WRESTLING}

print ("Scraping Army")
cxn = scraper.get_connection()
college = scraper.get_college(cxn, "Army")
d = pq(url=college[1])
for key, sport in sports.items():
    print (sport);
    finder = d('strong:contains("' + key + '")')
    if not finder:
       finder = d('span:contains("' + key + '")').filter(lambda i, this: not 'Sprint' in this.text)
    while not finder.is_("tr"):
       finder = finder.parent()
    info_row = finder.next().next()
    while not info_row.is_("tr"):
        info_row = info_row.next()
    while info_row.children():
        print(info_row)
        scraper.parse_row(cxn, college[0], college[1], sport, info_row.children(), ["name", "title", "phone", "email"],
                          {'email_suffix' : '@usma.edu', 'phone_prefix' : '(845) 938-', 'truncate_name' : "- @"})
        info_row = info_row.next()

scraper.close_connection(cxn)