def read_rider_info(rider_soup, riders, teams): #try: position = convert_to_int(rider_soup.find('span').find('span').text) if position is None: return (None, None, None) time_lag = read_time_result(rider_soup) #read the rider's info info = rider_soup.find_all('a') team_id = read_team_info(info, teams) name = process_string(info[0].contents[1]) surname = process_string(info[0].contents[0].contents[0]) rider_link = info[0].get('href') key = FullName(name=name, surname=surname) if key not in riders: riders[key] = { 'name': name, 'surname': surname, 'link': rider_link, 'team_id': team_id, 'id': len(riders) + 1 } return (time_lag, position, riders[key]['id'])
def main(): tree = ET.parse('test-data2.data') root = tree.getroot() contexts = root.findall("./lexelt/instance/context") for context in contexts: context.text = utils.process_string(context.text, unwantedTags) for head in context: head.tail = utils.process_string(head.tail, unwantedTags) tree.write('processed_test2.xml')
def scrape_manager(): #site = 'https://www.holdet.dk/da/tour-de-france-2017/' ht = 'https://www.holdet.dk/handlers/tradedata.ashx?game=tour-de-france-2017&page=page_Id&r=r_Id' payloads = [ 1498992552420, 1498992692724, 1498992724104, 1498992735593, 1498992747598, 1498992760018, 1498992781743, 1498992795986 ] riders = [] riders_dict = {} # get all info from the race page session = requests.session() for i in range(0, 8): site = ht.replace('&page=page_Id&r=r_Id', '&page={}&r={}'.format(i, payloads[i])) req = session.get(site) data = json.loads(req.content) for item in data['Dataset']['Items']: name = process_string(item['Values'][2]) value = item['Values'][16] if name not in riders_dict: riders.append({'rider_name': name, 'cost': value}) riders_dict[name] = value riders_df = pd.DataFrame(riders) riders_df.to_csv('riders_cost.csv', index=False) pass
def get_word_definition_overlap_count(a, b): unwantedTags = utils.construct_unwanted_tags() defs_a = wn.synsets(a) defs_b = wn.synsets(b) # Unigram overlap count unigramOverlapCount = 0 bigramOverlapCount = 0 aUnigramSet = set() bUnigramSet = set() aBigramSet = set() bBigramSet = set() # Construct a ngram sets for d in defs_a: aUnigramArray = utils.process_string( d.definition().lower(), unwantedTags).split() # Unigram add_list_to_set(aUnigramSet, aUnigramArray) # Bigram for i in range(len(aUnigramArray) - 1): aBigramSet.add(aUnigramArray[i] + aUnigramArray[i + 1]) # Construct b ngram sets for d in defs_b: bUnigramArray = utils.process_string( d.definition().lower(), unwantedTags).split() # Unigram add_list_to_set(bUnigramSet, bUnigramArray) # Bigram for i in range(len(bUnigramArray) - 1): bBigramSet.add(bUnigramArray[i] + bUnigramArray[i + 1]) for word in bUnigramSet: if(word in aUnigramSet): unigramOverlapCount += 1 for word in bBigramSet: if word in aBigramSet: bigramOverlapCount += 1 return unigramOverlapCount * alpha + bigramOverlapCount * (1 - alpha)
def read_team_info(info, teams): try: team_name = process_string(info[1].contents[0]) team_link = info[1].get('href') if team_name not in teams: teams[team_name] = { 'name': team_name, 'link': team_link, 'id': len(teams) + 1 } return teams[team_name]['id'] except: return 0
def find_or_create(model, name): name = process_string(name).capitalize() identifier = slugify(name) try: instance = model.objects.filter( identifier=identifier)[0] except: instance = model.objects.create( identifier=identifier, name=name ) return instance
def update_in_store(WMLtypeIn, XMLin, OptionsIn={}, CapabilitiesIn=""): """ Execute the WMLS_UpdateInStore API and save the server response Parameters: See the WITSML STORE Application Program Interface (API) WMLtypeIn: A string containing the object name Example: 'well' WMLtypeIn values for all objects are defined in witsml.py Example WMLTYPEIN_WELL XMLin: A string containing the XML document. This string can contain variable substitutions ($...$), file substitutions (#...#) and conditional substitutions (^...?...:...^) OptionsIn: A dictionary in the form 'option':'value' The OptionsIn string is encoded by this function Example: {'compressionMethod':'gzip'} CapabilitiesIn: A string containing the XML document Returns: True if response received, or False otherwise """ WITSMLServer.log_store_action("sending WMLS_UpdateInStore request") WITSMLServer.clear_response() try: queryString = utils.process_string(XMLin) optionsInString = utils.encode_options_in(OptionsIn) testlog.wtl_log_server_query("WMLtypeIn", WMLtypeIn, "XMLin", queryString, "OptionsIn", optionsInString, "CapabilitiesIn", CapabilitiesIn) before = datetime.now() reply = WITSMLServer.client.service.WMLS_UpdateInStore( WMLtypeIn, queryString, optionsInString, CapabilitiesIn) after = datetime.now() duration_in_seconds = (after - before).total_seconds() WITSMLServer.elapse_time_in_seconds.set( duration_in_seconds, log='Elapse Time in Seconds') except Exception, exception_instance: WITSMLServer.log_store_result( 'Failed - ' + str(formatUserFriendlyHTTPError(exception_instance))) return False
def read_stage_info(stage_soup): try: stage_length = read_stage_length(stage_soup) info = stage_soup.find('div', class_='subDiv info show') #stage_avg_speed = convert_to_float(info.find(string='Avg. speed winner:').parent.next_sibling.replace('km/h', '')) stage_name = process_string( info.find(string='Start/finish:').parent.next_sibling) pt_tag = info.find("span", text=lambda text: not text) stage_type = pt_tag['class'][0] stage = { 'name': stage_name, 'length': stage_length, 'type': stage_type } return stage except: print('error') return None
def scrape_all_data(): """ Scrape the 'http://www.procyclingstats.com/' website for all info about stages and riders """ race_filter = 'races.php?year=0&circuit=1&ApplyFilter=Filter' session = requests.session() seasons = ['2017', '2016', '2015', '2014'] races = [] riders = {} stages = [] teams = {} results = [] for season in seasons: print(season) link = pro_site + race_filter.replace('year=0', 'year={}'.format(season)) # get all races for the season req = session.get(link) soup = BeautifulSoup(req.content, 'lxml') table = soup.find('table', class_='basic') for row in table.find_all('tr')[1:]: tds = row.find_all('td') winner_tag = tds[2].find('a') winner_name = winner_tag.text if not winner_name: continue a = tds[1].find('a') race_link = a.get('href') race_name = process_string(a.contents[1]) race_class = tds[3].contents[0] print('\nrace: ' + race_name) race = { 'id': len(races) + 1, 'name': race_name, 'link': race_link, 'class': race_class.replace('.UWT', ''), 'season': int(season) } races.append(race) race_id = race['id'] req = session.get(pro_site + race_link) soup = BeautifulSoup(req.content, 'lxml') if is_one_day_race(soup): race['no_stages'] = 1 result = soup.find( 'ul', class_='entryNav').contents[0].find('a').get('href') req = session.get(pro_site + result) soup = BeautifulSoup(req.content, 'lxml') read_stage_results(soup, race_id, stages, riders, teams, results, 1) else: req = session.get(pro_site + race_link.replace('&c=2', '&c=4')) soup = BeautifulSoup(req.content, 'lxml') table = soup.find('table', class_='basic') table_rows = table.find_all('tr')[1:] race['no_stages'] = len(table_rows) for i in range(0, len(table_rows)): tr = table_rows[i] tds = tr.find_all('td') winner = tds[2].a if (len(winner.contents) == 0): print(winner) continue td = tds[1].a link = td.get('href') ## read the information about the stage req = session.get(pro_site + link) soup = BeautifulSoup(req.content, 'lxml') read_stage_results(soup, race_id, stages, riders, teams, results, i + 1) # convert lists to dataframes races_df = pd.DataFrame(races) riders_df = pd.DataFrame(riders.values()) teams_df = pd.DataFrame(teams.values()) stages_df = pd.DataFrame(stages) results_df = pd.DataFrame(results) #print(riders_df.head()) #print(stages_df.head()) #print(results_df.head()) races_df.to_csv('races.csv', index=False, encoding='utf-8') stages_df.to_csv('stages.csv', index=False, encoding='utf-8') riders_df.to_csv('riders.csv', index=False, encoding='utf-8') teams_df.to_csv('teams.csv', index=False, encoding='utf-8') results_df.to_csv('results.csv', index=False, encoding='utf-8') pass