Exemplo n.º 1
0
def read_rider_info(rider_soup, riders, teams):
    #try:
    position = convert_to_int(rider_soup.find('span').find('span').text)
    if position is None:
        return (None, None, None)

    time_lag = read_time_result(rider_soup)

    #read the rider's info
    info = rider_soup.find_all('a')

    team_id = read_team_info(info, teams)

    name = process_string(info[0].contents[1])
    surname = process_string(info[0].contents[0].contents[0])
    rider_link = info[0].get('href')
    key = FullName(name=name, surname=surname)
    if key not in riders:
        riders[key] = {
            'name': name,
            'surname': surname,
            'link': rider_link,
            'team_id': team_id,
            'id': len(riders) + 1
        }

    return (time_lag, position, riders[key]['id'])
Exemplo n.º 2
0
def main():
    tree = ET.parse('test-data2.data')
    root = tree.getroot()
    contexts = root.findall("./lexelt/instance/context")

    for context in contexts:
        context.text = utils.process_string(context.text, unwantedTags)
        for head in context:
            head.tail = utils.process_string(head.tail, unwantedTags)
    tree.write('processed_test2.xml')
Exemplo n.º 3
0
def scrape_manager():
    #site = 'https://www.holdet.dk/da/tour-de-france-2017/'

    ht = 'https://www.holdet.dk/handlers/tradedata.ashx?game=tour-de-france-2017&page=page_Id&r=r_Id'
    payloads = [
        1498992552420, 1498992692724, 1498992724104, 1498992735593,
        1498992747598, 1498992760018, 1498992781743, 1498992795986
    ]

    riders = []
    riders_dict = {}
    # get all info from the race page
    session = requests.session()
    for i in range(0, 8):
        site = ht.replace('&page=page_Id&r=r_Id',
                          '&page={}&r={}'.format(i, payloads[i]))
        req = session.get(site)
        data = json.loads(req.content)
        for item in data['Dataset']['Items']:
            name = process_string(item['Values'][2])
            value = item['Values'][16]
            if name not in riders_dict:
                riders.append({'rider_name': name, 'cost': value})
                riders_dict[name] = value

    riders_df = pd.DataFrame(riders)
    riders_df.to_csv('riders_cost.csv', index=False)
    pass
Exemplo n.º 4
0
def get_word_definition_overlap_count(a, b):
    unwantedTags = utils.construct_unwanted_tags()
    defs_a = wn.synsets(a)
    defs_b = wn.synsets(b)

    # Unigram overlap count
    unigramOverlapCount = 0
    bigramOverlapCount = 0
    aUnigramSet = set()
    bUnigramSet = set()
    aBigramSet = set()
    bBigramSet = set()
    # Construct a ngram sets
    for d in defs_a:
        aUnigramArray = utils.process_string(
            d.definition().lower(), unwantedTags).split()

        # Unigram
        add_list_to_set(aUnigramSet, aUnigramArray)
        # Bigram
        for i in range(len(aUnigramArray) - 1):
            aBigramSet.add(aUnigramArray[i] + aUnigramArray[i + 1])

    # Construct b ngram sets
    for d in defs_b:
        bUnigramArray = utils.process_string(
            d.definition().lower(), unwantedTags).split()
        # Unigram
        add_list_to_set(bUnigramSet, bUnigramArray)

        # Bigram
        for i in range(len(bUnigramArray) - 1):
            bBigramSet.add(bUnigramArray[i] + bUnigramArray[i + 1])

    for word in bUnigramSet:
        if(word in aUnigramSet):
            unigramOverlapCount += 1
    for word in bBigramSet:
        if word in aBigramSet:
            bigramOverlapCount += 1
    return unigramOverlapCount * alpha + bigramOverlapCount * (1 - alpha)
Exemplo n.º 5
0
def read_team_info(info, teams):
    try:
        team_name = process_string(info[1].contents[0])
        team_link = info[1].get('href')
        if team_name not in teams:
            teams[team_name] = {
                'name': team_name,
                'link': team_link,
                'id': len(teams) + 1
            }
        return teams[team_name]['id']
    except:
        return 0
Exemplo n.º 6
0
    def find_or_create(model, name):
        name = process_string(name).capitalize()
        identifier = slugify(name)

        try:
            instance = model.objects.filter(
                identifier=identifier)[0]
        except:
            instance = model.objects.create(
                identifier=identifier,
                name=name
            )

        return instance
Exemplo n.º 7
0
    def update_in_store(WMLtypeIn, XMLin, OptionsIn={}, CapabilitiesIn=""):
        """
        Execute the WMLS_UpdateInStore API and save the server response
        
        Parameters:
          See the WITSML STORE Application Program Interface (API)
        
        
          WMLtypeIn: A string containing the object name
                     Example: 'well'
                     WMLtypeIn values for all objects are defined in witsml.py
                     Example WMLTYPEIN_WELL 
        
          XMLin:     A string containing the XML document.   
                     This string can contain variable substitutions ($...$),
                     file substitutions (#...#) and  conditional substitutions
                     (^...?...:...^)
        
          OptionsIn: A dictionary in the form 'option':'value'
                     The OptionsIn string is encoded by this function
                     Example: {'compressionMethod':'gzip'}

          CapabilitiesIn:  A string containing the XML document

        Returns:
          True if response received, or False otherwise
        """

        WITSMLServer.log_store_action("sending WMLS_UpdateInStore request")
        WITSMLServer.clear_response()
        try:
            queryString = utils.process_string(XMLin)
            optionsInString = utils.encode_options_in(OptionsIn)
            testlog.wtl_log_server_query("WMLtypeIn", WMLtypeIn, "XMLin",
                                         queryString, "OptionsIn",
                                         optionsInString, "CapabilitiesIn",
                                         CapabilitiesIn)
            before = datetime.now()
            reply = WITSMLServer.client.service.WMLS_UpdateInStore(
                WMLtypeIn, queryString, optionsInString, CapabilitiesIn)
            after = datetime.now()
            duration_in_seconds = (after - before).total_seconds()
            WITSMLServer.elapse_time_in_seconds.set(
                duration_in_seconds, log='Elapse Time in Seconds')
        except Exception, exception_instance:
            WITSMLServer.log_store_result(
                'Failed - ' +
                str(formatUserFriendlyHTTPError(exception_instance)))
            return False
Exemplo n.º 8
0
def read_stage_info(stage_soup):
    try:
        stage_length = read_stage_length(stage_soup)
        info = stage_soup.find('div', class_='subDiv info show')

        #stage_avg_speed = convert_to_float(info.find(string='Avg. speed winner:').parent.next_sibling.replace('km/h', ''))
        stage_name = process_string(
            info.find(string='Start/finish:').parent.next_sibling)
        pt_tag = info.find("span", text=lambda text: not text)
        stage_type = pt_tag['class'][0]

        stage = {
            'name': stage_name,
            'length': stage_length,
            'type': stage_type
        }

        return stage
    except:
        print('error')
        return None
Exemplo n.º 9
0
def scrape_all_data():
    """
        Scrape the 'http://www.procyclingstats.com/' website for all info about stages and riders
    """
    race_filter = 'races.php?year=0&circuit=1&ApplyFilter=Filter'
    session = requests.session()

    seasons = ['2017', '2016', '2015', '2014']

    races = []
    riders = {}
    stages = []
    teams = {}
    results = []

    for season in seasons:
        print(season)

        link = pro_site + race_filter.replace('year=0',
                                              'year={}'.format(season))
        # get all races for the season
        req = session.get(link)
        soup = BeautifulSoup(req.content, 'lxml')

        table = soup.find('table', class_='basic')
        for row in table.find_all('tr')[1:]:
            tds = row.find_all('td')
            winner_tag = tds[2].find('a')
            winner_name = winner_tag.text
            if not winner_name:
                continue

            a = tds[1].find('a')
            race_link = a.get('href')
            race_name = process_string(a.contents[1])
            race_class = tds[3].contents[0]

            print('\nrace: ' + race_name)

            race = {
                'id': len(races) + 1,
                'name': race_name,
                'link': race_link,
                'class': race_class.replace('.UWT', ''),
                'season': int(season)
            }
            races.append(race)
            race_id = race['id']

            req = session.get(pro_site + race_link)
            soup = BeautifulSoup(req.content, 'lxml')

            if is_one_day_race(soup):
                race['no_stages'] = 1
                result = soup.find(
                    'ul', class_='entryNav').contents[0].find('a').get('href')
                req = session.get(pro_site + result)
                soup = BeautifulSoup(req.content, 'lxml')
                read_stage_results(soup, race_id, stages, riders, teams,
                                   results, 1)
            else:
                req = session.get(pro_site + race_link.replace('&c=2', '&c=4'))
                soup = BeautifulSoup(req.content, 'lxml')
                table = soup.find('table', class_='basic')
                table_rows = table.find_all('tr')[1:]
                race['no_stages'] = len(table_rows)
                for i in range(0, len(table_rows)):
                    tr = table_rows[i]
                    tds = tr.find_all('td')
                    winner = tds[2].a
                    if (len(winner.contents) == 0):
                        print(winner)
                        continue

                    td = tds[1].a
                    link = td.get('href')
                    ## read the information about the stage
                    req = session.get(pro_site + link)
                    soup = BeautifulSoup(req.content, 'lxml')
                    read_stage_results(soup, race_id, stages, riders, teams,
                                       results, i + 1)

    # convert lists to dataframes
    races_df = pd.DataFrame(races)
    riders_df = pd.DataFrame(riders.values())
    teams_df = pd.DataFrame(teams.values())
    stages_df = pd.DataFrame(stages)
    results_df = pd.DataFrame(results)
    #print(riders_df.head())
    #print(stages_df.head())
    #print(results_df.head())
    races_df.to_csv('races.csv', index=False, encoding='utf-8')
    stages_df.to_csv('stages.csv', index=False, encoding='utf-8')
    riders_df.to_csv('riders.csv', index=False, encoding='utf-8')
    teams_df.to_csv('teams.csv', index=False, encoding='utf-8')
    results_df.to_csv('results.csv', index=False, encoding='utf-8')
    pass