Пример #1
0
    def parse_team_stats(self, homeaway='home'):
        # Parse each team's stats, relative to a single game
        team_stats = {}
        batting = self.boxscore.find('batting', team_flag=homeaway)
        pitching = self.boxscore.find('pitching', team_flag=homeaway)
        team_stats['game_id'] = self.game_id
        team_stats['team_id'] = try_int(self.boxscore.get(homeaway + '_id'))
        team_stats['at_home'] = (homeaway == 'home')
        games_back_text = self.linescore.get(homeaway + '_games_back')
        games_back_wildcard_text = self.linescore.get(homeaway + '_games_back')

        # If a team is 0 games back, they'll be listed as '-'. The
        # games_back_wildcard is sometimes '-', sometimes missing in this case.
        # If they're 0 games back, set both to 0
        if games_back_text == '-':
            team_stats['games_back'] = 0
            team_stats['games_back_wildcard'] = 0
        elif games_back_wildcard_text == '-':
            team_stats['games_back_wildcard'] = 0
            team_stats['games_back'] = try_float(games_back_text)
        else:
            team_stats['games_back'] = try_float(games_back_text)
            team_stats['games_back_wildcard'] = try_float(
                games_back_wildcard_text)

        wins = try_int(self.boxscore.get(homeaway + '_wins', 0))
        losses = try_int(self.boxscore.get(homeaway + '_loss', 0))
        team_stats['wins'] = wins
        team_stats['losses'] = losses
        team_stats['winrate'] = 0 if (wins +
                                      losses) == 0 else wins / (wins + losses)
        team_stats['avg'] = try_float(batting.get('avg'))
        team_stats['at_bats'] = try_int(batting.get('ab'))
        team_stats['runs'] = try_int(batting.get('r'))
        team_stats['hits'] = try_int(batting.get('h'))
        team_stats['doubles'] = try_int(batting.get('d'))
        team_stats['triples'] = try_int(batting.get('t'))
        team_stats['home_runs'] = try_int(batting.get('hr'))
        team_stats['rbis'] = try_int(batting.get('rbi'))
        team_stats['walks'] = try_int(batting.get('bb'))
        team_stats['putouts'] = try_int(batting.get('po'))
        team_stats['da'] = try_int(batting.get('da'))
        team_stats['strikeouts'] = try_int(batting.get('so'))
        team_stats['left_on_base'] = try_int(batting.get('lob'))
        team_stats['era'] = try_float(pitching.get('era'))
        # Drop None values and add TeamStats object to the to_load list
        team_stats = dict(
            (k, v) for k, v in team_stats.items() if v is not None)
        self.to_load.append(TeamStats(**team_stats))
Пример #2
0
    def parse_team_stats(self, homeaway='home'):
        # Parse each team's stats, relative to a single game
        team_stats = {}
        batting = self.boxscore.find('batting', team_flag=homeaway)
        pitching = self.boxscore.find('pitching', team_flag=homeaway)
        team_stats['game_id'] = self.game_id
        team_stats['team_id'] = try_int(self.boxscore.get(homeaway + '_id'))
        team_stats['at_home'] = (homeaway == 'home')
        games_back_text = self.linescore.get(homeaway + '_games_back')
        games_back_wildcard_text = self.linescore.get(homeaway + '_games_back')

        # If a team is 0 games back, they'll be listed as '-'. The
        # games_back_wildcard is sometimes '-', sometimes missing in this case.
        # If they're 0 games back, set both to 0
        if games_back_text == '-':
            team_stats['games_back'] = 0
            team_stats['games_back_wildcard'] = 0
        elif games_back_wildcard_text == '-':
            team_stats['games_back_wildcard'] = 0
            team_stats['games_back'] = try_float(games_back_text)
        else:
            team_stats['games_back'] = try_float(games_back_text)
            team_stats['games_back_wildcard'] = try_float(games_back_wildcard_text)

        wins = try_int(self.boxscore.get(homeaway + '_wins', 0))
        losses = try_int(self.boxscore.get(homeaway + '_loss', 0))
        team_stats['wins'] = wins
        team_stats['losses'] = losses
        team_stats['winrate'] = 0 if (wins + losses) == 0 else wins / (wins + losses)
        team_stats['avg'] = try_float(batting.get('avg'))
        team_stats['at_bats'] = try_int(batting.get('ab'))
        team_stats['runs'] = try_int(batting.get('r'))
        team_stats['hits'] = try_int(batting.get('h'))
        team_stats['doubles'] = try_int(batting.get('d'))
        team_stats['triples'] = try_int(batting.get('t'))
        team_stats['home_runs'] = try_int(batting.get('hr'))
        team_stats['rbis'] = try_int(batting.get('rbi'))
        team_stats['walks'] = try_int(batting.get('bb'))
        team_stats['putouts'] = try_int(batting.get('po'))
        team_stats['da'] = try_int(batting.get('da'))
        team_stats['strikeouts'] = try_int(batting.get('so'))
        team_stats['left_on_base'] = try_int(batting.get('lob'))
        team_stats['era'] = try_float(pitching.get('era'))
        # Drop None values and add TeamStats object to the to_load list
        team_stats = dict((k, v) for k, v in team_stats.items() if v is not None)
        self.to_load.append(TeamStats(**team_stats))
Пример #3
0
 def parse_batters(self):
     # Parses batter statistics and adds all batters to the to_load list
     for batter in self.boxscore.find_all('batter'):
         b = {}
         b['game_id'] = self.game_id
         homeaway = batter.parent.get('team_flag')
         b['team_id'] = try_int(self.boxscore.get(homeaway + '_id'))
         b['batter_id'] = try_int(batter.get('id'))
         b['name'] = batter.get('name')
         b['full_name'] = batter.get('name_display_first_last')
         b['avg'] = try_float(batter.get('avg'))
         b['batting_order'] = try_int(batter.get('bo'))
         b['at_bats'] = try_int(batter.get('ab'))
         b['strikeouts'] = try_int(batter.get('so'))
         b['flyouts'] = try_int(batter.get('ao'))
         b['hits'] = try_int(batter.get('h'))
         b['doubles'] = try_int(batter.get('d'))
         b['triples'] = try_int(batter.get('t'))
         b['home_runs'] = try_int(batter.get('hr'))
         b['walks'] = try_int(batter.get('bb'))
         b['hit_by_pitch'] = try_int(batter.get('hbp'))
         b['sac_bunts'] = try_int(batter.get('sac'))
         b['sac_flys'] = try_int(batter.get('fs'))
         b['rbi'] = try_int(batter.get('rbi'))
         b['assists'] = try_int(batter.get('a'))
         b['runs'] = try_int(batter.get('r'))
         b['left_on_base'] = try_int(batter.get('lob'))
         b['caught_stealing'] = try_int(batter.get('cs'))
         b['stolen_bases'] = try_int(batter.get('sb'))
         b['season_walks'] = try_int(batter.get('s_bb'))
         b['season_hits'] = try_int(batter.get('s_h'))
         b['season_home_runs'] = try_int(batter.get('s_hr'))
         b['season_runs'] = try_int(batter.get('s_r'))
         b['season_rbi'] = try_int(batter.get('s_rbi'))
         b['season_strikeouts'] = try_int(batter.get('s_so'))
         b['position'] = batter.get('pos')
         b['putouts'] = try_int(batter.get('po'))
         b['errors'] = try_int(batter.get('e'))
         b['fielding'] = try_float(batter.get('fldg'))
         # Drop None values and add to to_load list
         b = dict((k, v) for k, v in b.items() if v is not None)
         self.to_load.append(Batter(**b))
Пример #4
0
 def parse_batters(self):
     # Parses batter statistics and adds all batters to the to_load list
     for batter in self.boxscore.find_all('batter'):
         b = {}
         b['game_id'] = self.game_id
         homeaway = batter.parent.get('team_flag')
         b['team_id'] = try_int(self.boxscore.get(homeaway + '_id'))
         b['batter_id'] = try_int(batter.get('id'))
         b['name'] = batter.get('name')
         b['full_name'] = batter.get('name_display_first_last')
         b['avg'] = try_float(batter.get('avg'))
         b['batting_order'] = try_int(batter.get('bo'))
         b['at_bats'] = try_int(batter.get('ab'))
         b['strikeouts'] = try_int(batter.get('so'))
         b['flyouts'] = try_int(batter.get('ao'))
         b['hits'] = try_int(batter.get('h'))
         b['doubles'] = try_int(batter.get('d'))
         b['triples'] = try_int(batter.get('t'))
         b['home_runs'] = try_int(batter.get('hr'))
         b['walks'] = try_int(batter.get('bb'))
         b['hit_by_pitch'] = try_int(batter.get('hbp'))
         b['sac_bunts'] = try_int(batter.get('sac'))
         b['sac_flys'] = try_int(batter.get('fs'))
         b['rbi'] = try_int(batter.get('rbi'))
         b['assists'] = try_int(batter.get('a'))
         b['runs'] = try_int(batter.get('r'))
         b['left_on_base'] = try_int(batter.get('lob'))
         b['caught_stealing'] = try_int(batter.get('cs'))
         b['stolen_bases'] = try_int(batter.get('sb'))
         b['season_walks'] = try_int(batter.get('s_bb'))
         b['season_hits'] = try_int(batter.get('s_h'))
         b['season_home_runs'] = try_int(batter.get('s_hr'))
         b['season_runs'] = try_int(batter.get('s_r'))
         b['season_rbi'] = try_int(batter.get('s_rbi'))
         b['season_strikeouts'] = try_int(batter.get('s_so'))
         b['position'] = batter.get('pos')
         b['putouts'] = try_int(batter.get('po'))
         b['errors'] = try_int(batter.get('e'))
         b['fielding'] = try_float(batter.get('fldg'))
         # Drop None values and add to to_load list
         b = dict((k, v) for k, v in b.items() if v is not None)
         self.to_load.append(Batter(**b))
    def _count_vocab(self, raw_documents, fixed_vocab):
        """Create sparse feature matrix, and vocabulary where fixed_vocab=False
        """
        if fixed_vocab:
            vocabulary = self.vocabulary_
        else:
            # Add a new value when a new vocabulary item is seen
            vocabulary = defaultdict()
            vocabulary.default_factory = vocabulary.__len__

        analyze = self.build_analyzer()
        j_indices = []
        indptr = _make_int_array()
        values = _make_float_array()
        indptr.append(0)
        for doc in raw_documents:
            feature_counter = {}
            current_num = 0
            for feature in analyze(doc):
                maybe_float = try_float(feature)
                if maybe_float > 0 and maybe_float <= 200:
                    current_num = maybe_float
                    continue
                try:
                    if current_num == 0:
                        continue
                    feature_idx = vocabulary[feature]
                    if feature_idx not in feature_counter:
                        feature_counter[feature_idx] = current_num / 200
                        current_num = 0
                except KeyError:
                    # Ignore out-of-vocabulary items for fixed_vocab=True
                    continue

            j_indices.extend(feature_counter.keys())
            values.extend(feature_counter.values())
            indptr.append(len(j_indices))

        if not fixed_vocab:
            # disable defaultdict behaviour
            vocabulary = dict(vocabulary)
            if not vocabulary:
                raise ValueError("empty vocabulary; perhaps the documents only"
                                 " contain stop words")

        j_indices = np.asarray(j_indices, dtype=np.intc)
        indptr = np.frombuffer(indptr, dtype=np.intc)
        values = np.frombuffer(values, dtype=np.float32)

        X = sp.csr_matrix((values, j_indices, indptr),
                          shape=(len(indptr) - 1, len(vocabulary)),
                          dtype=np.float32)
        X.sort_indices()
        return vocabulary, X
Пример #6
0
 def parse_pitchers(self):
     # Parses pitcher statistics and adds all pitchers to the to_load list
     for pitcher in self.boxscore.find_all('pitcher'):
         p = {}
         p['pitcher_id'] = try_int(pitcher.get('id'))
         p['game_id'] = self.game_id
         homeaway = pitcher.parent.get('team_flag')
         p['team_id'] = try_int(self.boxscore.get(homeaway + '_id'))
         p['name'] = pitcher.get('name')
         p['full_name'] = pitcher.get('name_display_first_last')
         p['position'] = pitcher.get('pos')
         p['outs'] = try_int(pitcher.get('out'))
         p['batters_faced'] = try_int(pitcher.get('bf'))
         p['home_runs'] = try_int(pitcher.get('hr'))
         p['walks'] = try_int(pitcher.get('bb'))
         p['strikeouts'] = try_int(pitcher.get('so'))
         p['earned_runs'] = try_int(pitcher.get('er'))
         p['runs'] = try_int(pitcher.get('r'))
         p['hits'] = try_int(pitcher.get('h'))
         p['wins'] = try_int(pitcher.get('w'))
         p['losses'] = try_int(pitcher.get('l'))
         p['saves'] = try_int(pitcher.get('sv'))
         p['era'] = try_float(pitcher.get('era'))
         p['pitches_thrown'] = try_int(pitcher.get('np'))
         p['strikes'] = try_int(pitcher.get('s'))
         p['blown_saves'] = try_int(pitcher.get('bs'))
         p['holds'] = try_int(pitcher.get('hld'))
         p['season_innings_pitched'] = try_float(pitcher.get('s_ip'))
         p['season_hits'] = try_int(pitcher.get('s_h'))
         p['season_runs'] = try_int(pitcher.get('s_r'))
         p['season_earned_runs'] = try_int(pitcher.get('s_er'))
         p['season_walks'] = try_int(pitcher.get('s_bb'))
         p['season_strikeouts'] = try_int(pitcher.get('s_so'))
         p['game_score'] = try_int(pitcher.get('game_score'))
         p['blown_save'] = pitcher.get('blown_save')
         p['save'] = pitcher.get('save')
         p['loss'] = pitcher.get('loss')
         p['win'] = pitcher.get('win')
         # Drop None values and add to the to_load list
         p = dict((k, v) for k, v in p.items() if v is not None)
         self.to_load.append(Pitcher(**p))
Пример #7
0
 def parse_pitchers(self):
     # Parses pitcher statistics and adds all pitchers to the to_load list
     for pitcher in self.boxscore.find_all('pitcher'):
         p = {}
         p['pitcher_id'] = try_int(pitcher.get('id'))
         p['game_id'] = self.game_id
         homeaway = pitcher.parent.get('team_flag')
         p['team_id'] = try_int(self.boxscore.get(homeaway + '_id'))
         p['name'] = pitcher.get('name')
         p['full_name'] = pitcher.get('name_display_first_last')
         p['position'] = pitcher.get('pos')
         p['outs'] = try_int(pitcher.get('out'))
         p['batters_faced'] = try_int(pitcher.get('bf'))
         p['home_runs'] = try_int(pitcher.get('hr'))
         p['walks'] = try_int(pitcher.get('bb'))
         p['strikeouts'] = try_int(pitcher.get('so'))
         p['earned_runs'] = try_int(pitcher.get('er'))
         p['runs'] = try_int(pitcher.get('r'))
         p['hits'] = try_int(pitcher.get('h'))
         p['wins'] = try_int(pitcher.get('w'))
         p['losses'] = try_int(pitcher.get('l'))
         p['saves'] = try_int(pitcher.get('sv'))
         p['era'] = try_float(pitcher.get('era'))
         p['pitches_thrown'] = try_int(pitcher.get('np'))
         p['strikes'] = try_int(pitcher.get('s'))
         p['blown_saves'] = try_int(pitcher.get('bs'))
         p['holds'] = try_int(pitcher.get('hld'))
         p['season_innings_pitched'] = try_float(pitcher.get('s_ip'))
         p['season_hits'] = try_int(pitcher.get('s_h'))
         p['season_runs'] = try_int(pitcher.get('s_r'))
         p['season_earned_runs'] = try_int(pitcher.get('s_er'))
         p['season_walks'] = try_int(pitcher.get('s_bb'))
         p['season_strikeouts'] = try_int(pitcher.get('s_so'))
         p['game_score'] = try_int(pitcher.get('game_score'))
         p['blown_save'] = pitcher.get('blown_save')
         p['save'] = pitcher.get('save')
         p['loss'] = pitcher.get('loss')
         p['win'] = pitcher.get('win')
         # Drop None values and add to the to_load list
         p = dict((k, v) for k, v in p.items() if v is not None)
         self.to_load.append(Pitcher(**p))
Пример #8
0
def _parse_ppm(page):
    msg_list = [r'returned an error: (.*)$', r'(Too many residues)']
    for msg in msg_list:
        error = re.findall(msg, page)
        if error:
            with open("ppm_error.txt", "w") as fp:
                fp.write(error[0])
            raise Exception(error[0])
    pdb_url = PPM_URL + re.findall(r'href="\./(pdb_upload/.*out\.pdb)"',
                                   page)[0]
    delta_g = re.findall(r'([-+]?[0-9]*\.?[0-9]+) kcal/mol', page)[0]
    info_dict = {"delta_g": try_float(delta_g)}
    return pdb_url, info_dict
    def load_page(self, url):
        """
        Loads URL to json
        :param url:
        :return:
        """
        auth = None
        if 'github_token' in self.state and 'github_user' in self.state:
            auth = HTTPBasicAuth(self.state['github_user'], self.state['github_token'])

        for attempt in range(self.attempts):
            if self.terminate:
                raise Exception('Terminating')

            try:
                res = requests.get(url, timeout=10, auth=auth)
                headers = res.headers

                if res.status_code == 404:
                    logger.warning('URL not found: %s' % url)
                    return None, None, None

                self.rate_limit_reset = utils.try_float(headers.get('X-RateLimit-Reset')) + 10
                self.rate_limit_remaining = utils.try_int(headers.get('X-RateLimit-Remaining'))
                if self.rate_limit_remaining is not None and self.rate_limit_remaining <= 1:
                    sleep_sec = self.rate_limit_reset - time.time()

                    logger.info('Rate limit exceeded, sleeping till: %d, it is %d seconds, %d minutes'
                                % (self.rate_limit_reset, sleep_sec, sleep_sec / 60.0))
                    self.sleep_interruptible(self.rate_limit_reset)
                    raise Exception('Rate limit exceeded')

                if res.status_code // 100 != 2:
                    res.raise_for_status()

                data = res.content
                if data is None:
                    raise Exception('Empty response')

                js = json.loads(data, object_pairs_hook=OrderedDict)
                return js, headers, res

            except Exception as e:
                logger.warning('Exception in loading page: %s, page: %s' % (e, url))

        logger.warning('Skipping url: %s' % url)
        return None, None, None
Пример #10
0
    def load_page_local(self):
        """
        Loads page stored in thread local
        :return:
        """

        auth = None
        resource = self.local_data.resource
        if resource.usr is not None:
            auth = HTTPBasicAuth(resource.usr, resource.token)

        job = self.local_data.job

        res = requests.get(job.url, timeout=10, auth=auth)
        headers = res.headers

        resource.reset_time = utils.try_float(headers.get('X-RateLimit-Reset'))
        resource.remaining = utils.try_int(
            headers.get('X-RateLimit-Remaining'))
        resource.last_used = time.time()
        resource.used_cnt += 1

        if res.status_code == 403 and resource.remaining is not None and resource.remaining < 10:
            resource.fail_cnt += 1
            raise RateLimitHit

        if res.status_code == 404:
            resource.fail_cnt += 1
            logger.warning('URL not found: %s' % job.url)
            return None, None, None

        if res.status_code // 100 != 2:
            resource.fail_cnt += 1
            res.raise_for_status()

        data = res.content
        if data is None:
            resource.fail_cnt += 1
            raise Exception('Empty response')

        js = json.loads(data, object_pairs_hook=OrderedDict)
        return js, headers, res
Пример #11
0
def _parse_opm_info(page):

    # check if there were no matches
    no_matches = re.findall(r'<h2>Search Results for ".*"</h2>No matches',
                            page)
    if no_matches:
        return None

    # check if this page only points to a representative structure
    rep = re.findall(
        r'Representative structure\(s\) of this protein: <br /> '
        r'<a href="protein\.php\?pdbid=([0-9a-zA-Z]{4})">', page)
    if rep:
        return {"representative": rep[0].upper()}

    opm_type = re.findall(r'<li><i>Type:</i> <a.*>(.*)</a>', page)
    opm_class = re.findall(r'<li><i>Class:</i> <a.*>(.*)</a>', page)
    opm_superfamily = re.findall(
        r'<li><i>Superfamily:</i> <a[^<]*>([^<]*)</a>', page)
    opm_family = re.findall(r'<li><i>Family:</i> <a[^<]*>([^<]*)</a>', page)
    opm_species = re.findall(r'<li><i>Species:</i> <i><a.*>(.*)</a></i>', page)
    opm_localization = re.findall(r'<li><i>Localization:</i> <a.*>(.*)</a>',
                                  page)

    related_ids = re.findall(r'"\?extrapdb=([0-9a-zA-Z]{4})"', page)
    related_ids = [x.upper() for x in related_ids]
    related_ids.sort()

    delta_g = re.findall(r'([-+]?[0-9]*\.?[0-9]+) kcal/mol', page)

    return {
        "type": opm_type[0].split(" ", 1)[1],
        "class": opm_class[0].split(" ", 1)[1],
        "superfamily": opm_superfamily[0].split(" ", 1)[1],
        "family": opm_family[0].split(" ", 1)[1],
        "species": opm_species[0].strip(),
        "localization": opm_localization[0],
        "related_ids": related_ids,
        "delta_g": try_float(get_index(delta_g, 0))
    }
Пример #12
0
    def transform(self, X):
        keys = {
            "1", "2", "3", "4", "5", "7", "8", "9", "a", "as", "at", "b",
            "bars", "beautiful", "boots", "bottles", "bowls", "box", "boxes",
            "brand", "bras", "bucks", "cans", "card", "cards", "case", "cm",
            "comes", "compartments", "controllers", "cream", "credit", "crop",
            "dd", "dollar", "dollars", "dolls", "dress", "dvds", "each",
            "edition", "euc", "fashion", "feet", "fits", "fl", "ft", "g",
            "games", "gb", "gms", "gold", "gram", "grams", "hr", "hrs", "in",
            "inch", "inches", "k", "karat", "layers", "up", "meter", "mil",
            "mini", "mint", "ml", "mm", "month", "mugs", "no", "not", "nwt",
            "off", "onesies", "opi", "ounce", "ounces", "outfits", "oz",
            "packages", "packets", "packs", "pair", "panels", "pants",
            "patches", "pc", "pics", "piece", "pieces", "pokémon", "pokemon",
            "pounds", "price", "protection", "random", "retro", "ring",
            "rings", "rolls", "samples", "sandals", "series", "sets", "sheets",
            "shirts", "shoe", "shoes", "shows", "slots", "small", "so", "some",
            "stamped", "sterling", "stickers", "still", "stretch", "strips",
            "summer", "t", "tags", "tiny", "tone", "tubes", "victoria",
            "vinyl", "w", "waist", "waistband", "waterproof", "watt", "white",
            "wireless", "x10", "x13", "x15", "x3", "x4", "x5", "x6", "x7",
            "x8", "x9", "yrs", "½", "lipsticks", "bar", "apple", "access",
            "wax", "monster", "spell", "spinners", "lunch", "ac", "jamberry",
            "medal", "gerard"
        }
        regex = re.compile("(\d+)[ ]?(\w+)", re.IGNORECASE)

        specifics = []
        for x in X:
            spec = {}
            for val, key in regex.findall(str(x), re.IGNORECASE):
                if key in keys:
                    val = try_float(val)
                    if val > 3000:
                        continue
                    spec[key] = val
                    spec['{}_{}'.format(key, val)] = 1
            specifics.append(spec)

        return specifics
Пример #13
0
 def parse_pitches(self):
     # Parse every pitch in all innings, adding them to the to_load list.
     pitch_counter = count()
     for pitch in self.innings.find_all('pitch'):
         # Some years are missing pitch_ids. Since we're using it as a key,
         # assign here and increment the counter
         p = {}
         p['game_id'] = self.game_id
         p['pitch_id'] = int(pitch.get('id', next(pitch_counter)))
         p['at_bat_number'] = try_int(pitch.parent.get('num'))
         p['description'] = pitch.get('des')
         p['type'] = pitch.get('type')
         try:
             t = dateutil.parser.parse(pitch.get('tfs_zulu', ''))
             p['timestamp'] = t.astimezone(timezone('America/New_York'))
         except ValueError:
             logging.warning('Could not parse timestamp: Game {}; pitch {}'.format(
                 self.game_id, p['pitch_id']))
         p['x'] = try_float(pitch.get('x'))
         p['y'] = try_float(pitch.get('y'))
         p['event_num'] = try_int(pitch.get('event_num'))
         p['sv_id'] = pitch.get('sv_id')
         p['play_guid'] = pitch.get('play_guid')
         p['start_speed'] = try_float(pitch.get('start_speed'))
         p['end_speed'] = try_float(pitch.get('end_speed'))
         p['sz_top'] = try_float(pitch.get('sz_top'))
         p['sz_bottom'] = try_float(pitch.get('sz_bot'))
         p['pfx_x'] = try_float(pitch.get('pfx_x'))
         p['pfx_z'] = try_float(pitch.get('pfx_z'))
         p['x0'] = try_float(pitch.get('x0'))
         p['y0'] = try_float(pitch.get('y0'))
         p['z0'] = try_float(pitch.get('z0'))
         p['vx0'] = try_float(pitch.get('vx0'))
         p['vy0'] = try_float(pitch.get('vy0'))
         p['vz0'] = try_float(pitch.get('vz0'))
         p['ax'] = try_float(pitch.get('ax'))
         p['ay'] = try_float(pitch.get('ay'))
         p['az'] = try_float(pitch.get('az'))
         p['break_y'] = try_float(pitch.get('break_y'))
         p['break_angle'] = try_float(pitch.get('break_angle'))
         p['break_length'] = try_float(pitch.get('break_length'))
         p['pitch_type'] = pitch.get('pitch_type')
         p['type_confidence'] = try_float(pitch.get('type_confidence'))
         p['zone'] = try_int(pitch.get('zone'))
         p['nasty'] = try_int(pitch.get('nasty'))
         p['spin_dir'] = try_float(pitch.get('spin_dir'))
         p['spin_rate'] = try_float(pitch.get('spin_rate'))
         # Drop None items and add Pitch to to_load
         p = dict((k, v) for k, v in p.items() if v is not None)
         logging.info(p)
         self.to_load.append(Pitch(**p))
Пример #14
0
 def parse_pitches(self):
     # Parse every pitch in all innings, adding them to the to_load list.
     pitch_counter = count()
     for pitch in self.innings.find_all('pitch'):
         # Some years are missing pitch_ids. Since we're using it as a key,
         # assign here and increment the counter
         p = {}
         p['game_id'] = self.game_id
         p['pitch_id'] = int(pitch.get('id', next(pitch_counter)))
         p['at_bat_number'] = try_int(pitch.parent.get('num'))
         p['description'] = pitch.get('des')
         p['type'] = pitch.get('type')
         try:
             t = dateutil.parser.parse(pitch.get('tfs_zulu', ''))
             p['timestamp'] = t.astimezone(timezone('America/New_York'))
         except ValueError:
             logging.warning(
                 'Could not parse timestamp: Game {}; pitch {}'.format(
                     self.game_id, p['pitch_id']))
         p['x'] = try_float(pitch.get('x'))
         p['y'] = try_float(pitch.get('y'))
         p['event_num'] = try_int(pitch.get('event_num'))
         p['sv_id'] = pitch.get('sv_id')
         p['play_guid'] = pitch.get('play_guid')
         p['start_speed'] = try_float(pitch.get('start_speed'))
         p['end_speed'] = try_float(pitch.get('end_speed'))
         p['sz_top'] = try_float(pitch.get('sz_top'))
         p['sz_bottom'] = try_float(pitch.get('sz_bot'))
         p['pfx_x'] = try_float(pitch.get('pfx_x'))
         p['pfx_z'] = try_float(pitch.get('pfx_z'))
         p['x0'] = try_float(pitch.get('x0'))
         p['y0'] = try_float(pitch.get('y0'))
         p['z0'] = try_float(pitch.get('z0'))
         p['vx0'] = try_float(pitch.get('vx0'))
         p['vy0'] = try_float(pitch.get('vy0'))
         p['vz0'] = try_float(pitch.get('vz0'))
         p['ax'] = try_float(pitch.get('ax'))
         p['ay'] = try_float(pitch.get('ay'))
         p['az'] = try_float(pitch.get('az'))
         p['break_y'] = try_float(pitch.get('break_y'))
         p['break_angle'] = try_float(pitch.get('break_angle'))
         p['break_length'] = try_float(pitch.get('break_length'))
         p['pitch_type'] = pitch.get('pitch_type')
         p['type_confidence'] = try_float(pitch.get('type_confidence'))
         p['zone'] = try_int(pitch.get('zone'))
         p['nasty'] = try_int(pitch.get('nasty'))
         p['spin_dir'] = try_float(pitch.get('spin_dir'))
         p['spin_rate'] = try_float(pitch.get('spin_rate'))
         # Drop None items and add Pitch to to_load
         p = dict((k, v) for k, v in p.items() if v is not None)
         logging.info(p)
         self.to_load.append(Pitch(**p))
Пример #15
0
 def compileInstanceResults(self, sInst):
     for mLog, lN in self.lResLogs:  ## Select method and its name list
         lNames = self.getMethodName(lN)
         aDetThis = self.aDetThisInst[lNames]  ## Result table line section
         if sInst in mLog:
             self.nReported += 1
             aResultThisInst = OrderedDict({"n_Reported": 1})
             aResultThisInst["n_CheckFailed"] = 0
             mRes = mLog[sInst]  ## The method's entry for this instance
             aDetThis["chk"] = "ok"
             if "SOLUTION_CHECKS_FAILED" in mRes and \
               0<mRes["SOLUTION_CHECKS_FAILED"]:
                 aResultThisInst["n_CheckFailed"] = 1
                 aDetThis["chk"] = "BAD"
                 utils.addMapValues(self.mCmpVecVals[lNames],
                                    aResultThisInst)
                 print(
                     "WARNING: SOLUTION CHECK(S) FAILED for the instance ",
                     sInst,
                     ",  method '",
                     lNames,
                     "'.",
                     sep='',
                     file=self.ioBadChecks)
                 continue  ## TODO. Param?
             aResultThisInst["n_ErrorsBackend"] = 0
             aResultThisInst["n_ErrorsLogical"] = 0
             aDetThis["errH"] = 0
             aDetThis["errL"] = 0
             mSlv = mRes["__SOLVE__"]
             dObj_MZN = utils.try_float(mSlv.get("ObjVal_MZN"))
             aDetThis["objMZN"] = dObj_MZN
             dObj_SLV = utils.try_float(mSlv.get("ObjVal_Solver"))
             aDetThis["objSLV"] = dObj_SLV
             dBnd_SLV = utils.try_float(mSlv.get("DualBnd_Solver"))
             aDetThis["bnd"] = dBnd_SLV
             dTime_All = utils.try_float(mSlv.get("TimeReal_All"))
             aDetThis["tAll"] = dTime_All
             dTime_Flt = utils.try_float(mSlv.get("Time_Flt"))
             aResultThisInst[
                 "t_Flatten"] = dTime_Flt if dTime_Flt is not None else dTime_All  ##??
             aDetThis["tFlt"] = dTime_Flt
             dTime_Last = utils.try_float(mSlv.get("TimeReal_LastStatus"))
             aDetThis["tBest"] = dTime_Last
             ## Compare obj vals
             dObj, bObj_MZN = (dObj_MZN, True) if \
                   None!=dObj_MZN and abs( dObj_MZN ) < 1e45 else (mSlv.get("ObjVal_MZN"), False)
             ## Assuming solver value is better if different. WHY? Well it' happened both ways
             dObj, bObj_SLV = (dObj_SLV, True) if \
                   None!=dObj_SLV and abs( dObj_SLV ) < 1e45 else (dObj, False)
             if bObj_MZN and bObj_SLV:
                 if abs(dObj_MZN - dObj_SLV) > 1e-6 * max(
                         abs(dObj_MZN), abs(dObj_SLV)):
                     aResultThisInst["n_ErrorsLogical"] += 1
                     aDetThis["errL"] += 1
                     print(
                         "  WARNING: DIFFERENT MZN / SOLVER OBJ VALUES for the instance ",
                         sInst,
                         ", method '",
                         lNames,
                         "' : ",
                         dObj_MZN,
                         " / ",
                         dObj_SLV,
                         sep='',
                         file=self.ioContrObjValMZN)
             ## Retrieve solution status
             if "Sol_Status" in mSlv:
                 n_SolStatus = mSlv["Sol_Status"][0]
             else:
                 n_SolStatus = 0
             ## Retrieve dual bound
             dBnd = None
             if None != dBnd_SLV and abs(dBnd_SLV) < 1e45:
                 dBnd = dBnd_SLV
                 self.lDualBnd.append((
                     dBnd_SLV,
                     lNames))  ## Even infeas instances can have dual bound?
             ## Trying to deduce opt sense if not given:
             if 1 == len(self.sSenses):
                 nSense = next(iter(self.sSenses.keys()))
             else:
                 nSense = -2  ## ??
             aDetThis["sns"] = self.mapProblemSense[nSense]
             self.bOptProblem = True if 0 != nSense else False  ## or (None!=dBnd or None!=dObj)
             ### ... here assumed it's an opt problem by default... why... need to check bounds first??
             ## Handle optimality / SAT completed
             if 2 == n_SolStatus:
                 if not self.bOptProblem:
                     self.lSatAll.append(lNames)
                     aResultThisInst["n_SATALL"] = 1
                     aDetThis["stt"] = self.mapStatShort[4]
                 else:  ## Assume it's an optimization problem????? TODO
                     self.lOpt.append(
                         lNames)  ## Append the optimal method list
                     aResultThisInst["n_OPT"] = 1
                     aDetThis["stt"] = self.mapStatShort[2]
                     if None == dObj or abs(dObj) >= 1e45:
                         aResultThisInst["n_ErrorsLogical"] += 1
                         aDetThis["errL"] += 1
                         print(
                             "  WARNING: OPTIMAL STATUS BUT BAD OBJ VALUE, instance ",
                             sInst,
                             ", method '",
                             lNames,
                             "': '",
                             ("" if None == dObj else str(dObj)),
                             "', result record: ",  # mRes,
                             ",, dObj_MZN: ",
                             dObj_MZN,
                             sep='',
                             file=self.ioBadObjValueStatusOpt)
                     else:
                         self.mOptVal[
                             dObj] = lNames  ## Could have used OrderedDict
                         self.lOptVal.append(
                             (dObj,
                              lNames))  ## To have both a map and the order
                         self.lPrimBnd.append((dObj, lNames))
             ## Handle feasibility / SAT
             elif 1 == n_SolStatus:
                 if not self.bOptProblem:
                     self.lSat.append(lNames)
                     aResultThisInst["n_SAT"] = 1
                     aDetThis["stt"] = self.mapStatShort[3]
                 else:  ## Assume it's an optimization problem????? TODO
                     self.lFeas.append(
                         lNames)  ## Append the optimal method list
                     aResultThisInst["n_FEAS"] = 1
                     aDetThis["stt"] = self.mapStatShort[1]
                     if None == dObj or abs(dObj) >= 1e45:
                         aResultThisInst["n_ErrorsLogical"] += 1
                         aDetThis["errL"] += 1
                         print(
                             "  WARNING: feasible status but bad obj value, instance ",
                             sInst,
                             ", method '",
                             lNames,
                             "' :'",
                             ("" if None == dObj else str(dObj)),
                             "', result record: ",  #  mRes,
                             sep='',
                             file=self.ioBadObjValueStatusFeas)
                     else:
                         self.lPrimBnd.append((dObj, lNames))
             ## Handle infeasibility
             elif -1 >= n_SolStatus and -3 <= n_SolStatus:
                 self.lInfeas.append(lNames)
                 aResultThisInst["n_INFEAS"] = 1
                 aDetThis["stt"] = self.mapStatShort[n_SolStatus]
                 self.mInfeas.setdefault(sInst, [])
                 self.mInfeas[sInst].append(lNames)
             ## Handle ERROR?
             elif -4 == n_SolStatus:
                 aResultThisInst["n_ErrorsBackend"] = 1
                 aDetThis["errH"] += 1
                 aDetThis["stt"] = self.mapStatShort[
                     n_SolStatus]  ## Should not happen TODO
                 self.mError.setdefault(sInst, []).append(lNames)
                 print(
                     "ERROR REPORTED for the instance ",
                     sInst,
                     ", method '",
                     lNames,
                     "',  result record: ",  ## mRes,
                     sep='',
                     file=self.ioErrors)
             else:
                 aResultThisInst["n_UNKNOWN"] = 1
                 aDetThis["stt"] = self.mapStatShort[0]
             ## Handle NOFZN
             if None == dTime_Flt:
                 aResultThisInst["n_NOFZN"] = 1
                 self.mNoFZN.setdefault(sInst, []).append(lNames)
             ## Handle FAIL???
             # LAST:
             utils.addMapValues(self.mCmpVecVals[lNames], aResultThisInst)
Пример #16
0
    def _post_exec(self):
        if self.parallel and self.check_only:
            dct = collections.defaultdict(list)
            status_dct = {}
            tag_dct = {}
            db_records = []
            for t in self.tool_list:
                check_info = t.check(full=True)
                cur_info = CURATED_INFO.get(t.pdb_id, {})
                tag = re.split("/|\.", check_info)[0]

                if tag == "mppd":
                    tag = "provi"

                if tag != "opm":
                    # check if opm found two mplanes
                    try:
                        if len(t.opm.get_planes()) != 2:
                            tag = "mplane"
                    except IOError:
                        # print tag, check_info, t.id
                        pass
                else:
                    if os.path.isfile(t.opm.outpath("ppm_error.txt")):
                        tag = "ppm"
                        # print open( t.opm.outpath( "ppm_error.txt" ) ).read()

                if tag != "pdb_info" and t.pdb_info.check():
                    info = t.pdb_info.get_info()
                    if "CA ATOMS ONLY" in info.get("model_type", {}):
                        tag = "calpha_only"
                    if cur_info.get("backbone_only"):
                        tag = "backbone_only"
                    if "THEORETICAL MODEL" == info.get("experiment", ""):
                        tag = "theoretical_model"
                    res = info.get("resolution")
                    if res and res >= 4.0 and res != "NOT":
                        tag = "resolution"
                    if info.get("obsolete"):
                        tag = "obsolete"

                try:
                    opm_info = t.opm_info.get_info()
                    if opm_info and opm_info.get("type") != "Transmembrane":
                        tag = "no_transmembrane"
                except:
                    pass

                if cur_info.get("no_pdb_entry"):
                    tag = "no_pdb_entry"
                if cur_info.get("no_transmembrane"):
                    tag = "no_transmembrane"

                tag_dct[t.pdb_id] = tag
                dct[tag].append(t)

            # representative id search
            test_rep_list = flatten([
                zip([x] * len(dct[x]), dct[x])
                for x in ["opm", "ppm", "msms0", "msms_vdw_fin", "dowser"]
            ])
            print test_rep_list
            for tag, t in test_rep_list:
                try:
                    opm_info = t.opm_info.get_info()
                    mpstruc_info = t.mpstruc_info.get_info()
                except:
                    continue
                rid_list = []
                if opm_info:
                    rep_id = opm_info.get("representative")
                    if rep_id:
                        rid_list.append(rep_id.upper())
                    rid_list += opm_info.get("related_ids", [])
                if mpstruc_info:
                    master_id = mpstruc_info.get("master")
                    if master_id:
                        rid_list.append(master_id.upper())
                    rid_list += mpstruc_info.get("related", [])

                rep = None
                for rid in rid_list:
                    for x in dct["Ok"]:
                        if x.pdb_id == rid:
                            rep = x
                            break
                    else:
                        continue
                    break
                if rep:
                    dct["representative"].append(t)
                else:
                    dct["no_representative"].append(t)

            ignore_tags = [
                "no_pdb_entry",
                "mplane",
                "representative",
                "no_representative",
                "no_transmembrane",
                "theoretical_model",
                "obsolete",
            ]

            # status types
            #   included:   all good
            #   linked:     only a representative available
            #   pending:    to be included
            #   obsolete:   superseeded
            #   defect:     low resolution; missing atoms
            #   model:      theoretical model
            for pdb_id, tag in tag_dct.iteritems():
                if tag == "Ok":
                    status = "included"
                elif tag == "obsolete":
                    status = "obsolete"
                elif tag == "theoretical_model":
                    status = "model"
                elif tag in ["calpha_only", "backbone_only"]:
                    status = "backbone_only"
                elif tag == "resolution":
                    status = "low_resolution"
                elif tag in ["opm", "ppm", "msms0", "msms_vdw_fin"]:
                    if pdb_id in dct["representative"]:
                        status = "linked"
                    else:
                        status = "pending"
                elif tag in ignore_tags:
                    continue
                else:
                    status = "unknown"
                    print tag
                status_dct[pdb_id] = status

            for tag, t_list in dct.iteritems():
                if tag != "Ok":
                    print tag, " ".join(map(lambda x: x.id, t_list))
            for tag, t_list in dct.iteritems():
                print tag, len(t_list)

            if self.database:
                for tag, t_list in dct.iteritems():
                    if tag in ignore_tags:
                        continue
                    for t in t_list:
                        t_info = t.get_info()
                        if tag == "Ok":
                            for s in t.get_stats():
                                if s['source'] == 'fin' and s[
                                        'segment'] == 'TM':
                                    t_stats = s
                                    break
                            else:
                                raise Exception('no stats found %s' % t.pdb_id)
                        else:
                            t_stats = {}
                        cur_info = CURATED_INFO.get(t.pdb_id, {})
                        db_records.append(
                            MppdDbRecord(
                                t_info['pdb_id'],
                                t_info['pdb_title'],
                                ",".join(t_info['pdb_keywords']),
                                t_info['pdb_experiment'],
                                t_info['pdb_resolution'],
                                t_info['opm_superfamily'],
                                t_info['opm_family'],
                                t_info['opm_representative'],
                                t_info['opm_species'],
                                ",".join(t_info['opm_related']),
                                t_info['mpstruc_group'],
                                t_info['mpstruc_subgroup'],
                                t_info['mpstruc_name'],
                                t_info['mpstruc_species'],
                                t_info['mpstruc_master'],
                                ",".join(t_info['mpstruc_related']),
                                cur_info.get('representative', ""),
                                ",".join(cur_info.get('related', [])),
                                status_dct[t.pdb_id],
                                t_stats.get('packdens_protein_buried'),
                                t_stats.get('water_count'),
                                t_stats.get('residue_count'),
                                t_stats.get('msms'),
                            ))
                db = SqliteBackend("mppd.db", MppdDbRecord)
                db.write(db_records)
            if self.extract:
                fdir = self.extract
                if not os.path.exists(fdir):
                    os.makedirs(fdir)
                shutil.copyfile(self.outpath("mppd.db"),
                                os.path.join(fdir, "mppd.db"))
                for t in dct.get('Ok', []):
                    flist = [
                        t.original_dry_pdb, t.final_pdb, t.opm.mplane_file,
                        t.hbexplore_fin.hbx_file + ".bonds",
                        t.voronoia_fin.vol_file + ".atmprop",
                        t.outpath("mppd.provi")
                    ]
                    flist += t.msms_vdw_fin.component_files()
                    for fsrc in flist:
                        fdst = os.path.join(fdir, t.id, t.relpath(fsrc))
                        if not os.path.exists(os.path.dirname(fdst)):
                            os.makedirs(os.path.dirname(fdst))
                        shutil.copyfile(fsrc, fdst)
            if self.figures:
                alpha = 0.3
                size = 7.0
                nres = collections.defaultdict(list)
                nwater = collections.defaultdict(list)
                resolution = collections.defaultdict(list)
                ncav = collections.defaultdict(list)
                sesvol = collections.defaultdict(list)
                packdens = collections.defaultdict(list)
                packdens_buried = collections.defaultdict(list)
                for t in dct.get('Ok', []):
                    stats = t.get_stats()
                    info = t.get_info()
                    for s in stats:
                        if s["segment"] != "TM":
                            continue
                        key = (s["source"], s["segment"])
                        nres[key].append(s["residue_count"])
                        nwater[key].append(s["water_count"])
                        resolution[key].append(
                            try_float(info["pdb_resolution"], 0.0))
                        ncav[key].append(s["msms"])
                        sesvol[key].append(s["msms_ses"])
                        packdens[key].append(s["packdens_protein"])
                        packdens_buried[key].append(
                            s["packdens_protein_buried"])
                print nres.keys()
                for key in nres.keys():
                    print key
                    x = np.array(nwater[key])
                    y = np.array(nres[key])
                    x_y = x / y
                    r = np.array(resolution[key])
                    cav = np.array(ncav[key])
                    cav_y = cav / y
                    vol = np.array(sesvol[key]) * -1
                    vol_y = vol / y
                    pd = np.array(packdens[key])
                    pd_buried = np.array(packdens_buried[key])

                    from mpl_toolkits.axes_grid.anchored_artists import (
                        AnchoredText)

                    def hist(axis, x, label, loc=1, nzero=False):
                        if nzero:
                            x = x[x != 0]
                        if len(x) == 0:
                            x = np.array([0])
                        axis.hist(x, normed=True, bins=25)
                        axis.set_xlabel(label)
                        summary = ("Var: %.4f\nStd: %.4f\nMean: %.4f\n"
                                   "Median: %.4f\nMin: %.4f\nMax: %.4f\n") % (
                                       np.var(x), x.std(), x.mean(),
                                       np.median(x), x.min(), x.max())
                        at = AnchoredText(summary,
                                          loc=loc or 1,
                                          prop={"size": 10},
                                          frameon=True,
                                          pad=0.5,
                                          borderpad=1.0)
                        axis.add_artist(at)

                    def scatter(axis, x, y, xlabel, ylabel, loc=1, nzero=True):
                        if nzero:
                            xnzero = x != 0
                            ynzero = y != 0
                            x = x[xnzero & ynzero]
                            y = y[xnzero & ynzero]
                        try:
                            r = pearsonr(x, y)
                        except Exception:
                            r = (np.nan, np.nan)
                        axis.scatter(x, y, alpha=alpha, s=size)
                        axis.set_xlabel(xlabel)
                        axis.set_ylabel(ylabel)
                        axis.set_ylim((0, axis.get_ylim()[1]))
                        summary = "r: %.4f\np: %.4f\n" % r
                        at = AnchoredText(summary,
                                          loc=loc or 1,
                                          prop={"size": 10},
                                          frameon=True,
                                          pad=0.5,
                                          borderpad=1.0)
                        axis.add_artist(at)

                    fig, (ax) = plt.subplots(3, 4, figsize=[20, 12])

                    scatter(ax[0, 0], x, y, "#h20", "#res")
                    hist(ax[0, 1], x_y, "#h2o / #res")

                    scatter(ax[1, 0], r, x_y, "resolution [A]", "#h2o / #res")
                    hist(ax[1, 1], cav_y, "#cav / #res")

                    hist(ax[2, 0], vol_y, "ses_vol [A^3] / #res")

                    hist(ax[0, 2], pd, "packing density")

                    scatter(ax[1, 2], r, pd, "resolution [A]",
                            "packing density")

                    hist(ax[0, 3], pd_buried, "packing density buried")

                    scatter(ax[1, 3], r, pd_buried, "resolution [A]",
                            "packing density buried")

                    fig.savefig("_".join(key) + ".png")

                def bar(ax, ydata, labels):
                    y = [np.array(yd).mean() for yd in ydata]
                    x = np.arange(len(y))
                    e = [np.array(yd).std() for yd in ydata]
                    ax.bar(x,
                           y,
                           align='center',
                           yerr=e,
                           ecolor='black',
                           facecolor='#777777')
                    ax.set_xticks(x)
                    ax.set_xticklabels(labels)
                    xlim = (x.min() - 1, x.max() + 1)
                    ax.set_xlim(xlim)

                # ...
                tm_keys = [("org", "TM"), ("fin", "TM"), ("dow", "TM")]
                if all(map(lambda k: k in nres, tm_keys)):
                    ydata = []
                    labels = []
                    for key in tm_keys:
                        ydata.append(
                            (np.array(nwater[key]) / np.array(nres[key])) *
                            100)
                        labels.append(key[0])
                    fig, (ax) = plt.subplots(1, 1, figsize=[6, 4.5])
                    bar(ax, ydata, labels)
                    fig.savefig("h2o_per_100_res.png")

                # ...
                nwater_cutoff = collections.defaultdict(list)
                cutoff_list = np.arange(1.4, 3.9, step=0.1)
                for t in dct.get('Ok', []):
                    stats2 = t.get_stats2()
                    for s2 in stats2:
                        key = s2["segment"]
                        if not len(nwater_cutoff[key]):
                            for cutoff in cutoff_list:
                                nwater_cutoff[key].append([])
                        water_count = s2["water_count"]
                        # residue_count = s2["residue_count"]
                        count_list = s2["exp_water_cutoff_count"]
                        for i, cutoff in enumerate(cutoff_list):
                            frac = try_div(count_list[i], water_count)
                            nwater_cutoff[key][i].append(frac)
                for key in nwater_cutoff.keys():
                    fig, (ax) = plt.subplots(1, 1, figsize=[8, 4])
                    bar(ax, nwater_cutoff[key], map(str, cutoff_list))
                    fig.savefig(str(key) + ".png")