def test_isfloat_returns_True_if_given_float_string_padded_or_not(x, y, z):
    assume(not math.isnan(x))
    assume(not math.isinf(x))
    y = ''.join(repeat(' ', y)) + repr(x) + ''.join(repeat(' ', z))
    assert fastnumbers.isfloat(repr(x))
    assert fastnumbers.isfloat(repr(x), str_only=True)
    assert fastnumbers.isfloat(y)
示例#2
0
def load_data_for_nn():
    data = pd.read_csv(os.path.join(DIR_TRAIN, 'train_set.csv'),
                       usecols=range(1, 11),
                       parse_dates=['timestamp', 'thread_timestamp'])
    data = data[data.channel.isin([
        'career', 'big_data', 'deep_learning', 'kaggle_crackers',
        'lang_python', 'lang_r', 'nlp', 'theory_and_practice', 'welcome',
        'bayesian', '_meetings', 'datasets'
    ]) & data.main_msg]

    # data_train = data.
    date_before = date(2017, 4, 1)
    train = data[data['timestamp'] <= date_before]
    val = data[data['timestamp'] > date_before]

    train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
    train_data['channel'] = train_data.channel.map(MAPPINGS)
    train_data = train_data.sort_values('channel').reset_index()[[
        'channel', 'text'
    ]]

    val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
    val_data['channel'] = val_data.channel.map(MAPPINGS)
    val_data = val_data.sort_values('channel').reset_index()[[
        'channel', 'text'
    ]]

    train_data.text = train_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or
                                                   isint(x) or len(x) < 20)]

    val_data.text = val_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    val_data = val_data[~val_data.text.
                        apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

    train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
    train_labels = np.asarray(train_data['channel'], dtype='int8')

    val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
    val_labels = np.asarray(val_data['channel'], dtype='int8')

    vocab, vocab_size = create_vocab_set()

    X_train = text2sequence(train_text, vocab)
    X_val = text2sequence(val_text, vocab)

    X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH, value=0)
    X_val = pad_sequences(X_val, maxlen=MAX_SEQUENCE_LENGTH, value=0)

    train_labels = to_categorical(train_labels, num_classes=12)
    val_labels = to_categorical(val_labels, num_classes=12)

    return X_train, train_labels, X_val, val_labels
def readfile(filedir):

    with open(filedir, "r") as f:
        prices = []
        hps = []
        lines1 = []
        lines2 = []

        for line in f:
            line = line.split(',')

            hp = fn.fast_float(line[21], default=0)
            price = fn.fast_float(line[25], default=0)

            if fn.isfloat(hp):
                if hp > 20 and hp < 300:
                    hps.append(hp)
                else:
                    lines1.append(line)
            else:
                lines1.append(line)

            if fn.isfloat(price):
                if price > 1 and price < 60:
                    prices.append(price)
                else:
                    lines2.append(line)
            else:
                lines2.append(line)

    avghp = sum(hps) / len(hps)
    avgprice = sum(prices) / len(prices)

    for line in lines1:
        hp = avghp
        price = fn.fast_float(line[21])

        hps.append(hp)
        prices.append(price)

    for line in lines2:
        hp = fn.fast_float(line[25])
        price = avgprice

        hps.append(hp)
        prices.append(price)

    sort = zip(prices, hps)
    sort.sort()
    prices = [x for x, y in sort]
    hps = [y for x, y in sort]

    return prices , hps
示例#4
0
def load_data():
    data = pd.read_csv('../data/train_set.csv',
                       usecols=range(1, 11),
                       parse_dates=['timestamp', 'thread_timestamp'])
    data = data[data.channel.isin([
        'career', 'big_data', 'deep_learning', 'kaggle_crackers',
        'lang_python', 'lang_r', 'nlp', 'theory_and_practice', 'welcome',
        'bayesian', '_meetings', 'datasets'
    ])
                & data.main_msg]

    users_100 = list(data.user_id.value_counts()[:100].index)
    data = data[data["user_id"].isin(users_100)]

    mappings = {}
    for c, value in enumerate(users_100, 0):
        mappings[value] = c

    # split on data and data val
    date_before = date(2017, 4, 1)
    train = data[data['timestamp'] <= date_before]
    val = data[data['timestamp'] > date_before]

    train_data = train[['user_id', 'text']].reset_index()[['user_id', 'text']]
    train_data['user_id'] = train_data.user_id.map(mappings)
    train_data = train_data.sort_values('user_id').reset_index()[[
        'user_id', 'text'
    ]]

    val_data = val[['user_id', 'text']].reset_index()[['user_id', 'text']]
    val_data['user_id'] = val_data.user_id.map(mappings)
    val_data = val_data.sort_values('user_id').reset_index()[[
        'user_id', 'text'
    ]]

    train_data.text = train_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or
                                                   isint(x) or len(x) < 20)]

    val_data.text = val_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    val_data = val_data[~val_data.text.
                        apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

    train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
    train_labels = np.asarray(train_data['user_id'], dtype='int8')

    val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
    val_labels = np.asarray(val_data['user_id'], dtype='int8')
    return train_text, train_labels, val_text, val_labels
示例#5
0
        def func(value):
            """
            Check if a value can be casted to a specific
            :param value: value to be checked
            :return:
            """
            if isinstance(value, bool):
                _data_type = "bool"
            elif fastnumbers.isint(value):  # Check if value is integer
                _data_type = "int"
            elif fastnumbers.isfloat(value):
                _data_type = "float"
            # if string we try to parse it to int, float or bool
            elif isinstance(value, str):
                if str_to_boolean(value):
                    _data_type = "bool"
                elif str_to_date(value):
                    _data_type = "date"
                elif str_to_array(value):
                    _data_type = "array"
                else:
                    _data_type = "string"
            else:
                _data_type = "null"

            if get_type is False:
                if _data_type == data_type:
                    return True
                else:
                    return False
            else:
                return _data_type
示例#6
0
 def __repr__(self):
     # This is just a helper method to print
     # the question in a readable format.
     condition = "=="
     if fn.isfloat(self.value):
         condition = ">="
     return "Is %s %s %s?" % (DATASET_HEADERS[self.column], condition,
                              str(self.value))
示例#7
0
def _infer_type(value):
    #if not value or f4py.is_missing_value(value):
    #    return None
    if fastnumbers.isint(value):
        return b"i"
    if fastnumbers.isfloat(value):
        return b"f"
    return b"s"
示例#8
0
def load_data_gbm():
    data = pd.read_csv(os.path.join(dir_train, 'train_set.csv'),
                       usecols=range(1, 11),
                       parse_dates=['timestamp', 'thread_timestamp'])
    data = data[data.channel.isin([
        'career', 'big_data', 'deep_learning', 'kaggle_crackers',
        'lang_python', 'lang_r', 'nlp', 'theory_and_practice', 'welcome',
        'bayesian', '_meetings', 'datasets'
    ]) & data.main_msg]

    date_before = date(2017, 4, 1)
    train = data[data['timestamp'] <= date_before]
    val = data[data['timestamp'] > date_before]

    train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
    train_data['channel'] = train_data.channel.map(mappings)
    train_data = train_data.sort_values('channel').reset_index()[[
        'channel', 'text'
    ]]

    val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
    val_data['channel'] = val_data.channel.map(mappings)
    val_data = val_data.sort_values('channel').reset_index()[[
        'channel', 'text'
    ]]

    train_data.text = train_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    train_data = train_data[~train_data.text.apply(lambda x: isfloat(x) or
                                                   isint(x) or len(x) < 20)]

    val_data.text = val_data.text.astype(str) \
        .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
        .apply(lambda x: re.sub('\s+', ' ', x))
    val_data = val_data[~val_data.text.
                        apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

    train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
    train_labels = np.asarray(train_data['channel'], dtype='int8')

    val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
    val_labels = np.asarray(val_data['channel'], dtype='int8')

    return train_text, val_text, train_labels, val_labels
示例#9
0
def parse_argval(val: str):
    if val.isdigit():
        return int(val)
    if isfloat(val):
        return float(val)
    if val == "set()":
        return set()

    try:
        return orjson.loads(val)
    except ValueError:
        return val
示例#10
0
def guess_type(val, empty_as_null: bool) -> ColumnType:
    """Guess type of a value"""
    if val is None:
        return ColumnType.NULL

    assert isinstance(val, (int, float, str)), "Invalid column data"
    if fastnumbers.isfloat(val):
        return ColumnType.NUMBER
    else:
        if len(val.strip()) == 0 and empty_as_null:
            return ColumnType.NULL
        return ColumnType.STRING
示例#11
0
def get_quantity(s):
    # 5, 5 eV, 5+/-1 eV, 5(1) eV
    # set uncertainty to nan if not provided
    parts = s.split()
    parts += [None] * (2 - len(parts))
    if isfloat(parts[0]):
        parts[0] += "+/-nan"

    try:
        parts[0] = ufloat_fromstr(parts[0])
        return ureg.Measurement(*parts)
    except ValueError:
        return None
示例#12
0
def norm_val(val, empty_as_null: bool) -> Union[bytes, int, float, None]:
    """Normalize a value"""
    if val is None:
        return None

    if fastnumbers.isfloat(val) or fastnumbers.isint(val):
        return fastnumbers.float(val)

    val = val.strip()
    if len(val) == 0 and empty_as_null:
        return None

    return val.encode("utf-8", "ignore")
示例#13
0
def infer(value):
    """
    Infer a Spark data type from a value
    :param value: value to be inferred
    :return: Spark data type
    """
    result = None
    # print(v)
    if value is None:
        result = "null"

    elif is_bool(value):
        result = "bool"

    elif isint(value):
        result = "int"

    elif isfloat(value):
        result = "float"

    elif is_list(value):
        result = ArrayType(infer(value[0]))

    elif is_datetime(value):
        result = "datetime"

    elif is_date(value):
        result = "date"

    elif is_binary(value):
        result = "binary"

    elif is_str(value):
        if str_to_boolean(value):
            result = "bool"
        elif str_to_date(value):
            result = "string"  # date
        elif str_to_array(value):
            result = "string"  # array
        else:
            result = "string"

    return get_spark_dtypes_object(result)
示例#14
0
def to_spark(value):
    """
    Infer a Spark data type from a value
    :param value: value to be inferred
    :return: Spark data type
    """
    result = None
    if value is None:
        result = "null"

    elif is_bool_value(value):
        result = "bool"

    elif fastnumbers.isint(value):
        result = "int"

    elif fastnumbers.isfloat(value):
        result = "float"

    elif is_list_value(value):
        result = ArrayType(to_spark(value[0]))

    elif is_datetime(value):
        result = "datetime"

    elif is_date(value):
        result = "date"

    elif is_binary(value):
        result = "binary"

    elif is_str(value):
        if is_bool_str(value):
            result = "bool"
        elif is_datetime(value):
            result = "string"  # date
        elif is_list_str(value):
            result = "string"  # array
        else:
            result = "string"

    return parse_spark_class_dtypes(result)
示例#15
0
def parse_column_type(name, values):
    if name == b"Sample":
        return b"i"

    non_missing_values = [x for x in values if x != b"" and x != b"NA"]
    unique_values = set(non_missing_values)

    has_non_number = False
    for x in unique_values:
        if not fastnumbers.isfloat(x):
            has_non_number = True
            break

    if has_non_number:
        if len(unique_values) == len(non_missing_values):
            return b"i"  #ID
        else:
            return b"d"  #Discrete

    return b"n"  # Numeric
def test_isfloat_returns_False_if_given_non_number_string(x):
    assume(not a_number(x))
    assert not fastnumbers.isfloat(x)
def test_isfloat_returns_False_for_nan_string_unless_allow_nan_is_True():
    assert not fastnumbers.isfloat('nan')
    assert fastnumbers.isfloat('nan', allow_nan=True)
    assert fastnumbers.isfloat('-NaN', allow_nan=True)
def test_isfloat_given_unicode_non_numeral_returns_False(x):
    assert not fastnumbers.isfloat(x)
def test_isfloat_given_unicode_of_more_than_one_char_returns_False(x):
    assume(not a_number(x))
    assert not fastnumbers.isfloat(x)
def test_isfloat_returns_False_if_given_string_and_num_only_is_True(x):
    assume(not math.isnan(x))
    assume(not math.isinf(x))
    assert not fastnumbers.isfloat(repr(x), num_only=True)
def test_isfloat_given_unicode_numeral_returns_True(x):
    assert fastnumbers.isfloat(x)
    # Try padded as well
    assert fastnumbers.isfloat(u'   ' + x + u'   ')
def test_isfloat_returns_True_if_given_int_string_padded_or_not(x, y, z):
    y = ''.join(repeat(' ', y)) + repr(x) + ''.join(repeat(' ', z))
    assert fastnumbers.isfloat(repr(x))
    assert fastnumbers.isfloat(repr(x), str_only=True)
    assert fastnumbers.isfloat(y)
示例#23
0
文件: infer.py 项目: a-domingu/tbcnn
def str_to_decimal(_value):
    return True if fastnumbers.isfloat(_value) else False
def test_isfloat_returns_True_if_given_float(x):
    assert fastnumbers.isfloat(x)
    assert fastnumbers.isfloat(x, num_only=True)
def test_isfloat_returns_False_if_given_float_and_str_only_is_True(x):
    assert not fastnumbers.isfloat(x, str_only=True)
def test_isfloat_returns_False_if_given_int(x):
    assert not fastnumbers.isfloat(x)
def test_isfloat():
    # 1. float number
    assert fastnumbers.isfloat(-367.3268)
    assert not fastnumbers.isfloat(-367.3268, str_only=True)
    assert fastnumbers.isfloat(-367.3268, num_only=True)
    # 2. signed float string
    assert fastnumbers.isfloat("+367.3268")
    assert fastnumbers.isfloat("+367.3268", True)
    assert not fastnumbers.isfloat("+367.3268", num_only=True)
    # 3. float string with exponents
    assert fastnumbers.isfloat("-367.3268e207")
    # 4. float string with padded whitespace
    assert fastnumbers.isfloat("   -367.04   ")
    # 5. int number
    assert not fastnumbers.isfloat(499)
    # 6. signed int string
    assert fastnumbers.isfloat("-499")
    # 7. int string with padded whitespace
    assert fastnumbers.isfloat("   +3001   ")
    # 8. long number
    assert not fastnumbers.isfloat(35892482945872302493)
    # 9. long string
    assert fastnumbers.isfloat("35892482945872302493")
    # 10. return type
    assert fastnumbers.isfloat(4029) is False
    assert fastnumbers.isfloat(4029.0) is True
    assert fastnumbers.isfloat(4029.0, str_only=True) is False
    assert fastnumbers.isfloat("4029") is True
    assert fastnumbers.isfloat("4029", True) is True
    # 11. TypeError for invalid input
    assert not fastnumbers.isfloat(["hey"])
    # 12. Invalid input string
    assert not fastnumbers.isfloat("not_a_number")
    # 13. Invalid input string with numbers
    assert not fastnumbers.isfloat("26.8 lb")
    # 14. Infinity
    assert not fastnumbers.isfloat("inf")
    assert fastnumbers.isfloat("inf", allow_inf=True)
    assert fastnumbers.isfloat("-infinity", allow_inf=True)
    assert fastnumbers.isfloat("-INFINITY", allow_inf=True)
    # 15. NaN
    assert not fastnumbers.isfloat("nAn")
    assert fastnumbers.isfloat("nan", allow_nan=True)
    assert fastnumbers.isfloat("-NaN", allow_nan=True)
    # 16. Sign/'e'/'.' only
    assert not fastnumbers.isfloat("+")
    assert not fastnumbers.isfloat("-")
    assert not fastnumbers.isfloat("e")
    assert not fastnumbers.isfloat(".")
    # 18. Unicode numbers
    assert fastnumbers.isfloat(u"⑦")
    assert fastnumbers.isfloat(u"⁸")
    assert fastnumbers.isfloat(u"⅔")
    assert fastnumbers.isfloat(u"Ⅴ")
示例#28
0
date_before = date(2017, 4, 1)
train = data[data['timestamp'] <= date_before]
val = data[data['timestamp'] > date_before]

train_data = train[['channel', 'text']].reset_index()[['channel', 'text']]
train_data['channel'] = train_data.channel.map(mappings)
train_data = train_data.sort_values('channel').reset_index()[[
    'channel', 'text'
]]

val_data = val[['channel', 'text']].reset_index()[['channel', 'text']]
val_data['channel'] = val_data.channel.map(mappings)
val_data = val_data.sort_values('channel').reset_index()[['channel', 'text']]

train_data = train_data[~train_data.text.
                        apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]
val_data = val_data[~val_data.text.
                    apply(lambda x: isfloat(x) or isint(x) or len(x) < 20)]

train_text = train_data['text'].astype(str).apply(lambda x: x.lower())
train_labels = np.asarray(train_data['channel'], dtype='int8')

val_text = val_data['text'].astype(str).apply(lambda x: x.lower())
val_labels = np.asarray(val_data['channel'], dtype='int8')

train_text = train_text \
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
    .apply(lambda x: re.sub('\s+', ' ', x))

val_text = val_text \
    .apply(lambda x: re.sub('(<\S+>:?)|(\s?:\S+:\s?)|(&gt;)|([\w\.]*@[\w\.]*)', ' ', x)) \
def test_isfloat_returns_False_for_inf_string_unless_allow_inf_is_True():
    assert not fastnumbers.isfloat('inf')
    assert fastnumbers.isfloat('inf', allow_inf=True)
    assert fastnumbers.isfloat('-INFINITY', allow_inf=True)
示例#30
0
def crawl_nbastats_by_year(year, champion_team_name='dal', num_player_of_interest=10):
    champion_team_name = team_abbr_full[champion_team_name]
    url_root = 'http://espn.go.com/nba/team/stats/_/name/'
    best_stats = []
    champion_teamstats = pd.DataFrame()
    non_champion_teamstats = []
    non_champion_teamlist = []
    for team_abbr, team_name in zip(teams_abbr, teams_full_name):
        catogory = '/cat/avgMinutes/'   # ordering player with their avg. minutes
        URL = url_root + team_abbr + '/year/' + str(year) + category + team_name
        print 'parsing ' + URL + ' ...'
        request = urllib2.Request(URL)
        response = urllib2.urlopen(request)
        if response.url != URL:
            print 'no response on this address, redirect to: ', response.url
            continue
        response = response.read()
        soup = BeautifulSoup(response, 'html.parser')

        players = soup.findAll('tr', {'class': re.compile('^player-')})
        stat_labels = soup.findAll('tr', {'class': ['colhead']})
        total_labels = soup.findAll('tr', {'class': ['total']})
        print soup.title.string

        #print '1: ', total_labels[0].select('td')
        #print '2: ', total_labels[1].select('td')

        player_list = []
        player_dict = {}
        team_stats = OrderedDict()  # avoid dict sorting the keys when adding them

        # Initialise 30 statistics for the team
        stats = ['', '']
        stats[0] = stat_labels[0].select('td')   # Table 1: game statistics
        stats[1] = stat_labels[1].select('td')   # Table 2: shooting statistics
        stat_labels = stats
        for stat in stats[0]:
            team_stats[stat.get_text()] = 0.0
        for stat in stats[1]:
            team_stats[stat.get_text()] = 0.0

        numOfPlayer = len(players) / 2

        # teams with players fewer than 10 are not included in the study
        if numOfPlayer < num_player_of_interest:
            print 'warning: players less than ' + str(num_player_of_interest) + ' !'
            continue

        player_namelist = []
        for i, player in enumerate(players, 0):
            if i == numOfPlayer:
                break
            player_stats = player.findAll('td')
            player_namelist.append(player_stats[0].get_text().encode('ascii', 'ignore'))
        team_stats = pd.DataFrame(np.zeros([numOfPlayer, len(team_stats.keys())]), \
                        index=player_namelist, columns=team_stats.keys())
        team_stats = team_stats.drop('PLAYER', 1)

        for i, player in enumerate(players, 0):
            player_idx = i % numOfPlayer
            j = i / numOfPlayer
            player_stats = player.findAll('td') # iterate over players within a team

            stat = np.zeros(len(player_stats))
            for stat_label, player_stat in zip(stat_labels[j], player_stats):
                x = player_stat.get_text().encode('ascii', 'ignore')
                if isfloat(x) == True:
                    x = float(x)
                    team_stats.set_value(player_namelist[player_idx], stat_label.get_text(), x)

        '''filename = team_name + '_' + str(year) + '.csv'
        print 'saving ' + filename, ' ...'
        team_stats.to_csv(filename)'''

        # keep track of champion team with specified year
        team_stats.index.name = 'Players'
        team_stats.columns.name = 'Statistics'
        if team_name == champion_team_name:
            champion_teamstats = team_stats
        else:
            non_champion_teamstats.append(team_stats)
            non_champion_teamlist.append(team_name)
        # keep track of the best of each statistics
        if len(best_stats) == 0:
            best_stats = team_stats.max(axis=0, numeric_only=True).as_matrix()
        else:
            team_stats = team_stats.max(axis=0, numeric_only=True).as_matrix()
            # only take max if all the entries in 'team_stats' are non-nan
            if not np.isnan(team_stats).any():
                best_stats = np.maximum(best_stats, team_stats) # element-wise max

    # normalise the stats by dividing the champion team's stats by the best stats among all teams
    if (not champion_teamstats.empty) and (len(best_stats) != 0):
        champion_teamstats = champion_teamstats.loc[:, 'GP'::].divide(best_stats, axis='columns')

    for team_stat, team_name in zip(non_champion_teamstats, non_champion_teamlist):
        team_stat = team_stat.loc[:, 'GP'::].divide(best_stats, axis='columns')
        team_stat = team_stat.iloc[0:num_player_of_interest]
        filename = 'non_champions/' + str(year) + '_' + team_name + '.csv'
        team_stat.to_csv(filename)

    return champion_teamstats
def test_isfloat_with_no_arguments_fails():
    with raises(TypeError):
        fastnumbers.isfloat(5, invalid='dummy')
示例#32
0
文件: infer.py 项目: a-domingu/tbcnn
def is_decimal(value):
    return fastnumbers.isfloat(value, allow_nan=True)