def _sanitize_dates(start: Union[int, date, datetime], end: Union[int, date, datetime]) -> Sanitize_Type:
    """
        Return (datetime_start, datetime_end) tuple
    """
    if start and end:
        if start > end:
            raise ValueError("end must be after start")
    else:
        raise ValueError("start and or end must contain valid int. date or datetime object")

    start = datetime(start, 1, 1) if _types.is_number(start) else pd.to_datetime(start)
    end = datetime(end, 1, 1) if _types.is_number(end) else pd.to_datetime(end)

    return start, end
예제 #2
0
파일: utils.py 프로젝트: dxcv/pyetf
def sanitize_dates(start, end):
    """
    Return (datetime_start, datetime_end) tuple
    """
    if is_number(start):
        # regard int as year
        start = datetime.datetime(start, 1, 1)
    start = pd.to_datetime(start)
    if is_number(end):
        # regard int as year
        end = datetime.datetime(end, 1, 1)
    end = pd.to_datetime(end)
    if start is not None and end is not None:
        if start > end:
            raise Exception("end must be after start")
    return start, end
예제 #3
0
def build_address(row: pd.Series) -> str:
    """Combines multiple columns into a single mailing address value.
    Columns required are "StreetAddress", "City", "State" and "Zip"
    """
    zip = row["Zip"]
    if pd_types.is_number(zip):
        zip = f'{zip:0>5}'
    return escape_value(
        f'{row["StreetAddress"]}, {row["City"]}, {row["State"]} {zip}')
예제 #4
0
def _sanitize_dates(start, end):
    """
    Return (datetime_start, datetime_end) tuple
    if start is None - default is 2010/01/01
    if end is None - default is today
    """
    if is_number(start):
        # regard int as year
        start = dt.datetime(start, 1, 1)
    start = to_datetime(start)

    if is_number(end):
        end = dt.datetime(end, 1, 1)
    end = to_datetime(end)

    if start is None:
        start = dt.datetime(2010, 1, 1)
    if end is None:
        end = dt.datetime.today()
    return start, end
예제 #5
0
def _sanitize_dates(start: typing.Union[None, int],
                    end: typing.Union[None, int]) -> tuple:
    """
        Return (datetime_start, datetime_end) tuple
    """
    if is_number(start):
        # regard int as year
        start: datetime.datetime = datetime.datetime(start, 1, 1)
    start = pd.to_datetime(start)

    if is_number(end):
        # regard int as year
        end: datetime.datetime = datetime.datetime(end, 1, 1)
    end = pd.to_datetime(end)

    if start and end:
        if start > end:
            raise Exception("end must be after start")

    return start, end
예제 #6
0
def _sanitize_dates(start, end):
    """
    Return (timestamp_start, timestamp_end) tuple
    if start is None - default is 15 years before the current date
    if end is None - default is today
    Parameters
    ----------
    start : str, int, date, datetime, Timestamp
        Desired start date
    end : str, int, date, datetime, Timestamp
        Desired end date
    """
    today = dt.date.today()
    today = to_datetime(today)

    if is_number(start):
        # regard int as year
        start = dt.datetime(start, 1, 1)
    start = to_datetime(start)

    if is_number(end):
        end = dt.datetime(end, 1, 1)
    end = to_datetime(end)

    if start is None:
        # default to 5 years before today
        start = today - dt.timedelta(days=365 * 15)
    if end is None:
        # default to today
        end = today
    try:
        start = to_datetime(start)
        end = to_datetime(end)
    except (TypeError, ValueError):
        raise ValueError("Invalid date format.")
    if start > end:
        raise ValueError("start must be an earlier date than end")
    if start > today or end > today:
        raise ValueError("Start and end dates must be before current date")
    return start, end
예제 #7
0
 def partition_data(self, data, inquiry):
     '''
     partitions this dataset into yes and no evaluated datasets of the given inquiry
     return yes_data, no_data
     '''
     lab = inquiry.label
     val = inquiry.value
     if is_number(val):
         yes_data = data[data[lab] >= val]
     else:
         yes_data = data[data[lab] == val]
     no_data = data.drop(yes_data.index)
     return yes_data, no_data
예제 #8
0
파일: utils.py 프로젝트: jingmouren/cnswd
def _sanitize_date(obj, default):
    """转换为日期对象,如果为None则使用默认值。输出datetime.date对象"""
    if isinstance(obj, pd.Timestamp):
        return obj.date()
    # 务必排在dt.date之前
    if isinstance(obj, dt.datetime):
        return obj.date()
    if isinstance(obj, dt.date):
        return obj
    if is_number(obj):
        return dt.date(obj, 1, 1)
    if isinstance(obj, str):
        return pd.to_datetime(obj).date()
    if obj is None:
        return default
    raise ValueError('不能识别的输入日期')
예제 #9
0
    def from_values(cls, initial_value, values=None, closed="left"):
        """
        Construct :class:`Stairs` from :class:`pandas.Series`.

        Parameters
        ----------
        initial_value : float, default 0
            The value of the step function at negative infinity.
        values : :class:`pandas.Series`
            The step function values' when approaching the change points from the right
        closed : {"left", "right"}
            Indicates whether the half-open intervals comprising the step function should be interpreted
            as left-closed or right-closed.

        Returns
        -------
        :class:`Stairs`
        """

        if not isinstance(values, pd.Series) or values.empty:
            raise ValueError("values must be a not empty Series")

        if not (is_numeric_dtype(values.index) or is_datetime64_dtype(
                values.index) or is_timedelta64_dtype(values.index)):
            warnings.warn("The index of data is not numeric, or time based")

        if np.isinf(values.index).any():
            raise ValueError("Invalid value for Series index")

        if not is_numeric_dtype(values) or not is_number(initial_value):
            raise ValueError("Invalid dtype for from_values()")

        if not values.index.is_monotonic_increasing:
            raise ValueError("Series index must be monotonic")

        series_values_inf_mask = np.isinf(values)
        if series_values_inf_mask.any():
            values = values.replace([np.inf], np.nan)
            warnings.warn(
                "Infinity values detected and have been converted to NaN")

        new_instance = cls(closed=closed)
        new_instance.initial_value = initial_value
        new_instance._data = values.to_frame("value")
        new_instance._valid_deltas = False
        new_instance._valid_values = True
        return new_instance
예제 #10
0
    def _fit(self, X: pd.Series, y):
        if not is_numeric_dtype(X) and X.name not in self.categorical_cols:
            raise ValueError(
                'Column {} is not numeric and not in categorical_cols.'.format(
                    X.name))

        if X.name in self.categorical_cols:
            X = self.encode_with_label(X, y)

        if not self.encode:
            self.min_[X.name], self.max_[X.name] = X.min(), X.max()

        X, y = self._drop_na(X, y)
        min_frac = self.min_frac if is_number(
            self.min_frac) else self.min_frac[X.name]
        DT = DecisionTreeClassifier(max_leaf_nodes=self.bins,
                                    min_samples_leaf=min_frac,
                                    random_state=self.random_state)
        DT.fit(X.to_frame(), y)
        return parse_tree(DT.tree_), DT
예제 #11
0
    def get_ans(self, query_data):
        '''
        compares the value of this Inquiry to the value of query_data based 
        on the label

        query_data is the row of values for which answer to this inquiry is required

        comparison criteria:
            for number instances >= comparison,
            for literal == comparision
            example:
                query_data = [255, 0, 0, 'red']
                obj_1 = {label='R', value='255'}
                obj_1.get_ans(query_data) <- query_data['r'] >= value

                obj_2 = {label='color', value='red'}
                obj_2.get_ans(query_data) <- query_data['color'] == value
        '''
        if is_number(self.value):
            return query_data[self.label] >= self.value
        return query_data[self.label] == self.value
예제 #12
0
def censor(x, range=(0, 1), only_finite=True):
    """
    Convert any values outside of range to a **NULL** type object.

    Parameters
    ----------
    x : array_like
        Values to manipulate
    range : tuple
        (min, max) giving desired output range
    only_finite : bool
        If True (the default), will only modify
        finite values.

    Returns
    -------
    x : array_like
        Censored array

    Examples
    --------
    >>> a = [1, 2, np.inf, 3, 4, -np.inf, 5]
    >>> censor(a, (0, 10))
    [1, 2, inf, 3, 4, -inf, 5]
    >>> censor(a, (0, 10), False)
    [1, 2, nan, 3, 4, nan, 5]
    >>> censor(a, (2, 4))
    [nan, 2, inf, 3, 4, -inf, nan]

    Notes
    -----
    All values in ``x`` should be of the same type. ``only_finite`` parameter
    is not considered for Datetime and Timedelta types.

    The **NULL** type object depends on the type of values in **x**.

    - :class:`float` - :py:`float('nan')`
    - :class:`int` - :py:`float('nan')`
    - :class:`datetime.datetime` : :py:`np.datetime64(NaT)`
    - :class:`datetime.timedelta` : :py:`np.timedelta64(NaT)`

    """
    if not len(x):
        return x

    py_time_types = (datetime.datetime, datetime.timedelta)
    np_pd_time_types = (pd.Timestamp, pd.Timedelta,
                        np.datetime64, np.timedelta64)
    x0 = first_element(x)

    # Yes, we want type not isinstance
    if type(x0) in py_time_types:
        return _censor_with(x, range, 'NaT')

    if not hasattr(x, 'dtype') and isinstance(x0, np_pd_time_types):
        return _censor_with(x, range, type(x0)('NaT'))

    x_array = np.asarray(x)
    if pdtypes.is_number(x0) and not isinstance(x0, np.timedelta64):
        null = float('nan')
    elif com.is_datetime_arraylike(x_array):
        null = pd.Timestamp('NaT')
    elif pdtypes.is_datetime64_dtype(x_array):
        null = np.datetime64('NaT')
    elif isinstance(x0, pd.Timedelta):
        null = pd.Timedelta('NaT')
    elif pdtypes.is_timedelta64_dtype(x_array):
        null = np.timedelta64('NaT')
    else:
        raise ValueError(
            "Do not know how to censor values of type "
            "{}".format(type(x0)))

    if only_finite:
        try:
            finite = np.isfinite(x)
        except TypeError:
            finite = np.repeat(True, len(x))
    else:
        finite = np.repeat(True, len(x))

    if hasattr(x, 'dtype'):
        outside = (x < range[0]) | (x > range[1])
        bool_idx = finite & outside
        x = x.copy()
        x[bool_idx] = null
    else:
        x = [null if not range[0] <= val <= range[1] and f else val
             for val, f in zip(x, finite)]

    return x
예제 #13
0
 def __str__(self):
     condition = '>=' if is_number(self.value) else '=='
     return 'Is %s %s %s?' % (str(self.label), condition, str(self.value))
예제 #14
0
    def _fit(self, X, y, **fit_parmas):
        """ Fit a single feature and return the cutoff points"""
        self.categorical_cols = self.categorical_cols or []

        if not is_numeric_dtype(X) and X.name not in self.categorical_cols:
            raise ValueError('Column {} is not numeric and not in categorical_cols.'.format(X.name))

        y = force_zero_one(y)
        X, y = make_series(X), make_series(y)

        # if X is discrete, encode with positive ratio in y
        if X.name in self.categorical_cols:
            # the categorical columns will remain unchanged if
            # we turn off  bin_cat_cols
            if not self.bin_cat_cols:
                return None
            X = self.encode_with_label(X, y)

        # the number of bins is the number of cutoff points minus 1
        n_bins = X.nunique() - 1

        # if the number of bins is less than `max_bin` for categorical columns then
        # set the column as a mapping
        if n_bins < self.max_bin and X.name in self.categorical_cols:
            # mapping bad rate to encoding
            group_mapping = {v: i+1 for i, v in enumerate(set(X[X.notnull()]))}
            return self.discrete_encoding[X.name].map(group_mapping).to_dict()

        # speed up the process with prebinning
        if self.prebin and n_bins > self.prebin:
            if self.prebin_method.lower() == 'tree':
                min_frac = self.min_frac if is_number(self.min_frac) else self.min_frac[X.name]
                X, _ = tree_binning(X, y, n=self.prebin, min_frac=min_frac, 
                                    encode=False, random_state=1024)
            elif self.prebin_method.lower() == 'equal_freq':
                X, _ = equal_frequency_binning(X, n=self.prebin, encode=False)
            else:
                raise ValueError('Only `tree` and `equal_freq` is supported for prebin_method.')

        # convert to mapping
        mapping = y.groupby(X).apply(list).to_dict()

        # set the overall expected ratio
        if len(mapping) == 0:
            return [-np.inf]

        self.expected_ratio = sum(sum(v) for v in mapping.values()) / sum(len(v) for v in mapping.values())
        # if the expected_ratio is 0 or 1 there should be only 1 group and
        # any not-null value will be encoded into 1
        if self.expected_ratio == 0 or self.expected_ratio == 1:
            return [-np.inf]

        n_bins = len(mapping)
        # merge bins based on chi square
        while n_bins > self.max_bin:
            mapping = self.merge_chisquare(mapping)
            n_bins = len(mapping)

        # merge bins to create mixed label in every bin
        if self.force_mix_label and n_bins > 1:
            is_pure = False
            while not is_pure:
                mapping, is_pure = self.merge_purity(mapping)

        # merge bins to keep bins to be monotonic
        if self.force_monotonic:
            while len(mapping) > 2 and not self.is_monotonic_post_bin(mapping):
                # mapping = self.merge_chisquare(mapping)
                mapping = self.merge_monotonic(mapping)

        # merge bins to meet the minimum sample size for each interval
        if self.min_interval_size > 0:
            if self.min_interval_size <= 1:
                min_interval_size = self.min_interval_size * X.notnull().sum()
            else:
                min_interval_size = self.min_interval_size

            meet_interval_size = False
            while not meet_interval_size and len(mapping) > 2:
                mapping, meet_interval_size = self.merge_interval_size(mapping, min_interval_size)

        # clean up the cache
        self._chisquare_cache = dict()
        return mapping.keys()
def stringify(a):
		if is_number(a):
			b = str(int(round(a,0)))
		elif type(a) == type([]):
def fraglist_clearner(fraglist):
	for frag in fraglist:
				n=0
			
			
				if is_number(frag) == True:
예제 #17
0
def _to_kv_quantity(value: object) -> str:
    if pd_types.is_number(value):
        return f'[{value} KiloVolt]'
    return ''
예제 #18
0
def zip_to_dcid(zip: object) -> str:
    if pd_types.is_number(zip):
        return f'dcid:zip/{zip:0>5}'
    return ''
def generate_transitions(df):
	'''
	Remember to initialize a dictionary for the transitions
	'''
	try: 
		CEDP_methods
	except NameError:
		CEDP_methods = {}
	  

	EP = 10
	CXP = 4
	Time = 5
	
	
	for idx in range(len(df)):
		ln = df.loc[idx]
		inhouse = ln['inhouse'].strip()
		chemname = ln['name'].strip().capitalize()
		adduct = ln['Adduct'].strip()
		Q1 = ln['Q1'].round(5)
		Q1s = str(int(ln['Q1'].round()))
	  
		if is_number(ln['Fragments (low)']) == True:
			fl = [ln['Fragments (low)']]
		else: 
			fl = [i.strip("?!. ,") for i in ln['Fragments (low)'].split(',')]
				
		if is_number(ln['Fragments (High)']) == True:
			fl = [ln['Fragments (High)']]
		else: fh = [i.strip("?.! ,") for i in ln['Fragments (High)'].split(',')]
		
		best_list = []
		
		## CE optimizations
		
		DP = 50
		method_string = ''
		method_name = f'200706_CEDP_%s_%s_MZ%s_Pos' %(inhouse, adduct, Q1s)
		
	   ################ 
		
		for frag in fl[:6]:
			#print(frag)
			
			
			if is_number(frag) == True:
				if pd.isna(frag) == True:
					fl.remove(frag)
					#print('1', fl)
				if is_string_dtype(frag) ==  True:
					try:
						float(frag)
					except: 
						#print('2', fl)
						fl.remove(frag)    
				else:
					try:
						Q3 = float(frag.strip("?. ,!"))
					except: 
						#print('3', fl)
						continue
						
			else: 
				fl.remove(frag)
				#print('4', fl)
				
				
				
			if len(fl) >=1:
				#print(Q1, frag, pd.isna(frag))


				for Q3, CE in make_CE_MRMs(frag, range(10,50, 5)):
					try: 
						Q3s = str(int(Q3))
					except:
						continue


					ID = f'%s_%s_%s_%s_DP%s_CE%s' %(inhouse, chemname, Q1s, Q3s, DP, CE)


					lowfstr = '\t'.join([str(x) for x in (round(Q1,2),round(Q3,3),Time,ID,DP,EP,CE,CXP)])
					method_string = method_string + lowfstr + '\n'
					#print((method_string))
			else: pass
			CEDP_methods[method_name] = method_string
			
			###################
			
		for frag in fh[:6]:
		#print(frag)
		
		
			if is_number(frag) == True:
				if pd.isna(frag) == True:
					fh.remove(frag)
					#print('1', fh)
				if is_string_dtype(frag) ==  True:
					try:
						float(frag)
					except: 
						#print('2', fh)
						fh.remove(frag)    
				else:
					try:
						Q3 = float(frag.strip("?. ,!"))
					except: 
						#print('3', fh)
						continue
						
			else: 
				fh.remove(frag)
				#print('4', fh)
				
				
				
			if len(fh) >=1:
				#print(Q1, frag, pd.isna(frag))


				for Q3, CE in make_CE_MRMs(frag, range(40,90, 5)):
					try: 
						Q3s = str(int(Q3))
					except:
						continue


					ID = f'%s_%s_%s_%s_DP%s_CE%s' %(inhouse, chemname, Q1s, Q3s, DP, CE)


					lowfstr = '\t'.join([str(x) for x in (round(Q1,2),round(Q3,3),Time,ID,DP,EP,CE,CXP)])
					method_string = method_string + lowfstr + '\n'
					#print((method_string))
			else: pass
			CEDP_methods[method_name] = method_string
		
		########
		
		
		if len(fl) >= 1:
			best_list.append(fl[0])
		if len(fh) >= 1:
			best_list.append(fh[0])
		if len(best_list) >= 1:
		
			hlswitch = 0
			for frag in best_list :
				if hlswitch >=1 :
					CE = 50                    
				else:
					CE = 25
				if is_number(frag) == True:
					if pd.isna(frag) == True:
						best_list.remove(frag)
						#print('1', best_list)
				if is_string_dtype(frag) ==  True:
					try:
						float(frag)
					except: 
						#print('2', best_list)
						best_list.remove(frag)    
				else:
					try:
						Q3 = float(frag.strip("?. ,!"))
					except: 
						#print('3', best_list)
						continue
					#	
					#else: 
					#	best_list.remove(frag)
					#	#print('4', best_list)
					
					
					
				if len(best_list) >=1:
				#print(Q1, frag, pd.isna(frag))
					for Q3, DP in make_CE_MRMs(frag, range(10, 130, 10)):
						try: 
							Q3s = str(int(Q3))
						except:
							continue
					ID = f'%s_%s_%s_%s_DP%s_CE%s' %(inhouse, chemname, Q1s, Q3s, DP, CE)
					lowfstr = '\t'.join([str(x) for x in (round(Q1,2),round(Q3,3),Time,ID,DP,EP,CE,CXP)])
					method_string = method_string + lowfstr + '\n'
					#print((method_string))
				else: pass
				CEDP_methods[method_name] = method_string
	return(CEDP_methods)