Exemplo n.º 1
0
def p005(lower_bound, upper_bound):
	factors = _Counter()
	for i in range(lower_bound, upper_bound + 1):
		for multiple, count in _Counter(factor(i)).items():
			if count > factors.get(multiple, 0):
				factors[multiple] = count
	return _reduce(_mul, (k ** v for k, v in factors.items()))
Exemplo n.º 2
0
def get_score(cards):
	main = None
	values = list(get_values(cards))
	while 1 in values:
		values.remove(1)
		values.append(14)
	if is_royal_flush(cards):
		main = 9
		values = [14, 13, 12, 11, 10]
	elif is_straight_flush(cards):
		main = 8
		if 1 in cards and 2 in cards:
			values.remove(14)
			values.append(1)
		values = sorted(values, reverse=True)
	elif is_four_of_a_kind(cards):
		main = 7
		values = list(_chain.from_iterable(map(lambda item: [item[0]] * item[1], sorted(_Counter(values).items(), key=_itemgetter(1), reverse=True))))
	elif is_full_house(cards):
		main = 6
		values = list(_chain.from_iterable(map(lambda item: [item[0]] * item[1], sorted(_Counter(values).items(), key=_itemgetter(1), reverse=True))))
	elif is_flush(cards):
		main = 5
		values = sorted(values, reverse=True)
	elif is_straight(cards):
		main = 4
		if 1 in cards and 2 in cards:
			values.remove(14)
			values.append(1)
		values = sorted(values, reverse=True)
	elif is_three_of_a_kind(cards):
		main = 3
		values = list(_chain.from_iterable(map(lambda item: [item[0]] * item[1], sorted(_Counter(values).items(), key=_itemgetter(1), reverse=True))))
		if values[3] < values[4]:
			values[3], values[4] = values[4], values[3]
	elif is_two_pair(cards):
		main = 2
		values = list(_chain.from_iterable(map(lambda item: [item[0]] * item[1], sorted(_Counter(values).items(), key=_itemgetter(1), reverse=True))))
		if values[0] < values[2]:
			values[0:2], values[2:4] = values[2:4], values[0:2]
	elif is_one_pair(cards):
		main = 1
		values = list(_chain.from_iterable(map(lambda item: [item[0]] * item[1], sorted(_Counter(values).items(), key=_itemgetter(1), reverse=True))))
		high = max(tuple(enumerate(values))[2:], key=_itemgetter(1))[0]
		values[2], values[high] = values[high], values[2]
		if values[3] < values[4]:
			values[3], values[4] = values[4], values[3]
	else:
		main = 0
		values = sorted(values, reverse=True)
	return [main] + values
Exemplo n.º 3
0
def broadcast_arrays(*args, **kwargs):
    tensors = [*args]
    shapes = [t.get_shape().as_list() for t in tensors]
    max_rank = max(len(s) for s in shapes)

    for index, value in enumerate(shapes):
        if len(value) == max_rank:
            continue

        tensor = tensors[index]
        for _ in range(max_rank - len(value)):
            value.insert(0, 1)
            tensor = _tf.expand_dims(tensor, axis=0)
        tensors[index] = tensor

    broadcast_shape = []
    for index in range(max_rank):
        dimensions = [s[index] for s in shapes]
        repeats = _Counter(dimensions)
        if len(repeats) > 2 or (len(repeats) == 2
                                and 1 not in list(repeats.keys())):
            raise ValueError(
                "operands could not be broadcast together with shapes", shapes)
        broadcast_shape.append(max(repeats.keys()))

    for axis, dimension in enumerate(broadcast_shape):
        tensors = [
            _tf.concat([t] *
                       dimension, axis=axis) if t.get_shape()[axis] == 1 else t
            for t in tensors
        ]

    return tensors
Exemplo n.º 4
0
    def _get_qualifier(self, qualifier: str = _const.CHAR_EMPTY) -> str:
        # Makes sure the qualifier (when specified) ends with a separator and returns it.

        if not self._is_remote or qualifier is None:
            # If it's an SDE/remote workspace (regardless of qualifier),
            # or when the qualifier is None, set it to an empty string.
            qualifier = _const.CHAR_EMPTY
        if not qualifier and self._is_remote:
            try:
                # For Oracle databases, the user name is the qualifier. We could derive that from the connection
                # properties by doing a Describe() on the workspace. However, for other databases (e.g. MSSQL), this is
                # not so straight-forward. Moreover, Describe() tends to be quite slow on a whole workspace.
                # For this reason, we will iterate over a bunch of object names in the workspace (starting with
                # Feature Datasets - and when not found, Feature Classes and Tables) to try and fetch the most common
                # qualifier prefix.
                with _arcpy.EnvManager(workspace=self._path):
                    items = _arcpy.ListDatasets() or _arcpy.ListFeatureClasses(
                    ) or _arcpy.ListTables()
                    qkeys = (self._sep.join(item.split(self._sep)[:-1])
                             for item in items)
                    qualifier, _ = _Counter(qkeys).most_common()[0]
            except (AttributeError, IOError, RuntimeError):
                raise ValueError(
                    f'{Workspace.__name__} could not determine qualifier from SDE connection file'
                )

        return self._fix_qualifier(qualifier, self._sep)
Exemplo n.º 5
0
def __dt_heuristic(direction, function, derivative_order, approximation_order,
                   *args):
    """
	informative stackexchange answers:
		https://math.stackexchange.com/a/2488893/68036
		https://math.stackexchange.com/a/819015/68036
	"""
    max_float_spacing_in_codomain = max(
        (_np.spacing(fi)) for fi in _np.ravel(function(*args)))

    recommended_dt = abs(max_float_spacing_in_codomain)**(
        1.0 / (derivative_order + approximation_order))
    trial_dts = [
        recommended_dt * (10.0**exponent) for exponent in range(0, 10)
    ]
    trial_dfs = [
        _finite_forward_difference(direction, dt)(function)(*args)
        for dt in trial_dts
    ]
    trial_dfs_rounded = [
        tuple(float(format(val, ".3e")) for val in _np.ravel(trial_df))
        for trial_df in trial_dfs
    ]
    counts = _Counter(trial_dfs_rounded)
    for rounded_value, _ in counts.most_common(1):
        first_common_dt = trial_dts[trial_dfs_rounded.index(rounded_value)]
        return first_common_dt
Exemplo n.º 6
0
def fontsize_get(iax):
    r"""
    Scan text elements and return a dictionary with fontsizes
    Parameters
    ----------
    iax : :obj:`~matplotlib.axes.Axes`

    Returns
    -------
    sizes : dict
        A dictionary with two lists,
        keyed with "n_polygons" and "other"
        * "n_polygons" contains a list in ascending
        order with the fontsizes of text elements
        of which there are as many as CirclePolygons
        This fact is a strong hint that those text
        elements correspond 1st to the residue-label texts
        and 2nd to the residue-SS-label texts
        * "other" contains all other fontsizes in
        :obj:`iax`, most likely belonging to the
        fragment-label texts
    """
    n_polygons = len([obj for obj in iax.artists if isinstance(obj, _CP)])
    sizes = {"n_polygons": [],
             "other": []}

    c = _Counter([_np.round(txt.get_fontsize(), 2) for txt in iax.texts])

    for key, val in c.items():
        sizes[{True: "n_polygons",
               False: "other"}[val == n_polygons]].append(key)

    return {key: sorted(val) for key, val in sizes.items()}
Exemplo n.º 7
0
def create_bag_of_words(corpus, senses):
    unique_word_counts = _Counter()
    senses_counter = _Counter()
    bow_dict = {}
    for sense in senses:
        bow_dict[sense] = _Counter()
        #make a Counter obj for every word sense
    for instance in corpus:
        #for every sample
        for word in instance[2]:
            #for every word in sample
            bow_dict[instance[1]][word] += 1
            #incrament the count of this word for this sense
            unique_word_counts[word] += 1
            #incrament the times we've seen this word total
        senses_counter[instance[1]] += 1
        #incrament the number of times we've seen this sense
    return bow_dict, senses_counter, unique_word_counts
Exemplo n.º 8
0
def summarize(backup):
    ''' prints a brief summary on the given backup '''

    print()
    print_manifest(backup)
    print()
    print_status(backup)
    print()
    print("File stats")
    print("==========")
    c = _Counter(r.filetype for r in backup.filerecords())
    for k, v in c.items():
        print("{:25s} {:25s}".format(k, str(v)))
Exemplo n.º 9
0
def nmi(partition1, partition2):
    """
    Compute NMI between two partitions. If the input partitions are multilayer, this function computes the multilayer
    NMI.

    :param partition1: first input partition as mapping of node to mesoset
    :param partition2: second input partition as mapping of node to mesoset

    :return: NMI value (normalised by joint entropy)
    """
    n = len(partition1)
    if len(partition2) != n:
        raise ValueError("partitions need to have the same number of elements")
    p12 = _Counter((partition1[key], partition2[key]) for key in partition1)
    h12 = sum((p / n) * log2(p / n) for p in p12.values())

    p1 = _Counter(partition1.values())
    h1 = sum((p / n) * log2(p / n) for p in p1.values())

    p2 = _Counter(partition2.values())
    h2 = sum((p / n) * log2(p / n) for p in p2.values())

    return (h1 + h2 - h12) / h12
Exemplo n.º 10
0
def count_sort(array_, reverse=False):
    need_sort, base = array_, 10
    if not isinstance(array_[0], int):
        base_dict = {v: k for k, v in enumerate(__CHAR)}
        base = base_dict[max(set(''.join([str(s[1:] if '-' in s else s) for s in array_])))] + 1
    need_sort = [_str2dec(base)(s) for s in array_]
    min_, max_ = min(need_sort), max(need_sort)
    count_map = _Counter(ele for ele in need_sort)
    sorted_array = [0] * len(array_)
    if not reverse:
        for i in range(min_ + 1, max_ + 1):
            count_map[i] += count_map[i - 1]
    else:
        for i in reversed(range(min_, max_)):
            count_map[i] += count_map[i + 1]
    for index, val in enumerate(reversed(need_sort)):
        sorted_array[count_map[val] - 1] = array_[-1 - index]
        count_map[val] -= 1
    return sorted_array
Exemplo n.º 11
0
def even_folds(corpus, senses, k):
    sense_list_in_order = []
    for sample in corpus:
        sense_list_in_order.append(sample[1])
    num_per_sense = _Counter(sense_list_in_order)
    count_per_fold = {}
    for sense in senses:
        total = num_per_sense[sense]
        not_last_fold = _ceil(total / k)
        total_not_last = (k-1)*not_last_fold
        last_fold = total - total_not_last
        count_per_fold[sense] = (not_last_fold, last_fold)
    index_by_sense = {}
    sense_index_counter = {}
    for sense in senses:
        index_by_sense[sense] = []
        sense_index_counter[sense] = 0
    for idx,sample in enumerate(corpus):
        index_by_sense[sample[1]].append(idx)
    indexes = []
    last_fold_counter = 0
    for fold in range(k):
        fold_indexes = []
        for sense in senses:
    #we have n lists of indexes for n senses
    #we need to subset each of the lists into appropriate fold sizes
            if last_fold_counter < (k - 1):
                current_idx = 0
                while(current_idx < count_per_fold[sense][0]):
                    fold_indexes.append(index_by_sense[sense][sense_index_counter[sense]])
                    current_idx += 1
                    sense_index_counter[sense] += 1
            else:
                current_idx = 0
                while(current_idx < count_per_fold[sense][1]):
                    fold_indexes.append(index_by_sense[sense][sense_index_counter[sense]])
                    current_idx += 1
                    sense_index_counter[sense] += 1
        last_fold_counter += 1
        indexes.append(fold_indexes)
    return indexes
Exemplo n.º 12
0
def divideSpikes(spikes, blockStartT, blockEndT, blockSeq, flag):
    '''
    From spikes, generate a dictionary where keys are elements from blockSeq and values are ndarrays with all spikes in between blockStartT/EndT for that conditoin. 
    
    input:
    ------
        spikes:         ndarray like with spike times
        
        blockStartT:    ndarray like with the start time of each block
        
        blockEndT:      ndarray like with the end time of each block
        
        blockSeq:       ndarray like with 'keys' identifying each block. Blocks with the same identifier will end up together.
                        keys can be integers, strings or any other immutable object

        Flag:           Decides between different types of computations on the spikes
                        0:      Spike times are not changed at all
                        1:      Spike times are changed as if all block sequences for a given condition were continuous
                                (the time for the first instance of each block seq is 0, the second instance starts from where the 1st left off and so on)
                        2:      Spike times are changed such that EVERY block seq starts from 0

    output:
    -------
        spikesOut:      a dictionary in which spikesOut[blockSeq[i]] is a ndarray with all the spikes associated with blockSeq[i]
                        Depending on 'flag' spike times might be modified.
                        
    Usage:
    ------
        Possible use of Flag 0
            Set random seed at the beginning and have a random stimuli alternating between conditions. Both conditions draw numbers from the same random stream.
        Possible use of Flag 1
            Set the seed for as many random streams as experimental conditions and alternate the conditions many times without reseting the seed
        Possible use of Flag 2
            Reset the seed when a stimulus is repeated
    '''

    # Make a dictionary where keys are blockSeq IDs and the values are the accumulated time under such condition. This will be used if flag==2
    accumulatedTime = _Counter()

    # start an empty array where spikes will be added
    spikesOut = _defaultdict(lambda : _np.array([]))

    # add two spike to 'spikes' one prior to first blockStartT and one after last blockEndT to avoid special cases below. By adding these spikes startIndex and lastIndex are always found
    preSpk = _np.array([blockStartT[0]-1])
    postSpk = _np.array([blockEndT[-1]+1])
    spks = _np.concatenate((preSpk, spikes, postSpk))

    #_pdb.set_trace()
    for i, startT in enumerate(blockStartT):
        # only assign spikes with meaningful blockSeq. Sometimes I want to exclude spikes from the analysis for example during adapting sequences.
        if blockSeq[i] is None:
            continue
        
        # find 1st spike in spikes that is greater than startT
        startIndex = _np.where(_np.diff(_np.sign(spks-startT)))[0][0]+1
        
        # find last spike in spikes that is smaller than BlockEndT[i]
        lastIndex = _np.where(_np.diff(_np.sign(spks-blockEndT[i])))[0][0]

        # grab only the spikes corresponding to this block
        blockSpks = spks[startIndex:lastIndex+1]

        # Modify spike times in this block according to flag
        if flag==0:
            pass
        elif flag==1:
            blockSpks -= sum(accumulatedTime.values()) - accumulatedTime[blockSeq[i]]
        elif flag==2:
            blockSpks -= startT

        #_pdb.set_trace()
        # Add spike times to spikesOut
        spikesOut[blockSeq[i]] = _np.concatenate((spikesOut[blockSeq[i]], blockSpks))

        # Keep track of accumulatedTime
        accumulatedTime[blockSeq[i]] += blockEndT[i] - blockStartT[i]

    return spikesOut
Exemplo n.º 13
0
def tex_table(*args,
              caption: str = '',
              numerate: bool = True,
              colors=(
                  'C0C0C0',
                  'EFEFEF',
                  'C0C0C0',
              ),
              color_frequency: int = 2,
              accuracy: float = 0.2,
              lab_fmt: bool = True):
    """
    Проще объяснить на примере
    Есть серии измерений теплоёмкости и температуры.
        T=Var(range(273,280), [1]*7)
        C=Var([9]*7, [0.1]*7)
    T измеряется в К, а С в Дж/К
    :param args: ('T, К', T), ('С, Дж/К', C)
    или (('T, К', T), ('С, Дж/К', C),)
    Сгенерируется таблица со значениями и погрешностями. Будут столбцы: 'T, К', '\Delta T, К', 'С, Дж/К', '\Delta С, Дж/К'
    Но если вам, например, не захотелось отображать погрешность T, то args должно иметь вид:
    ('T, К', T, False), ('С, Дж/К', C)
    :param caption: guess yourself
    :param numerate: Вводит нумерацию строчек
    :param colors: Таблица имеет разные цвета: цвет заголовков, цвет чередования, цвет столбца нумепрации.
    В формате цветов HTML можно менять их.
    :param color_frequency: Частота чередования
    :param accuracy: Точность, с которой отображается ошибка
    :param lab_fmt: Выносит степени 10 в заголовок
    """

    # This func should check: Do this arg should present error in table or not?
    def ch_err(arg):
        return (len(arg) == 2) or arg[2] is True

    if hasattr(args[0], '__getitem__') and not hasattr(args[0][0], 'split'):
        args = args[0]
    try:
        height = args[0][1].__len__()
    except Exception:
        raise _ArgumentError
    table = list()
    if numerate:
        table.append([''] + list(map(str, range(1, height + 1))))
        NCol = 0  # Номер текущей колонки
    else:
        NCol = -1  # Номер текущей колонки

    for arg in args:
        if len(arg[1]) is not height:
            raise TypeError('Wrong length of arguments')
        NCol += 1

        if lab_fmt is True:
            if ch_err(arg):
                # table.append(['$' + rus_tex_formula(arg[0]) + '$'])
                val = tuple(map(lambda x: _dc.Decimal(str(x)), arg[1].val()))
                err = tuple(map(lambda x: _dc.Decimal(str(x)), arg[1].err()))
                most_common_exp = _Counter(_get_eng_exp(v)
                                           for v in val).most_common(1)[0][0]
                if most_common_exp != 0:
                    # Наличие ',' значит поставил ли пользователь размерность или это безразмерное число
                    if ',' in arg[0]:
                        table.append([
                            '$' + rus_tex_formula(arg[0]) + '\\cdot 10^{' +
                            str(most_common_exp) + '} $'
                        ])
                    else:
                        table.append([
                            '$' + rus_tex_formula(arg[0]) + '\\cdot 10^{' +
                            str(-most_common_exp) + '} $'
                        ])
                else:
                    table.append(['$' + rus_tex_formula(arg[0]) + '$'])
                Arr_of_val = list()
                Arr_of_err = list()
                for i in range(height):
                    Rval, Rerr = _lab_decimal_style(val[i],
                                                    err[i],
                                                    accuracy=accuracy)
                    Arr_of_val += [_dec_normal(Rval.scaleb(-most_common_exp))]
                    Arr_of_err += [_dec_normal(Rerr.scaleb(-most_common_exp))]
                table[NCol] += Arr_of_val
                if most_common_exp != 0:
                    # Наличие ',' значит поставил ли пользователь размерность или это безразмерное число
                    if ',' in arg[0]:
                        table.append([
                            '$' + '\\Delta ' + rus_tex_formula(arg[0]) +
                            '\\cdot 10^{' + str(most_common_exp) + '} $'
                        ])
                    else:
                        table.append([
                            '$' + '\\Delta ' + rus_tex_formula(arg[0]) +
                            '\\cdot 10^{' + str(-most_common_exp) + '} $'
                        ])
                else:
                    table.append(
                        ['$' + '\\Delta ' + rus_tex_formula(arg[0]) + '$'])
                NCol += 1
                table[NCol] += Arr_of_err
            else:
                # table.append(['$' + rus_tex_formula(arg[0]) + '$'])
                val = tuple(map(lambda x: _dc.Decimal(str(x)), arg[1].val()))
                most_common_exp = _Counter(_get_eng_exp(v)
                                           for v in val).most_common(1)[0][0]
                if most_common_exp != 0:
                    # Наличие ',' значит поставил ли пользователь размерность или это безразмерное число
                    if ',' in arg[0]:
                        table.append([
                            '$' + rus_tex_formula(arg[0]) + '\\cdot 10^{' +
                            str(most_common_exp) + '} $'
                        ])
                    else:
                        table.append([
                            '$' + rus_tex_formula(arg[0]) + '\\cdot 10^{' +
                            str(-most_common_exp) + '} $'
                        ])
                else:
                    table.append(['$' + rus_tex_formula(arg[0]) + '$'])
                Arr_of_val = list()
                for i in range(height):
                    Rval, Rerr = _lab_decimal_style(val[i],
                                                    0,
                                                    accuracy=accuracy)
                    Arr_of_val += [_dec_normal(Rval.scaleb(-most_common_exp))]
                table[NCol] += Arr_of_val
        else:
            table.append(['$' + rus_tex_formula(arg[0]) + '$'])
            if ch_err(arg):
                table[NCol] += tuple(map(str, arg[1].val()))
                NCol += 1
                table[NCol] += tuple(map(str, arg[1].err()))
            else:
                table[NCol] += list(map(str, arg[1].val()))
    NCol += 1  # Отныне это число колонок без учёта нумерации (самая левая колонка)

    ret = '\\begin{table}[ht]' + '\n' + '\\center' + '\n'
    if caption is None or False:
        pass
    else:
        ret += '\\caption{' + caption + '}' + '\n'
    ret += '\\begin{tabular}{' + len(table) * '|c' + '|}' + '\n' + '\\hline' + '\n' + \
           '\\rowcolor[HTML]{' + colors[0] + '}' + '\n'

    for str_num in range(height + 1):
        if str_num % color_frequency == 0 and str_num != 0:
            ret += '\\rowcolor[HTML]{' + colors[1] + '} \n'
        if numerate and str_num != 0:
            ret += '\\cellcolor[HTML]{' + colors[2] + '} '
        for col_num in range(NCol):
            ret += table[col_num][str_num] + ' & '
        ret = ret[:-2] + '\\\\ \\hline' + '\n'
    ret += '\\end{tabular}' + '\n' + '\\end{table}' + '\n'
    return ret
Exemplo n.º 14
0
def is_two_pair(cards):
	return _Counter(_Counter(get_values(cards)).values()).get(2, 0) == 2
 def __init__(self):
     self._counter: CounterType[CounterKey] = _Counter()
Exemplo n.º 16
0
def is_three_of_a_kind(cards):
	return 3 in _Counter(get_values(cards)).values()
Exemplo n.º 17
0
def is_full_house(cards):
	values = _Counter(get_values(cards)).values()
	return 2 in values and 3 in values
Exemplo n.º 18
0
def is_four_of_a_kind(cards):
	return 4 in _Counter(get_values(cards)).values()
Exemplo n.º 19
0
def tfpred(gene_ids, species = 'hsa'):
    '''
    Predicts transcription factors driving differential gene expression.
    Parameters:
    ----------
    gene_ids : array_like, int
        1-D array of Entrez Gene IDs of differentially expressed genes.

    species : str
        Three-letter species ID. Currently, function works only with human genes. 
        Therefore, 'hsa' is the default value and should not be changed.

    Returns : DataFrame
        Returns a list of transcription factors ranked according to their probability 
        scores.
    '''
    # Check the input
    if type(gene_ids) is str:
        raise ValueError("'gene_ids' should be a list of IDs")

    try:
        gene_ids = list(gene_ids)
    except TypeError:
        print("'gene_ids' argument is not iterable.")
        raise

    if len(_np.array(gene_ids).shape) > 1:
        raise ValueError('The list of gene IDs should be flat')

    if len(gene_ids)<10:
        raise ValueError('The list of genes is too short')

    for i in range(len(gene_ids)):
        if type(gene_ids[i]) is not int:
            try:
                gene_ids[i] = int(gene_ids[i])
            except ValueError:
                print("An element of 'gene_ids' cannot be converted to an integer.")
                raise

    if species!='hsa':
        raise ValueError("Only 'hsa' option is currently supported.")
    
    # Load data
    file_list = ['tfdrive_data/pathways_hsa_full_hg38.parquet',
                 'tfdrive_data/go_terms_hsa_processes.parquet',
                 'tfdrive_data/tf_names_hsa.parquet',
                 'tfdrive_data/tf_families_hsa.parquet',
                 'tfdrive_data/tf_matrix_transfac.parquet',
                 'tfdrive_data/tf_matrix_chea.parquet',
                 'tfdrive_data/rf_mod.joblib',
                 'tfdrive_data/log_mod.joblib']
                 
    if not all([_os.path.isfile(f) for f in file_list]):
        raise OSError("Some required data files are missing.")

    # Table of gene ID to pathway associations
    pathways_db = _pd.read_parquet('tfdrive_data/pathways_hsa_full_hg38.parquet')
    pathways_db.drop_duplicates(['gene_id','pathway'],keep='first', inplace=True)

    gene_ids = list(set(gene_ids))
    if len([g for g in gene_ids if g in pathways_db['gene_id'].values]) < 10:
        raise ValueError("Too few or none of provided gene IDs were found in the database.")

    # Table of gene ID to GO terms (from 'processes' cathegory) associations
    go_terms_db = _pd.read_parquet('tfdrive_data/go_terms_hsa_processes.parquet')
    go_terms_db.drop_duplicates(['gene_id','GO_term'],keep='first', inplace=True)

    # Table of TF to TF ID associations
    tf_names_db = _pd.read_parquet('tfdrive_data/tf_names_hsa.parquet')

    # Load TF classification table
    tf_families_df = _pd.read_parquet('tfdrive_data/tf_families_hsa.parquet')

    # Tables of TF-gene interactions
    transfac_df = _pd.read_parquet('tfdrive_data/tf_matrix_transfac.parquet')
    chea_df = _pd.read_parquet('tfdrive_data/tf_matrix_chea.parquet')

    # Arbitrary distribution for calculating 'pathway importance'
    dist = _st.beta(2,2)

    # Prediction models
    rf_model = _load('tfdrive_data/rf_mod.joblib')
    log_model = _load('tfdrive_data/log_mod.joblib') 

    full_data_df = _pd.DataFrame(columns=['TF', 'TF_id', 'p_share', 
                                         'all_p', 'p_score', 'go_share', 
                                         'all_go', 'go_score'])

    for tf_id in tf_names_db.index:
        tf_name = tf_names_db.at[tf_id, 'gene']  

        # Get unique pathways and terms related to the TF and their counts
        tf_ptws = list(pathways_db[pathways_db['gene_id'].isin([tf_id])]['pathway'])
        tf_terms = list(go_terms_db[go_terms_db['gene_id'].isin([tf_id])]['GO_term'])
        tf_pw_count = _Counter(tf_ptws)
        tf_term_count = _Counter(tf_terms)
        all_p = len(tf_pw_count)
        all_go = len(tf_term_count)

        # Get unique pathways and terms related to differentially expressed genes
        # and their counts
        case_ptws = list(pathways_db[pathways_db['gene_id'].isin(gene_ids)]['pathway'])
        case_terms = list(go_terms_db[go_terms_db['gene_id'].isin(gene_ids)]['GO_term'])
        case_ptw_count = _Counter(case_ptws)
        case_term_count = _Counter(case_terms)

        # Calculate relative pathway/GO term importance
        max_pw_count = max(_Counter(pathways_db['pathway'].values).values())
        max_go_count = max(_Counter(go_terms_db['GO_term'].values).values())

        pw_df = _pd.DataFrame.from_dict(case_ptw_count, orient='index',columns=['count'])
        pw_df['importance'] = pw_df['count'].apply(lambda x: x/max_pw_count)
        pw_df['importance']  = dist.pdf(pw_df['importance'])

        go_df = _pd.DataFrame.from_dict(case_term_count, orient='index',columns=['count'])
        go_df['importance'] = go_df['count'].apply(lambda x: x/max_go_count)
        go_df['importance']  = dist.pdf(go_df['importance'])

        # Calculate relative pathway/term overlap and scores based on 'importance'
        common_p = 0
        common_go = 0
        p_score = 0.0
        go_score = 0.0
        coef = 100
        
        if all_p > 0:
            for i in tf_pw_count:
                if i in case_ptw_count:
                    common_p += 1
                    p_score += coef * pw_df.at[i, 'importance'] * pw_df.at[i, 'count'] / len(gene_ids)

        if all_go > 0:
            for t in tf_term_count:
                if t in case_term_count:
                    common_go += 1
                    go_score += coef * go_df.at[t, 'importance'] * go_df.at[t, 'count'] / len(gene_ids)
  
        p_share = common_p/all_p if all_p > 0 else 0.0
        go_share = common_go/all_go if all_go > 0 else 0.0
        
        # Add to DataFrame
        full_data_df.loc[len(full_data_df)] = [tf_name, tf_id, p_share, all_p,  
                                               p_score, go_share, all_go, go_score]

    #Drop TFs which have less than 3 pathways associated with them
    full_data_df = full_data_df[full_data_df['all_p']>2].copy(deep=True)
    full_data_df = full_data_df[full_data_df['all_go']>2].copy(deep=True)

    # Calculate relative interaction frequencies for each family for each group
    tf_group_scores_df = _pd.DataFrame(columns=['family', 'transfac', 'chea'])
    gene_ids_str = [str(g) for g in gene_ids]
    for f in tf_families_df.index:
        tf_ids = tf_families_df.at[f, 'ids']
        # Transfac
        t_value = 0.0
        t_tf_ids = [t for t in tf_ids if t in transfac_df.columns]
        t_g_ids = [h for h in gene_ids_str if h in transfac_df.index]    
        if (len(t_tf_ids) > 0) & (len(t_g_ids) > 0):
            t_value = (transfac_df.loc[t_g_ids, t_tf_ids].sum().sum() / len(t_g_ids)) / \
                      (transfac_df[t_tf_ids].sum().sum() / len(transfac_df))
        # ChEA
        c_value = 0.0
        c_tf_ids = [c for c in tf_ids if c in chea_df.columns]
        c_g_ids = [h for h in gene_ids_str if h in chea_df.index]    
        if (len(c_tf_ids) > 0) & (len(c_g_ids) > 0):
            c_value = (chea_df.loc[c_g_ids, c_tf_ids].sum().sum() / len(c_g_ids)) / \
                      (chea_df[c_tf_ids].sum().sum() / len(chea_df))
        tf_group_scores_df.loc[len(tf_group_scores_df)] = [f, t_value, c_value]
            
    # Create TF to TF family association table
    tf_family_pairs = _pd.DataFrame(columns=['family'])
    for i in tf_families_df.index:
        for tf in tf_families_df.at[i, 'ids']:
            tf_family_pairs.loc[tf] = [i]
            
    # Assign values to full_data_df
    full_data_df['transfac'] = 0.0
    full_data_df['chea'] = 0.0
    for i in full_data_df.index:
        tf_id = str(full_data_df.at[i, 'TF_id'])
        if tf_id in tf_family_pairs.index:
            family = tf_family_pairs.at[tf_id, 'family']
            full_data_df.at[i, 'transfac'] = tf_group_scores_df[tf_group_scores_df['family'] == family]['transfac'].values[0]
            full_data_df.at[i, 'chea'] = tf_group_scores_df[tf_group_scores_df['family'] == family]['chea'].values[0]
        
        # If TF is not in the TF families, calculate individual values
        else:
            t_value = 0.0
            t_g_ids = [h for h in gene_ids_str if h in transfac_df.index]    
            if (tf_id in transfac_df.columns) & (len(t_g_ids) > 0):
                t_value = (transfac_df.loc[t_g_ids, tf_id].sum().sum() / len(t_g_ids)) / \
                    (transfac_df[tf_id].sum().sum() / len(transfac_df))
                full_data_df.at[i, 'transfac'] = t_value 

            c_value = 0.0
            c_g_ids = [h for h in gene_ids_str if h in chea_df.index]    
            if (tf_id in chea_df.columns) & (len(c_g_ids) > 0):
                c_value = (chea_df.loc[c_g_ids, tf_id].sum().sum() / len(c_g_ids)) / \
                    (chea_df[tf_id].sum().sum() / len(chea_df))
                full_data_df.at[i, 'chea'] = c_value 

    # Fill zeroes if only one ('chea' or 'transfac') value is available
    for i in full_data_df.index:
        if full_data_df.at[i, 'chea'] == 0.0:
            full_data_df.at[i, 'chea'] = full_data_df.at[i, 'transfac']
        if full_data_df.at[i, 'transfac'] == 0.0:
            full_data_df.at[i, 'transfac'] = full_data_df.at[i, 'chea'] 

    # Predictions of Random Forest model
    full_data_df['RF_prob'] = rf_model.predict_proba(full_data_df[['TF_id',
                                                                    'p_share','all_p',
                                                                    'p_score', 'go_share',
                                                                    'all_go', 'go_score',
                                                                    'transfac','chea']].values)[:,[1]]

    # Predictions of Logistic Regression model
    full_data_df['LR_prob'] = log_model.predict_proba(full_data_df[['p_share','p_score', 
                                                'go_share', 'go_score', 'RF_prob']].values)[:,[1]]


    full_data_df.drop(columns=['RF_prob', 'p_share', 'all_p', 'p_score', 'go_share', 'all_go', 
                               'go_score', 'transfac', 'chea'], inplace=True)
    intersection = [g for g in tf_names_db.index if g in gene_ids]
    full_data_df = full_data_df[~full_data_df['TF_id'].isin(intersection)].copy(deep=True)
    full_data_df.sort_values('LR_prob', ascending=False, inplace=True)
    full_data_df.reset_index(drop=True, inplace=True)

    return full_data_df
def count(x):
    return _Counter(x)
Exemplo n.º 21
0
def is_one_pair(cards):
	values = _Counter(get_values(cards)).values()
	return 2 in values and len(values) == 4
Exemplo n.º 22
0
def mode(lst):
    """Find statistical mode of iterable of hashable objects."""
    counter = _Counter(lst)
    return counter.most_common(1)[0]
Exemplo n.º 23
0
def residues_from_descriptors(
    residue_descriptors,
    fragments,
    top,
    pick_this_fragment_by_default=None,
    fragment_names=None,
    additional_resnaming_dicts=None,
    extra_string_info='',
    just_inform=False,
):
    r"""
    Returns residue idxs based on a list of residue descriptors.

    Fragments are needed to better identify residues. If a residue
    is present in multiple fragments, the user can dis-ambiguate
    or pick all residue idxs matching the :obj:`residue_descriptor`

    Because of this (one descriptor can match more than one residue)
    the return values are not necessarily of the same length
    as :obj:`residue_descriptors`

    Parameters
    ----------
    residue_descriptors: string or list of of strings
        AAs of the form of "GLU30" or "E30" or 30, can be mixed
    fragments: iterable of iterables of integers
        The integers in the iterables of 'fragments'
        represent residue indices of that fragment
    top: :obj:`~mdtraj.Topology`
    pick_this_fragment_by_default: None or integer.
        Pick this fragment without asking in case of ambiguity.
        If None, the user will we prompted
    fragment_names:
        list of strings providing informative names input :obj:`fragments`
    additional_resnaming_dicts : dict of dicts, default is None
        Dictionary of dictionaries. Lower-level dicts are keyed
        with residue indices and valued with additional residue names.
        Higher-level keys can be whatever. Use case is e.g. if "R131"
        needs to be disambiguated bc. it pops up in many fragments.
        You can pass {"BW":{895:"3.50", ...} here and that label
        will be displayed next to the residue. :obj:`mdciao.cli`
        methods use this.
    just_inform : bool, default is False
        Just inform about the AAs, don't ask for a selection
    extra_string_info: string with any additional info to be printed in case of ambiguity

    Returns
    -------
    residxs : list
        lists of integers that have been selected
    fragidxs : list
        The list of fragments where the residues are
    """
    residxs = []
    fragidxs = []
    last_answer = '0'

    if isinstance(residue_descriptors, (str, int)):
        residue_descriptors = [residue_descriptors]

    for key in residue_descriptors:
        cands = _np.array(
            find_AA(str(key), top, extra_columns=additional_resnaming_dicts))
        cand_fragments = _force_iterable(
            _np.squeeze(_in_what_N_fragments(cands, fragments)))
        # TODO refactor into smaller methods
        if len(cands) == 0:
            print("No residue found with descriptor %s" % key)
            residxs.append(None)
            fragidxs.append(None)
        elif len(cands) == 1:
            if len(cand_fragments) > 1:
                raise ValueError("Your fragment definitions overlap, "
                                 "res_idx %u (%s) is found in fragments  %s" %
                                 (cands[0], top.residue(cands[0]), ', '.join(
                                     [str(fr) for fr in cand_fragments])))
            elif len(cand_fragments) == 0:

                raise ValueError("Your fragment definitions do not contain "
                                 "the residue of interest %u (%s)." %
                                 (cands[0], top.residue(cands[0])))
            residxs.append(cands[0])
            fragidxs.append(cand_fragments[0])
            if just_inform:
                istr = residue_line("0.0",
                                    top.residue(residxs[-1]),
                                    fragidxs[-1],
                                    additional_resnaming_dicts,
                                    fragment_names=fragment_names)
                print(istr)
        else:
            istr = "ambiguous definition for AA %s" % key
            istr += extra_string_info
            if not just_inform:
                print(istr)
            cand_chars = _np.hstack(
                [['%s.%u' % (key, ii) for ii in range(n)]
                 for key, n in _Counter(cand_fragments).items()]).tolist()
            for cc, ss, char in zip(cands, cand_fragments, cand_chars):
                istr = residue_line(char, top.residue(cc), ss,
                                    additional_resnaming_dicts)
                print(istr)
            if just_inform:
                print()
                residxs.extend([ii for ii in cands if ii not in residxs])
                fragidxs.extend(
                    [ii for ii in cand_fragments if ii not in fragidxs])
                continue
            if pick_this_fragment_by_default is None:
                prompt = "Input one fragment idx out of %s and press enter (selects all matching residues in that fragment).\n" \
                         "Use one x.y descriptor in case of repeated fragment index.\n" \
                         "Leave empty and hit enter to repeat last option [%s]" % (
                         [int(ii) for ii in _np.unique(cand_fragments)], last_answer)

                answer = input(prompt)
            else:
                answer = str(pick_this_fragment_by_default)
                print("Automatically picked fragment %u" %
                      pick_this_fragment_by_default)

            if len(answer) == 0:
                answer = last_answer

            if str(answer).isdigit():
                answer = int(answer)
                assert answer in cand_fragments
                idxs_w_answer = _np.argwhere(
                    [answer == ii for ii in cand_fragments]).squeeze()
                cands = cands[idxs_w_answer]
            elif '.' in answer and answer in cand_chars:
                idx_w_answer = _np.argwhere(
                    [answer == ii for ii in cand_chars]).squeeze()
                answer = cand_fragments[idx_w_answer]
                cands = cands[idx_w_answer]
            else:
                raise ValueError("%s is not a possible answer" % answer)
                # TODO implent k for keeping this answer from now on

            assert answer in cand_fragments, (
                "Your answer has to be an integer in the of the fragment list %s"
                % cand_fragments)
            last_answer = answer

            residxs.extend([int(ii) for ii in _force_iterable(cands)])
            fragidxs.extend([int(answer) for __ in _force_iterable(cands)])

    return residxs, fragidxs
Exemplo n.º 24
0
                        help='Sort by letter frequency, highest first')
    return parser


if __name__ == '__main__':
    args = parse_command_line().parse_args()

    # Get input text from chosen source
    if args.input_text is not None:
        text = args.input_text
    elif args.input_filename is not None:
        with args.input_filename as filename:
            text = filename.read()

    # Analyze input text
    frequency = _Counter({c: 0 for c in _alphabet})
    frequency.update(c for c in text.lower() if c in frequency)
    total_letters = sum(frequency.values())
    total_chars = len(text.replace('\n', ''))  # do not count newline chars

    # Output results
    if not args.sortfreq:
        output = sorted(frequency.items(), key=lambda x: x[0], reverse=False)
    else:
        output = sorted(frequency.items(), key=lambda x: x[1], reverse=True)
    for c, n in output:
        if not (args.nonulls and n == 0):
            print('{}:{:>8}{:10.2f}%'.format(c, n,
                                             100 * n / (total_letters or 1)))
    if args.total:
        print('Total letters:   {:>8}\nTotal characters:{:>8}'.format(