示例#1
0
def get_common_words(doc, num):
    words = [
        cleanup(token.text) for token in doc
        if token.is_stop != True and token.is_punct != True and '\n' not in
        token.text and not any(c in token.text
                               for c in ['â', '—', '¦', '|', 'ï', '\n', ' '])
    ]
    return cntr(words), ' '.join([x[0] for x in cntr(words).most_common(num)])
示例#2
0
def tokenize_me(input_text):
    soup = BeautifulSoup(input_text, features="html.parser")
    text = soup.get_text()
    tokens = nltk.word_tokenize(text.lower())
    tokens = [i for i in tokens if (i not in punctuation)]
    normalized_tokens = []
    for t in tokens:
        t = t.strip(string.punctuation)
        if len(t) > 1:
            new_token = morph.parse(t)[0]
            if new_token.tag.POS is None:
                if str(new_token.tag) in ['UNKN', 'LATN']:
                    normalized_tokens.append(new_token.normal_form)
            elif new_token.tag.POS in ['NOUN', 'VERB', 'INFN', 'ADJF', 'ADJS']:
                normalized_tokens.append(new_token.normal_form)
    tokens = [t for t in normalized_tokens if t not in stop_words]
    tokens = set(tokens)
    bigrams = ngrams(tokens, 2)
    for k1, k2 in cntr(bigrams):
        if k1 < k2:
            tokens.add(k1 + " " + k2)
        else:
            tokens.add(k2 + " " + k1)

    return tokens
示例#3
0
def get_common_nouns(doc, num):
    nouns = [
        cleanup(token.text) for token in doc
        if token.is_stop != True and token.is_punct != True and token.pos_ ==
        "NOUN" and not any(c in token.text
                           for c in ['â', '—', '¦', '|', 'ï', '\n', ' '])
    ]
    return ' '.join([x[0] for x in cntr(nouns).most_common(num)])
示例#4
0
def problem_64():
    from collections import Counter as cntr
    _ = input()
    c = cntr(list(map(int,input().split())))
    ans = 0
    for k,v in c.items():
        if k<=v:
            ans+=v-k
        else:
            ans+=v
    print(ans)
示例#5
0
def allRepeatedMotifs(dna, minimum, maximum):
 
    # insure that DNA in upper cases
    dna = dna.upper()
    maximum = maximum # longest motif with at less two repeats 
    minimum = minimum  # the minimum length of motif is 9 bp
    start = timeit.default_timer()
    substrs = cntr() 
    for n in range(minimum,maximum): # look from 9 to 41 bp
      for i in range(0,len(dna)-n):  # loop for counter
        substrs[dna[i:i+n]] += 1 
    motifs = substrs#.most_common

    stop = timeit.default_timer()
    print('> Get all repeated substring: ', round(stop - start, 3), "s") 

    # Convert to dict
    d = dict(motifs)
    start = timeit.default_timer()
    # filter only repeats at less twice
    # sort by motif length decreasing
    d = dict((k, v) for k, v in sorted(d.items(), key=lambda item: len(item[0]),reverse = True) if v >= 2)
    # Sort by value decreasing
    #d = dict((k, v) for k, v in sorted(d.items(), key=lambda item: item[1],reverse = True) if v >= 2)
    stop = timeit.default_timer()
    print('> Filter and sort motifs with at less two repeats: ', round(stop - start, 3), "s") 


    start = timeit.default_timer()

    ls = []
    # reformat output
    #for key, value in d.items():
    #    print(key, len(str(key)) ,value)

    for key, value in d.items():
        temp = [key,len(str(key))]
        ls.append(temp)

    stop = timeit.default_timer()
    print('> Reformating dictionary to list, include length of motifs (bp): ', round(stop - start,3), "s") 
    print("> At all there are: ", len(ls), " motifs, with at less two repeats.")
    #print("> Print motifs, length , repeats: ", *ls[1:20], sep = "\n")

    return(ls)
示例#6
0
def calculateBasins(grid):
    n, basins = len(grid), [u for u in range(len(grid)**2)]

    # disjoint-set-union help methods
    def find(u):
        return u if basins[u] == u else find(basins[u])

    def union(u, v):
        basins[find(v)] = find(u)

    # union edges using disjoint-set-union
    for i in range(n):
        for j in range(n):
            mini, minj = get_min(grid, i, j, n)
            union(i * n + j, mini * n + minj)

    # count and sort
    return sorted(cntr(find(u) for u in range(n * n)).values(), reverse=True)
示例#7
0
    def split_on_best_feature(self, dataX, dataY):
        # Resource: Primary Resource -> No. 2
        # Main Goal: Choosing the best feature to split on <- that means choosing the feature which has the highest absoluate correlation with dataY.
        # Rules:
        #       1. Splitting value will be the mean of the splitiing feature values. 
        #           1a. If all the features have same amount of values, then choose the feature which comes first. 
        #       2. If the selected best feature can not split the data, then we will choose second best feature to split on. 
        #           2a. If none of the feature can not split the data accordingly, then it that case we will return the leaf. 

        # Params:
        #   dataX : A numpy ndarray -> x values at each node
        #   dataY : A numpy 1d array -> y values at each node

        # Returns: 
        #   Tree: A mumpy ndarray.
        #       
        #                <------ feature indices (int type; index for a leaf is -1), splitting values ------>
        #                           /     ~            ~               ~              ~
        #                           |     ~            ~               ~              ~
        #               nodes       |     ~            ~               ~              ~
        #                           |     ~            ~               ~              ~
        #                           \     ~            ~               ~              ~

        if dataX.shape[0] <= self.leaf_size: 
            return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan])
    
        # Now, lets look into the availble list of features. 
        availble_features = range(dataX.shape[1]) # Equivalent to num_features
        availble_LIST_of_features = list(availble_features)

        # Tuples: (<features>, <their_correlation_with_dataY>)
        feature_correlations = []
        feature_correlations = sorted(feature_correlations, key=lambda feature_correlations: feature_correlations[1]) # Sorting with correlations.
        # Referance for Sorting: https://docs.python.org/2.7/howto/sorting.html
         
        for ftr_itr in range(dataX.shape[1]):
            absolute_correlation_value = abs(pr(dataX[:, ftr_itr], dataY)[0])
            
            # Dropping NAN values, and assigning their correlation to 0.0 <- float number.
            if np.isnan(absolute_correlation_value):
                absolute_correlation_value = 0.0
            else:
                pass            
            
            # Now,Appending all values to features_correlaticat coons.            
            feature_correlations.append((ftr_itr, absolute_correlation_value))
        
        # Choosing the best feature. 
        # if lenth of availble total features are 0,
        #           then return leaf. 
        feature_Correlation_temp = 0   
        if len(availble_LIST_of_features) == 0:
            return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan])
        
        #else:
        #   once again check if the features are 1 or more. 
        # Choose the best feature, if any, by iterating over feats_corrs
        else:            
            # Choose the best feature, if any, by iterating over feats_corrs
            while len(availble_LIST_of_features) -1 >= 0:
                best_feature_itr = feature_correlations[feature_Correlation_temp][0]
                y = best_feature_itr

                # Split the data according to the best feature, and considering the mean of the data. 
                # Primary Resource No. 2
                split_val = np.median(dataX[:, y])

                # Arrays for indexing - Logically 
                left_i = dataX[:, y] 
                right_i = dataX[:, y]

                left_index = left_i <= split_val
                right_index = right_i > split_val

                # In any case if we can not split ANY feature in any two distinct parts, then all we do is -> return the leaf.         
                if len(np.unique(left_index)) != 1:
                    break                
                # Once we use the feature, then we take it off from remaining best features to choose from. 
                availble_LIST_of_features.remove(y)
                feature_Correlation_temp = feature_Correlation_temp + 1            
                      
        # Once we run while loop and in any case if we run out of all features that we can split on, then in that case we just return leaf. 
        if len(availble_LIST_of_features) == 0:
            return np.array([-1, cntr(dataY).most_common(1)[0][0], np.nan, np.nan])      
        
        # Building Following:
        #       left branch
        #       the root                    
        lefttree = (self.split_on_best_feature(dataX[left_index], dataY[left_index]))

        # Set the starting row for the right subtree of the current root
        if lefttree.ndim == 1: # https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.ndarray.ndim.html
            righttree_start = 1 
            righttree_start = righttree_start + 1
        elif lefttree.ndim >= (1+1):
            righttree_start = (lefttree.shape[0] + 2)-1
        root = np.array([best_feature_itr, split_val, 1, righttree_start])

        return np.vstack((root, lefttree, self.split_on_best_feature(dataX[right_index], dataY[right_index])))
示例#8
0
errlist = list(gatelog_ids - outdir_ids)

gatelogs_werr = []
for x in errlist:
    #print x
    gatelogs_werr.extend(glob.glob("./**/gatejob*" + str(x)))

print("--------------Joblogs with errors----------------")
print("joblogs without matching outputdir:")
for logf in gatelogs_werr:
    print(logf)
    with open(logf) as f:
        preverr = ""
        for line in f:
            if re.search(w, line, re.IGNORECASE) and preverr != line:
                preverr = line
                print(line, end=' ')

print("-------------------------------------------------")

run_ids = []
if len(gatelogs_werr[0].split('/')) > 2:
    for logf in gatelogs_werr:
        run_ids.append(logf.split('/')[-2])

    for key, val in list(cntr(run_ids).items()):
        print("Relaunch", val, "jobs in", key)
else:
    print("Relaunch", len(gatelogs) - len(outdirs), "jobs.")
示例#9
0
outdir_ids = set([int(x.split('/')[-1].split('.')[-1]) for x in outdirs])

errlist = list(gatelog_ids - outdir_ids)

gatelogs_werr = []
for x in errlist:
    #print x
    gatelogs_werr.extend(glob.glob("./**/gatejob*" + str(x)))

print "--------------Joblogs with errors----------------"
print "joblogs without matching outputdir:"
for logf in gatelogs_werr:
    print logf
    with open(logf) as f:
        preverr = ""
        for line in f:
            if re.search(w, line, re.IGNORECASE) and preverr != line:
                preverr = line
                print line,

print "-------------------------------------------------"

run_ids = []
if len(gatelogs_werr[0].split('/')) > 2:
    for logf in gatelogs_werr:
        run_ids.append(logf.split('/')[-2])

    for key, val in cntr(run_ids).items():
        print "Relaunch", val, "jobs in", key
else:
    print "Relaunch", len(gatelogs) - len(outdirs), "jobs."