def update(self, *args, **kwargs): """D.update([E, ]**F) -> None. Update D from dict/iterable E and F. If E present, does: for k in E: D[k] = E[k] This is followed by: for k in F: D[k] = F[k] **Example:** >>> e = Exponents() >>> e.update(a=1) >>> e.update(dict(b=2), c=3) >>> e.update('1/d') >>> e # doctest: +SKIP Exponents({'a': 1, 'b': 2, 'c': 3, 'd': -1}) .. testcleanup:: >>> assert e == dict(a=1, b=2, c=3, d=-1) """ try: # Assume args[0] is a string. arg = args[0].replace(' ', '') # Remove spaces. except (IndexError, AttributeError): Counter.update(self, *args, **kwargs) else: if len(args) > 1: raise TypeError("update expected at most 1 arguments, got %i" % len(args)) Counter.update(self, Exponents.fromstr(arg), **kwargs)
def __setitem__(self, item, val): if len(self) > self._max * 2: print("Purging") to_delete = self.most_common()[self._max:] for ii in to_delete: del self[ii] Counter.__setitem__(self, item, val)
def __init__(self, _dict=None, **kwargs): if _dict is None: self.__init__(kwargs) else: if not all(isinstance(x, Rational) for x in _dict.values()): raise TypeError("powers of dimensions must be rational") Counter.__init__(self, _dict) self.clean()
def __init__(self, n, *args, **kwargs): assert n>=1 self._size = n self._worst = None Counter.__init__(self, *args, **kwargs) if len(self)>=n: self._worst = self.most_common(n)[-1] for k,v in self.most_common()[n:]: del self[k]
def __setitem__(self, k, v): if k in self: assert self[k]==v return if self._worst is not None: # the beam is full if v<=self._worst[1]: return else: del self[self._worst[0]] Counter.__setitem__(self, k, v) assert self._size>=len(self) if self._size==len(self): self._worst = min(self.items(), key=lambda (k1,v1): v1)
def wanabeknn(k=15): from collections import Counter ftrd = open("minidata/trainingData.csv") fted = open("minidata/testData.csv") flab = open("minidata/trainingLabels.csv") lab = [[int(j) for j in i.strip().split(",")] for i in flab.readlines()] trd = [[int(j) for j in i.strip().split("\t")] for i in ftrd.readlines()] ted = [[int(j) for j in i.strip().split("\t")] for i in fted.readlines()] def dist(a,b): return sum([min(a[i], b[i]) for i in xrange(len(a))]) rez = [] for v in ted: print "hurej %4d %3d" % ( len(rez),len(rez[-1:])) t = [] for trindex, train in enumerate(trd): t.append((dist(train, v), trindex)) tt = sorted(t, reverse=True) ll = [] for i in range(k): ll += lab[tt[i][1]] n = len(ll) for i in range(k/3): ll += lab[tt[i][1]] rez.append([x[0] for x in Counter.most_common(Counter(ll),n/k)]) print rez cPickle.dump(rez, file("rezPickled/wnbknn%d.pickled" % k, "w"), -1)
def addFakeData(oData,oLabels,count=100,low=10): data = oData[:] labels = oLabels[:] for iafsa in range(count): c = Counter(chain(*labels)) lc = Counter.most_common(c) dlc = {} for l in lc: dlc[l[0]] = l[1] #teze = [sum([ dlc[y]**2 for y in x]) for x in labels] teze = [sum([ dlc[y] for y in x]) for x in labels] teze = sorted([(y,x) for x,y in enumerate(teze)]) tt = teze[:max(low*10,200)] shuffle(tt) duplicate = [x[1] for x in tt[:low]] dLabels = [labels[i][:] for i in duplicate] dData = [data[i][:] for i in duplicate] for ii in range(1): for i in range(len(duplicate)): labels.append(dLabels[i]) data.append(dData[i]) #shuflamo vrstice da niso vec lepo, pa poskrbimo da labele ostanejo #pri svojem primeru sd = [] [sd.append((data[i],labels[i])) for i in xrange(len(data))] shuffle(sd) ll = [] dd = [] for x,y in sd: dd.append(x) ll.append(y) return (dd, ll)
def shortestCompletingWord(licensePlate, words): """ :type licensePlate: str :type words: List[str] :rtype: str """ d = {} licensePlate = licensePlate.lower() for c in licensePlate: if c.isalpha(): d[c] = d.get(c, 0) + 1 res1 = '' res2 = '' length1 = length2 = 20 for word in words: n = len(word) all_in, flag = f(n, word, d) if all_in and n < length1: res1 = word length1 = n if flag and n < length2: res2 = word length2 = n if res1: return res1 return res2
def __init__(self, max_len, N_max = 5): ''' @summary: NGramSet Constructor @param max_len: 最优序列长度 @param N_max: CCS2012 对应最大n-gram ''' Counter.__init__(self) self.N_max = N_max self.max_len = max_len self.alphabet_size = 0 # 记录项总数 self.all_record_num = 0 # 记录数据集中记录总数 self.TERM = 0 # 序列结束符
def _most_preferred(self, alternatives): """Applies funcnamei from each trait to the alternatives and return the most preferred.""" prefs = [y for y in [getattr(x, funcnamei)(alternatives) for x in self.traits] if y is not None] if not prefs: return None if len(prefs) == 1: return prefs[0] return Counter.most_common(Counter(prefs), 1)[0][0]
def items(self, relative=False): """ Returns a list of (key, value)-tuples sorted by value, highest-first. With relative=True, the sum of values is 1.0. """ a = Counter.most_common(self) if relative: n = sum(v for k, v in a) or 1. a = [(k, v / n) for v, k in a] return a
def __len__(self): '''Returns the total number of members, excluding repeated elements. >>> m = multiset('abb') >>> len(m) 2 ''' return Counter.__len__(self)
def most_common_viz(output_dir: str, ints: collections.Counter) -> None: df = pd.DataFrame(ints.most_common(), columns=["Integer", "Frequency"]) with open(os.path.join(output_dir, 'index.html'), 'w') as fh: fh.write('<html><body>\n') fh.write('<h3>Most common integers:</h3>\n') fh.write(df.to_html(index=False)) fh.write('</body></html>') with open(os.path.join(output_dir, 'index.tsv'), 'w') as fh: fh.write(df.to_csv(sep='\t', index=False))
def __add__(self, other): """Add together, with recursion. The basic idea is that the set of keys should be added together and then recurse the addition to the values where keys are shared otherwise just add the value. >>> m, m2 = Mdict('a'), Mdict('abb') >>> m + m2 == {'a':2, 'b':2} True >>> m + 1 == {None:1, 'a':1} True >>> m['a'] = m2 #now fractal >>> m == {'a': {'a':1, 'b':2}} True >>> m + m + 1 == {'a': {'a':2, 'b':4}, None: 1} True """ try: # need to add (union) of keys, and then recurse into values of common keys return Mdict(Counter.__add__(self, other)) #Counter.__add__(self, other) except TypeError: #must've been an integer for other return Mdict(Counter.__add__(self, Counter({None: other})))
def removeLeastCommonData(oData, oLabels, least=5): data = oData[:] labels = oLabels[:] c = Counter(chain(*labels)) lc = Counter.most_common(c) bb = sorted(list(Set([j for i,j in lc]))) a = [x[0] for x in lc if x[1] < bb[5]] rem = [i for i,j in enumerate(labels) if len(Set(j).intersection(Set(a))) > 0 ] [labels.pop(x) for x in sorted(rem, reverse=True)] [data.pop(x) for x in sorted(rem, reverse=True)] return (data, labels)
def removeMostCommonData(oData, oLabels, count=20): data = oData[:] labels = oLabels[:] for iafsa in range(count): c = Counter(chain(*labels)) lc = Counter.most_common(c) dlc = {} for l in lc: dlc[l[0]] = l[1] teze = [max([ dlc[y] for y in x]) for x in labels] teze = sorted([(y,x) for x,y in enumerate(teze)]) rem = [x[1] for x in teze[-10:]] [labels.pop(x) for x in sorted(rem, reverse=True)] [data.pop(x) for x in sorted(rem, reverse=True)] return (data, labels)
def __init__(self, iterable): self.__finishedinit = False Counter.__init__(self, iterable) self.__finishedinit = True
def __len__(self): return Counter.__len__(self)
def testSearcherSaveRestore(self): ray.init(num_cpus=8, local_mode=True) def create_searcher(): class TestSuggestion(Searcher): def __init__(self, index): self.index = index self.returned_result = [] super().__init__(metric="episode_reward_mean", mode="max") def suggest(self, trial_id): self.index += 1 return {"test_variable": self.index} def on_trial_complete(self, trial_id, result=None, **kwargs): self.returned_result.append(result) def save(self, checkpoint_path): with open(checkpoint_path, "wb") as f: pickle.dump(self.__dict__, f) def restore(self, checkpoint_path): with open(checkpoint_path, "rb") as f: self.__dict__.update(pickle.load(f)) searcher = TestSuggestion(0) searcher = ConcurrencyLimiter(searcher, max_concurrent=2) searcher = Repeater(searcher, repeat=3, set_index=False) search_alg = SearchGenerator(searcher) experiment_spec = { "run": "__fake", "num_samples": 20, "stop": { "training_iteration": 2 } } experiments = [Experiment.from_json("test", experiment_spec)] search_alg.add_configurations(experiments) return search_alg searcher = create_searcher() runner = TrialRunner(search_alg=searcher, local_checkpoint_dir=self.tmpdir, checkpoint_period=-1) for i in range(6): runner.step() assert len( runner.get_trials()) == 6, [t.config for t in runner.get_trials()] runner.checkpoint() trials = runner.get_trials() [ runner.trial_executor.stop_trial(t) for t in trials if t.status is not Trial.ERROR ] del runner # stop_all(runner.get_trials()) searcher = create_searcher() runner2 = TrialRunner(search_alg=searcher, local_checkpoint_dir=self.tmpdir, resume="LOCAL") assert len(runner2.get_trials()) == 6, [ t.config for t in runner2.get_trials() ] def trial_statuses(): return [t.status for t in runner2.get_trials()] def num_running_trials(): return sum(t.status == Trial.RUNNING for t in runner2.get_trials()) for i in range(6): runner2.step() assert len(set(trial_statuses())) == 1 assert Trial.RUNNING in trial_statuses() for i in range(20): runner2.step() assert 1 <= num_running_trials() <= 6 evaluated = [ t.evaluated_params["test_variable"] for t in runner2.get_trials() ] count = Counter(evaluated) assert all(v <= 3 for v in count.values())
def build_vocab(train_dataset_files, fields, data_type, share_vocab, src_vocab_path, src_vocab_size, src_words_min_frequency, tgt_vocab_path, tgt_vocab_size, tgt_words_min_frequency): """ Args: train_dataset_files: a list of train dataset pt file. fields (dict): fields to build vocab for. data_type: "text", "img" or "audio"? share_vocab(bool): share source and target vocabulary? src_vocab_path(string): Path to src vocabulary file. src_vocab_size(int): size of the source vocabulary. src_words_min_frequency(int): the minimum frequency needed to include a source word in the vocabulary. tgt_vocab_path(string): Path to tgt vocabulary file. tgt_vocab_size(int): size of the target vocabulary. tgt_words_min_frequency(int): the minimum frequency needed to include a target word in the vocabulary. Returns: Dict of Fields """ counter = {} for k in fields: counter[k] = Counter() # Load vocabulary src_vocab = None if len(src_vocab_path) > 0: src_vocab = set([]) print('Loading source vocab from %s' % src_vocab_path) assert os.path.exists(src_vocab_path), \ 'src vocab %s not found!' % src_vocab_path with open(src_vocab_path) as f: for line in f: word = line.strip().split()[0] src_vocab.add(word) tgt_vocab = None if len(tgt_vocab_path) > 0: tgt_vocab = set([]) print('Loading target vocab from %s' % tgt_vocab_path) assert os.path.exists(tgt_vocab_path), \ 'tgt vocab %s not found!' % tgt_vocab_path with open(tgt_vocab_path) as f: for line in f: word = line.strip().split()[0] tgt_vocab.add(word) for path in train_dataset_files: dataset = torch.load(path) print(" * reloading %s." % path) for ex in dataset.examples: for k in fields: val = getattr(ex, k, None) if val is not None and not fields[k].sequential: val = [val] elif k == 'src' and src_vocab: val = [item for item in val if item in src_vocab] elif k == 'tgt' and tgt_vocab: val = [item for item in val if item in tgt_vocab] counter[k].update(val) _build_field_vocab(fields["tgt"], counter["tgt"], max_size=tgt_vocab_size, min_freq=tgt_words_min_frequency) print(" * tgt vocab size: %d." % len(fields["tgt"].vocab)) # All datasets have same num of n_tgt_features, # getting the last one is OK. for j in range(dataset.n_tgt_feats): key = "tgt_feat_" + str(j) _build_field_vocab(fields[key], counter[key]) print(" * %s vocab size: %d." % (key, len(fields[key].vocab))) if data_type == 'text': _build_field_vocab(fields["src"], counter["src"], max_size=src_vocab_size, min_freq=src_words_min_frequency) print(" * src vocab size: %d." % len(fields["src"].vocab)) # All datasets have same num of n_src_features, # getting the last one is OK. for j in range(dataset.n_src_feats): key = "src_feat_" + str(j) _build_field_vocab(fields[key], counter[key]) print(" * %s vocab size: %d." % (key, len(fields[key].vocab))) # Merge the input and output vocabularies. if share_vocab: # `tgt_vocab_size` is ignored when sharing vocabularies print(" * merging src and tgt vocab...") merged_vocab = merge_vocabs( [fields["src"].vocab, fields["tgt"].vocab], vocab_size=src_vocab_size) fields["src"].vocab = merged_vocab fields["tgt"].vocab = merged_vocab return fields
def process_questions(args): ''' Encode question tokens''' print('Loading data') with open(args.annotation_file, 'r') as dataset_file: instances = json.load(dataset_file) # Either create the vocab or load it from disk if args.mode in ['train']: print('Building vocab') answer_cnt = {} for instance in instances: answer = instance['answer'] answer_cnt[answer] = answer_cnt.get(answer, 0) + 1 answer_token_to_idx = {'<UNK0>': 0, '<UNK1>': 1} answer_counter = Counter(answer_cnt) frequent_answers = answer_counter.most_common(args.answer_top) total_ans = sum(item[1] for item in answer_counter.items()) total_freq_ans = sum(item[1] for item in frequent_answers) print("Number of unique answers:", len(answer_counter)) print("Total number of answers:", total_ans) print("Top %i answers account for %f%%" % (len(frequent_answers), total_freq_ans * 100.0 / total_ans)) for token, cnt in Counter(answer_cnt).most_common(args.answer_top): answer_token_to_idx[token] = len(answer_token_to_idx) print('Get answer_token_to_idx, num: %d' % len(answer_token_to_idx)) question_token_to_idx = {'<NULL>': 0, '<UNK>': 1} for i, instance in enumerate(instances): question = instance['question'].lower()[:-1] for token in nltk.word_tokenize(question): if token not in question_token_to_idx: question_token_to_idx[token] = len(question_token_to_idx) print('Get question_token_to_idx') print(len(question_token_to_idx)) vocab = { 'question_token_to_idx': question_token_to_idx, 'answer_token_to_idx': answer_token_to_idx, 'question_answer_token_to_idx': {'<NULL>': 0, '<UNK>': 1} } print('Write into %s' % args.vocab_json.format(args.dataset, args.dataset)) with open(args.vocab_json.format(args.dataset, args.dataset), 'w') as f: json.dump(vocab, f, indent=4) else: print('Loading vocab') with open(args.vocab_json.format(args.dataset, args.dataset), 'r') as f: vocab = json.load(f) # Encode all questions print('Encoding data') questions_encoded = [] questions_len = [] question_ids = [] video_ids_tbw = [] video_names_tbw = [] all_answers = [] for idx, instance in enumerate(instances): question = instance['question'].lower()[:-1] question_tokens = nltk.word_tokenize(question) question_encoded = utils.encode(question_tokens, vocab['question_token_to_idx'], allow_unk=True) questions_encoded.append(question_encoded) questions_len.append(len(question_encoded)) question_ids.append(idx) im_name = instance['video_id'] video_ids_tbw.append(im_name) video_names_tbw.append(im_name) if instance['answer'] in vocab['answer_token_to_idx']: answer = vocab['answer_token_to_idx'][instance['answer']] elif args.mode in ['train']: answer = 0 elif args.mode in ['val', 'test']: answer = 1 all_answers.append(answer) max_question_length = max(len(x) for x in questions_encoded) for qe in questions_encoded: while len(qe) < max_question_length: qe.append(vocab['question_token_to_idx']['<NULL>']) questions_encoded = np.asarray(questions_encoded, dtype=np.int32) questions_len = np.asarray(questions_len, dtype=np.int32) print(questions_encoded.shape) glove_matrix = None if args.mode == 'train': token_itow = {i: w for w, i in vocab['question_token_to_idx'].items()} print("Load glove from %s" % args.glove_pt) glove = pickle.load(open(args.glove_pt, 'rb')) dim_word = glove['the'].shape[0] glove_matrix = [] for i in range(len(token_itow)): vector = glove.get(token_itow[i], np.zeros((dim_word,))) glove_matrix.append(vector) glove_matrix = np.asarray(glove_matrix, dtype=np.float32) print(glove_matrix.shape) print('Writing', args.output_pt.format(args.dataset, args.dataset, args.mode)) obj = { 'questions': questions_encoded, 'questions_len': questions_len, 'question_id': question_ids, 'video_ids': np.asarray(video_ids_tbw), 'video_names': np.array(video_names_tbw), 'answers': all_answers, 'glove': glove_matrix, } with open(args.output_pt.format(args.dataset, args.dataset, args.mode), 'wb') as f: pickle.dump(obj, f)
def __init__(self, *args, **kwargs): Counter.__init__(self, *args, **kwargs)
# print Counter(foundEmosAndSmilies) # print("found smilies top50") # print Counter.most_common(Counter(foundEmosAndSmilies), 50) # print("total number of sentences with smilies:") # print numOfTotalSentences """ Classify """ for message in messages: messagecounter += 1 # for every sentence score, words = emoCount.score(message) foundEmosAndSmilies = foundEmosAndSmilies + words if len(words) != 0: numOfTotalSentences += 1 print(words) # in the end print("Messages total:") print messagecounter print("found smilies total:") print sum(Counter(foundEmosAndSmilies).values()) print("found smilies examples ordered:") print Counter(foundEmosAndSmilies) print("found smilies top50") print Counter.most_common(Counter(foundEmosAndSmilies), 50) print("total number of sentences with smilies:") print numOfTotalSentences
def func(s1: str, s2: str): if s1 == s2: return 0 if s1.startswith(s2): return 1 if s2.startswith(s1): return -1 for i in range(0, min(len(s1), len(s2))): if alphabet.index(s1[i]) < alphabet.index(s2[i]): return -1 elif alphabet.index(s1[i]) > alphabet.index(s2[i]): return 1 ls = [input().strip() for x in range(0, int(input()))] temp1 = [list(Counter(x).keys()) for x in ls] alphabet = [] for x in temp1: alphabet = list(set(alphabet) | set(x)) #print(alphabet) first = [] for x in range(0, len(alphabet)): ls = sorted(ls, key=cmp_to_key(func)) first.append(ls[0]) alphabet.append(alphabet[0]) alphabet.remove(alphabet[0]) #print(first) result = list(set(first)) print(len(result)) for x in ls: if x in result:
def find(self, minhash, threshold, containment=False, ignore_scaled=False): """ Do a Jaccard similarity or containment search. """ # make sure we're looking at the same scaled value as database if self.scaled > minhash.scaled: minhash = minhash.downsample_scaled(self.scaled) elif self.scaled < minhash.scaled and not ignore_scaled: raise ValueError( "lca db scaled is {} vs query {}; must downsample".format( self.scaled, minhash.scaled)) if not hasattr(self, 'signatures'): debug('creating signatures for LCA DB...') sigd = defaultdict(minhash.copy_and_clear) for (k, v) in self.hashval_to_idx.items(): for vv in v: sigd[vv].add_hash(k) self.signatures = sigd debug('=> {} signatures!', len(self.signatures)) # build idx_to_ident from ident_to_idx if not hasattr(self, 'idx_to_ident'): idx_to_ident = {} for k, v in self.ident_to_idx.items(): idx_to_ident[v] = k self.idx_to_ident = idx_to_ident query_mins = set(minhash.get_mins()) # collect matching hashes: c = Counter() for hashval in query_mins: idx_list = self.hashval_to_idx.get(hashval, []) for idx in idx_list: c[idx] += 1 debug('number of matching signatures for hashes: {}', len(c)) for idx, count in c.items(): ident = self.idx_to_ident[idx] name = self.ident_to_name[ident] debug('looking at {} ({})', ident, name) match_mh = self.signatures[idx] match_size = len(match_mh) debug('count: {}; query_mins: {}; match size: {}', count, len(query_mins), match_size) if containment: score = count / len(query_mins) else: score = count / (len(query_mins) + match_size - count) debug('score: {} (containment? {})', score, containment) if score >= threshold: # reconstruct signature... ugh. from .. import SourmashSignature match_sig = SourmashSignature(match_mh, name=name) yield score, match_sig, match_sig.md5sum(), self.filename, name
def create_data_analysis_report(data_frame): real_news, fake_news = [news for _, news in data_frame.groupby(data_frame['is_sarcastic'] == 1)] logging.info("---------------------------------Shape of Real and Fake news in training data----------------------------------------------------------------------") logging.info(real_news.shape) logging.info(fake_news.shape) print("\n-------------------------------------------Shape of Real and Fake News in training data------------------------------------------------------------------------") print("\n Real News:",real_news.shape) print("\n Fake News:",fake_news.shape) #<-------------------------------------------------EXPLORING ARTICLE HEADLINE TEXT----------------------------------------------------------------------------------------------> words_per_headline_plot_t = real_news["headline"].apply(lambda x: len(x.split())) stdev_t_head = statistics.stdev(words_per_headline_plot_t) words_per_headline_t = words_per_headline_plot_t.sum() / len(real_news["headline"]) words_per_headline_plot_f = fake_news["headline"].apply(lambda x: len(x.split())) stdev_f_head = statistics.stdev(words_per_headline_plot_f) words_per_headline_f = words_per_headline_plot_f.sum() / len(fake_news["headline"]) logging.info("\n--------------------------------------------------------Exploring Article Headline Text-----------------------------------------------------------") logging.info("\n-------------------------------------------------Average Number and Standard Deviation----------------------------------------") logging.info("\nThe average number of words in a real news Headline is :") logging.info(words_per_headline_t) logging.info("\nThe average number of words in a fake news Headline is :") logging.info(words_per_headline_f) logging.info("\nThe standard deviation in real news article lengths is:") logging.info(stdev_t_head) logging.info("\nThe standard deviation in fake news article lengths is :") logging.info(stdev_f_head) print("-------------------------------------------Averge number and Standard Deviation------------------------------------------------") print("The average number of words in a real news headline is ", words_per_headline_t) print("The average number of words in a fake news headline is ", words_per_headline_f) print("The standard deviation in real news articles' headline lengths is ", stdev_t_head) print("The standard deviation in fake news articles' headline lengths is ", stdev_f_head) #Plotting the average and standard deviation diagram fig, ax = mpl.subplots(1, 2, figsize=(10, 6)) words_per_headline_plot = sns.distplot(words_per_headline_plot_t, ax=ax[0], color="darkblue", rug=True).set_title( "Number of Words in Real News Headline") words_per_headline_plot = sns.distplot(words_per_headline_plot_f, ax=ax[1], color="red", rug=True).set_title( "Number of Words in Fake News Headline") mpl.show() words_per_headline_plot.figure.savefig("Data_Analysis_Plots_Directory\words_per_headline_plot.png") #--------------------------------------------ARTICLE HEADLINE SENTIMENT ANALYSIS----------------------------------------------------- headline_polarity_true = pd.DataFrame(columns=["Headline", "sentiment"]) for headline in real_news["headline"]: headline = TextBlob(headline) headline_polarity_true = headline_polarity_true.append( pd.Series([headline, headline.sentiment.polarity], index=headline_polarity_true.columns), ignore_index=True) headline_polarity_fake = pd.DataFrame(columns=["Headline", "sentiment"]) for headline in fake_news["headline"]: headline = TextBlob(headline) headline_polarity_fake = headline_polarity_fake.append( pd.Series([headline, headline.sentiment.polarity], index=headline_polarity_fake.columns), ignore_index=True) headline_polarity_true_sm = statistics.mean(headline_polarity_true["sentiment"]) headline_polarity_fake_sm = statistics.mean(headline_polarity_fake["sentiment"]) logging.info( "\n-------------------------------------------Sentiment Analysis of Article Headline Text-----------------------------------------------------------") logging.info("\nThe headline sentiment analysis result for real_news :") logging.info(headline_polarity_true_sm) logging.info("\nThe headline sentiment analysis result for real_news : :") logging.info(headline_polarity_fake_sm) logging.info( "\nPlotting headline_sentiment_plot and saved at Data_Analysis_Plots_Directory\headline_sentiment_analysis_plot:") fig, ax = mpl.subplots(1, 2, figsize=(10, 6)) headline_sentiment_plot = sns.distplot(headline_polarity_true["sentiment"], ax=ax[0], color="darkblue", rug=True).set_title("Real News Headline Sentiments") headline_sentiment_plot = sns.distplot(headline_polarity_fake["sentiment"], ax=ax[1], color="red", rug=True).set_title( "Fake News Headline Sentiments") mpl.show() headline_sentiment_plot.figure.savefig("Data_Analysis_Plots_Directory\headline_sentiment_analysis_plot.png") # #---------------------------------------Computing bigrams in Real News headline------------------------------------------------------------------- lemmatizer = WordNetLemmatizer() words_in_real_news_headline = [] # all tokens in true articles words_in_fake_news_headline = [] words_in_real_news_headline_with_no_stopwords = [] # all tokens in true articles words_in_fake_news_headline_with_no_stopwords = [] # all tokens in fake articles #--------------------------------------------------Processinng ngram-------------------------------------------------------------------------------- process(real_news, words_in_real_news_headline) process(fake_news, words_in_fake_news_headline) bigrams_real_news_headline = zip(words_in_real_news_headline, words_in_real_news_headline[1:]) bigram_counts_real_news_headline = Counter(bigrams_real_news_headline) df = pd.DataFrame(bigram_counts_real_news_headline.most_common(20), columns=["Bigram_Real_News", "Frequency"]) bigrams_real_news_headline = df logging.info(bigrams_real_news_headline) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Bigram_Real_News', y='Frequency', title="Top Bigrams in Real News Headline").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_Bigrams_plot.png", bbox_inches = "tight") mpl.show() #---------------------------------------Computing bigrams in Fake News headline------------------------------------------------------------------- bigrams_fake_news_headline = zip(words_in_fake_news_headline, words_in_fake_news_headline[1:]) bigram_counts_fake_news_headline = Counter(bigrams_fake_news_headline) df = pd.DataFrame(bigram_counts_fake_news_headline.most_common(20), columns=["Bigram_Fake_News", "Frequency"]) bigrams_fake_news_headline = df logging.info(bigrams_fake_news_headline) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Bigram_Fake_News', y='Frequency', title="Top Bigram in Fake News Headline").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_Bigrams_plot.png", bbox_inches="tight") mpl.show() #---------------------------------------Computing trigrams in real news headline------------------------------------------------------------------- trigrams_real_news_headline = zip(words_in_real_news_headline, words_in_real_news_headline[1:], words_in_real_news_headline[2:]) trigram_counts_real_news_headline = Counter(trigrams_real_news_headline) df = pd.DataFrame(trigram_counts_real_news_headline.most_common(20), columns=["Trigram_Real_News", "Frequency"]) trigrams_real_news_headline = df logging.info(trigrams_real_news_headline) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Trigram_Real_News', y='Frequency', title="Top Tigrams in Real News Headline").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_Trigrams_plot.png", bbox_inches="tight") mpl.show() #---------------------------------------Computing trigrams in fake news headline------------------------------------------------------------------- trigrams_fake_news_headline = zip(words_in_fake_news_headline, words_in_fake_news_headline[1:], words_in_fake_news_headline[2:]) trigram_counts_fake_news_headline = Counter(trigrams_fake_news_headline) df = pd.DataFrame(trigram_counts_fake_news_headline.most_common(20), columns=["Trigram_Fake_News", "Frequency"]) trigrams_fake_news_headline = df logging.info(trigrams_fake_news_headline) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Trigram_Fake_News', y='Frequency', title="Top Trigrams in Fake News Headline").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_Trigrams_plot.png", bbox_inches="tight") mpl.show() #---------------------------------------Computing unigram in real news headline------------------------------------------------------------------- wordcounts_r = Counter(words_in_real_news_headline) mostcommon_r = Counter(wordcounts_r).most_common(20) df = pd.DataFrame(mostcommon_r, columns=["Unigram_Real_News", "Frequency"]) logging.info(df) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Unigram_Real_News', y='Frequency', title="Top Unigram in Real News Headline").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_unigrams_plot.png", bbox_inches="tight") mpl.show() r_plot = dict(mostcommon_r) mostcommon_r = df.reset_index(drop=True) mostcommon_r = df['Unigram_Real_News'].tolist() r_wc = WordCloud(max_words=25,relative_scaling=1,background_color ='white', normalize_plurals=False).generate_from_frequencies(r_plot) mpl.imshow(r_wc) mpl.title("Plot of Most Frequent Words in Real News") mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_unigrams_wc_plot.png", bbox_inches="tight") mpl.show() #---------------------------------------Computing unigram in fake news headline------------------------------------------------------------------- wordcounts_f = Counter(words_in_fake_news_headline) mostcommon_f = Counter(wordcounts_f).most_common(20) df = pd.DataFrame(mostcommon_f, columns=["Unigram_Fake_News", "Frequency"]) logging.info(df) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Unigram_Fake_News', y='Frequency', title="Top Unigram in Fake News Headline").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_unigrams_plot.png", bbox_inches="tight") mpl.show() f_plot = dict(mostcommon_f) mostcommon_f = df.reset_index(drop=True) mostcommon_f = df['Unigram_Fake_News'].tolist() f_wc = WordCloud(max_words=25,relative_scaling=1,background_color ='white', normalize_plurals=False).generate_from_frequencies(f_plot) mpl.imshow(f_wc) mpl.title("Plot of Most Frequent Words in Fake News") mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_unigrams_wc_plot.png", bbox_inches="tight") mpl.show() #Of the top 20 words in each class, 9 words are common logging.info("-------------------------Of the top 20 words in each class, Number of words that are common------------------------------------------") logging.info(len(set(mostcommon_r) & set(mostcommon_f))) #--------------------------------------------------Processinng ngram with no stop words-------------------------------------------------------------------------------- process_no_stopwords(real_news, words_in_real_news_headline_with_no_stopwords) process_no_stopwords(fake_news, words_in_fake_news_headline_with_no_stopwords) #---------------------------------------Computing bigrams in Real News headline with no stop word------------------------------------------------------------------- bigrams_real_news_headline = zip(words_in_real_news_headline_with_no_stopwords, words_in_real_news_headline_with_no_stopwords[1:]) bigram_counts_real_news_headline = Counter(bigrams_real_news_headline) df = pd.DataFrame(bigram_counts_real_news_headline.most_common(20), columns=["Bigram_Real_News_with_no_stopwords", "Frequency"]) bigrams_real_news_headline = df logging.info(bigrams_real_news_headline) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Bigram_Real_News_with_no_stopwords', y='Frequency', title="Top Bigrams in Real News Headline with no stop words").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_Bigrams_with_no_stop_words_plot.png", bbox_inches = "tight") mpl.show() #---------------------------------------Computing bigrams in Fake News headline with no stop word------------------------------------------------------------------- bigrams_fake_news_headline = zip(words_in_fake_news_headline_with_no_stopwords, words_in_fake_news_headline_with_no_stopwords[1:]) bigram_counts_fake_news_headline = Counter(bigrams_fake_news_headline) df = pd.DataFrame(bigram_counts_fake_news_headline.most_common(20), columns=["Bigram_Fake_News_no_stopwords", "Frequency"]) bigrams_fake_news_headline = df logging.info(bigrams_fake_news_headline) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Bigram_Fake_News_no_stopwords', y='Frequency', title="Top Bigram in Fake News Headline with no stop words").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_Bigrams_no_stopwords_plot.png", bbox_inches="tight") mpl.show() #---------------------------------------Computing trigrams in real news headline with no stop words------------------------------------------------------------------- trigrams_real_news_headline = zip(words_in_real_news_headline_with_no_stopwords, words_in_real_news_headline_with_no_stopwords[1:],words_in_real_news_headline_with_no_stopwords[2:]) trigram_counts_real_news_headline = Counter(trigrams_real_news_headline) df = pd.DataFrame(trigram_counts_real_news_headline.most_common(20), columns=["Trigram_Real_News_no_stopwords", "Frequency"]) trigrams_real_news_headline = df logging.info(trigrams_real_news_headline) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Trigram_Real_News_no_stopwords', y='Frequency', title="Top Tigrams in Real News Headline with no stop words").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_Trigrams_no_stopwords_plot.png", bbox_inches="tight") mpl.show() #---------------------------------------Computing trigrams in fake news headline with no stop words------------------------------------------------------------------- trigrams_fake_news_headline = zip(words_in_fake_news_headline_with_no_stopwords, words_in_fake_news_headline_with_no_stopwords[1:], words_in_fake_news_headline_with_no_stopwords[2:]) trigram_counts_fake_news_headline = Counter(trigrams_fake_news_headline) df = pd.DataFrame(trigram_counts_fake_news_headline.most_common(20), columns=["Trigram_Fake_News_no_stopwords", "Frequency"]) trigrams_fake_news_headline = df logging.info(trigrams_fake_news_headline) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Trigram_Fake_News_no_stopwords', y='Frequency', title="Top Trigrams in Fake News Headline with no stop words").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_Trigrams_no_stopwords_plot.png", bbox_inches="tight") mpl.show() #---------------------------------------Computing unigram in real news headline with no stop words------------------------------------------------------------------- wordcounts_r = Counter(words_in_real_news_headline_with_no_stopwords) mostcommon_r = Counter(wordcounts_r).most_common(20) df = pd.DataFrame(mostcommon_r, columns=["Unigram_Real_News_no_stopwords", "Frequency"]) logging.info(df) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Unigram_Real_News_no_stopwords', y='Frequency', title="Top Unigram in Real News Headline with no stop words").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_unigrams_no_stopwords_plot.png", bbox_inches="tight") mpl.show() r_plot = dict(mostcommon_r) mostcommon_r = df.reset_index(drop=True) mostcommon_r = df['Unigram_Real_News_no_stopwords'].tolist() r_wc = WordCloud(max_words=25, relative_scaling=1, background_color='white', normalize_plurals=False).generate_from_frequencies(r_plot) mpl.imshow(r_wc) mpl.title("Plot of Most Frequent Words with no stop words in Real News") mpl.savefig("Data_Analysis_Plots_Directory\ real_news_top_unigrams_with_o_stopwords_wc_plot.png", bbox_inches="tight") mpl.show() #---------------------------------------Computing unigram in fake news headline with stop words------------------------------------------------------------------- wordcounts_f = Counter(words_in_fake_news_headline_with_no_stopwords) mostcommon_f = Counter(wordcounts_f).most_common(20) df = pd.DataFrame(mostcommon_f, columns=["Unigram_Fake_News_no_stopwords", "Frequency"]) logging.info(df) df.sort_values(by='Frequency', ascending=False) df.plot.barh(x='Unigram_Fake_News_no_stopwords', y='Frequency', title="Top Unigram in Fake News Headline no_stopwords").invert_yaxis() mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_unigrams_no_stopwords_plot.png", bbox_inches="tight") mpl.show() f_plot = dict(mostcommon_f) mostcommon_f = df.reset_index(drop=True) mostcommon_f = df['Unigram_Fake_News_no_stopwords'].tolist() f_wc = WordCloud(max_words=25, relative_scaling=1, background_color='white', normalize_plurals=False).generate_from_frequencies(f_plot) mpl.imshow(f_wc) mpl.title("Plot of Most Frequent Words wit no stop words in Fake News") mpl.savefig("Data_Analysis_Plots_Directory\ fake_news_top_unigrams_no_stopwords_wc_plot.png", bbox_inches="tight") mpl.show()
import re text = 'Everything is awesome, everything is cool when you are part of the team. Everything is awesome, when you are living the dream' # print('Everything is awesome' in text) # print(text.replace('dream', 'scream')) text2 = ''' $ python module_index.py |grep ^re re | stdlib | 005, 007, 009, 015, 021, 022, 068, 080, 081, 086, 095 ''' # print(re.findall(r'\d+', text2)) text3 = """Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum""" # print(text3.split()) # print(re.findall(r'[A-Z][a-z0-9]+', text3)) from collections import Counter cnt = Counter(re.findall(r'[A-Z][a-z0-9]+', text3)) print(cnt)
def getMostUsedDefiFormats(self, count=None): return Counter([entry.getDefiFormat() for entry in self]).most_common(count)
def main(args): data_file_name = get_file_name(args.data_path, args.data_file) tweet_data = dict() metrics = { 'words': list(), 'unfair': list(), 'times': list(), 'source': list() } with open(data_file_name, 'r') as data_file: for line in data_file: tweet = json.loads(line) status = tweet['status'] # tweet ID tweet_data.setdefault(status['id'], dict()) # static fields target_field, source_field = 'created at', 'created_at' if target_field in tweet_data[status['id']]: assert tweet_data[ status['id']][target_field] == status[source_field] else: tweet_data[status['id']][target_field] = status[source_field] target_field, source_field = 'text', 'text' if target_field in tweet_data[status['id']]: assert tweet_data[ status['id']][target_field] == status[source_field] else: tweet_data[status['id']][target_field] = status[source_field] target_field, source_field = 'source', 'source' if target_field in tweet_data[status['id']]: assert tweet_data[ status['id']][target_field] == status[source_field] else: tweet_data[status['id']][target_field] = status[source_field] # dynamic fields target_field, source_field = 'rt', 'retweet_count' tweet_data[status['id']].setdefault(target_field, dict()) tweet_data[status['id']][target_field][ tweet['collected at']] = status[source_field] for id, tweet in tweet_data.iteritems(): norm_words = normalized(tweet['text']) metrics['words'].extend(norm_words) metrics['unfair'].append(is_negative(norm_words)) metrics['times'].append(tweet['created at']) metrics['source'].append(tweet['source']) if tweet[ 'source'] == '<a href="https://ads.twitter.com" rel="nofollow">Twitter Ads</a>': print '>>>>>>>', tweet['created at'], tweet['text'] n_words = 25 print 'Top ' + str( n_words) + ' most used words (excludes some common words like "the").' for word, count in Counter(metrics['words']).most_common(n_words): if count > 0: print word, '(' + str(count) + ')' print day_parts = { 'you should really get some sleep': (datetime.time(0, 0, 0), datetime.time(4, 0, 0)), 'early morning': (datetime.time(4, 0, 1), datetime.time(8, 0, 0)), 'morning': (datetime.time(8, 0, 1), datetime.time(12, 0, 0)), 'afternoon': (datetime.time(12, 0, 1), datetime.time(16, 0, 0)), 'evening': (datetime.time(16, 0, 1), datetime.time(20, 0, 0)), 'late night': (datetime.time(20, 0, 1), datetime.time(23, 59, 59)) } tweet_day_parts = list() for t in metrics['times']: for day_part, (start, end) in day_parts.iteritems(): if start <= datetime.datetime.strptime( t, '%a %b %d %H:%M:%S +0000 %Y').time() <= end: tweet_day_parts.append(day_part) break print 'Tweet frequency by day part' for day_part, count in Counter(tweet_day_parts).most_common(): print day_part, '(' + str(count) + ')' print print 'Unfair-o-meter: # unfair tweets, # total tweets, % unfair tweets' n_unfair = sum(metrics['unfair']) n_tweets = len(metrics['unfair']) print n_unfair, n_tweets, str( 100 * round(n_unfair / float(n_tweets), 1)) + '%' print total = 0 for source, count in Counter(metrics['source']).most_common(): total += count print source, count print 'total tweets:', total
def __iter__(self): return Counter.__iter__(self)
@author: sanyuktabaluni """ from matplotlib import pyplot as plt import numpy as np import pandas as pd from collections import Counter plt.style.use('seaborn') df=pd.read_csv("data.csv") df["LanguagesWorkedWith"]=df["LanguagesWorkedWith"].apply(lambda x: x.split(";")) print(Counter(df["LanguagesWorkedWith"].iloc[1])) c=Counter() for row in df["LanguagesWorkedWith"]: c.update(row) print(c) languages=[] popularity=[] for s in c.most_common(15): languages.append(s[0]) popularity.append(s[1])
with open("../input/9.txt") as f: data = [list(map(int, line[:-1])) for line in f.readlines()] part_1 = 0 basin = 0 seen = {} stack = [] for r in range(len(data)): for c in range(len(data[0])): if all(r + dr < 0 or r + dr >= len(data) or c + dc < 0 or c + dc >= len(data[0]) or data[r][c] < data[r + dr][c + dc] for dr, dc in ((0, -1), (0, 1), (-1, 0), (1, 0))): part_1 += 1 + data[r][c] if (r, c) not in seen and data[r][c] != 9: stack.append((r, c)) while stack: r, c = stack.pop() for dr, dc in ((0, -1), (0, 1), (-1, 0), (1, 0)): r_ = r + dr c_ = c + dc if 0 <= r_ < len(data) and 0 <= c_ < len(data[0]): if (r_, c_) not in seen and data[r_][c_] != 9: seen[(r_, c_)] = basin stack.append((r_, c_)) basin += 1 print(part_1) a, b, c = Counter(list(seen.values())).most_common(3) print(a[1] * b[1] * c[1])
print books print len(books) def get_year(book): """book["date"] looks like 'November 2014' so we need to split on the space and then take the second piece""" return int(book["date"].split()[1]) # 2014 is the last complete year of data (when I ran this) year_counts = Counter( get_year(book) for book in books if get_year(book) <= 2017) years = sorted(year_counts) book_counts = [year_counts[year] for year in years] plt.plot(years, book_counts) plt.ylabel("# of data books") plt.title("Data is Big!") plt.show() serialized = """{ "title" : "Data Science Book",
def collect_comments_top(comments, company_id): comments_top = Counter() for commenters in comments.values(): comments_top.update(commenters) del comments_top[company_id] return dict(comments_top.most_common())
def _counter(self): # Non-public method return Counter(self.text)
def simulate(pocket, step, count): for _ in range(count): pocket = expand(pocket) pocket = step(pocket) return pocket puzzle_input = """ ##.##### #.##..#. .##...## ###.#... .####### ##....## ###.###. .#.#.#.. """ data = [[elt for elt in line] for line in puzzle_input.strip().split()] # pocket[z][y][x] pocket = defaultdict(lambda: defaultdict(dict)) for y, row in enumerate(data): for x, value in enumerate(row): pocket[0][y][x] = value print(pocket) bootup_pocket = simulate(pocket, step_part1, count=6) print(Counter(bootup_pocket[z][y][x] for z in bootup_pocket for y in bootup_pocket[z] for x in bootup_pocket[z][y]))
def dist_stats(H): """ Computes many basic hypergraph stats and puts them all into a single dictionary object * nrows = number of nodes (rows in the incidence matrix) * ncols = number of edges (columns in the incidence matrix) * aspect ratio = nrows/ncols * ncells = number of filled cells in incidence matrix * density = ncells/(nrows*ncols) * node degree list = degree_dist(H) * node degree dist = centrality_stats(degree_dist(H)) * node degree hist = Counter(degree_dist(H)) * max node degree = max(degree_dist(H)) * edge size list = edge_size_dist(H) * edge size dist = centrality_stats(edge_size_dist(H)) * edge size hist = Counter(edge_size_dist(H)) * max edge size = max(edge_size_dist(H)) * comp nodes list = s_comp_dist(H, s=1, edges=False) * comp nodes dist = centrality_stats(s_comp_dist(H, s=1, edges=False)) * comp nodes hist = Counter(s_comp_dist(H, s=1, edges=False)) * comp edges list = s_comp_dist(H, s=1, edges=True) * comp edges dist = centrality_stats(s_comp_dist(H, s=1, edges=True)) * comp edges hist = Counter(s_comp_dist(H, s=1, edges=True)) * num comps = len(s_comp_dist(H)) Parameters ---------- H : Hypergraph Returns ------- dist_stats : dict Dictionary which keeps track of each of the above items (e.g., basic['nrows'] = the number of nodes in H) """ stats = H.state_dict.get("dist_stats", None) if stats is not None: return H.state_dict["dist_stats"] else: cstats = ["min", "max", "mean", "median", "std"] basic = dict() # Number of rows (nodes), columns (edges), and aspect ratio basic["nrows"] = len(H.nodes) basic["ncols"] = len(H.edges) basic["aspect ratio"] = basic["nrows"] / basic["ncols"] # Number of cells and density M = H.incidence_matrix(index=False) basic["ncells"] = M.nnz basic["density"] = basic["ncells"] / (basic["nrows"] * basic["ncols"]) # Node degree distribution basic["node degree list"] = sorted(degree_dist(H), reverse=True) basic["node degree centrality stats"] = dict( zip(cstats, centrality_stats(basic["node degree list"])) ) basic["node degree hist"] = Counter(basic["node degree list"]) basic["max node degree"] = max(basic["node degree list"]) # Edge size distribution basic["edge size list"] = sorted(H.edge_size_dist(), reverse=True) basic["edge size centrality stats"] = dict( zip(cstats, centrality_stats(basic["edge size list"])) ) basic["edge size hist"] = Counter(basic["edge size list"]) basic["max edge size"] = max(basic["edge size hist"]) # Component size distribution (nodes) basic["comp nodes list"] = sorted(s_comp_dist(H, edges=False), reverse=True) basic["comp nodes hist"] = Counter(basic["comp nodes list"]) basic["comp nodes centrality stats"] = dict( zip(cstats, centrality_stats(basic["comp nodes list"])) ) # Component size distribution (edges) basic["comp edges list"] = sorted(s_comp_dist(H, edges=True), reverse=True) basic["comp edges hist"] = Counter(basic["comp edges list"]) basic["comp edges centrality stats"] = dict( zip(cstats, centrality_stats(basic["comp edges list"])) ) # Number of components basic["num comps"] = len(basic["comp nodes list"]) # # Diameters # basic['s edge diam list'] = s_edge_diameter_dist(H) # basic['s node diam list'] = s_node_diameter_dist(H) if H.isstatic: H.set_state(dist_stats=basic) return basic
def getFrequencies(text): """Return a Counter() object of text.""" return Counter(text)
def _count_cards(cards: Iterable[Card]) -> Tuple[int, ...]: counter = Counter(cards) return tuple(counter[card] for card in Card)
def frequencySort(self, s): cnt = Counter(s) ans = [] for k, v in sorted(cnt.items(), key = lambda item: item[1], reverse = True): ans.append(k * v) return "".join(ans)
def __init__(self, duration, _): # last argument is resource (or None), but it is unused. self._hits = [] self._delta = duration if isinstance(duration, timedelta) \ else timedelta(seconds=duration) self._thread_lock = Lock() self._counter = PyCounter()
def mutDifferentReplacementVerbose(individual, pset, personal_map=False): """ choice terminals_and_constants verbose Replaces a randomly chosen primitive from *individual* by a randomly chosen primitive with the same number of arguments from the :attr:`pset` attribute of the individual. decrease the probability of same terminals. :param individual: The normal or typed tree to be mutated. :param pset: SymbolSet :param personal_map: bool :returns: A tuple of one tree. """ if len(individual) < 4: return individual, individual = copy.copy(individual) ters = [repr(i) for i in individual.terminals()] pset_ters = [repr(i) for i in pset.terminals_and_constants] cou = Counter(ters) cou_mutil = {i: j for i, j in cou.items() if j >= 2} ks = list(cou_mutil.keys()) nks = list(set(pset_ters) - (set(ks))) if len(nks) <= 1: return individual, nks.sort() # very import for random p_nks = np.array([pset.prob_ter_con[i] for i in nks]) p_nks = p_nks.astype(float) p_nks /= np.sum(p_nks) if cou_mutil: indexs = [] for k, v in cou_mutil.items(): indi = [] for i in np.arange(1, len(individual), 2): if repr(individual[i]) == k: indi.append(i) if indi: indexs.append(random.choice(indi)) if personal_map: p_nks_new = pset.premap.get_nodes_value(ind=individual, pset=pset, node=None, site=indexs) if p_nks_new is not None: nks = list(pset.prob_ter_con.keys()) p_nks = p_nks_new if len(indexs) <= len(nks): term = random.choice(nks, len(indexs), replace=False, p=p_nks) else: term = random.choice(nks, len(indexs), replace=True, p=p_nks) term_ters = [] for name in term: for i in pset.terminals_and_constants: if repr(i) == name: term_ters.append(i) for o, n in zip(indexs, term_ters): individual[o] = n return individual,
def __delitem__(self, *args, **kwargs): if self.__finishedinit: raise AttributeError("Can't change a frozen counter!") Counter.__delitem__(self, *args, **kwargs)
from collections import Counter text = "In February 2014, I made a recommendation to my co - founders at" \ "Ballistiq that I wanted to cancel development of ArtStation." \ "The project was in development hell. It wasn’t going anywhere." \ "I was unhappy with it and just couldn’t see a path for it to be a"\ "successful product. Two months later we managed to launch it," \ "and two years later it is the leading network for professional games." words = text.split() Counter = Counter(words) top_three = Counter.most_common(3) print(top_three)
def __getitem__(self, key): return Counter.__getitem__(self, key)
def __init__(self, *args, **kwargs): AttrDict.__init__(self, *args, **kwargs) Counter.__init__(self) self.__exclude_keys__ |= {'most_common'}
from sys import stdin as Si, maxsize as m from math import floor as F from collections import defaultdict as dt, Counter as Co from operator import itemgetter as ig from math import pi if __name__ == "__main__": L = tuple(map(int, Si.readline().split())) H, Max = Co(L), 0 for k, v in H.items(): if v > 1: Max = max(Max, k * min(v, 3)) print(sum(L) - Max) """ A. Bear and Five Cards time limit per test 2 seconds memory limit per test 256 megabytes input standard input output standard output A little bear Limak plays a game. He has five cards. There is one number written on each card. Each number is a positive integer. Limak can discard (throw out) some cards. His goal is to minimize the sum of numbers written on remaining (not discarded) cards. He is allowed to at most once discard two or three cards with the same number. Of course, he won't discard cards if it's impossible to choose two or three cards with the same number.
def __init__(self,Config): self.preprocessor = Preprocessor() self.cfg = Config() self.word_counter = Counter() self.words_dict = {}
def __init__(self, word_list = []): self.word_list = word_list Counter.__init__(self, word_list)
return Article(web_link=web_link, blog_link=blog_link, tags=tags, time_added=time_added, year=year, month=month, date=date, hour=hour, title=title) lists = map(parse_article, article_list) article_date_list = [] # 2018-03 article_time_list = [] # 12:24 article_blog_link = [] for l in lists: time_string = l.time_added article_date_list.append(str(time_string)[0:7]) article_time_list.append(int(str(time_string)[11:13])) article_blog_link.append(l.blog_link) article_date_list = Counter(article_date_list) article_time_list = Counter(article_time_list) article_blog_link_list = Counter(article_blog_link) article_blog_link_other_list = [] count = 0 article_blog_link_final_list = dict() for k in article_blog_link_list.keys(): v = article_blog_link_list.get(k) if v <= 10: count += v else: article_blog_link_final_list[k] = v def dict2list(dic: dict):
def most_common(self, n, conts): """Returns most frequent word""" return Counter.most_common(conts)
def main(): """ Helper script to encode raw text with the GPT-2 BPE using multiple processes. The encoder.json and vocab.bpe files can be obtained here: - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json - https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe """ parser = argparse.ArgumentParser() parser.add_argument( "--encoder-json", help="path to encoder.json", ) parser.add_argument( "--vocab-bpe", type=str, help="path to vocab.bpe", ) parser.add_argument( "--inputs", nargs="+", default=["-"], help="input files to filter/encode", ) parser.add_argument( "--outputs", nargs="+", default=["-"], help="path to save encoded outputs", ) parser.add_argument( "--keep-empty", action="store_true", help="keep empty lines", ) parser.add_argument("--workers", type=int, default=20) args = parser.parse_args() assert len(args.inputs) == len( args.outputs ), "number of input and output paths should match" with contextlib.ExitStack() as stack: inputs = [ stack.enter_context(open(input, "r", encoding="utf-8")) if input != "-" else sys.stdin for input in args.inputs ] outputs = [ stack.enter_context(open(output, "w", encoding="utf-8")) if output != "-" else sys.stdout for output in args.outputs ] encoder = MultiprocessingEncoder(args) pool = Pool(args.workers, initializer=encoder.initializer) encoded_lines = pool.imap(encoder.encode_lines, zip(*inputs), 100) stats = Counter() for i, (filt, enc_lines) in enumerate(encoded_lines, start=1): if filt == "PASS": for enc_line, output_h in zip(enc_lines, outputs): print(enc_line, file=output_h) else: stats["num_filtered_" + filt] += 1 if i % 10000 == 0: print("processed {} lines".format(i), file=sys.stderr) for k, v in stats.most_common(): print("[{}] filtered {} lines".format(k, v), file=sys.stderr)
def filter_picks(catalog, stations=None, channels=None, networks=None, locations=None, top_n_picks=None, evaluation_mode='all'): """ Filter events in the catalog based on a number of parameters. :param catalog: Catalog to filter. :type catalog: obspy.core.event.Catalog :param stations: List for stations to keep picks from. :type stations: list :param channels: List of channels to keep picks from. :type channels: list :param networks: List of networks to keep picks from. :type networks: list :param locations: List of location codes to use :type locations: list :param top_n_picks: Filter only the top N most used station-channel pairs. :type top_n_picks: int :param evaluation_mode: To select only manual or automatic picks, or use all (default). :type evaluation_mode: str :return: Filtered Catalog - if events are left with no picks, they are removed from the catalog. :rtype: obspy.core.event.Catalog .. note:: Will filter first by station, then by channel, then by network, if using top_n_picks, this will be done last, after the other filters have been applied. .. note:: Doesn't work in place on the catalog, your input catalog will be safe unless you overwrite it. .. note:: Doesn't expand wildcard characters. .. rubric:: Example >>> from obspy.clients.fdsn import Client >>> from eqcorrscan.utils.catalog_utils import filter_picks >>> from obspy import UTCDateTime >>> client = Client('NCEDC') >>> t1 = UTCDateTime(2004, 9, 28) >>> t2 = t1 + 86400 >>> catalog = client.get_events(starttime=t1, endtime=t2, minmagnitude=3, ... minlatitude=35.7, maxlatitude=36.1, ... minlongitude=-120.6, maxlongitude=-120.2, ... includearrivals=True) >>> print(len(catalog)) 12 >>> filtered_catalog = filter_picks(catalog, stations=['BMS', 'BAP', ... 'PAG', 'PAN', ... 'PBI', 'PKY', ... 'YEG', 'WOF']) >>> print(len(filtered_catalog)) 12 >>> stations = [] >>> for event in filtered_catalog: ... for pick in event.picks: ... stations.append(pick.waveform_id.station_code) >>> print(sorted(list(set(stations)))) ['BAP', 'BMS', 'PAG', 'PAN', 'PBI', 'PKY', 'WOF', 'YEG'] """ # Don't work in place on the catalog filtered_catalog = catalog.copy() if stations: for event in filtered_catalog: if len(event.picks) == 0: continue event.picks = [pick for pick in event.picks if pick.waveform_id.station_code in stations] if channels: for event in filtered_catalog: if len(event.picks) == 0: continue event.picks = [pick for pick in event.picks if pick.waveform_id.channel_code in channels] if networks: for event in filtered_catalog: if len(event.picks) == 0: continue event.picks = [pick for pick in event.picks if pick.waveform_id.network_code in networks] if locations: for event in filtered_catalog: if len(event.picks) == 0: continue event.picks = [pick for pick in event.picks if pick.waveform_id.location_code in locations] if evaluation_mode == 'manual': for event in filtered_catalog: event.picks = [pick for pick in event.picks if pick.evaluation_mode == 'manual'] elif evaluation_mode == 'automatic': for event in filtered_catalog: event.picks = [pick for pick in event.picks if pick.evaluation_mode == 'automatic'] elif evaluation_mode != 'all': warnings.warn('Unrecognised evaluation_mode: %s, using all picks' % evaluation_mode) if top_n_picks: all_picks = [] for event in filtered_catalog: all_picks += [(pick.waveform_id.station_code, pick.waveform_id.channel_code) for pick in event.picks] counted = Counter(all_picks).most_common() all_picks = [] # Hack around sorting the counter object: Py 2 does it differently to 3 for i in range(counted[0][1]): highest = [item[0] for item in counted if item[1] >= counted[0][1] - i] # Sort them by alphabetical order in station highest = sorted(highest, key=lambda tup: tup[0]) for stachan in highest: if stachan not in all_picks: all_picks.append(stachan) if len(all_picks) > top_n_picks: all_picks = all_picks[0:top_n_picks] break for event in filtered_catalog: if len(event.picks) == 0: continue event.picks = [pick for pick in event.picks if (pick.waveform_id.station_code, pick.waveform_id.channel_code) in all_picks] # Remove events without picks tmp_catalog = Catalog() for event in filtered_catalog: if len(event.picks) > 0: tmp_catalog.append(event) return tmp_catalog
df2[1] df2[np.where(df2=='顶')] df2[[1,2,3]] df2.ix[[1,2,3]] [1,2,3,4].__dir__() '的' in ['的大丰','的'] test = [1,2,3,4,2,2,3,1,4,4,4] ################set key用法 print(max(set(test),key=test.count)) a = [1,4,2,3,2,3,4,2] from collections import Counter Counter(a).most_common(2) import functools product = functools.reduce((lambda x, y: x * y), [1, 2, 3, 4]) import operator operator.xor(60,13) functools.reduce(operator.xor, [1,2,5,2,1,5,9,2]) t1 = [1,2,3] t2 =[10,20,30] dict(zip(t1,t2))[3] la = [1,2] lb = [4,5,6] lc = [7,8,9,10]
def prepare(): global vocab, written_lines # Files to be prepared files = { '{}.{}'.format(hparams['train_prefix'].replace('.bpe', ''), hparams['src']).replace( preprocessing['train_folder'], '').lstrip('\\/'): {'amount': 1, 'up_to': -1}, # copy all of data (up to "samples") '{}.{}'.format(hparams['dev_prefix'].replace('.bpe', ''), hparams['src']).replace(preprocessing['train_folder'], '').lstrip('\\/'): { 'amount': .1, 'up_to': preprocessing['test_size']}, # copy 1/10th but up to 'test_size' '{}.{}'.format(hparams['test_prefix'].replace('.bpe', ''), hparams['src']).replace( preprocessing['train_folder'], '').lstrip('\\/'): {'amount': .1, 'up_to': preprocessing['test_size']}, '{}.{}'.format(hparams['train_prefix'].replace('.bpe', ''), hparams['tgt']).replace( preprocessing['train_folder'], '').lstrip('\\/'): {'amount': 1, 'up_to': -1}, '{}.{}'.format(hparams['dev_prefix'].replace('.bpe', ''), hparams['tgt']).replace(preprocessing['train_folder'], '').lstrip('\\/'): { 'amount': .1, 'up_to': preprocessing['test_size']}, '{}.{}'.format(hparams['test_prefix'].replace('.bpe', ''), hparams['tgt']).replace( preprocessing['train_folder'], '').lstrip('\\/'): {'amount': .1, 'up_to': preprocessing['test_size']}, } # pprint.pformat(files, indent=4) print(colorama.Fore.GREEN + "\nPreparing training set from raw set" + colorama.Fore.RESET) # Ensure that train folder exists try: os.makedirs(preprocessing['train_folder']) except OSError as e: if e.errno != errno.EEXIST: raise # Ensure that model/log folder exists train_log_dir = os.path.join(hparams['out_dir'], 'train_log') try: os.makedirs(train_log_dir) except OSError as e: if e.errno != errno.EEXIST: raise data_vocab = Counter() # Iterate thru files and prepare them for file_name, amounts in files.items(): vocab = Counter() print("File: {}{}{}".format(colorama.Fore.GREEN, file_name, colorama.Fore.RESET)) # Output file handler out_file = open('{}/{}'.format(preprocessing['train_folder'], file_name), 'w', encoding='utf-8', buffering=131072) # Maximum number of lines read = 0 amount = int(min(amounts['amount'] * preprocessing['samples'] if preprocessing['samples'] > 0 else 10 ** 20, amounts['up_to'] if amounts['up_to'] > 0 else 10 ** 20)) # Prepare thread variables write_thread = None vocab_thread = None written_lines = 0 # We are going to use multiprocessing for tokenization, as it's cpu intensive with Pool(processes=preprocessing['cpu_count']) as pool: # Count number of lines in file progress = tqdm(ascii=True, unit=' lines', total=min(amount, sum(1 for _ in open( '{}/{}'.format(preprocessing['source_folder'], file_name), 'r', encoding='utf-8', buffering=131072)))) # Open input file with open('{}/{}'.format(preprocessing['source_folder'], file_name), 'r', encoding='utf-8', buffering=131072) as in_file: last_batch = False # Iterate every 10k lines for rows in read_lines(in_file, 10000, ''): # If number of lines is greater than limit - break read += len(rows) if read >= amount: rows = rows[:amount - read + len(rows)] last_batch = True # Process using multiprocessing rows = pool.map(tokenize, rows, 100) # Process vocab using multiprocessing vocab_part = pool.map(sentence_split, rows, 100) # Join running threads from previous loop if write_thread is not None: write_thread.join() vocab_thread.join() progress.update(written_lines) # Thread for vocab update vocab_thread = Thread(target=append_vocab, args=(vocab_part,)) vocab_thread.start() # And thread for saving tokenized data to output file write_thread = Thread(target=write_lines, args=(out_file, rows, written_lines == 0)) write_thread.start() # Last batch - break / exit loop if last_batch: break # Join running threads and update progress bar write_thread.join() vocab_thread.join() progress.update(written_lines) progress.close() # If it's train file, save vocab if file_name == '{}.{}'.format(hparams['train_prefix'].replace('.bpe', ''), hparams['src']).replace( preprocessing['train_folder'], '').lstrip('\\/'): data_vocab[hparams['src']] = vocab elif file_name == '{}.{}'.format(hparams['train_prefix'].replace('.bpe', ''), hparams['tgt']).replace( preprocessing['train_folder'], '').lstrip('\\/'): data_vocab[hparams['tgt']] = vocab # If joined vocab - add counters if preprocessing['joined_vocab']: data_vocab[hparams['src']] += data_vocab[hparams['tgt']] del data_vocab[hparams['tgt']] # BPE/WPM-like tokenization # inspired by and based on https://github.com/rsennrich/subword-nmt if preprocessing['use_bpe']: print(colorama.Fore.GREEN + "\nLearning BPE" + colorama.Fore.RESET) # List of subword joins to be applied to training data joins = {} # Final train vocab for NMT train_vocab = {} # Learn BPE for both vocabs (or common vocab) for source, raw_vocab in data_vocab.items(): # Pair stats stats = Counter() # Pair indexes indices = defaultdict(lambda: defaultdict(int)) # Build 'new' vocab used for BPE learning (train_vocab will be a final vocab for NMT) vocab = [] train_vocab[source] = Counter() # Build vocab for BPE learning purpose print("Building temporary vocab ({})".format(hparams['src'] if preprocessing['joined_vocab'] else source)) for i, (entity, freq) in tqdm(enumerate(raw_vocab.most_common()), ascii=True, unit=' tokens'): # Split vocab token entity = tuple(entity.split()) # Make pairs ("ABCD" -> (A, B), (B, C), (C, D)), stats, indexes and train vocab prev_char = entity[0] train_vocab[source][prev_char] += freq for char in entity[1:]: stats[prev_char, char] += freq indices[prev_char, char][i] += 1 train_vocab[source][char] += freq prev_char = char vocab.append((entity, freq)) print("Learning BPE for vocab of {} tokens".format(preprocessing['vocab_size'])) # List of joins per vocab joins[source] = [] # Partial stats speeds up learning process - optimization for 'max' above partial_stats = Counter(['', -1]) partial_stats_min = 0 update_partial_stats = True # Current number of vocab tokens train_vocab_len = prev_train_vocab_len = len(train_vocab[source]) # Progress bar progress = tqdm(ascii=True, unit=' tokens', total=preprocessing['vocab_size'], maxinterval=0.1, miniters=10) progress.monitor_interval = 1 progress.update(prev_train_vocab_len) # Learn until vocab will contain desired number of tokens while train_vocab_len < preprocessing['vocab_size']: clean_train_vocab = False # Get most frequent pair most_frequent, freq = partial_stats.most_common(1)[0] # Update partial stats or frequency of most frequent pair is less than saved minimum for partial stats if update_partial_stats or freq <= partial_stats_min: partial_stats_min = partial_stats.most_common(500)[-1][1] partial_stats = Counter() for k, v in stats.most_common(): if v < partial_stats_min: break partial_stats[k] = v update_partial_stats = False # Get most frequent pair (again, proper one this time) most_frequent, _ = partial_stats.most_common(1)[0] # If frequency is lower than 2 - exit if stats[most_frequent] < 2: print( 'No pair has frequency greater than 1. Stopping earlier, your vocab file will include less tokens.\n') break # Replace pair "A B" with new entity "AB" # Changes made changes = [] # Replace regex pattern = re.compile(r'(?<!\S)' + re.escape(' '.join(most_frequent)) + r'(?!\S)') # Loop through indices for j, freq in indices[most_frequent].items(): # Do not touch not existent pairs if freq < 1: continue # Get entity and frequency entity, freq = vocab[j] # Replace "A B" with "AB" in entity new_entity = pattern.sub(''.join(most_frequent), ' '.join(entity)) new_entity = tuple(new_entity.split()) # Update entity vocab[j] = (new_entity, freq) changes.append((j, new_entity, entity, freq)) # Update indices and pair stats # Merged pair doesn't exist anymore stats[most_frequent] = 0 partial_stats[most_frequent] = 0 indices[most_frequent] = defaultdict(int) # Get entities and a new pair first, second = most_frequent new_pair = first + second # Iterate through all changes for j, entity, old_entity, freq in changes: # Find all occurences of first pair entity prev = -2 for i in iter([i for i, entity in enumerate(old_entity) if entity == first]): # Do not touch second "B B" if "B B B" if i == prev + 1: continue # Check if second pair entity follows first one if i < len(old_entity) - 1 and old_entity[i + 1] == second: # Reduce frequency of "A B" in "A B C D" where "B C" is a merged pair if i: prev = old_entity[i - 1:i + 1] stats[prev] -= freq partial_stats[prev] = stats[prev] indices[prev][j] -= 1 # Reduce frequency of "C D" in "A B C D" where "B C" is a merged pair if i < len(old_entity) - 2: # But do not touch "C B" if "A B C B C" as values will be adjusted with next occurence of "B C" pair if old_entity[i + 2] != first or i >= len(old_entity) - 3 or old_entity[ i + 3] != second: next = old_entity[i + 1:i + 3] stats[next] -= freq partial_stats[next] = stats[next] indices[next][j] -= 1 prev = i if train_vocab[source][first] <= freq or train_vocab[source][second] <= freq: clean_train_vocab = True train_vocab[source][first] -= freq train_vocab[source][second] -= freq # Find all occurences of first pair entity for i in [i for i, entity in enumerate(entity) if entity == new_pair]: # Increase frequency of (new pair) "A BC" in "A BC D" if i: prev = entity[i - 1:i + 1] stats[prev] += freq if stats[prev] > partial_stats_min: update_partial_stats = True indices[prev][j] += 1 # Increase frequency of (new pair) "BC D" in "A BC D", but do not touch if "A BC BC" as stats for "BC BC" will be adjusted win next occurence of "BC" pair if i < len(entity) - 1 and entity[i + 1] != new_pair: next = entity[i:i + 2] stats[next] += freq if stats[next] > partial_stats_min: update_partial_stats = True indices[next][j] += 1 # Set frequency of a new pair train_vocab[source][new_pair] += freq # Current pair is merged - is not a pair anymore, so has frequency of 0 stats[most_frequent] = 0 partial_stats[most_frequent] = 0 # Remove (from training vocab) tokens with frequency of 0 if clean_train_vocab: train_vocab[source] = +train_vocab[source] # Calculate current number of train vocab entities prev_train_vocab_len = train_vocab_len train_vocab_len = len(train_vocab[source]) train_vocab_len_diff = train_vocab_len - prev_train_vocab_len # Update progress bar if train_vocab_len_diff >= 0: progress.update(train_vocab_len_diff) # For a negative number set new value directly - tqdm doesn't support negative updates else: progress.n += train_vocab_len_diff progress.refresh() # Add new join pair joins[source].append(most_frequent) # Save list of joins for train vocab joins[source] = dict(reversed([(v, i) for i, v in enumerate(joins[source])])) # Done progress.close() # Save list of joins to a file (joined vocab) and replace main vocabs if preprocessing['joined_vocab']: with open('{}/{}'.format(preprocessing['train_folder'], 'bpe_joins.common.json'), 'w', encoding='utf-8', buffering=131072) as bpe_file: json.dump({json.dumps(k): v for k, v in joins[hparams['src']].items()}, bpe_file) data_vocab[hparams['src']] = train_vocab[hparams['src']] # Save list of joins to files (separated vocab) else: with open('{}/{}'.format(preprocessing['train_folder'], 'bpe_joins.{}.json'.format(hparams['src'])), 'w', encoding='utf-8', buffering=131072) as bpe_file: json.dump({json.dumps(k): v for k, v in joins[hparams['src']].items()}, bpe_file) with open('{}/{}'.format(preprocessing['train_folder'], 'bpe_joins.{}.json'.format(hparams['tgt'])), 'w', encoding='utf-8', buffering=131072) as bpe_file: json.dump({json.dumps(k): v for k, v in joins[hparams['tgt']].items()}, bpe_file) data_vocab[hparams['src']] = train_vocab[hparams['src']] data_vocab[hparams['tgt']] = train_vocab[hparams['tgt']] print(colorama.Fore.GREEN + "\nApplying BPE" + colorama.Fore.RESET) # BPE files to be prepared bpe_files = [ '{}.{}'.format(hparams['train_prefix'], hparams['src']).replace(preprocessing['train_folder'], '').lstrip( '\\/'), '{}.{}'.format(hparams['dev_prefix'], hparams['src']).replace(preprocessing['train_folder'], '').lstrip( '\\/'), '{}.{}'.format(hparams['test_prefix'], hparams['src']).replace(preprocessing['train_folder'], '').lstrip( '\\/'), '{}.{}'.format(hparams['train_prefix'], hparams['tgt']).replace(preprocessing['train_folder'], '').lstrip( '\\/'), '{}.{}'.format(hparams['dev_prefix'], hparams['tgt']).replace(preprocessing['train_folder'], '').lstrip( '\\/'), '{}.{}'.format(hparams['test_prefix'], hparams['tgt']).replace(preprocessing['train_folder'], '').lstrip( '\\/'), ] # Iterate thru files and apply BPE for i, file_name in enumerate(bpe_files): # Current train vocab source = hparams['src'] if preprocessing['joined_vocab'] else file_name.split('.')[-1] print("File: {}{}{}".format(colorama.Fore.GREEN, file_name, colorama.Fore.RESET)) # Output file handler out_file = open('{}/{}'.format(preprocessing['train_folder'], file_name), 'w', encoding='utf-8', buffering=131072) # Prepare thread variables write_thread = None written_lines = 0 # We are going to use multiprocessing for joins, as it's cpu intensive with Pool(processes=preprocessing['cpu_count'], initializer=apply_bpe_init, initargs=(joins[source],)) as pool: # Progress bar progress = tqdm(ascii=True, unit=' lines', total=sum(1 for _ in open( '{}/{}'.format(preprocessing['train_folder'], file_name.replace('.bpe.', '.')), 'r', encoding='utf-8', buffering=131072))) # Open input file with open('{}/{}'.format(preprocessing['train_folder'], file_name.replace('.bpe.', '.')), 'r', encoding='utf-8', buffering=131072) as in_file: # Iterate every 10k lines for rows in read_lines(in_file, 10000, ''): # Process using multiprocessing rows = pool.map(apply_bpe, rows, 100) # Join running threads from previous loop if write_thread is not None: write_thread.join() # vocab_thread.join() # print('+') progress.update(written_lines) # vocab_thread2.join() # Thread for saving tokenized data to output BPE file write_thread = Thread(target=write_lines, args=(out_file, rows, written_lines == 0)) write_thread.start() # Join running threads and update progress bar write_thread.join() progress.update(written_lines) progress.close() # Remove unnecessary train file (BPE one will be used by NMT) os.remove('{}/{}'.format(preprocessing['train_folder'], file_name.replace('.bpe.', '.'))) print(colorama.Fore.GREEN + "\nPostprocessing and saving vocabs" + colorama.Fore.RESET) # Vocab files to be prepared # Joined vocab if preprocessing['joined_vocab']: vocab_files = [ '{}.{}'.format(hparams['train_prefix'].replace('train', 'vocab'), hparams['src']).replace( preprocessing['train_folder'], '').lstrip('\\/'), ] # Separated vocabs else: vocab_files = [ '{}.{}'.format(hparams['train_prefix'].replace('train', 'vocab'), hparams['src']).replace( preprocessing['train_folder'], '').lstrip('\\/'), '{}.{}'.format(hparams['train_prefix'].replace('train', 'vocab'), hparams['tgt']).replace( preprocessing['train_folder'], '').lstrip('\\/'), ] for vocab_file_name in vocab_files: print("File: {}{}{}".format(colorama.Fore.GREEN, vocab_file_name, colorama.Fore.RESET)) # Get most common entities source = vocab_file_name.split('.')[-1] data_vocab[source] = [entity for entity, _ in data_vocab[source].most_common()] # Write entities to a file with open('{}/{}'.format(preprocessing['train_folder'], vocab_file_name), 'w', encoding='utf-8', buffering=131072) as vocab_file: vocab_file.write("<unk>\n<s>\n</s>\n" + "\n".join(data_vocab[source][:preprocessing['vocab_size']])) with open('{}/{}'.format(preprocessing['train_folder'], vocab_file_name.replace('vocab', 'vocab_unused')), 'w', encoding='utf-8', buffering=131072) as vocab_file: vocab_file.write("\n".join(data_vocab[source][preprocessing['vocab_size']:])) print(colorama.Fore.GREEN + "\nWriting pbtxt file" + colorama.Fore.RESET) # Write pbtxt file for metadata for embeddings with open('{}/{}'.format(os.path.join(train_log_dir), 'projector_config.pbtxt'), 'w', encoding='utf-8', buffering=131072) as pbtxt_file: pbtxt_file.write(('''embeddings {{\n tensor_name: 'embeddings/decoder/embedding_decoder'\n ''' + '''metadata_path: '{}'\n}}\nembeddings {{\n ''' + '''tensor_name: 'embeddings/encoder/embedding_encoder'\n metadata_path: '{}'\n}}''').format( '{}/{}'.format(preprocessing['train_folder'], vocab_files[0].replace('train', 'vocab')), '{}/{}'.format(preprocessing['train_folder'], vocab_files[0 if preprocessing['joined_vocab'] else 1].replace('train', 'vocab')) )) print(colorama.Fore.GREEN + "\nAll done" + colorama.Fore.RESET)
from collections import ChainMap import os, argparse # 构造缺省参数: defaults = { 'color': 'red', 'user': '******' } # 构造命令行参数: parser = argparse.ArgumentParser() parser.add_argument('-u', '--user') parser.add_argument('-c', '--color') namespace = parser.parse_args() command_line_args = {k: v for k, v in vars(namespace).items() if v} # 组合成ChainMap: combined = ChainMap(command_line_args, os.environ, defaults) # 打印参数: print('color=%s' % combined['color']) print('user=%s' % combined['user']) # Counter from collections import Counter c = Counter() for ch in 'programming': c[ch] = c[ch] + 1 print(c)
from collections import Counter if __name__ == "__main__": s = input().strip() b = [] a = Counter(s) for i in a.keys(): b.append([i, a[i]]) b = sorted(b, key=lambda x: (-int(x[1]), -ord(x[0])))[:3] for i in b: print(*i)
from collections import Counter import pandas as pd import time # data = "Este es un ejemplo de remocion de palabras a ver que pasa con las palabras y la remocion" articles = pd.read_csv("articles1.csv", usecols=[1, 2, 9]) ids = articles['id'].values titles = articles['title'].values content = articles['content'].values stopWords = set(stopwords.words('english')) index = {} sub_index = {} start = time.time() for r in range(len(ids)): data = titles[r].lower() + content[r].lower() words = word_tokenize(data) wordsFiltered = [] for w in words: if w not in stopWords and len(w) != 1: wordsFiltered.append(w) c = Counter(wordsFiltered) for s, r in c.items(): sub_index[s] = r index[ids[r]] = sub_index sub_index = {} data = "" end = time.time() print(index[17284]) print(end - start)
def entropy(self,word): if self.labels==None: raise ValueError('Please execute self.build_clusters() before calculating entropy(word)') word_labels = [ self.labels[i] for i,text in enumerate(self.Snap['TEXT']) if word in text] WordLabelDistribution = Counter(word_labels) print WordLabelDistribution NoOfText = sum(WordLabelDistribution.values()) WordEntropy = sum([ -(nlabelText/NoOfText)*log(nlabelText/NoOfText) for nlabelText in Counter.values() ]) return WordEntropy
def viterbi_segment(text): probs, lasts = [1.0], [0] for i in range(1, len(text) + 1): prob_k, k = max((probs[j] * word_prob(text[j:i]), j) for j in range(max(0, i - max_word_length), i)) probs.append(prob_k) lasts.append(k) words = [] i = len(text) while 0 < i: words.append(text[lasts[i]:i]) i = lasts[i] words.reverse() return words, probs[-1] dictionary = Counter(words(open(r'C:\Users\Comete\big.txt').read())) max_word_length = max(map(len, dictionary)) total = float(sum(dictionary.values())) # 1. data processing minutes.pop(minutes.columns[0]) from nltk.corpus import stopwords import spacy nlp = spacy.load('en_core_web_sm',disable=['parser', 'ner']) stop_words = stopwords.words('english') import datetime Month = [datetime.date(2008, i, 1).strftime('%B').lower() for i in range(1,13)] stop_words.extend(['year','month','day','mr','meeting','committee','ms','federal','page']