def get_ref_pixels(ref_wvl, wvlsol0, x=None): """ Given the list of wavelengths tuples, return expected pixel positions from the initial wavelength solution of wvlsol0. """ if x is None: x = np.arange(len(wvlsol0)) um2pixel = interp1d(wvlsol0, x, bounds_error=False) ref_pixel = [um2pixel(w) for w in ref_wvl] # there could be cases when the ref lines fall out of bounds, # resulting nans. nan_filter = [np.all(np.isfinite(p)) for p in ref_pixel] valid_list = [[np.all(np.isfinite(p))]*len(p) for p in ref_pixel] group_flags = get_group_flags(ref_wvl) df = pd.DataFrame(dict(wavelength=flatten(ref_wvl), valid=flatten(valid_list), group_flag=group_flags, group_id=np.add.accumulate(group_flags))) ref_pixel_filtered = [r for r, m in zip(ref_pixel, nan_filter) if m] df2 = df.join(pd.DataFrame(dict(pixel=flatten(ref_pixel_filtered)), index=df.index[flatten(valid_list)])) return df2
def p_program(self, p): """program : declarations fundefs instructions""" p[0] = AST.Program(AST.Declarations.mapTypedDeclarations(p[1]), utils.flatten(p[2]), AST.Instructions(utils.flatten(p[3]))) p[0].set_parents() p[0].set_scope() p[0].set_position(p.lexer.lexer.lineno, p.lexer.lexer.lexpos)
def mergedict(dicts): result = {} keys = set(flatten([d.keys() for d in dicts])) for k in keys: vals = [v for v in [d.get(k) for d in dicts] if v] if len(vals) == 0: continue if k in ("_id", "timestamp"): continue if isinstance(vals[0], dict): result[k] = mergedict(vals) elif isinstance(vals[0], (str, unicode, int, bool, long)): v = set(flatten(vals)) if len(v) == 1: result[k] = v.pop() else: result[k] = list(v) elif isinstance(vals[0], (list, tuple)) and isinstance(vals[0][0], (str, unicode, int, bool)): result[k] = list(set(flatten(vals))) elif k == "interfaces": result[k] = mergedict_bykeys(vals, "ifindex") elif k == "arp": result[k] = mergedict_bykeys(vals, "mac", "v4addr") elif k == "neighbours": result[k] = mergedict_bykeys(vals, "v4addr") elif k == "v4routes": result[k] = mergedict_bykeys(vals, "network") elif k == "bridges": result[k] = mergedict_bykeys(vals, "name") elif k == "addresses": result[k] = mergedict_bykeys(vals, "mac") else: raise MergeError("unhandled key: %s" % k) return result
def semantics(doc): prep = preprocess(doc) return ( flatten( prep.pos_tags() ), prep.noun_phrases(), flatten( prep.get_entities() ) )
def count_intervals_in_all_songs(songs): intervals_direction = utils.flatten([song.intervals_with_direction for song in songs]) intervals = utils.flatten([song.intervals for song in songs]) print "All Songs\n" print " With direction" utils.print_dic_perc(Counter(intervals_direction)) print " Without direction" utils.print_dic_perc(Counter(intervals))
def fit(self, X, y, copy=False): IRSystem.fit(self, X, y, copy) self._target = list(self._target) self._labels = tuple(set(flatten(self._target))) self._label_dist = Counter(flatten(self._target)) self._target = np.array(self._target) self.compute_prior() self.compute_conditional()
def global_region_weights(self): ''' returns default weights for all similarities ''' #return utils.norm_list(utils.flatten(self._feature_compare_helper(lambda fs1, not_used: [1] + utils.flatten(fs1.region_weights()), self, 'all'))) #return utils.norm_list(utils.flatten(map(lambda fs: [1] + utils.flatten(fs.region_weights()), self.feature_sets))) self._load_check() weights = list() for fs in self.feature_sets: region_weights = fs.region_weights() weights.append([1] + utils.flatten(region_weights)) return utils.norm_list(utils.flatten(weights))
def PLS_Models(model_dict, validation_dict, target, **args): '''This section of code will create and test the prospective models''' '''Pick the model building parameters out of args''' try: break_flag = args['break_flag'] #Decide whether or not we are going to test models that include a midseason split except KeyError: break_flag = 2 try: limits=utils.flatten([args['specificity']]) except KeyError: limits = np.arange(11.)/100 + 0.85 #Default: test specificity limits from 0.85 to 0.95 try: threshold=utils.flatten([args['threshold']]) except KeyError: threshold=[0,1] #Default: threshold by counts if break_flag != 1: model=pls.Model( model_dict, model_target=target.lower() ) if break_flag != 0: mw=pls.Model_Wrapper( data=model_dict, model_target=target.lower() ) results = list() #Test models w/ midseason split for spec_lim in limits: for threshold_method in threshold: if threshold_method==0: balance_method=1 # else: balance_method=0 if break_flag != 0: '''mw.Generate_Models(breaks=1, specificity=spec_lim, wedge='julian', threshold_method=threshold_method, balance_method=balance_method) imbalance = mw.imbalance split_index = mlab.find(mw.imbalance[:,1] == np.min(mw.imbalance[:,1]))''' imbalance = pls_parallel.Tune_Split(mw, specificity=spec_lim, wedge='julian', threshold_method=threshold_method, balance_method=balance_method) split_index = mlab.find(imbalance[:,1] == np.min(imbalance[:,1])) for split in imbalance[split_index,0]: mw.Split(wedge='julian', breakpoint=split) mw.Assign_Thresholds(threshold_method=threshold_method, specificity=spec_lim) summary = Summarize(mw, validation_dict, **args) summary.insert( 1, balance_method) summary.insert( 1, threshold_method) results.append( summary ) #Test models w/o midseason split if break_flag != 1: for spec_lim in limits: model.Threshold(specificity=spec_lim) summary = Summarize(model, validation_dict, **args) summary.insert(1, np.nan) summary.insert(1, np.nan) results.append( summary ) return results
def add_pattern(self, production, options): self.progress(production, 'add_pattern: %s' % options) self.progress(production, '[\'pattern\', %s, %s, %s' % (options.get('subject'), options.get('predicate'), options.get('object'))) triple = {} for r,v in options.items(): if isinstance(v,list) and len(flatten(v)) == 1: v = flatten(v)[0] if self.validate and not isinstance(v, Term): self.error("add_pattern", "Expected %s to be a resource, but it was %s" % (r, v), {'production' : production}) triple[r] = v self.add_prod_datum('pattern', Pattern(triple))
def extract_links(br): """Extract FP related links from the current page.""" links_to_visit_text = list(ut.flatten([br.find_elements_by_partial_link_text(linktext) for linktext in LINK_LABELS])) links_to_visit_url = list(ut.flatten([br.find_elements_by_xpath('//a[contains(@href,"%s")]' % linkurl) for linkurl in LINK_URLS])) links_to_visit = [link for link in links_to_visit_text + links_to_visit_url if link] if len(links_to_visit) < NO_OF_LINKS_TO_CLICK: # if we cannot find links by href and link texts links_to_visit += extract_onclick_elements(br) # we search for all elements with onclick event handler wl_log.info('%s links were found on %s' % (len(links_to_visit), br.current_url)) return links_to_visit
def train(self, authors): self.stopwords = {ln: self.get_stop_words(ln) \ for ln in self.db.get_languages()} lang = self.db.get_author_language(authors[0]) self.words = [self.db.get_author(a)["corpus"] for a in authors] self.words = utils.flatten(self.words) tokenizer = self.get_tokenizer() self.words = map(lambda x: tokenizer.tokenize(x), self.words) self.words = utils.flatten(self.words) self.words = list(set([x.lower() for x in self.words])) self.words = filter(lambda x: x in self.stopwords[lang], self.words) self.words.sort()
def tokenize_insight(insight, twitter=False): """ return subject, property and context tokens """ url = twagUrl if twitter else stagUrl insight = dict(insight.items()) context = tokenize_doc({'content':insight['content']}, twitter=twitter) subj = tokenize_doc({'content':insight['subject']}, twitter=twitter) prop = tokenize_doc({'content':insight['property']}, twitter=twitter) insight['context_toks'] = flatten(context['toks']) insight['subj_toks'] = flatten(subj['toks']) insight['prop_toks'] = flatten(prop['toks']) return insight
def listspecies(reactions): # print "listspecies:" species=[] for r in reactions: lhs = list(r.LHS()) rhs = list(r.RHS()) mhs = list(r.MHS()) s=list(set(utils.flatten([lhs,rhs,mhs]))) # print "s=",s species.append(s) species=list(set(utils.flatten(species))) if "Nil" in species: species.remove("Nil") # print "species=",species return species
def func(*args, **kwargs): from scrapper.models import Criterion tweets = fn(*args, **kwargs) if tweets: users, hash_tags = zip(*map( lambda tweet:(tweet['mentions'], tweet['hash_tags']), tweets )) users = flatten(users) hash_tags = flatten(hash_tags) criteria_obj = [Criterion(type='hash_tag', value=hashtag) for hashtag in hash_tags if is_valid_hashtag(hashtag)] criteria_obj += [Criterion(type='user_name', value=username) for username in users if is_valid_username(username)] Criterion.objects.bulk_create_or_skip(hash_tags=hash_tags, user_name=users) return tweets
def cseg_similarity(cseg1, cseg2): """Returns Marvin and Laprade (1987) CSIM(A, B) for a single cseg. It's a contour similarity function that measures similarity between two csegs of the same cardinality. The maximum similarity is 1, and minimum is 0. >>> cseg_similarity(Contour([0, 2, 3, 1]), Contour([3, 1, 0, 2])) 0 """ cseg1_triangle = utils.flatten(cseg1.comparison_matrix().superior_triangle()) cseg2_triangle = utils.flatten(cseg2.comparison_matrix().superior_triangle()) return auxiliary.position_comparison(cseg1_triangle, cseg2_triangle)
def possible_cseg(base_3): """Returns a cseg from a base 3 sequence, if the cseg is possible (Polansky and Bassein 1992). >>> possible_cseg([2, 2, 2]) < 0 1 2 > """ seq = utils.flatten(base_3) size = len(seq) for x in itertools.product(range(size), repeat=3): cseg = Contour(x) if utils.flatten(cseg.base_three_representation()) == seq: return Contour(x) return "Impossible cseg"
def viewfinder_corners(corners, position, extensions): # [(1, 2), (10, 20)] -> [-1, 10, -2, 20] corners = [-corners[0][0], corners[1][0], -corners[0][1], corners[1][1]] # [5, 4] -> [5, 5, 4, 4] extensions = flatten([(x, x) for x in extensions]) # [2, 3] -> [-2, 2, -3, 3] position = flatten([(-x, x) for x in position]) visible = [] for corner, extension, coordinate in zip(corners, extensions, position): extended_coordinate = coordinate + extension if extended_coordinate < corner: visible.append(abs(extended_coordinate)) else: visible.append(abs(corner)) return visible
def get_files(pattern): '''Returns list of files that matches glob pattern provided. Pattern can be iterable with multiple patterns.''' if not hasattr(pattern, '__iter__'): pattern = [pattern] return list(flatten([glob.glob(x) for x in pattern]))
def crawl(self): # 지역구 대표 jobs = [] constant_candidates = self.constant_candidates candidate_type = self.candidate_type target = self.target target_eng = self.target_eng target_kor = self.target_kor nth = self.nth req_url = self.urlPath_result_list townCode_JSON = get_json(self.urlPath_city_codes, self.urlParam_city_codes)[0]['results'] print("\x1b[1;36mWaiting to connect http://info.nec.go.kr server (%s, %d-th)...\x1b[1;m" % (target_eng, nth)) for city_index, (city_code, city_name) in list(enumerate(self.city_codes(townCode_JSON))): req_param = self.XHTML_url_param(city_code) job = gevent.spawn(self.parse, req_url, req_param, constant_candidates, target, target_kor, city_name, city_code, city_index, townCode_JSON) jobs.append(job) gevent.joinall(jobs) every_result = [{'election_type':target_eng, 'nth':nth, 'candidate_type':candidate_type, \ 'results':flatten(job.get() for job in jobs)}] # 비례대표 if hasattr(self, 'next_crawler'): prop_result = self.next_crawler.crawl() every_result.extend(prop_result) return every_result
def crawl(self): jobs = [] target = self.target target_eng = self.target_eng target_kor = self.target_kor nth = self.nth city_code_list = self.city_codes() req_url = dict(town=self.urlPath_town_list, sgg=self.urlPath_sgg_list) param_dict = dict(town=self.urlParam_town_list, sgg=self.urlParam_sgg_list) # 광역자치단체 단위 페이지의 데이터 크롤링의 기본과정. print("\x1b[1;36mWaiting to connect http://info.nec.go.kr server (%s, %d-th)...\x1b[1;m" % (target_eng, nth)) for city_code, city_name in city_code_list: # 각 광역자치단체 별로 아래 단계를 수행. req_param = self.JSON_url_param(city_code, copy.deepcopy(param_dict)) job = gevent.spawn(self.parse_city, req_url, req_param, target, target_kor, nth, city_code, city_name) jobs.append(job) gevent.joinall(jobs) every_result = [{'election_type':target,'nth':nth,'results':flatten(job.get() for job in jobs)}] # 추가될 수도 있는 데이터 크롤링을 위해 next_crawler를 추가하는 내용. if hasattr(self, 'next_crawler'): next_result = self.next_crawler.crawl() every_result.extend(next_result) return every_result
def save(video,format='txt'): for case in switch(format): if case('xls'): import xlwt if len(video['comments']) > 0: wbk = xlwt.Workbook() sheet = wbk.add_sheet(tech.validate(video['title'])) bigs = wbk.add_sheet('Bigrams') tris = wbk.add_sheet('Trigrams') context = wbk.add_sheet('Context') for_nlp = tech.flatten(video['comments'][0]) for idx,comment in enumerate(video['comments'][0]): sheet.write(idx,0,' '.join(comment)) for idx,bigram in enumerate(tech.bigrams(for_nlp,self.term)): bigs.write(idx,0,' '.join(bigram)) for idx,trigram in enumerate(tech.trigrams(for_nlp,self.term)): tris.write(idx,0,' '.join(trigram)) for idx,con in enumerate(tech.context(for_nlp,self.term)): context.write(idx,0,' '.join(con)) wbk.save(tech.validate(video['title'])+'.xls') print 'Videos, trigrams, bigrams, and contexts saved to XLS files.' #indexing is zero-based, row then column break if case('txt'): if len(video['comments']) > 0: with open(path.join(dir_path, tech.validate(video['title'])),'a') as f: f.write(video['comments']) print 'Saved %s as text' % video['title'] break
def from_file(filename): try: j = json.load(open(filename, 'r')) except IOError as e: print "Error opening show:", e return None name = j['name'] start = numbering_from_str(j['start']) airdates = dict((start.from_str(e), datetime.strptime(d, "%Y/%m/%d").date()) for e, d in j.get('airdates', {}).iteritems()) name_matcher = NameMatcher(j.get('match', { 'name': name })) sources = [] for source_name, source_params in j['sources'].iteritems(): params = config.source(source_name) params.update(source_params) source = get_source(source_name, params) if source: sources.append(source) condition = j.get('condition', config.condition) rules = list(flatten(config.bind_rules(j.get('rules')))) if not rules: rules = [config.rule('default')] return Show(name, filename, start, airdates, name_matcher, sources, condition, rules)
def expand_commas(self,root): ''' Transform foo, bar, baz color: blue into foo color: blue bar color: blue baz color: blue ''' children = [] for child in root.children: if not isinstance(child,tree.RuleNode) and not child.parsed_rules.members.size > 1: if isinstance(child,tree.DirectiveNode): child = self.expand_commas(child) children.append(child) continue members = [] for seq in child.parsed_rules.members: node = tree.RuleNode([]) node.parsed_rules = self.make_cseq(seq) node.children = child.children node.append(members) children.append(members) root.children = utils.flatten(children) return root
def __init__(self, confirm): self.confirm = confirm self.clusters = confirm.get_clusters() self.preprocess_clusters() self.docs = utils.flatten(map(lambda cluster: cluster.members, self.clusters)) self.all_labels = self.get_all_labels() self.num_docs = len(self.docs) self.label_pr_mats = self.calc_label_pr_mats() self.label_cluster_mat = self.calc_label_cluster_counts() self.total_counts = {count: 0 for count in _counts} for label in self.label_pr_mats: for count in _counts: n = self.label_pr_mats[label][count] self.total_counts[count] += n #print json.dumps(self.label_pr_mats, indent=4) labels = list(self.all_labels) mapping = {label: labels.index(label) for label in labels} self.true_labels = map(lambda _doc: mapping[_doc.label], self.docs) self.predicted_labels = list() for _doc in self.docs: for x, _cluster in enumerate(self.clusters): if _doc in _cluster.members: self.predicted_labels.append(x) break
def test_features_syn(): docs = doc.get_docs_nested(get_data_dir(sys.argv[2])) max_size = int(sys.argv[3]) num_combine = int(sys.argv[4]) min_size = int(sys.argv[5]) d = collections.defaultdict(list) for _doc in docs: d[_doc.label].append(_doc) pure_clusters = d.values() broken_clusters = list() for x in xrange(10): for _cluster in pure_clusters: broken_clusters += [_cluster[i:i + max_size] for i in range(0, len(_cluster), max_size)] combined_clusters = list() while broken_clusters: if len(broken_clusters) < num_combine: clusters = list(broken_clusters) else: clusters = random.sample(broken_clusters, num_combine) for _cluster in clusters: broken_clusters.remove(_cluster) combined_clusters.append(utils.flatten(clusters)) clusters = map(lambda combined_cluster: cluster.Cluster(combined_cluster), combined_clusters) ncluster.test_features(clusters, min_size)
def generate_model(self): print("Gathering and processing tweets...") # Shuffle list of username-label tuples tuple_list = usermapping.data_tuples.items() # Split and grab tweets for users results = utils.flatten([ self.fetch_data(t) for t in tuple_list ]) # TODO: Cross-validation generation trn_ratio = int(len(results) * 0.85) shuffle(results) print(len(results)) print(trn_ratio) train = results[:trn_ratio] test = results[trn_ratio:] # Instantiate and train classifier print("Training...") cl = NaiveBayesClassifier(train) cl.train() # Save model print("Saving model...") utils.save_model(cl) # Classify test print("Testing...") print("Accuracy: {0}".format(cl.accuracy(test))) return cl
def p_expr_list_or_empty(self, p): """expr_list_or_empty : expr_list | """ if len(p) > 1: p[0] = utils.flatten(p[1]) else: p[0] = []
def process_individual_frames(self, PATH_TO_DATA, annotations, list_of_layers, sampling_rate, LCD): i = 0 label_map = {} frm_map = {} X = {} map_index_data = pickle.load(open(annotations, "rb")) for index in map_index_data: segments = map_index_data[index] print "Processing images for label " + str(index) for seg in segments: print str(seg) frm_num = seg[0] while frm_num <= seg[1]: print frm_num frm_map[i] = frm_num label_map[i] = index im = caffe.io.load_image(utils.get_full_image_path(PATH_TO_DATA, frm_num)) self.net.blobs['data'].data[...] = self.transformer.preprocess('data', im) out = self.net.forward() for layer in list_of_layers: if layer == 'input': data = cv2.imread(full_image_path) else: data = self.net.blobs[layer].data[0] data = utils.flatten(data) utils.dict_insert(layer, data, X) frm_num += sampling_rate i += 1 return X, label_map, frm_map
def ValidateLogistic(model_dict, validation_dict, target, **args): '''Creates and tests prospective models using logisitic regression.''' #Pick the model building parameters out of args try: weights = list( args['weights'] ) #Logistic regression, weighted away from the threshold. except KeyError: weights = ['discrete'] try: limits=utils.flatten([args['specificity']]) except KeyError: limits = np.arange(11.)/100 + 0.85 #Default: test specificity limits from 0.85 to 0.95 results = list() #Test models w/ midseason split for weight in weights: for limit in limits: l=logistic.Model(model_dict, target, specificity=limit, weights=weight) summary = Summarize(l, validation_dict, **args) summary.insert( 1, weight) summary.insert( 1, np.nan) results.append( summary ) return results
def make_similarity_matrix(matrix, size=MIN_ALIGN): singles = matrix.tolist() points = [flatten(t) for t in tuples(singles, size)] numPoints = len(points) # euclidean distance distMat = np.sqrt(np.sum((repmat(points, numPoints, 1) - repeat(points, numPoints, axis=0))**2, axis=1, dtype=np.float32)) return distMat.reshape((numPoints, numPoints))
def greedy_search(abs_list, doc_list, budget=3): def _rouge_clean(s): return re.sub(r'[^a-zA-Z0-9 ]', '', s) max_rouge = 0.0 abs_tokens = _rouge_clean(' '.join(flatten(abs_list))).split() sents = [_rouge_clean(' '.join(sent)).split() for sent in doc_list] hyp_unigrams = [ngrams(sent, 1) for sent in sents] hyp_bigrams = [ngrams(sent, 2) for sent in sents] ref_unigrams = ngrams(abs_tokens, 1) ref_bigrams = ngrams(abs_tokens, 2) selected_idxs = [] for _ in range(budget): curr_max_rouge = max_rouge curr_id = -1 for (i, sent) in enumerate(sents): if i in selected_idxs: continue candidate_idxs = selected_idxs + [i] candidate_unigrams = set.union( *[set(hyp_unigrams[idx]) for idx in candidate_idxs]) candidate_bigrams = set.union( *[set(hyp_bigrams[idx]) for idx in candidate_idxs]) rouge1 = approx_rouge(candidate_unigrams, ref_unigrams) rouge2 = approx_rouge(candidate_bigrams, ref_bigrams) rouge_score = rouge1 + rouge2 if rouge_score > curr_max_rouge: curr_max_rouge = rouge_score curr_id = i if curr_id == -1: return (list(sorted(selected_idxs)), max_rouge) selected_idxs.append(curr_id) max_rouge = curr_max_rouge return (list(sorted(selected_idxs)), max_rouge)
def create_clusters(documents): for document in documents: document.text = [] for sentence in document.original: words = [word.lower() for word in sentence] words = [word for word in words if word not in forbidden_symbols] document.text.append(words) num_docs = len(documents) term_doc_frequency = {} # calculate term frequency per document for document in documents: words = flatten(document.text) words = words[:DECAY_THRESHOLD] doc_corpus = {} for word in words: if word in doc_corpus: doc_corpus[word] += 1 else: doc_corpus[word] = 1 document.corpus = doc_corpus # calculate number of documents where each term appears for document in documents: for word in document.corpus: if word in term_doc_frequency: term_doc_frequency[word] += 1 else: term_doc_frequency[word] = 1 # calculate the idf for each term, and ignore if it is below the threshold term_idf = {} for term in term_doc_frequency: idf = math.log(num_docs / (1 + term_doc_frequency[term])) if idf >= IDF_THRESHOLD: term_idf[term] = idf # generate the tf_idf vector for each document for document in documents: document.tf_idf_vector = generate_tf_idf_vector(document, term_idf) clusters = [Cluster(documents[0], term_idf)] for document in documents[1:]: clusters = calc_similarity(clusters, document, term_idf) return clusters
def calculate_word_frequency(topic_headers): """Calculates the word frequencies in topic headers""" tokens = flatten([word_tokenize(topic_header) for topic_header in topic_headers]) tokens = [ word.lower() for word in tokens if not word.lower() in stopwords.words("english") ] word_frequencies = FreqDist(word.lower() for word in tokens) top_word_frequencies = word_frequencies.most_common(500) for x in top_word_frequencies: print(x) return top_word_frequencies
def scan_P__(P__): """Detect forks and roots per P.""" for _P_, P_ in pairwise(P__): # Iterate through pairs of lines. _itP_, itP_ = iter(_P_), iter(P_) # Convert to iterators. try: _P, P = next(_itP_), next(itP_) # First pair to check. except StopIteration: # No more fork-root pair. continue # To next pair of _P_, P_. while True: isleft, olp = comp_edge(_P, P) # Check for 4 different cases. if olp and _P['sign'] == P['sign']: _P['root_'].append(P) P['fork_'].append(_P) try: # Check for stopping: _P, P = (next(_itP_), P) if isleft else (_P, next(itP_)) except StopIteration: # No more fork-root pair. break # To next pair of _P_, P_. return [*flatten(P__)] # Flatten P__ before return.
def add_all_imports_idx_and_depth_to_row(row, df, join_all, max_depth): idx_set = set([row['ID']]) depth_list = [0] depth = 0 # print row if join_all or row.loc['is_imported'] == False: imports_idx_all, imports_depth = get_all_imports_idx_and_depth( df=df, imports_idx=row.loc['imports_idx'], idx_set=idx_set, depth_list=depth_list, depth=depth, max_depth=max_depth, ) row['imports_idx_all'] = flatten(list(imports_idx_all)) row['imports_depth'] = imports_depth else: row['imports_idx_all'] = [] row['imports_depth'] = -1 return row
def complex_volume(model, ind): cdist = gpytorch.kernels.Kernel().covar_dist n_vert = len(model.simplicial_complex[ind]) total_vert = model.n_vert mat = torch.ones(n_vert + 1, n_vert + 1) - torch.eye(n_vert + 1) ## compute distance between parameters ## temp_pars = [p for p in model.net.parameters()][0::total_vert] n_par = int(sum([p.numel() for p in temp_pars])) par_vecs = torch.zeros(n_vert, n_par).to(temp_pars[0].device) for ii, vv in enumerate(model.simplicial_complex[ind]): par_vecs[ii, :] = utils.flatten([p for p in model.net.parameters() ][vv::total_vert]) dist_mat = cdist(par_vecs, par_vecs).pow(2) mat[:n_vert, :n_vert] = dist_mat norm = (math.factorial(n_vert - 1)**2) * (2.**(n_vert - 1)) return torch.abs(torch.det(mat)).div(norm)
def make_dictionary(word2index, sentences_list, vocab_size=50000): # <eos>を排除 sentences_list = [sentence[:-1]for sentence in sentences_list] # 2重のリストをフラットにする words_list = utils.flatten(sentences_list) # 頻度順にソートしてidをふる counter = collections.Counter() counter.update(words_list) cnt = 2 for word, count in counter.most_common(): # 出現回数3回以上の単語のみ辞書に追加 if cnt >= vocab_size: break if count >= 3: word2index[word] = cnt cnt += 1 word2index[u'<sos>'] = 0 word2index[u'<eos>'] = 1 word2index[u'<unk>'] = len(word2index) return word2index
def get_data(filename, syll_mgr, num_symbols): num_syllables = syll_mgr.get_size() lines = open(filename, 'r').read().splitlines() num_lines = len(lines) num_lines = 30000 text_lines = [] text_sylls = [] for i in range(0, num_lines): parts = lines[i].split("\t") label = utils.flatten(literal_eval(parts[1])) if len(label) == num_symbols: text_lines.append(str(parts[0])) text_sylls.append(label) num_lines = len(text_lines) label_array = np.zeros((num_symbols, num_lines, num_syllables), dtype=np.int8) for i in range(0, num_lines): for j in range(num_symbols): label_array[j][i][syll_mgr.get_encoding(text_sylls[i][j])] = 1 return (text_lines, label_array)
def update_position(self): Frame.update_position(self) w, h = self.width, self.height ax, ay = self.abs_coords() self._gcp_framevlist.vertices[:] = flatten([ rectv2f(2, 2, w-4, 54-2, ax, ay), # equip box rrectv2f(2.5, 2.5, 4*36, 52, ax, ay), # equip box border rectv2f(w-2-32, 66, 32, 22, ax, ay), # cardnum box rrectv2f(w-2-32, 66, 32, 22, ax, ay), # cardnum box border ]) full = rectv2f(0, 0, w, h, ax, ay) self._highlight_disabled.vertices[:] = full self._highlight.vertices[:] = full if self.actor_frame: self.actor_frame.set_position(self.x - 6, self.y - 4) if self.turn_frame: self.turn_frame.set_position(self.x - 6, self.y - 4)
def train(self, authors): self.stopwords = {ln: self.get_stop_words(ln) \ for ln in self.db.get_languages()} lang = self.db.get_author_language(authors[0]) # transform corpus into a list of preprocessed documents documents = [self.db.get_author(a)["corpus"] for a in authors] documents = utils.flatten(documents) tokenizer = self.get_tokenizer() documents = map(lambda x: tokenizer.tokenize(x), documents) documents = [map(lambda x: x.lower(), d) for d in documents] documents = [filter(lambda x: x not in self.stopwords[lang], d) \ for d in documents] # build topic model self.dictionary = corpora.Dictionary(documents) self.dictionary.filter_extremes(no_below=5, no_above=0.5) documents = map(lambda x: self.dictionary.doc2bow(x), documents) self.model = models.LdaModel(documents, num_topics=self.k, id2word=self.dictionary, iterations=1000)
def linear(x,output_size,scope,add_tanh=False,wd=None,bn=False,bias=False,is_train=None,ln=False): # bn -> batch norm # ln -> layer norm with tf.variable_scope(scope): # since the input here is not two rank, we flat the input while keeping the last dims keep = 1 #print x.get_shape().as_list() flat_x = flatten(x,keep) # keeping the last one dim # [N,M,JX,JQ,2d] => [N*M*JX*JQ,2d] #print flat_x.get_shape() # (?, 200) # wd+cwd bias_start = 0.0 if not (type(output_size) == type(1)): # need to be get_shape()[k].value output_size = output_size.value # add batch_norm if bn: assert is_train is not None flat_x = batch_norm(flat_x,scope="bn",is_train=is_train) if ln: flat_x = layer_norm(flat_x,scope="ln") #print [flat_x.get_shape()[-1],output_size] W = tf.get_variable("W",dtype="float",initializer=tf.truncated_normal([flat_x.get_shape()[-1].value,output_size],stddev=0.1)) flat_out = tf.matmul(flat_x,W) if bias: bias = tf.get_variable("b",dtype="float",initializer=tf.constant(bias_start,shape=[output_size])) flat_out += bias if add_tanh: flat_out = tf.tanh(flat_out,name="tanh") #flat_out = tf.nn.dropout(flat_out,keep_prob) if wd is not None: add_wd(wd) out = reconstruct(flat_out,x,keep) return out
def __init__( self, mdl, X, feature_names=[], feature_types=[], feature_categories=[], feature_constraints=[], max_cancidates=100, tol=1e-6, target_name='Output', target_labels=['Good', 'Bad'], interaction_matrix=[], ): self.mdl_ = mdl self.coef_ = mdl.coef_[0] self.intercept_ = mdl.intercept_[0] self.X_ = X self.N_, self.D_ = X.shape self.feature_names_ = feature_names if len( feature_names) == self.D_ else [ 'x_{}'.format(d) for d in range(self.D_) ] self.feature_types_ = feature_types if len( feature_types) == self.D_ else ['C' for d in range(self.D_)] self.feature_categories_ = feature_categories self.feature_categories_flatten_ = flatten(feature_categories) self.feature_constraints_ = feature_constraints if len( feature_constraints) == self.D_ else ['' for d in range(self.D_)] self.target_name_ = target_name self.target_labels_ = target_labels self.AC_ = ActionCandidates(X, feature_names=feature_names, feature_types=feature_types, feature_categories=feature_categories, feature_constraints=feature_constraints, max_candidates=max_cancidates, tol=tol) self.tol_ = tol self.M_ = interaction_matrix if len( interaction_matrix) == self.D_ else np.zeros([self.D_, self.D_])
def to_conll(self, val_corpus, eval_script): """ Write to out_file the predictions, return CoNLL metrics results """ # Make predictions directory if there isn't one already golds_file, preds_file = '../preds/golds.txt', '../preds/predictions.txt' if not os.path.exists('../preds/'): os.makedirs('../preds/') # Combine all gold files into a single file (Perl script requires this) golds_file_content = flatten([doc.raw_text for doc in val_corpus]) with io.open(golds_file, 'w', encoding='utf-8', errors='strict') as f: for line in golds_file_content: f.write(line) # Dump predictions with io.open(preds_file, 'w', encoding='utf-8', errors='strict') as f: for doc in val_corpus: current_idx = 0 for line in doc.raw_text: # Indicates start / end of document or line break if line.startswith('#begin') or line.startswith( '#end') or line == '\n': f.write(line) continue else: # Replace the coref column entry with the predicted tag tokens = line.split() tokens[-1] = doc.tags[current_idx] # Increment by 1 so tags are still aligned current_idx += 1 # Rewrite it back out f.write('\t'.join(tokens)) f.write('\n') return golds_file, preds_file
def predictAll(self, sess, save=False): if self.predictions is not None: return self.predictions predictions = None for i in range(self.num_test_batches): testbatch = self.loader.get_testbatch() if self.include_coverage and self.include_entropy: preds = self.predict(sess, testbatch[0], testbatch[1], testbatch[2]) elif self.include_coverage: preds = self.predict(sess, testbatch[0], testbatch[1]) elif self.include_entropy: preds = self.predict(sess, testbatch[0], E = testbatch[1]) else: preds = self.predict(sess, testbatch[0]) if not self.multiclass: preds = utils.flatten(preds) if predictions is None: predictions = preds else: predictions = np.concatenate((predictions, preds)) if save: self.predictions = predictions return predictions
def predict_one_by_one_internal(combined, test_hour_ordinal, largest_lag=10): test = extract_test_func(combined) # here we have the rows for which we want to calculate aggregates and make predictions test_part = test[test['ordinal'] == test_hour_ordinal] # should works as long as largest_lag is really largest lag idxs_list = [ list(range(idx - largest_lag, idx + 1)) for idx in test_part.index ] idxs = flatten(idxs_list) df = agg_function(combined.iloc[idxs], test_size=1) test_for_predictions = df.loc[test_part.index][feats] test_for_predictions = convert_to_float_or_factorize_objects( test_for_predictions, feats) pred = model.predict(test_for_predictions) test.loc[test_part.index, target] = pred return test
def generate_visualizations(sample_outputs, output_path='visualization.html'): with open(sample_outputs) as json_file: data = json.load(json_file) with open(output_path, 'w+') as output_file: for doc_id in data.keys(): doc = data[doc_id] doc_words = doc['words'] clusters = doc['predicted_clusters'] event_mentions = flatten(clusters) output_file.write('<b>Document {}</b><br>'.format(doc_id)) output_file.write('{}<br><br><br>'.format( doc_to_html(doc, event_mentions))) for ix, cluster in enumerate(doc['predicted_clusters']): if len(cluster) == 1: continue output_file.write('<b>Cluster {}</b></br>'.format(ix + 1)) for em in cluster: output_file.write('{}<br>'.format( event_mentions_to_html(doc_words, em))) output_file.write('<br><br>') output_file.write('<br><hr>')
def test(self, sess): if self.predictions is not None: # Evaluate the accuracy of the saved predictions, based on the true test labels if self.multiclass: return metrics.accuracy_score( np.argmax(self.loader.test_data[-1], axis=-1), np.argmax(self.predictions.round(), axis=-1)) else: return metrics.accuracy_score( utils.flatten(self.loader.test_data[-1]), self.predictions.round()) test_acc = 0 num_test_ex = 0 for i in range(self.num_test_batches): testbatch = self.loader.get_testbatch() cur_size = len(testbatch[1]) batch_acc = curr_size * self.eval_accuracy_on_batch( testbatch) # Accuracy on the batch test_acc += batch_acc num_test_ex += cur_size # Final accuracy needs to be weighted average of batch accuracies, weighted by the size of each batch (since the last batch may be smaller) return test_acc / num_test_ex
def make_heatmap(heatmap, graph_terms): try: set_status('getting document list', model=heatmap) with ManagedSession() as session: heatmap_query= create_query(session, author=heatmap.author, institution=heatmap.institution) filtered_query = filter_query(heatmap_query, dirty=False, starting_year=heatmap.starting_year, ending_year=heatmap.ending_year, sample_size=heatmap.sample_size, model=heatmap) extracted_terms = extract_terms(filtered_query, heatmap.term_type) heatmap_terms = flatten(extracted_terms) heatmap_vals = calculate_heatmap_values(heatmap_terms, graph_terms) heatmap.terms = json.dumps(jsonize_phrase_dict(heatmap_vals, 'intensity')) set_status('heatmap complete', model=heatmap) heatmap.finished = True heatmap.save() return heatmap_vals except Exception as e: set_status('Error: %s' % e, model=heatmap) raise e
def write_matchup(t1, t2, num, week_sheet, region): roster1 = get_active_roster(t1) roster2 = get_active_roster(t2) num_cols = 6 num_rows = 10 t1_col = 1 t2_col = 4 data_view = [[""]*num_cols for i in range(num_rows)] data_view[0][0] = "Match #{}".format(num + 1) for t, r, col in [(t1, roster1, t1_col), (t2, roster2, t2_col)]: data_view[1][col] = t for i, p in enumerate(r): both = region == "both" in_region = p[1] == region if both or in_region: data_view[3+i][col] = p[0] top_row = num*num_rows + 1 cells = week_sheet.range(top_row, 1, top_row + num_rows, num_cols) for cell, val in zip(cells, u.flatten(data_view)): cell.value = val week_sheet.update_cells(cells)
def print_top_k_accuracies(train_labels: List[Dict[int, int]], test_labels: List[Dict[int, int]], top_k_accuracy: List[int], label: str) -> None: pred_freq_counter = Counter[int](utils.flatten(l.values() for l in train_labels)) pred_vec = [lab for (lab, count) in pred_freq_counter.most_common()] total_labs = sum(map(len, test_labels)) unk_labs = sum(1 for labs in test_labels for lab in labs.values() if lab == 0) for k in top_k_accuracy: k_preds = set(pred_vec[:k]) corr = sum(1 for labs in test_labels for lab in labs.values() if lab in k_preds) utils.log('{}: top {} accuracy: {:.2f} ({:.2f} w/out UNK)'.format( label, k, corr / total_labs, corr / (total_labs - unk_labs), ))
def brute_force(wanted_parts, price_guide, k): """Enumerate all possible combinations of k stores""" by_store = utils.groupby(price_guide, lambda x: x['store_id']) results = [] for selected_stores in itertools.combinations(by_store.keys(), k): # get items sold by these stores only inventory = utils.flatten(by_store[s] for s in selected_stores) if covers(wanted_parts, inventory): # calculate minimum cost to buy everything using these stores cost, allocation = min_cost(wanted_parts, inventory) results.append({ 'cost': cost, 'allocation': allocation, 'store_ids': selected_stores }) #print 'Solution: k=%d, cost=%8.2f, store_ids=%40s' % (k, cost, selected_stores) else: #print 'Unable to fill quote using store_ids=%40s' % (selected_stores,) pass return results
def handle_underscores(suffix, text_encoder, prefix=False): encoder = text_encoder.encoder if prefix: tok = "___" else: tok = find_underscore_length(suffix) suffix_parts = [i.strip() for i in suffix.split("{}".format(tok))] to_flatten = [] for i, part in enumerate(suffix_parts): if part: to_flatten.append(text_encoder.encode([part], verbose=False)[0]) if i != len(suffix_parts) - 1 and suffix_parts[i + 1]: to_flatten.append([encoder["<blank>"]]) else: to_flatten.append([encoder["<blank>"]]) final_suffix = utils.flatten(to_flatten) return final_suffix
def _get_cooccurrences_context(self, keywords, corpus, context_window): """ given some context window, get co-occurrences for keyword network """ cv = CountVectorizer(vocabulary=np.sort(keywords), ngram_range=(1, 2)) matrix = cv.fit_transform( flatten(self._shift_corpus(corpus, context_window))) vocab = cv.vocabulary occurrence_dict, values_dict = defaultdict(list), defaultdict(list) row_entry, keyword_id, value = find(matrix) occurrences = zip(keyword_id, row_entry, value) # keyword_id, row entry, value for entry in occurrences: occurrence_dict[vocab[entry[0]]].append( (entry[1])), values_dict[vocab[entry[0]]].append((entry[2])) combos = defaultdict() for idx, key1 in enumerate(vocab): for key2 in vocab[idx + 1:]: key1_matrix, key2_matrix = np.array( occurrence_dict[key1]), np.array( occurrence_dict[key2] ) # get sentence ids where keys occur dist = np.abs( key1_matrix[:, np.newaxis] - key2_matrix ) # compute exhaustive distances (in sentence count) between mentions args = np.argwhere( dist < context_window ) # find where distance less than context_window size if len(args) > 0: cooccur_vals = [ val1 * val2 for val1, val2 in zip([values_dict[key1][x] for x in args[:, 0]], [values_dict[key2][y] for y in args[:, 1]]) ] # retrieve their co-occurrence values combos['_'.join( [key1, key2])] = combos.get('_'.join([key1, key2]), 0) + sum( cooccur_vals) # update occurrences return combos
def main(): url = "data/data.csv" df = pd.read_csv(url, index_col='Date', parse_dates=True) df = df[['Close']] ps.test_stationary(df['Close']) train, test = train_test_split(df, test_size=0.1, shuffle=False) scaler = StandardScaler() scaler = scaler.fit(train[['Close']]) train['NClose'] = scaler.transform(train[['Close']]) test['NClose'] = scaler.transform(test[['Close']]) sequence_length = 100 X_train, y_train = temporalize(train[['NClose']], train.NClose, False, sequence_length) X_test, y_test = temporalize(test[['NClose']], test.NClose, False, sequence_length) input_shape = ( X_train.shape[1], X_train.shape[2], ) intermediate_cfg = [64, 'latent', 64] latent_dim = 10 model = V_AE_LSTM(input_shape, intermediate_cfg, latent_dim, 'VAE-LSTM') model.fit(X_train, y_train, epochs=2, batch_size=124, validation_split=None, verbose=1) recunstruction, prediction = model.predict(X_test) recunstruction = flatten(recunstruction).reshape(-1) res = anomaly_detector(y_test.reshape(-1, 1), prediction.reshape(-1, 1))
def add_vert(self, to_simplexes=[0]): self.fix_points = [True] * self.n_vert + [False] new_model = self.architecture(self.n_output, fix_points=self.fix_points, **self.architecture_kwargs) ## assign osld pars to new model ## for index in range(self.n_vert): old_parameters = list(self.net. parameters())[index::self.n_vert] new_parameters = list(new_model.parameters())[index::(self.n_vert+1)] for old_par, new_par in zip(old_parameters, new_parameters): new_par.data.copy_(old_par.data) new_parameters = list(new_model.parameters()) new_parameters = new_parameters[(self.n_vert)::(self.n_vert+1)] n_par = sum([p.numel() for p in new_parameters]) ## assign mean of old pars to new vertex ## par_vecs = torch.zeros(self.n_vert, n_par).to(new_parameters[0].device) for ii in range(self.n_vert): temp = [p for p in self.net.parameters()][ii::self.n_vert] par_vecs[ii, :] = utils.flatten(temp) center_pars = torch.mean(par_vecs, 0).unsqueeze(0) center_pars = utils.unflatten_like(center_pars, new_parameters) for cntr, par in zip(center_pars, new_parameters): par.data = cntr.to(par.device) ## update self values ## self.n_vert += 1 self.net = new_model self.simplex_modules = [] for module in self.net.modules(): if issubclass(module.__class__, SimplexModule): self.simplex_modules.append(module) for cc in to_simplexes: self.simplicial_complex[cc].append(self.n_vert-1) return
def handle(self, evt_type, act): if evt_type == 'action_before' and isinstance(act, SpellCardAction): if act.cancelled: return act # some other thing have done the job if isinstance(act, NonResponsiveInstantSpellCardAction): return act g = Game.getgame() has_reject = False while g.SERVER_SIDE: from ..characters.reimu import Reimu for p in g.players: if isinstance(p, Reimu): has_reject = True break if has_reject: break from .definition import RejectCard for c in flatten([[p.cards, p.showncards] for p in g.players]): if isinstance(c, RejectCard): has_reject = True break break has_reject = sync_primitive(has_reject, g.players) if not has_reject: return act self.target_act = act # for ui pl = BatchList(p for p in g.players if not p.dead) p, rst = ask_for_action(self, pl, ['cards', 'showncards'], []) if not p: return act cards, _ = rst assert cards and self.cond(cards) g.process_action(LaunchReject(p, act, cards[0])) return act
def test(self, sess): if self.predictions is not None: if self.multiclass: return metrics.accuracy_score(np.argmax(self.loader.test_data[-1], axis=-1), np.argmax(self.predictions.round(), axis=-1)) else: return metrics.accuracy_score(utils.flatten(self.loader.test_data[-1]), self.predictions.round()) test_acc = 0 num_test_ex = 0 for i in range(self.num_test_batches): testbatch = self.loader.get_testbatch() cur_size = len(testbatch[1]) if self.include_coverage and self.include_entropy: batch_acc = cur_size * self.eval_accuracy_on_batch(testbatch[0], testbatch[1], testbatch[2], testbatch[3]) elif self.include_coverage: batch_acc = cur_size * self.eval_accuracy_on_batch(testbatch[0], testbatch[1], testbatch[2]) elif self.include_entropy: batch_acc = cur_size * self.eval_accuracy_on_batch(testbatch[0], testbatch[1], testbatch[2]) else: batch_acc = cur_size * self.eval_accuracy_on_batch(testbatch[0], testbatch[1]) test_acc += batch_acc num_test_ex += cur_size return test_acc / num_test_ex
def apply(self, X): if not self.trained: raise ReadoutException('readout is not trained!') X = numpy.array(X) if self.usefull_dims is not None: X = X[:, self.usefull_dims] if self.addBias: X = numpy.concatenate((X, numpy.ones((X.shape[0], 1))), 1) if self.addNegBias: X = numpy.concatenate((X, -1 * numpy.ones((X.shape[0], 1))), 1) if self.nClasses == 2: if self.addNegBias: maxI = (numpy.dot(X, self.W) >= 0).astype( numpy.int32) # maxI \in {1,2} else: maxI = (numpy.dot(X, self.W) >= 0.5).astype( numpy.int32) # maxI \in {1,2} else: S = numpy.zeros((X.shape[0], self.nClasses)) for i in range(self.nClasses): S[:, i] = numpy.dot(self.W[i, :], X.T) maxV = S.max(1) maxI = S.argmax(1) if maxI.ndim < 1: maxI = numpy.asarray([maxI]) Y = numpy.array( utils.flatten(numpy.asarray(self.uniqueY).take(maxI).tolist())) if self.swapLabels: Y = (numpy.array(Y * 2 - 1) * (-1) + 1) / 2 return Y.tolist()
def apply(self, X): if not self.trained: raise ReadoutException('readout is not trained!') X = numpy.asarray(X) if self.addBias: X = numpy.concatenate((X, numpy.ones((X.shape[0], 1))), 1) if self.nClasses == 2: maxI = (numpy.dot(X, self.W) >= 0).astype( numpy.int32) # maxI \in {1,2} else: S = numpy.zeros((X.shape[0], self.nClasses)) for i in range(self.nClasses): S[:, i] = numpy.dot(X, self.W[:, i]) maxV = S.max(1) maxI = S.argmax(1) if maxI.ndim < 1: maxI = numpy.asarray([maxI]) return utils.flatten(numpy.asarray(self.uniqueY).take(maxI).tolist())
def predict(self, debate_data, proba=False, y=None): test_x = self.preprocess(debate_data, is_training=False) test_y = np.array([LABELS_ORDER[label] for label in flatten(y)]) history = self.model.fit( x=self.train_x, y=self.train_y, class_weight=self.class_weights, validation_data=(test_x, test_y), batch_size=len(self.train_y), epochs=self.params['nn_params']['epochs'], shuffle=True, callbacks=[ MetricsCallback(num_inputs=len( self.params['nn_params']['inputs']), train_x=self.train_x, train_y=self.train_y, labels=LABELS), keras.callbacks.EarlyStopping( monitor=self.params['nn_params']['early_stopping'] ['monitor'], min_delta=self.params['nn_params']['early_stopping'] ['min_delta'], patience=self.params['nn_params']['early_stopping'] ['patience'], mode=self.params['nn_params']['early_stopping']['mode']) ]) prediction_probs = list(self.model.predict(test_x)) predictions_indices = [ example_pred_probs.tolist().index(max(example_pred_probs)) for example_pred_probs in prediction_probs ] predictions = [ LABELS[prediction] for prediction in predictions_indices ] return prediction_probs if proba else predictions