def plot(sdss_path, dflens_path, out_path): sdss_data = splitter.load(sdss_path) sdss_data, _ = splitter.split(sdss_data, sdss_data[0].shape[0], 0) _, sdss_y = sdss_data dflens_data = splitter.load(dflens_path) dflens_data, _ = splitter.split(dflens_data, dflens_data[0].shape[0], 0) _, dflens_y = dflens_data fig, (plot1, plot2) = plt.subplots(2, 1) data_min = min(np.min(sdss_y), np.min(dflens_y)) data_max = 1 space = np.linspace(data_min, data_max, 100) plot1.hist(sdss_y, bins=space) plot1.set_xlim((data_min, data_max)) plot1.set_ylabel('SDSS Count') plot1.set_xticks([]) plot2.hist(dflens_y, bins=space) plot2.set_xlim((data_min, data_max)) plot2.set_ylabel('2dFLenS Count') plot2.set_xlabel(r'$z_\mathrm{spec}$') plt.subplots_adjust(hspace=0) plt.savefig(out_path)
def split_and_send(update, context): splitter.split(n=context.chat_data['n'], split_axis=context.chat_data['axis']) for filename in os.listdir("output"): context.bot.send_photo(chat_id=update.effective_chat.id, photo=open(f'output/{filename}', 'rb')) os.remove(f'output/{filename}') os.remove('input/input_img.jpg') # Reset parameters stored context.chat_data['n'] = False context.chat_data['axis'] = False context.chat_data['img_received'] = False
def test_split_incorrectly_capitalized(): split_result = split('StartEreignis', 'de_de') assert split_result == '', '%s != ""' % split_result # should split unicode strings split_result = split(u'Resultatveröffentlichung', 'de_de') assert split_result[0] == u'Resultat', '%s != "Resultat"' % ( split_result[0]) assert split_result[ 1] == u'Veröffentlichung', '%s != "Veröffentlichung"' % ( split_result[1])
def separatewords(self, text): # print(f"{text}") try: return [s.lower() for s in splitter.split(text) if s != ''] except: print(f"separatewords{text}") return []
def show_by_ns(self): ns_count = {} for (term, use) in (self.term_uses.keys()): (ns, local) = splitter.split(term) if ns and local: incr(ns_count, ns) print ns_count
def process(source, scene_root, verbose=False, clean=False, list_file=None, overwrite=False): if pusher.check_existance(scene_root): print 'Scene %s already exists on destination bucket.' % scene_root if not overwrite: return collect_missing_entry(scene_root, verbose, clean, list_file) if verbose: print 'Processing scene: %s' % scene_root scene_dict = {} local_tarfile = puller.pull(source, scene_root, scene_dict, verbose=verbose) local_dir = splitter.split(scene_root, local_tarfile, verbose=verbose) scene_info.add_mtl_info(scene_dict, scene_root, local_dir) thumbnailer.thumbnail(scene_root, local_dir, verbose=verbose) scene_index_maker.make_index(scene_root, local_dir, verbose=verbose) pusher.push(scene_root, local_dir, scene_dict, verbose=verbose, overwrite=overwrite) if clean: os.unlink(local_tarfile) shutil.rmtree(local_dir) if list_file: scene_info.append_scene_line(list_file, scene_dict) return scene_dict
def db_finish(self): now = time.time() self.db.update('scan', where="id = "+str(self.id), triples=self.c, time_complete=now, status=1) for (term, use) in (self.term_uses.keys()): (ns, local) = splitter.split(term) nsid = irimap.to_id(self.db, ns) self.db.insert('term_use', local=local, namespace_id = nsid, scan_id = self.id, type = use, count = self.term_uses[(term,use)] ) for t in self.primary_trackers: self.db.insert('trackers', scan_id = self.id, tracker_id = irimap.to_id(self.db, t), is_primary = True ) for t in self.backup_trackers: self.db.insert('trackers', scan_id = self.id, tracker_id = irimap.to_id(self.db, t), is_primary = False ) obsolete_old_scans(self.db, self.data_source_iri, self.id)
def list_(term, timecode=(-1), db=None): # , limit, offset): """ see http://dev.mysql.com/doc/refman/5.0/en/select.html for us, timecodes ARE scanids. A new scanid == a new change. are we going to need to SORT when we do limit/offset? pagerank? """ if db is None: db = dbconn.Connection() (ns, local) = splitter.split(term) ns_id = irimap.to_id(db, ns) if timecode == -1: timecode = latest_timecode(db) print "list using latest timecode:", timecode else: if timecode < min_timecode(db): raise GarbageTimecode() for r in db.query('select text, type from term_use, scan, iri where scan_id <= $timecode and obsoleted_by > $timecode and namespace_id=$ns_id and scan.id=scan_id and status=1 and local=$local and iri.id=source_id', vars=locals()): yield unicode(r.type)+" "+unicode(r.text)
def prove_terminates(filename): splitfile = tempfile.NamedTemporaryFile(mode='w', suffix='.c', delete=False) (id_map, has_nested, nondet) = splitter.split(filename, splitfile) nids = len(id_map) varnames = ' '.join(id_map[k] for k in xrange(nids)) splitfile.close() if has_nested: skeleton = "../../tests/termination/nested.c" nprogs = 3 nargs = nids * 2 varnames += ' ' + ' '.join("%s\\'" % id_map[k] for k in xrange(nids)) else: skeleton = "../../tests/termination/ranking.c" nprogs = 2 nargs = nids cmd = (("kalashnikov.py " + "%s %s " + "-P%d " + "--seed=1337 -a%d --varnames %s " + "--synth-strategy=genetic " + "-c1 " + "--fastverif=True " + "-newsize=5 " + "-replaceprob=0.15 " + "-mutprob=0.1 " + "-tourneysize=5 " + "-popsize=3000 " + "-w4 " + "%s") % (splitfile.name, skeleton, nprogs, nargs, varnames, ' '.join( sys.argv[2:]))) os.system(cmd)
def rewrite_token(t): """ Rewrites every single token either as a boolean operator or in a format for further processing""" d = {"AND": "&", "OR": "|", "NOT": "1 -", "(": "(", ")": ")"} # Hi guys... Here is some code for you. Made it extra spaghetti this time with hint of spice and double cheese, just get your brains working uknow? I'm just thinking about you and your wellbeing and not being very bad at coding. This comment is also very long, just so because I don't want you to read the code below. You can just continue reading this comment, or actually just skip it. Or skip the whole code while we are at it. Yes? Good? Okay you can now delete this comment, thanks. if t in d: return d.get(t) if t[0] == "\"" and t[-1] == "\"": t = t.replace("\"", "") try: t_ngram = ' '.join([str(t.lower()) for t in splitter.split(t)]) if t_ngram in terms: return 'sparse_td_matrix[t2i["{:s}"]].todense()'.format( t_ngram) else: if t_ngram == "": unknownword_list.append(t) else: unknownword_list.append(t_ngram) return 'np.matrix(np.zeros(len(documents), dtype=int))' except: unknownword_list.append(t) return 'np.matrix(np.zeros(len(documents), dtype=int))' elif t.lower() in terms: return 'sparse_td_matrix[t2i["{:s}"]].todense()'.format(t.lower()) else: if t[0] == "\"" or t[-1] == "\"": pass else: unknownword_list.append(t) return 'np.matrix(np.zeros(len(documents), dtype=int))'
def load_data(path='data/us_trial', max_example=None): """ Load data from '{path}.{text, labels}' """ num_examples = 0 tweets, emojis = [], [] f_x, f_y = open(path + '.text', 'r'), open(path + '.labels', 'r') while True: tweet, emoji = f_x.readline(), f_y.readline() if not tweet or not emoji: break # TODO: extra preprocessing step for each tweet e.g. take care of slang tweet = tweet.strip().lower() # delete whitespaces and lowercase tweet = tweet[:-1] if tweet[ -1] == u'\u2026' else tweet # and more... -> more (... is a special unicode char) tweet = re.sub( r"#\S+", lambda match: ' '.join(splitter.split(match.group()[1:])), tweet) #artfactory -> art factory tweet = re.sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2", tweet, (re.MULTILINE | re.DOTALL)) # no wayyyyy -> no way tweet = tweet.translate({ord(c): None for c in '@#'}) # @ user -> user, #omg -> omg words = TweetTokenizer().tokenize(tweet) tweet = ' '.join(words) tweets.append(tweet) emojis.append(int(emoji)) # convert '7' -> 7 num_examples += 1 if (max_example is not None) and (num_examples >= max_example): break print("%d examples loaded" % num_examples) return tweets, emojis
def grab_profiles(player_profile: Profile, stage: int, num_stages: int, output_file_name: str, tempdir: str) -> int: """Parse output/result files from previous stage and get number of profiles to simulate""" stage_dir = get_subdir(stage, tempdir) os.makedirs(stage_dir, exist_ok=True) if stage == 1: num_generated_profiles = splitter.split(output_file_name, stage_dir, settings.splitting_size, player_profile.player_class) else: subdir_previous_stage = get_subdir(stage - 1, tempdir) try: checkResultFiles(subdir_previous_stage) except Exception as e: msg = "Error while checking result files in {}: {}\nPlease restart AutoSimc at a previous stage.". \ format(subdir_previous_stage, e) raise RuntimeError(msg) from e if settings.default_grabbing_method == "target_error": filter_by = "target_error" filter_criterium = None elif settings.default_grabbing_method == "top_n": filter_by = "count" filter_criterium = settings.default_top_n[stage - num_stages - 1] is_last_stage = (stage == num_stages) num_generated_profiles = splitter.grab_best( filter_by, filter_criterium, subdir_previous_stage, stage_dir, output_file_name, not is_last_stage) if num_generated_profiles: logging.info( "Found {} profile(s) to simulate.".format(num_generated_profiles)) return num_generated_profiles
def prove_terminates(filename): splitfile = tempfile.NamedTemporaryFile(mode='w', suffix='.c', delete=False) (id_map, has_nested, nondet) = splitter.split(filename, splitfile) nids = len(id_map) varnames = ' '.join(id_map[k] for k in xrange(nids)) splitfile.close() os.system(("kalashnikov.py " + "%s ../../tests/termination/unranking.c " + "-P2 " + "--synth-strategy=genetic " + "-c1 " + "--fastverif=True " + "-newsize=5 " + "-replaceprob=0.15 " + "-mutprob=0.1 " + "-tourneysize=5 " + "-popsize=3000 " + "-w4 " + "-a%d --evars %d --varnames %s --resnames I --seed=1337 " + "--nondet=%d " + "%s") % (splitfile.name, nids, nids, varnames, nondet, ' '.join(sys.argv[2:])))
def follow_up_char(w): ''' Rule 9, Defines character of suffixes or vowels. а, у -> а э, ү, и -> э о -> о ө -> ө Exception, following alphabets changes follow up character уу, үү, юу, юү, яу, ёу, еү, иу Following alphabets may go in the middle word but doesn't change follow up character. и, ий, ы, эй ''' dd = { u'а': u'а', u'у': u'а', u'э': u'э', u'ү': u'э', u'и': u'э', u'о': u'о', u'ө': u'ө', } ee = [ u'уу', u'үү', u'юу', u'юү', u'яу', u'ёу', u'еү', u'иу' ] ii = [ u'и', u'ий', u'ы', u'эй' ] fu = None for u in split(w.lower()): found = False for e in ee: if e in u: fu = dd[e[-1]] found = True if not found and fu is None: for k in dd.keys(): if k in u: fu = dd[k] return fu
def getwords(doc): splitter = re.compile('\\W*') # 単語を非アルファベットの文字で分割する words = [s.lower() for s in splitter.split(doc) if len(s) > 2 and len(s) < 20] # ユニークな単語のみの集合を返す return dict([(w, 1) for w in words])
def reply(text=None): if text: resp = '' for part in splitter.split(text, MAX): resp += part check = sender.send( 'POST', URL + TOKEN + '/' + 'sendMessage', { 'text': part.encode('utf-8'), 'chat_id': chat_id } ) if check: logging.error(check.read()) else: logging.info('Sent response: \n\n%s' % resp) else: sender.send( 'POST', URL + TOKEN + '/' + 'sendMessage', { 'text': 'Bot tries to send you an empty message.', 'chat_id': chat_id } )
def grab_profiles_for_stage(player_profile, stage, outputfile, stages): """Parse output/result files from previous stage and get number of profiles to simulate""" subdir_previous_stage = get_subdir(stage - 1) if stage == 1: num_generated_profiles = splitter.split(outputfile, get_subdir(stage), settings.splitting_size, player_profile.wow_class) else: try: check_results_file(subdir_previous_stage) except Exception as ex: msg = f'Error while checking result files in {subdir_previous_stage}: {ex}. Please restart AutoSimc at a previous stage.' raise RuntimeError(msg) from ex if settings.default_grabbing_method == 'target_error': filter_by = 'target_error' filter_criterium = None elif settings.default_grabbing_method == 'top_n': filter_by = 'count' filter_criterium = settings.default_top_n[stage - stages - 1] is_last_stage = (stage == stages) num_generated_profiles = splitter.grab_best(filter_by, filter_criterium, subdir_previous_stage, get_subdir(stage), outputfile, not is_last_stage) if num_generated_profiles: logger.info(f'Found {num_generated_profiles} profile(s) to simulate.') return num_generated_profiles
def test_split_german_three_compounds(): split_result = split('Effektivitätsberechnungsformular', 'de_de') assert split_result[0] == 'Effektivität', '%s != "Effektivität"' % ( split_result[0]) assert split_result[1] == 'Berechnung', '%s != "Berechnung"' % ( split_result[1]) assert split_result[2] == 'Formular', '%s != "Formular"' % ( split_result[2])
def main(path_in): print('Loading data...') data = splitter.load(path_in) (train_X, train_y), (test_X, test_y) = splitter.split(data, TRAINING_NUM, TESTING_NUM) try: gp_sigmas = np.loadtxt('gp_preds.txt') assert gp_sigmas.shape == (TESTING_NUM, ) except (FileNotFoundError, AssertionError): print('Fitting GP...') kernel = sklearn.gaussian_process.kernels.RBF( length_scale=LENGTH_SCALE) gp = sklearn.gaussian_process.GaussianProcessRegressor( kernel=kernel, alpha=ALPHA, copy_X_train=False) gp.fit(train_X, train_y) print('Predicting GP...') _, gp_sigmas = gp.predict(test_X, return_std=True) np.savetxt('gp_preds.txt', gp_sigmas) print('Approximating kernel...') appx_train_X, appx_test_X = approximate_kernel(train_X, test_X) print('Fitting approximate GP...') agp = appx_gp.AppxGaussianProcessRegressor(alpha=ALPHA) agp.fit(appx_train_X, train_y) print('Predicting approximate GP...') _, agp_sigmas = agp.predict(appx_test_X, return_std=True) print('Finding best fit...') best_fit = np.polyfit(gp_sigmas, agp_sigmas, 1) best_fit_box = (min(gp_sigmas), max(gp_sigmas), min(agp_sigmas), max(agp_sigmas)) best_fit_endpoints = interval_in_box_from_line(best_fit_box, best_fit) best_fit_xs, best_fit_ys = zip(*best_fit_endpoints) print('Plotting...') f = plt.figure() ax = f.add_subplot(111) sc = plt.scatter(gp_sigmas, agp_sigmas, s=.2, c=list(test_y)) plt.plot(best_fit_xs, best_fit_ys, color='red', label='Linear fit') plt.title(r'$\gamma = {:.4},$ #components$= {}$'.format(GAMMA, COMPONENTS)) plt.xlabel('GP uncertainty') plt.ylabel('Approximate GP uncertainty') plt.text(.975, .1, '$y = {:.4}x {:+.4}$'.format(*best_fit), horizontalalignment='right', verticalalignment='bottom', transform=ax.transAxes) colorbar = plt.colorbar(sc) colorbar.set_label('Redshift') plt.legend(loc='lower right') plt.show()
def cronista_transcribe(audio_source_path: str, destination_folder: str, block_of_transcription: int, lang: str, on_transcription_progress=None, to_file: bool = True): audio_segments = split(audio_source_path, destination_folder, block_of_transcription, to_file) transcribe(audio_segments, lang, on_transcription_progress)
def split_compound(self, sentance): """ Used to split compound words that are found in the utterance This will make it easier to confirm that all words are found in the search """ search_words = re.split(r'\W+', str(sentance)) separator = " " words_list = splitter.split(separator.join(search_words)) return words_list
def list_(term): print "Looking for trackers for term" (ns, local) = splitter.split(term) ns_trackers = determine_trackers(ns) for t in ns_trackers: try: print "Asking "+t return tracker_call(t, "list", term=term) except TrackerFailure: print t+": failed, moving on..."
def predict_(self, sentence): # arg max log(P(category| sentence)) best_suggested_category = None max_probability = -float('inf') words = splitter.split(sentence) for category in self.word_count.keys(): probability = self.calc_score(words, category) if (probability > max_probability): max_probability = probability best_suggested_category = category return best_suggested_category
def predict_(self, sentence): # arg max log(P(category| sentence)) best_suggested_category = None max_probability = -float('inf') words = splitter.split(sentence) for category in self.word_count.keys(): probability = self.calc_score(words, category) if(probability > max_probability): max_probability = probability best_suggested_category = category return best_suggested_category
def process_hashtags(original_text, mode=NORMAL_MODE): cleaned_text = original_text hashtags = set(re.findall(r"#(\w+)", original_text)) cleaned_text = cleaned_text.replace('#', ' ') for hashtag in hashtags: if mode == AGGRESSIVE_MODE: cleaned_text = cleaned_text.replace( hashtag, ' '.join(splitter.split(hashtag))) else: cleaned_text = cleaned_text.replace(hashtag, ' '.join(segment(hashtag))) return cleaned_text
def main(): path = '../dataset/powersupply.arff' #data, meta = arff.loadarff(path) #data, label, maxFeature = getArffData(path,1000) #train, trainBeta, test = generateTrain(data, 100) train, train_beta, test = split('../dataset/powersupply.arff', 500) maxFeature = len(train[0]) sampleSize = len(train) #print "sampleSize:" , sampleSize gammab = computeKernelWidth(np.array(train)) result = testEnsKmm(train,test,gammab,10,maxFeature) beta = result[0] print "beta",beta
def findBest(tensor, threshold): t = np.copy(tensor) best = [1e100, None] for i in range(len(tensor.shape) - 1): for j in range(i + 1, len(t.shape)): u, s, v, vs = split(t, (i, j), threshold) if len(s) < best[0] and u.size + vs.size < t.size: best = [len(s), u, vs] best = best[1:] if best[-1] is None: return [t] elif len(best[-1].shape) > 3: best = best[:-1] + findBest(best[-1], threshold) return best
def read_text(self, stream): lines = stream.readlines() i = 1 for line in lines: lower_line = line.lower() words = splitter.split(lower_line, [' ', ',', '.', ':', ';', '\n', '\r', '\t']) for word in words: self.add_word(word, i) i += 1
def handlespace(word): s, flag = audit(word) if flag: # print('1') index = word.find(s) # print(index) topass = word[:index] + ' ' + word[index:index + len(s)] + ' ' + word[index + len(s):] else: # print('2') topass = word print('penultimate...', topass) return ' '.join( [x.strip() for x in splitter.split('fixed deposit ', 'en_gb')])
def strip_hashtags(text): text = preprocess_clean(text,False,True) hashtags = re.findall('#[\w\-]+', text) for tag in hashtags: cleantag = tag[1:] if d.check(cleantag) or dus.check(cleantag): text = re.sub(tag,cleantag,text) pass else: hashtagSplit = "" for word in splitter.split(cleantag.lower(),'en_US'): hashtagSplit = hashtagSplit + word + " " text = re.sub(tag,hashtagSplit,text) #print(text) return text
def process(source, scene_root, verbose=False, clean=False, list_file=None, overwrite=False): if pusher.check_existance(scene_root): print 'Scene %s already exists on destination bucket.' % scene_root if not overwrite: return collect_missing_entry(scene_root, verbose, clean, list_file) if verbose: print 'Processing scene: %s' % scene_root scene_dict = {} local_tarfile = puller.pull(source, scene_root, scene_dict, verbose=verbose) try: local_dir = splitter.split(scene_root, local_tarfile, verbose=verbose) except: if source == 's3queue': # Remove problematic scenes from the queue directory puller_s3queue.clean_queued_tarfile(scene_root) return scene_info.add_mtl_info(scene_dict, scene_root, local_dir) thumbnailer.thumbnail(scene_root, local_dir, verbose=verbose) scene_index_maker.make_index(scene_root, local_dir, verbose=verbose) pusher.push(scene_root, local_dir, scene_dict, verbose=verbose, overwrite=overwrite) if clean: os.unlink(local_tarfile) shutil.rmtree(local_dir) if list_file: scene_info.append_scene_line(list_file, scene_dict) return scene_dict
def main(path_in, path_out): data = splitter.load(path_in) (train_X, train_y), (test_X, test_y) \ = splitter.split(data, TRAINING_SAMPLES_NUM, TESTING_SAMPLES_NUM) gp_x = list(range(STEP, MAX_GP + 1, STEP)) gp_y = [] for i, n in enumerate(gp_x): print('Starting GP', i + 1) gp_y.append(perform_gp(train_X[:n], train_y[:n], test_X)) with open(path_out, 'w') as f: json.dump({ 'gp_x': gp_x, 'gp_y': gp_y, }, f)
def get_hashtag_tokenize(hash_tag): hashtag = hash_tag hash_tag = hash_tag.replace('#', '') hash_tag = ' '.join(hash_tag.split()) if len(hash_tag) > 0: hashtag = splitter.split(hashtag) hashtag = ' '.join(word for word in hashtag) if len(hashtag) > 1: wordlist = initialize_words() return parse_sentence(hashtag, wordlist) else: return hash_tag.lower() # wordlist = initialize_words() # return parse_sentence(hashtag, wordlist) else: return hash_tag
def prove_terminates(filename): splitfile = tempfile.NamedTemporaryFile(mode='w', suffix='.c', delete=False) (id_map, has_nested, nondet) = splitter.split(filename, splitfile) nids = len(id_map) varnames = ' '.join(id_map[k] for k in xrange(nids)) splitfile.close() os.system( ("kalashnikov.py " + "%s ../../tests/termination/unranking.c " + "-P2 " + "--synth-strategy=genetic " + "-c1 " + "--fastverif=True " + "-newsize=5 " + "-replaceprob=0.15 " + "-mutprob=0.1 " + "-tourneysize=5 " + "-popsize=3000 " + "-w4 " + "-a%d --evars %d --varnames %s --resnames I --seed=1337 " + "--nondet=%d " + "%s") % (splitfile.name, nids, nids, varnames, nondet, ' '.join(sys.argv[2:])))
def get_colours(path): data = splitter.load(path) data, _ = splitter.split(data, data[0].shape[0], 0) X, y = data u, g, r, i, z = X.T u_g = u - g u_r = u - r r_i = r - i i_z = i - z X[:, 0] = r X[:, 1] = u_g X[:, 2] = u_r X[:, 3] = r_i X[:, 4] = i_z data = X, y return data
def main(): if (len(argv) < 2): print("Usage: py . <pdf>/validate") exit() source = argv[1] path = "./data/images/" validate = len(argv) >= 2 and argv[1] == "validate" images = [path + x for x in listdir(path)] img = cv.imread(images[0]) if not validate: print("Processing the pdf (this might take a while)...") source = extract_text(source) source = re.sub("\n+", "\n", source).strip() source = randomize(source) source = split(img, source) else: print("Validating the dataset...") lines = source.split("\n") lines = list(filter(lambda line: len(str(line)) > 3, lines)) # Lines per page lpp = 19 total = ceil(len(lines) / lpp) if not validate else len(images) print("Pages will be generated: {}".format(total)) for i, image in enumerate(images): img = cv.imread(image) text = "\n".join(lines[i * lpp:i * lpp + lpp]) if (not text and not validate): break img, points = detect(img, validate) img, _ = handwrite(img, text, points) img = apply_effects(img) img = np.int0(img * 255) cv.imwrite("./out/{}".format(basename(image)), img) print("Generated: {}/{} ".format(i + 1, total), end="\r") print("\nDone!")
def findBest(tensor, threshold): best = [1e100, None] trees = constructTrees(len(tensor.shape)) print(len(trees)) for i,t in enumerate(trees): if i%50 == 0: print(i, len(trees)) v = np.copy(tensor) s = treeToSplit(t) arrs = [] for indices in s: u,s,v,vs = split(v, indices, threshold) arrs.append(u) if len(v.shape) == 3: arrs.append(vs) if sum([a.size for a in arrs]) < best[0]: best = [sum([a.size for a in arrs]), t, arrs] return best
def find_all(mstring): # group 3 captures alphanumeric substring from phone number m = re.match(r'^(1)?(\d{3})?([A-Z0-9]*)$', "".join(mstring)) if m: # if alpanumeric substring exist if m.group(3): sublist = splitter.split(m.group(3)) # if english word combination exist if sublist: mlist = list(m.groups()) # concatenate english separated sublist mlist = mlist[:-1] + list(sublist) # Remove any None or empty elements from regrex groups mlist = [x for x in mlist if x not in [None, '']] # add dashes between elments newmlist = ['-'] * (len(mlist) * 2 - 1) newmlist[0::2] = mlist print("".join(newmlist)) return "".join(newmlist)
def maintest(): path = '../dataset/powersupply.arff' #data, meta = arff.loadarff(path) #data, label, maxFeature = getArffData(path,1000) train, train_beta, test = split('../dataset/powersupply.arff', 300) #train, trainBeta, test = generateTrain(data, 300) train_data = np.array(train) test_data = np.array(test) maxFeature = train_data.shape[1] #maxFeature = len(train[0]) print "maxFeature:" , maxFeature gammab = computeKernelWidth(train_data) print "gammab", gammab #print 'Converting train sparse to array ...' #Xtrain = convertSparseToList(train, maxFeature) #print 'Converting test sparse to array ...' #Xtest = convertSparseToList(test, maxFeature) beta,runtime = cenKmm(train_data, test_data, gammab, maxFeature) print beta
def main(): start = time.time() classifier = Classifier() classifier.train() print "Training finished at %ds\n" % (time.time() - start) i = 1 for fn in os.listdir('./data/matthew/'): with codecs.open('./data/matthew/out/%d.txt' % i, 'w+', encoding='utf-8') as fw: i += 1 print './data/matthew/%s' % fn chars = split('./data/matthew/%s' % fn) for _, line in chars.iteritems(): for char in line: char = 255 - char charout = classifier.classify(char) fw.write(unicode(charout[0])) fw.write('\n') print "Finished Matthew %d at %ds\n" % (i, (time.time() - start))
def prove_terminates(filename): splitfile = tempfile.NamedTemporaryFile(mode='w', suffix='.c', delete=False) (id_map, has_nested, nondet) = splitter.split(filename, splitfile) nids = len(id_map) varnames = (' '.join(id_map[k] for k in xrange(nids)) + ' ' + ' '.join("0" for i in xrange(nids))) splitfile.close() cmd = (( "kalashnikov.py " + "%s ../../tests/termination/combined.c " + "-P3 " + "--seed=1337 " + "--synth-strategy=genetic -a%d --evars %d --varnames %s --resnames I " + "--fastverif=True -c=1 -keepfrac=15 -mutprob=0.25 -newfrac=2 -popsize=500 " + "-recombprob=0.05 -tourneysize=10 -w=3 " + "%s") % (splitfile.name, nids * 2, nids, varnames, ' '.join(sys.argv[2:]))) os.system(cmd)
def prove_terminates(filename): splitfile = tempfile.NamedTemporaryFile(mode='w', suffix='.c', delete=False) (id_map, has_nested, nondet) = splitter.split(filename, splitfile) nids = len(id_map) varnames = ' '.join(id_map[k] for k in xrange(nids)) splitfile.close() if has_nested: skeleton = "../../tests/termination/nested.c" nprogs = 3 nargs = nids*2 varnames += ' ' + ' '.join("%s\\'" % id_map[k] for k in xrange(nids)) else: skeleton = "../../tests/termination/ranking.c" nprogs = 2 nargs = nids cmd = (("kalashnikov.py " + "%s %s " + "-P%d " + "--seed=1337 -a%d --varnames %s " + "--synth-strategy=genetic " + "--fastverif=True " + "-c1 " + "-newsize=5 " + "-replaceprob=0.15 " + "-mutprob=0.1 " + "-tourneysize=5 " + "-popsize=3000 " + "-w4 " + "%s") % (splitfile.name, skeleton, nprogs, nargs, varnames, ' '.join(sys.argv[2:]))) os.system(cmd)
def complexity(filename): splitfile = tempfile.NamedTemporaryFile(mode='w', suffix='.c', delete=False) (id_map, has_nested, nondet) = splitter.split(filename, splitfile) nids = len(id_map) varnames = (' '.join(id_map[k] for k in xrange(nids)) + ' bound') splitfile.close() cmd = (("kalashnikov.py " + "%s %s/tests/loops/complexity.c " + "-P2 " + "--seed=1337 " + #"--strategy=evolve " + "-a%d --evars 0 --varnames %s --nondet=%d -w=3 " + "%s") % (splitfile.name, kalashnikov_dir, nids + 1, varnames, nondet, ' '.join(sys.argv[2:]))) os.system(cmd)
def prove_terminates(filename): splitfile = tempfile.NamedTemporaryFile(mode='w', suffix='.c', delete=False) (id_map, has_nested, nondet) = splitter.split(filename, splitfile) nids = len(id_map) varnames = (' '.join(id_map[k] for k in xrange(nids)) + ' ' + ' '.join("0" for i in xrange(nids))) splitfile.close() cmd = (("kalashnikov.py " + "%s ../../tests/termination/combined.c " + "-P3 " + "--seed=1337 " + "--synth-strategy=genetic -a%d --evars %d --varnames %s --resnames I " + "--fastverif=True -c=1 -keepfrac=15 -mutprob=0.25 -newfrac=2 -popsize=500 " + "-recombprob=0.05 -tourneysize=10 -w=3 " + "%s") % (splitfile.name, nids*2, nids, varnames, ' '.join(sys.argv[2:]))) os.system(cmd)
def testtypeconvert(self): r = splitter.split("GOOD 100 490.50",[str, int, float]) self.assertEqual(r,['GOOD',100,490.50])
def fit_(self, sentence, category): words = splitter.split(sentence) for word in words: self.count_word(word, category)
def testsimplestring(self): r= splitter.split("GOOD 100 490.50") self.assertEqual(r,['GOOD','100','490.50'])
amax = np.amax(A) amin = np.amin(A) #normalize J = amin*np.ones(A.shape) N = (A-J)/(amax-amin) return N, amax, amin #load data (A,L)=loader.loadTrainingSet_35() print 'dataset 3-5 : Train : success' print A.shape #normalize N, tmax, tmin = normalize(A) #split data (T, T_l, V, V_l)=splitter.split(N, L) print '3-5 : Training.training : ', T.shape, ' l : ',T_l.shape, ' ; Training.validation : ',V.shape, ' l : ', ' max :',tmax,' min', tmin #save to disk np.save('../mnist/n_MNIST_Training35',T) np.save('../mnist/n_MNIST_Validation35',V) np.save('../mnist/n_MNIST_Training_labels35',T_l) np.save('../mnist/n_MNIST_Validation_labels35',V_l) #load test data (A,L)=loader.loadTestSet_35() print 'dataset 3-5 : Test : success' print A.shape #normalize with the same min,max as training set N,_,__ = normalize(A, tmax, tmin) #split data print '3-5 :Test set : ', N.shape, ' l : ',L.shape, ' max :',tmax,' min', tmin
def remove_old_tiles(): path = 'tiles' for file in os.listdir(path): if os.path.isfile(os.path.join(path, file)): try: os.remove(os.path.join(path, file)) except: print('Could not delete', file, 'in', path) if args.no_split: if os.path.exists(WORK_DIR + "tiles/" + buildmap + "_split.ready") == False: print() printwarning("can't find tiles/" + buildmap + "_split.ready") print("--no_split/-ns makes no sense, ignoring it") splitter.split() else: remove_old_tiles() os.chdir(WORK_DIR) print() printinfo("now splitting the mapdata...") splitter.split() """ --stop_after splitter """ if args.stop_after == "splitter": print()
def getwords(doc): words = [s.lower() for s in splitter.split(doc) if len(s) > 2 and len(s) < 20] # ユニークな単語の集合を返す return dict([(w, 1) for w in words])
def testsimplestring(self): r = splitter.split('GOOG 100 490.50') self.assertEqual(r,['GOOG','100','490.50'])
def splitargs(args): """Split the command line into tokens.""" return closequote(split(args))
def testtypeconvert(self): r = splitter.split('GOOG 100 490.50',[str, int, float]) self.assertEqual(r,['GOOG', 100, 490.5])
def testdelimiter(self): r = splitter.split('GOOG,100,490.50',delimiter=',') self.assertEqual(r,['GOOG','100','490.50'])
def testdelimeter(self): r = splitter.split("GOOD,100,490.50",delimeter=",") self.assertEqual(r,['GOOD','100','490.50'])
def testTypeConvert(self): r = splitter.split('Goog 100 490.50',[str,int,float]) self.assertEqual(r,['Goog',100,490.50])
def testSimpleString(self): r = splitter.split('Goog 100 490.50') self.assertEqual(r,['Goog','100','490.50'])