def plot_overlap_water(res, start_days, end_days, figure_path): ax_args_copy = overlap_ax_args.copy() res = copy.copy(res) mice = np.unique(res['mouse']) res = filter.filter_days_per_mouse(res, days_per_mouse=end_days) add_naive_learned(res, start_days, end_days) ax_args_copy.update({'xlim': [-1, 2]}) y_keys = ['US/CS+', 'CS+/US'] summary_res = defaultdict(list) for arg in y_keys: _get_overlap_water(res, arg=arg) new_res = reduce.new_filter_reduce( res, filter_keys=['mouse', 'day', 'odor_valence'], reduce_key='Overlap') new_res['Type'] = np.array([arg] * len(new_res['training_day'])) reduce.chain_defaultdicts(summary_res, new_res) summary_res.pop('Overlap_sem') summary_res.pop('Overlap_std') summary_res = filter.filter(summary_res, {'odor_valence': 'CS+'}) mean_std_res = reduce.new_filter_reduce(summary_res, filter_keys='Type', reduce_key='Overlap') types = np.unique(summary_res['Type']) scatter_args_copy = scatter_args.copy() scatter_args_copy.update({'s': 2, 'alpha': .6}) for i, type in enumerate(types): reuse_arg = True if i == 0: reuse_arg = False temp = filter.filter(summary_res, {'Type': type}) plot.plot_results(temp, x_key='Type', y_key='Overlap', loop_keys='mouse', colors=['Black'] * len(mice), plot_function=plt.scatter, path=figure_path, plot_args=scatter_args_copy, ax_args=ax_args_copy, save=False, reuse=reuse_arg, fig_size=(1.5, 1.5), rect=(.25, .25, .6, .6), legend=False) plot.plot_results(mean_std_res, x_key='Type', y_key='Overlap', error_key='Overlap_sem', path=figure_path, plot_function=plt.errorbar, plot_args=error_args, ax_args=ax_args, save=True, reuse=True, fig_size=(1.5, 1.5), legend=False) print(mean_std_res['Overlap'])
def process(): conn_old = mysql.connect('bsppr', '192.168.241.7') mysql.insert(conn_old,'set names utf8') conn_new = mysql.connect('bsppr', '192.168.241.32') mysql.insert(conn_new,'set names utf8') cinfos_old = get_cinfos_moa(conn_old) cinfos_new = get_cinfos_moa(conn_new) mongo_conn = get_mongo_conn() tablename = 'weixin' while True: tmpdatas = mongo.find(mongo_conn, tablename, {},1000) rawdatas = [] for raw in tmpdatas: url = raw['url'] mongo.delete(mongo_conn, tablename, {'url':url}) date = raw['pubtime'] now = datetime.datetime.now() diff = now - date if diff.days>2: continue rawdatas.append(raw) if len(rawdatas)==0: print 'wait datas...' time.sleep(300) raw_old_qualified = filter(cinfos_old,rawdatas) if raw_old_qualified: old_insert_num = feed_xpost.feed_data_to_xpost(conn_old, raw_old_qualified,'old') print 'old_insert_num: ',old_insert_num raw_new_qualified = filter(cinfos_new,rawdatas) if raw_new_qualified: new_insert_num = feed_xpost.feed_data_to_xpost(conn_new, raw_new_qualified,'new') print 'new_insert_num: ',new_insert_num
def process(): conn_old = mysql.connect('bsppr', '192.168.241.7') mysql.insert(conn_old, 'set names utf8') conn_new = mysql.connect('bsppr', '192.168.241.32') mysql.insert(conn_new, 'set names utf8') cinfos_old = get_cinfos(conn_old) #print cinfos_old cinfos_new = get_cinfos(conn_new) mongo_conn = get_mongo_conn() tablename = 'weixin' tmpdatas = mongo.find(mongo_conn, tablename, {}, 50) rawdatas = [] for raw in tmpdatas: date = raw['pubtime'] now = datetime.datetime.now() diff = now - date print diff.days rawdatas.append(raw) if len(rawdatas) == 0: time.sleep(10) raw_old_qualified = filter(cinfos_old, rawdatas) old_insert_num = feed_xpost.feed_data_to_xpost(conn_old, raw_old_qualified, 'old') raw_new_qualified = filter(cinfos_new, rawdatas) new_insert_num = feed_xpost.feed_data_to_xpost(conn_new, raw_new_qualified, 'new')
def _get_overlap_water(res, arg): def _helper(list_of_name_ix_tuple, desired_tuple): for tuple in list_of_name_ix_tuple: if tuple[0] == desired_tuple: ix = tuple[1] assert len(ix) == 1, 'more than 1 unique entry' return ix[0] res['Overlap'] = np.zeros(res['day'].shape) names, ixs = filter.retrieve_unique_entries( res, ['mouse', 'day', 'odor_standard']) list_of_name_ix_tuples = list(zip(names, ixs)) mice = np.unique(res['mouse']) for mouse in mice: mouse_res = filter.filter(res, filter_dict={'mouse': mouse}) days = np.unique(mouse_res['day']) for day in days: mouse_day_res = filter.filter(mouse_res, filter_dict={'day': day}) odors = np.unique(mouse_day_res['odor_standard']) if 'US' in odors: us_ix = _helper(list_of_name_ix_tuples, (mouse, day, 'US')) us_cells = np.where(res['sig'][us_ix])[0] for odor in odors: odor_ix = _helper(list_of_name_ix_tuples, (mouse, day, odor)) odor_cells = np.where(res['sig'][odor_ix])[0] if arg == 'US/CS+': overlap = _overlap(us_cells, odor_cells, arg='over') elif arg == 'CS+/US': overlap = _overlap(odor_cells, us_cells, arg='over') else: raise ValueError('overlap arg not recognized') res['Overlap'][odor_ix] = overlap
def write_data(path): import filter from pyspark.mllib.feature import Word2Vec, Word2VecModel # load data loc = '/user/rmusters/text/2015/01/*' text_file = sc.textFile(loc) data = text_file.map(lambda line: filter.filter(line).split(" ")) # load model word2vec = Word2Vec() model = Word2VecModel.load(sc, '/user/rmusters/2015model99') # get a tweet vector pair. from pyspark.sql import SQLContext sqlContext = SQLContext(sc) lookup = sqlContext.read.parquet('/user/rmusters/2015model99/data').alias("lookup") lookup_bd = sc.broadcast(lookup.rdd.collectAsMap()) vectors = data.map(lambda ws: [lookup_bd.value.get(w) for w in ws]) logger.info(vectors.count()) data = text_file.map(lambda line: (line, filter.filter(line).split(" ")))\ .map(lambda (text, filtered): (text, filtered, [lookup_bd.value.get(w) for w in filtered][0])) from pyspark.sql.functions import monotonicallyIncreasingId df = data.toDF(["text", "filtered_text", "vectors"]) # This will return a new DF with all the columns + id res = df.withColumn("id", monotonicallyIncreasingId()) res.write.parquet(path, mode="overwrite")
def graph(infile, graph_type, router_filtering, strength_filtering, missing_strength_filtering, end_time, coincidence, binsize, name, use_labels): jason = json.load(infile) infile.close() jason["packets"] = pd.DataFrame(jason["packets"]) jason["packets"]["time"] /= 1000000 filter(jason, router_filtering, strength_filtering, missing_strength_filtering, end_time) jason["last"] = jason["packets"]["time"].iget(-1) if(graph_type == "unique"): plotUnique(jason, binsize=binsize, labels=use_labels) elif(graph_type == "packets"): plotPackets(jason, binsize=binsize, labels=use_labels) elif(graph_type == "grid"): plotGrid(jason, coincidence=coincidence, name=name, labels=use_labels) elif(graph_type == "packethist"): plotPacketHistogram(jason) elif(graph_type == "strhist"): plotStrengthHistogram(jason) elif(graph_type == "segments"): plotSegments(jason, name=name, labels=use_labels) elif(graph_type == "vectors"): plotVectors(jason, name=name, labels=use_labels)
def pack(packname, filtexpr, setexpr, packall): # tarfile 'filter' requires v2.7 if sys.version_info < (2, 7): raise Exception('Python 2.7 or later required..') # Get the root directory of cblib scriptdir = os.path.split(inspect.getfile(inspect.currentframe()))[0] rootdir = os.path.join(scriptdir, '..', '..') if not packall and setexpr != None: if os.path.isfile(setexpr): rootdir = os.path.dirname(setexpr) else: rootdir = setexpr # Find all instances files = list() cbfset = CBFset() cbfset.read(setexpr) filter(filtexpr, None, cbfset, lambda x: files.append(x)) if packall: # Find all instance information files = files + glob.glob(os.path.join(rootdir, 'instances', '*.csv')) files = files + glob.glob(os.path.join(rootdir, 'instances', '*.bib')) # Find all source files from 'tools' files = files + glob.glob(os.path.join(rootdir, 'tools', '*.c')) files = files + glob.glob(os.path.join(rootdir, 'tools', '*.h')) files = files + glob.glob(os.path.join(rootdir, 'tools', 'Makefile.*')) # Find all documents from 'docs' files = files + glob.glob(os.path.join(rootdir, 'docs', '*.pdf')) # Find all python files from 'scripts' files = files + glob.glob(os.path.join(rootdir, 'scripts', '*.py')) files = files + glob.glob( os.path.join(rootdir, 'scripts', 'admin', '*.py')) files = files + glob.glob( os.path.join(rootdir, 'scripts', 'data', '*.py')) files = files + glob.glob( os.path.join(rootdir, 'scripts', 'dist', '*.py')) files = files + glob.glob( os.path.join(rootdir, 'scripts', 'filters', '*.py')) files = files + glob.glob( os.path.join(rootdir, 'scripts', 'solvers', '*.py')) # Find all other important files files.append(os.path.join(rootdir, 'README')) files.append(os.path.join(rootdir, 'instances', 'cbf', 'README')) # Create compressed tar file print('Writing ' + packname + '.tar.gz') tar = tarfile.open(os.path.join(scriptdir, packname + '.tar.gz'), 'w:gz') for f in files: extractname = os.path.join(packname, os.path.relpath(f, rootdir)) print(extractname) tar.add(f, arcname=extractname, filter=addwritepermission) tar.close()
def tearDownClass(self): # 必须使用 @ classmethod装饰器, 所有test运行完后运行一次 test_content = {} test_content['message'] = '.drop 天子' test_content['sender'] = sender filter(test_content) test_content['message'] = '' print('测试结束')
def _hist(res, save_path): ## distribution def histogram(real, label, bin, range, ax): density, bins = np.histogram(real, bins=bin, density=True, range=range) unity_density = density / density.sum() widths = bins[:-1] - bins[1:] ax.bar(bins[1:], unity_density, width=widths, alpha=.5, label=label) for mouse in np.unique(res['mouse']): pt_csp = filter.filter(res, {'mouse': mouse, 'odor_valence': 'PT CS+'}) csp = filter.filter(res, {'mouse': mouse}) csm = filter.filter(res, {'mouse': mouse}) data = pt_csp['velocity'] start = pt_csp['on'][0] end = pt_csp['end'][0] data_before = data[:, :start].flatten() data_during = data[:, start:end].flatten() data_after = data[:, end:].flatten() bins = 50 range = [-70, 70] fig = plt.figure(figsize=(2, 1.5)) ax = fig.add_axes([0.2, 0.2, 0.7, 0.7]) histogram(data_before, 'before', bin=bins, range=range, ax=ax) histogram(data_during, 'during', bin=bins, range=range, ax=ax) plt.xlim([range[0] - 0.5, range[1] + .5]) ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') rs = ranksums(data_before, data_during)[-1] xlim = plt.xlim() ylim = plt.ylim() x = xlim[0] + .7 * (xlim[1] - xlim[0]) y = ylim[0] + .7 * (ylim[1] - ylim[0]) plot.significance_str(x, y, rs) name = 'before_during_mouse_{}'.format(mouse) plot._easy_save(save_path, name=name) fig = plt.figure(figsize=(2, 1.5)) ax = fig.add_axes([0.2, 0.2, 0.7, 0.7]) histogram(data_during, 'before', bin=bins, range=range, ax=ax) histogram(data_after, 'during', bin=bins, range=range, ax=ax) plt.xlim([range[0] - 0.5, range[1] + .5]) ax.spines["right"].set_visible(False) ax.spines["top"].set_visible(False) ax.xaxis.set_ticks_position('bottom') ax.yaxis.set_ticks_position('left') rs = ranksums(data_during, data_after)[-1] xlim = plt.xlim() ylim = plt.ylim() x = xlim[0] + .7 * (xlim[1] - xlim[0]) y = ylim[0] + .7 * (ylim[1] - ylim[0]) plot.significance_str(x, y, rs) name = 'during_after_mouse_{}'.format(mouse) plot._easy_save(save_path, name=name)
def process_commits(features): commits_dict = gitable.dumpCommits() weeks = dict() all = dict() for author, commits in commits_dict.iteritems(): all[author] = len(commits[1:]) for week in commits[0].keys(): if not weeks.get(week): weeks[week] = dict() week_count = weeks.get(week) num= commits[0].get(week) week_count[author] = num weeks[week] = week_count weekly = weeks.keys() weekly.sort() weekly_count = dict() sorted_week_count = [] for week in weekly: commit_week = filter.filter(weeks.get(week)) large = commit_week.large(0.25, percent=True) small = commit_week.small(0.1,percent = True) weekly_count[week] = commit_week.sum() sorted_week_count.append(commit_week.sum()) print('%s, %d' %(week, commit_week.sum())) #logger.info(large) #logger.info(small) # Uneven work of weeks draw_bar(sorted_week_count,"commits per week","commits",range(len( sorted_week_count)),"week",0.35) week_filter = filter.filter(weekly_count) features['commits_week']= week_filter small_weeks = week_filter.small() if len(small_weeks)>0: features['low commits during the gap time']= small_weeks logger.info(small_weeks) large_weeks =week_filter.large() if len(large_weeks)>0: features['extra large work during the week'] = large_weeks #logger.info(large_weeks) #logger.info(large_weeks) # Uneven contribute of workers draw_bar(all.values(), "commits number posted by person","issues", range(len(all.keys())),"person",0.35) contribution_filter =filter.filter(all) features['commits_person'] = contribution_filter leader = contribution_filter.large(delta=1) if len(leader)>0 : features['large commits by single user'] = leader #logger.info("Project has leader %s" %(leader)) passenger = contribution_filter.small(delta=1) if len(passenger)>0 : features['small commits by single user']= passenger
def get_revenue(): # Umsatz laden revenues = daten.daten_laden("umsatz.json") # liste mit den gefilterten Umsätzen erstellen revenues_filtered = revenues # "Alle" als Dropdown beim Filter auswählen selected_jahr = selected_kunde = selected_lieferant = "Alle" # Wenn gefiltert wird dann... if request.method == 'POST': # lösche alle Umsätze, welche nicht gewünscht sind revenues_filtered = filter.filter(revenues_filtered, 'jahr', request.form['jahr']) revenues_filtered = filter.filter(revenues_filtered, 'lieferant', request.form['lieferant']) revenues_filtered = filter.filter(revenues_filtered, 'kunde', request.form['kunde']) # Filter richtiges Dropdown item auswählen selected_jahr = request.form['jahr'] selected_kunde = request.form['kunde'] selected_lieferant = request.form['lieferant'] # Listen für die Dropdown filter erstellen filter_list_jahr = filter.getFilterList(revenues, 'jahr', selected_jahr) filter_list_lieferant = filter.getFilterList(revenues, 'lieferant', selected_lieferant) filter_list_kunde = filter.getFilterList(revenues, 'kunde', selected_kunde) # Liste für die gefilterten Umsätze sumlist = [] # Die gefilterten Umsätze in die sumlist hinzufügen for k, v in revenues_filtered.items(): # Füge Item der Liste hinzu sumlist.append(v['umsatz']) # Summe der gefilterten Umsätze erstellen summe_umsatz = sum(sumlist) # Liste für die gefilterten Jahre yearlist = [] # Die gefilterten Jahre in die yearlist hinzufügen for k, v in revenues_filtered.items(): # Füge Item der Liste hinzu yearlist.append(v['jahr']) # Datenvisualisierung der Umsätze (Quelle: https://plotly.com/python/bar-charts/) if sumlist: fig = px.bar(x=yearlist, y=sumlist, title="Grafische Abbildung der Umsätze", labels=dict(x="Jahre", y="Umsätze in CHF"), barmode='group') else: fig = px.bar(x=None, y=None, title="Grafische Abbildung der Umsätze", labels=dict(x="Jahre", y="Umsätze in CHF")) div = plotly.io.to_html(fig, include_plotlyjs=True, full_html=False) # Ausgabe für das Html Datenausgabe return render_template('datenausgabe.html', revenues=revenues_filtered, filter_list_jahr=filter_list_jahr, filter_list_lieferant=filter_list_lieferant, filter_list_kunde=filter_list_kunde, summe_umsatz = summe_umsatz, viz_div = div)
def check(filename, viscosity, fillvol): ########### Check the calibration using the water run # Reads the data # Geometry of the couette cell roo = 0.044151 / 2.0 # outer cell outer radius in m ro = 0.039111 / 2.0 # outer cell radius in m ri = 0.01525 # inner cell radius in m icxsa = np.pi * (ri**2) ocxsa = np.pi * (ro**2) dxsa = ocxsa - icxsa # vol per height in m3/m dxsa = dxsa * 1000 # l / m dxsa = dxsa * 1000 # ml / m fill_height = fillvol / dxsa datf = pd.read_csv(filename) stw = datf['t'] stw = stw - stw[0] dr = datf['dr'] cr = datf['cr'] cr2a = datf['cr2a'] cr2b = datf['cr2b'] pv = datf['pv'] # Filter noise from data dr = filter(stw, dr, method="butter", A=2, B=0.001) cr = filter(stw, cr, method="butter", A=2, B=0.001) cr2a = filter(stw, cr2a, method="butter", A=2, B=0.001) cr2b = filter(stw, cr2b, method="butter", A=2, B=0.001) # Calculate viscosity musw = [viscosity] * len(cr) cu = 16.573 * cr - 29.778 cu2a = 11.307 * cr2a - 29.066 cu2b = 11.307 * cr2b - 29.066 cu = np.array((cu + cu2a + cu2b) / 3) cub = 0.00229473 * pv + 0.48960784 sp_rpms = dr * 316.451 - 163.091 sp_rads = (sp_rpms * 2 * np.pi) / 60 sn_rpms = 5.13 * pv + 15.275 vo = 0.0636 * pv + 2.423 #T calibration gam_dotw = (sp_rads * ri) / (ro - ri) #Tw_fc = eff[0] * (cu - cub) + eff[1] Tw_fc = eff[0] * ((cu - cub) * vo) + eff[1] tauw_fc = Tw_fc / (2 * np.pi * ri * ri * fill_height) muw_fc = tauw_fc / gam_dotw return stw, muw_fc, musw
def get_compare_responsive_sig(res): key = 'ssig' def _helper(res): assert res['odor_valence'][0] == 'CS+', 'wrong odor' assert res['odor_valence'][1] == 'CS-', 'wrong odor' on = res['DAQ_O_ON_F'][0] off = res['DAQ_W_ON_F'][0] sig_p = res[key][0] sig_m = res[key][1] dff_p = res['dff'][0] dff_m = res['dff'][1] sig_p_mask = sig_p == 1 sig_m_mask = sig_m == 1 dff_mask = dff_p - dff_m dff_mask = np.mean(dff_mask[:, on:off], axis=1) p = [a and b for a, b in zip(sig_p_mask, dff_mask > 0)] m = [a and b for a, b in zip(sig_m_mask, dff_mask < 0)] return np.array(p), np.array(m) mice = np.unique(res['mouse']) res = filter.filter(res, filter_dict={'odor_valence': ['CS+', 'CS-']}) sig_res = reduce.new_filter_reduce( res, reduce_key=key, filter_keys=['mouse', 'day', 'odor_valence']) dff_res = reduce.new_filter_reduce( res, reduce_key='dff', filter_keys=['mouse', 'day', 'odor_valence']) sig_res['dff'] = dff_res['dff'] new_res = defaultdict(list) for mouse in mice: mouse_res = filter.filter(sig_res, filter_dict={'mouse': mouse}) days = np.unique(mouse_res['day']) p_list = [] m_list = [] for i, day in enumerate(days): mouse_day_res = filter.filter(mouse_res, filter_dict={'day': day}) p, m = _helper(mouse_day_res) new_res['mouse'].append(mouse) new_res['mouse'].append(mouse) new_res['day'].append(day) new_res['day'].append(day) new_res['odor_valence'].append('CS+') new_res['odor_valence'].append('CS-') new_res[key].append(p) new_res[key].append(m) new_res['Fraction'].append(np.mean(p)) new_res['Fraction'].append(np.mean(m)) p_list.append(p) m_list.append(m) for key, val in new_res.items(): new_res[key] = np.array(val) return new_res
def estimate_bandpass(data): """ Estimate bandpass by rolling median over time data (np.ma.array): data array with axes (freq, time) window (int): size of moving window over which to compute bandpass estimated by median. TODO: Fit a polynomial instead? """ est = filter(data, params.st_bp_window_f, axis=0) est = filter(est, params.st_bp_window_t, axis=1) return est
def main(): parser = optparse.OptionParser("%prog - I diff repo manifests") parser.add_option("--filter", dest="filters", action="append") parser.add_option("--diff", "-d", dest="diff", action="store_false", default=True) parser.add_option("--format", dest="out_format", default="report") parser.add_option("--output", dest="output", default=None) parser.add_option("--root", dest="root", default=os.getcwd()) options, args = parser.parse_args() if not options.output: output = sys.stdout else: output = options.output if os.path.exists(output): print >> sys.stderr, "ERROR: Output file already exists" exit(1) if len(args) == 0: print "Choose a command: diff, cleanup, filter" exit(1) elif len(args) > 1: cmd_args = args[1:] else: cmd_args = None cmd = args[0] if cmd == 'diff': if len(cmd_args) != 2: print >> sys.stderr, "ERROR: must specify exactly two arguments (left and right)" exit(1) diff(cmd_args[0], cmd_args[1], output=output, output_format=options.out_format, filters=options.filters) elif cmd == 'freeze': freeze(cmd_args[0], output, options.root, gaia_branch='v1-train', gecko_branch='gecko-18', moz_remotes=['b2g'], moz_branch='v1-train') elif cmd == 'cleanup': if len(cmd_args) != 1: print >> sys.stderr, "ERROR: you can only filter one file at a time" exit(1) cleanup(cmd_args[0], output, options.filters) elif cmd == 'filter': if len(cmd_args) != 1: print >> sys.stderr, "ERROR: you can only filter one file at a time" exit(1) if options.filters == None: print >> sys.stderr, "ERROR: you must specify filters for the filter command" exit(1) filter(cmd_args[0], output, options.filters)
def get_qa(path): T = Tools() name_list = T.read_file_or_dir(path) for txt_file in name_list: f = filter(txt_file) zt_word,title_list,__1_dict = f.analayis() f.get_qa(T,zt_word,title_list,__1_dict)
def agglomerate_days(res, condition, first_day, last_day): mice = np.unique(res['mouse']) out = defaultdict(list) for i, mouse in enumerate(mice): if hasattr(condition, 'csp'): odors = condition.odors[mouse] else: odors = condition.dt_odors[mouse] + condition.pt_odors[mouse] for odor in odors: filter_dict = { 'mouse': mouse, 'day': np.arange(first_day[i], last_day[i] + 1), 'odor': odor } filtered_res = filter.filter(res, filter_dict) keys = [ 'lick', 'lick_collection', 'lick_baseline', 'time_first_lick', 'time_first_lick_collection', 'lick_5s', 'lick_com' ] temp_res = reduce_by_concat(filtered_res, 'lick', rank_keys=['day', 'ix']) for k in keys: _ = reduce_by_concat(filtered_res, k, rank_keys=['day', 'ix']) temp_res[k] = _[k] temp_res['day'] = np.array(sorted(filtered_res['day'])) temp_res['trial'] = np.arange(len(temp_res['lick'])) if len(temp_res['lick']): append_defaultdicts(out, temp_res) for key, val in out.items(): out[key] = np.array(val) return out
def new_filter_reduce(res, filter_keys, reduce_key, regularize='min'): out = defaultdict(list) if isinstance(filter_keys, str): filter_keys = [filter_keys] unique_combinations, ixs = filter.retrieve_unique_entries(res, filter_keys) for v in unique_combinations: filter_dict = { filter_key: val for filter_key, val in zip(filter_keys, v) } cur_res = filter.filter(res, filter_dict) if len(cur_res[reduce_key]): try: if regularize == 'min': _regularize_length(cur_res, reduce_key) elif regularize == 'max': _regularize_length_cristian_data(cur_res, reduce_key) else: raise ValueError('did not recognize regularize keyword') except: print('cannot regularize the length of {}'.format(reduce_key)) temp_res = reduce_by_mean(cur_res, reduce_key) append_defaultdicts(out, temp_res) bad = [] for key, val in out.items(): try: out[key] = np.array(val) except: bad.append(key) print('{} could not be reduced'.format(key)) for badkey in bad: out.pop(badkey) return out
def plot_compare_responsive(res, figure_path): ax_args_copy = ax_args.copy() ax_args_copy.update({ 'ylim': [0, .65], 'yticks': [0, .2, .4, .6], 'xticks': list(range(20)) }) res = copy.copy(res) res = filter.filter(res, {'odor_valence': ['CS+', 'CS-']}) res_ = get_compare_responsive_sig(res) line_args_copy = line_args.copy() line_args_copy.update({ 'marker': '.', 'linestyle': '--', 'linewidth': .5, 'alpha': .75 }) plot.plot_results(res_, x_key='day', y_key='Fraction', loop_keys=['mouse', 'odor_valence'], colors=['green', 'red'] * 10, path=figure_path, plot_args=line_args_copy, ax_args=ax_args_copy, fig_size=(2, 1.5), legend=False)
def test_gen(self): test_content['message'] = '.gen 桃毒' filter(test_content) test_content['message'] = '.guid' filter(test_content) test_content['message'] = '.choose 3' filter(test_content) test_content['message'] = '.guid' filter(test_content) test_content['message'] = '.drop 桃毒' filter(test_content) # test_content['message'] = '' # filter(test_content) # test_content['message'] = '' # filter(test_content) pass
def main(): from pyspark import SparkContext, SparkConf import filter #spark-submit --py-files master/hadoop/stemmer.py,master/hadoop/filter.py --master yarn --executor-memory 12g --deploy-mode cluster --num-executors 400 master/hadoop/word_count.py loc = '/user/rmusters/text/2015/01/*' #spark-submit --py-files master/hadoop/stemmer.py,master/hadoop/filter.py --master yarn --executor-memory 32g --deploy-mode cluster --num-executors 1000 master/hadoop/word_count.py loc = '/user/rmusters/text/2015/*/*' conf = (SparkConf().set("spark.driver.maxResultSize", "0")) sc = SparkContext(appName='word_count_filtered', conf=conf) text_file = sc.textFile(loc) threshold = 10 counts = text_file.map(lambda line: filter.filter(line)) \ .flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) \ .filter(lambda pair:pair[1] >= threshold)\ .sortBy(lambda x:x[1], ascending=True) counts.saveAsTextFile( '/user/rmusters/counts_taggedUrl_Mention_Stopwords_Punctuation_ignoreNonAscii_StemmedThreshold10_haha_hashtag2015all' ) print counts.count()
def plot_max_dff_valence(res, start_days, end_days, figure_path): res = copy.copy(res) # list_of_days = list(zip(start_days, end_days)) list_of_days = end_days start_end_day_res = filter.filter_days_per_mouse( res, days_per_mouse=list_of_days) start_end_day_res = filter.filter(start_end_day_res, {'odor_valence': ['CS+', 'CS-']}) _max_dff(start_end_day_res) start_end_day_res = reduce.new_filter_reduce( start_end_day_res, filter_keys=['odor_valence', 'mouse'], reduce_key='max_dff') add_naive_learned(start_end_day_res, start_days, end_days) ax_args_copy = ax_args.copy() # ax_args_copy.update({'xticks':[res['DAQ_O_ON_F'][-1], res['DAQ_W_ON_F'][-1]], 'xticklabels':['ON', 'US'], # 'ylim':[0, .2]}) nMice = len(np.unique(res['mouse'])) # colors = ['Green'] * nMice + ['Red'] * nMice # trace_args_copy = trace_args.copy() # trace_args_copy.update({'linestyle':'--','alpha':.5, 'linewidth':.75}) plot.plot_results(start_end_day_res, loop_keys='mouse', x_key='odor_valence', y_key='max_dff', path=figure_path, colors=['gray'] * 10, legend=False, fig_size=(2, 1.5))
def plot(dataset_filename): filtered_tweets = filter.filter(dataset_filename) relevant_tweets = classify.classify(filtered_tweets) counts_per_time_unit = dict() for tweet in relevant_tweets: # get python date from the tweet time tweet_date = dateparser.parse(tweet.time) # get key to uniquely identify the date and hour key = tweet_date.strftime("%Y-%m-%d: %H") if key in counts_per_time_unit: counts_per_time_unit[key] += 1 else: counts_per_time_unit[key] = 1 dates = sorted(counts_per_time_unit.keys()) if not os.path.exists('distributions'): os.makedirs('distributions') # get distribution filename from the data source's filename distribution_filename = dataset_filename.split('/')[len(dataset_filename.split('/')) - 1] # remove the previous file extension distribution_filename = distribution_filename.split('.')[0] f = open('distributions/' + distribution_filename + '.csv', 'w') for key in dates: f.write(str(key) + "," + str(counts_per_time_unit[key]) + "\n") print "Successfully generated file", 'distributions/' + distribution_filename + '.csv'
def filterff(path_file, filter_method="butter", a=1, b=1): # load up some noisy data logf = open(path_file, "r") dat = logf.readlines() logf.close() # sort the loaded data into lists t = [0] * 0 # x, time s = [0] * 0 # y, speed start = 0.0 # start time (since epoch) st = [0] * 0 # specific time (time since run begun, seconds) splt = dat[1].split(",", 5) t.append(float(splt[0])) s.append(float(splt[2])) st.append(0.0) for i in range(2, len(dat)): splt = dat[i].split(",", 5) t.append(float(splt[0])) s.append(float(splt[2])) st.append(t[i - 1] - t[0]) # Apply filter c = filter(t, s, method=filter_method, A=a) return st, s, c
def _example_velocity(res, save_path): xkey = 'trial' ykey = 'velocity' line_args = {'alpha': .5, 'linewidth': .25, 'marker': 'o', 'markersize': 0} mouse = 0 odor = 'PT CS+' temp = filter.filter(res, {'odor_valence': odor, 'mouse': mouse}) start = temp['on'][0] off = temp['off'][0] end = temp['end'][0] ax_args = { 'xticks': [start, off, end], 'xticklabels': ['ON', 'OFF', 'US'], 'ylim': [-5, 100] } for i, v in enumerate(temp[ykey]): v_ = savgol_filter(v, window_length=41, polyorder=0) temp[ykey][i] = v_ plot.plot_results(temp, x_key=xkey, y_key=ykey, loop_keys=['day', 'ix'], select_dict={ 'odor_valence': odor, 'mouse': mouse }, colors=['black'] * 200, plot_args=line_args, ax_args=ax_args, legend=False, path=save_path)
def test_ul(self): test_content['message'] = '.gen 桃毒' filter(test_content) test_content['message'] = '.ul' filter(test_content) test_content['message'] = '.switch 天子' filter(test_content) test_content['message'] = '.drop 桃毒' filter(test_content)
def reftable(out, filtexpr, setexpr): # Find the directory of this script scriptdir = os.path.split(inspect.getfile(inspect.currentframe()))[0] rootdir = os.path.join(scriptdir, '..', '..') # Default value if setexpr == None: setexpr = os.path.realpath( os.path.abspath(os.path.join(rootdir, 'instances', 'cbf'))) # Define files filemap = dict() cbfset = CBFset() cbfset.read(setexpr) filter.filter( filtexpr, None, cbfset, lambda x: files_add( cbfset.getpack(x, cbfset.rootdir), cbfset.getinstance(x), filemap)) # Define sorting convert = lambda text: int(text) if text.isdigit() else text alphanum_key = lambda key: [convert(c) for c in re.split('([0-9]+)', key)] out.opentable() csvpath = os.path.join(rootdir, 'instances', 'ref.csv') csvfile = open(csvpath, 'rt') try: csvdialect = csv.Sniffer().sniff(csvfile.read(), ';\t') csvfile.seek(0) csvreader = csv.reader(csvfile, csvdialect, quotechar='"') next(csvreader) for row in csvreader: if row[0] in filemap: mylst = list(set(row[1].split(', ')) & filemap[row[0]]) if len(mylst) >= 1: mylst.sort(key=alphanum_key) out.addrow(row[0], mylst, row[2], row[3], row[4]) except Exception as e: print(str(e)) finally: csvfile.close() out.closetable()
def calc_T(filename, fill_volume=5, visc=0.001): datf = pd.read_csv(filename) # Cell geometry roo = 0.044151 / 2.0 # outer cell outer radius in m ro = 0.039111 / 2.0 # outer cell radius in m ri = 0.01525 # inner cell radius in m #L = 0.039753 - (roo - ro) # height of couette cell icxsa = np.pi * (ri**2) ocxsa = np.pi * (ro**2) dxsa = ocxsa - icxsa # vol per height in m3/m dxsa = dxsa * 1000 # l / m dxsa = dxsa * 1000 # ml / m fill_height = fill_volume / dxsa # Split up csv columns t = datf['t'] st = t - t[0] dr = datf['dr'] cr = datf['cr'] pv = datf['pv'] # Filtering: aye or naw? if True: dr = np.array(filter(st, dr, method="butter", A=2, B=0.001)) cr = np.array(filter(st, cr, method="butter", A=2, B=0.001)) cr = filter(st, cr, method="gaussian", A=100, B=100) cr = filter(st, cr, method="butter", A=2, B=0.0001) # Calculate torque mus = [visc] * len(cr) sp_rpms = dr * 316.451 - 163.091 sp_rads = (sp_rpms * 2 * np.pi) / 60 sn_rpms = 5.13 * pv + 15.275 gam_dot = (sp_rads * ri) / (ro - ri) tau = mus * gam_dot T = tau * (2 * np.pi * ri * ri * fill_height) Ts = T / (1.0 - (sp_rpms / sn_rpms)) # cu = (-956.06 * (cr ** 3)) + (6543.97 * (cr ** 2)) + (-14924.369 * cr) + 11341.612 cu = (25.177 * cr) - 45.264 vo = 0.0636 * pv + 2.423 pe = cu * vo return st, mus, sp_rpms, sp_rads, gam_dot, tau, T, cu, vo, pe, Ts, sn_rpms, pv
def call_filter(filename): # data = filter.filter(sys.argv[1]) data = filter.filter(filename) # Write to output.csv with open(os.path.join(os.path.expanduser('~'),'Documents/Design-Project/analysis/OM',os.path.splitext(filename)[0] + '.csv'), "w") as f: writer = csv.writer(f) writer.writerows(data)
def process_issues(features): issues = gitable.launchDump() weekly_issues = issues['week'] del issues['week'] author_issues = dict() events_issues =dict() comments_issues = dict() for issue, events in issues.iteritems(): #print("ISSUE " + str(issue)) dict_add(author_issues, events[0].user) dict_add(comments_issues, events[0].comments) dict_add(events_issues, len(events[1:])) draw_bar(author_issues.values(), "issues number posted by person","issues", range(len(author_issues.keys())),"person",0.35) author_filter = filter.filter(author_issues) large_author = author_filter.large() features['issues_person'] = author_filter if(len(large_author) >0 ): features['large issues post by single user'] = large_author small_author = author_filter.small(delta=2) if(len(small_author) >0 ): features['small issues post by single user'] = small_author draw_bar(comments_issues.values(),"issues number with same comments number","issues",comments_issues.keys(),"comments number",0.35) events_filter= filter.filter(events_issues) features['events_issues'] = events_filter if len(events_filter.large())>0: features['large issues with same events'] = events_filter.large() comments_filter = filter.filter(comments_issues) features['comments_issues'] = comments_filter large_comments = comments_filter.large() if(len(large_comments)>0): features['large issues with same comments'] = large_comments draw_bar(events_issues.values(),"issues number with same events number", "issues",events_issues.keys(),"events number",0.35) single_user = filter.filter(author_issues) single_user.large(5)
def _get_overlap_odor(res, delete_non_selective): def _subsets(S, m): return set(itertools.combinations(S, m)) new = defaultdict(list) mice = np.unique(res['mouse']) for mouse in mice: mouse_res = filter.filter(res, filter_dict={'mouse': mouse}) days = np.unique(mouse_res['day']) for day in days: mouse_day_res = filter.filter(mouse_res, filter_dict={ 'day': day, 'odor_valence': ['CS+', 'CS-'] }) odors, odor_ix = np.unique(mouse_day_res['odor_standard'], return_index=True) assert len(odor_ix) == 4, 'Number of odors does not equal 4' all_comparisons = _subsets(odor_ix, 2) for comparison in all_comparisons: mask1 = mouse_day_res['sig'][comparison[0]] mask2 = mouse_day_res['sig'][comparison[1]] if delete_non_selective: non_selective_mask = _respond_to_all(mouse_day_res['sig']) mask1 = np.all( [mask1, np.invert(non_selective_mask)], axis=0).astype(int) mask2 = np.all( [mask2, np.invert(non_selective_mask)], axis=0).astype(int) overlap = _overlap(np.where(mask1)[0], np.where(mask2)[0]) new['Overlap'].append(overlap) new['mouse'].append(mouse) new['day'].append(day) if comparison == (0, 1): new['condition'].append('+:+') elif comparison == (2, 3): new['condition'].append('-:-') else: new['condition'].append('+:-') for key, val in new.items(): new[key] = np.array(val) return new
def display_data(): income = request.form.get('income', 'any') prof = request.form.get('profession', 'any') church = request.form.get('church', 'Total_churches') safety = request.form.get('safety', 'any') data = filter.filter(income, prof, church, safety) return render_template('table.html', data=data, logged_in=('logged_in' in session))
def _filter(res): out = defaultdict(list) for mouse in np.unique(res['mouse']): temp = filter.filter(res, {'mouse': mouse}) data = temp['ball_data'].flatten() max, min = np.max(data), np.min(data) if (max - min) > 4: reduce.chain_defaultdicts(out, temp) return out
def plot_summary_water(res, start_days, end_days, figure_path): ax_args_copy = ax_args.copy() res = copy.copy(res) get_responsive_cells(res) list_of_days = list(zip(start_days, end_days)) mice = np.unique(res['mouse']) start_end_day_res = filter.filter_days_per_mouse( res, days_per_mouse=list_of_days) add_naive_learned(start_end_day_res, start_days, end_days, 'a', 'b') odor_list = ['US'] colors = ['Turquoise'] ax_args_copy.update({'xlim': [-1, 2]}) for i, odor in enumerate(odor_list): plot.plot_results(start_end_day_res, select_dict={'odor_standard': odor}, x_key='training_day', y_key='Fraction Responsive', loop_keys='mouse', colors=[colors[i]] * len(mice), path=figure_path, plot_args=line_args, ax_args=ax_args_copy, fig_size=(1.6, 1.5), legend=False) before_csm = filter.filter(start_end_day_res, filter_dict={ 'training_day': 'a', 'odor_standard': 'US' }) after_csm = filter.filter(start_end_day_res, filter_dict={ 'training_day': 'b', 'odor_standard': 'US' }) from scipy.stats import ranksums, wilcoxon, kruskal print('Before PT CS+: {}'.format(np.mean( before_csm['Fraction Responsive']))) print('After PT CS+: {}'.format(np.mean(after_csm['Fraction Responsive']))) print('Wilcoxin:{}'.format( wilcoxon(before_csm['Fraction Responsive'], after_csm['Fraction Responsive'])))
def enricher(id): query = FlickrQuery(gconfig.flickrAPI,gconfig.flickrSecret) event = eventinfo(id) logger = logfile.logger(gconfig.logdir + '/%s.txt' % event.id) logger.info('query event information') if not event.succ: logger.info( "can not find such event" ) return logger.info('query photos with machine tag') idlist = query.searchbyid(event.id) db = Download(gconfig.tmpdir + '/%s' % event.id) db.download(idlist) #query.outputlist(idlist, event.id, 'list/idlist_%s.txt' % event.id) logger.info('query photos with text info') titlelist = query.searchbytitle(event.title,event.stime,event.id) db.download(titlelist) #query.outputlist(titlelist,event.id, 'list/titlelist_%s.txt' % event.id) logger.info('query photos with geo info') geolist = query.searchbygeo(event.lat,event.lng,event.stime,event.id) db.download(geolist) #query.outputlist(geolist,event.id,'list/geolist_%s.txt' % event.id) logger.info('parsing features') feature = getfeature() feature.run(gconfig.tmpdir + '/%s' % event.id) #trainfile = 'list/idlist_%s.txt' % event.id trainlist = [] for url in idlist: fname = url.split('/')[-1] fname = gconfig.tmpdir + '/%s' % event.id + '/' + fname trainlist.append(fname.replace('.jpg','_ch.txt')) testlist = [] for url in titlelist: fname = url.split('/')[-1] fname = gconfig.tmpdir + '/%s' % event.id + '/' + fname testlist.append(fname.replace('.jpg','_ch.txt')) logger.info('visual pruning') myfilter = filter(trainlist,testlist) r = myfilter.filter() lst = [] for idx in r: lst.append(testlist[idx]) logger.info('refining') myrefine = refine(event.id,lst) results = myrefine.refine() newresults = query.geturlbyid(results,titlelist) logger.info('output result') query.OutputXML(event.id,idlist,titlelist,geolist,newresults) query.OutputHtml(event.id,idlist,titlelist,geolist,newresults) logger.info('event-finished')
def filter(self, chanis=None, f0=0, f1=7, fr=0.5, gpass=0.01, gstop=30, ftype='ellip'): """Bandpass filter data on row indices chanis, between f0 and f1 (Hz), with filter rolloff (?) fr (Hz). Done in-place. ftype: 'ellip', 'butter', 'cheby1', 'cheby2', 'bessel' """ data = self.get_data() if chanis == None: chanis = np.arange(len(data)) data = data[chanis] data, b, a = filter.filter(data, self.sampfreq, f0, f1, fr, gpass, gstop, ftype) self.data[chanis] = data return b, a
def process(): conn_old = mysql.connect('bsppr', '192.168.241.7') mysql.insert(conn_old,'set names utf8') conn_new = mysql.connect('bsppr', '192.168.241.32') mysql.insert(conn_new,'set names utf8') cinfos_old = get_cinfos(conn_old) #print cinfos_old cinfos_new = get_cinfos(conn_new) mongo_conn = get_mongo_conn() tablename = 'weixin' tmpdatas = mongo.find(mongo_conn, tablename, {},50) rawdatas = [] for raw in tmpdatas: date = raw['pubtime'] now = datetime.datetime.now() diff = now - date print diff.days rawdatas.append(raw) if len(rawdatas)==0: time.sleep(10) raw_old_qualified = filter(cinfos_old,rawdatas) old_insert_num = feed_xpost.feed_data_to_xpost(conn_old, raw_old_qualified,'old') raw_new_qualified = filter(cinfos_new,rawdatas) new_insert_num = feed_xpost.feed_data_to_xpost(conn_new, raw_new_qualified,'new')
def get_msg_info(root_path): user_array = [] for user in os.listdir(root_path): if user == "farmer-d": user_path = os.path.join(root_path, user) msg_array = get_msg_info_user(user_path) sorted_msg_array = sorted(msg_array, cmp=compare) for msg in sorted_msg_array: msg.dict_info = filter.filter(msg.header_info + msg.body_info) user_array.append(msg_array) return user_array
def load(self,name): if os.path.exists(name) == False: return fobj = open(name,'rU') index = 0; key = ''; value = ''; for eachLine in fobj: if index == 0: key = filter(name,eachLine[:-1]) index = 1 else: #value = eachLine; self.allLink[key] = eachLine[:-1] index = 0 fobj.close()
def get(self,name,time=None,filter=None,normalize=False,keys=None): if name not in self.cache: data=[] done_header=False index=self.header.index(name) with open(os.path.join(self.dir,self.filename)) as f: for row in f: row=row.strip() if len(row)==0 or row.startswith('#'): continue if not done_header: done_header=True continue data.append(self.parse(row.split(',')[index])) data=np.array(data) self.cache[name]=data else: data=self.cache[name] if keys is not None: data2=np.zeros((len(data),len(keys)),dtype=float) for i,d in enumerate(data): scale=1.0 if normalize: length=np.sqrt(sum([v*v for v in d.values()])) if length>0.01: scale=1.0/length for j,key in enumerate(keys): data2[i][j]=d.get(key,0)*scale data=data2 normalize=False if filter is not None: data=filter.filter(data,self.time[1]-self.time[0],tau=filter) if time is not None: if isinstance(time,(float,int)): data=data[self.get_index_for_time(time)] else: data=data[self.get_index_for_time(time[0]):self.get_index_for_time(time[1])] if normalize: for i,v in enumerate(data): length=np.linalg.norm(v) if length>0.1: data[i]/=length return data
def bulk_store_clip_vector(vecs, start_id=0): """ """ if isinstance(vecs, list): # clips = [] #id = get_highest_pk(ClipVector) id = start_id for vec in vecs: if isinstance(vec, dict) and filter(vec, 'clip'): if settings.VECTOR_ON_MONGO: clip1 = vec.pop('clip1st').clipID clip2 = vec.pop('clip2nd').clipID clip_vec = ClipVector2(clip1st=clip1, clip2nd=clip2, **vec) else: id += 1 clip_vec = ClipVector(pkey=id, **vec) # clips.append(clip_vec) if CLIPS_QUEUE.full(): print 'queue full, freeze' time.sleep(0.5) CLIPS_QUEUE.put(clip_vec) return True # if clips: # n_cluster = len(clips) / int(BULK_INSERT_ITEMS) + 1 # for i in range(n_cluster): # offset = i*BULK_INSERT_ITEMS # end = offset + BULK_INSERT_ITEMS # cluster = clips[offset:end] # try: # if cluster: # ClipVector.objects.bulk_create(cluster) # except DatabaseError as dbe: # print 'debug [index=%d, n_clusters=%d, n_records=%d, cluster_size=%d]' % \ # (i, n_cluster, len(clips), BULK_INSERT_ITEMS) # raise dbe # return True return False
def bulk_store_movie_vector(vecs, start_id=0): """ """ if isinstance(vecs, list): # movies = [] #id = get_highest_pk(MovieVector) id = start_id for vec in vecs: if isinstance(vec, dict) and filter(vec, 'movie'): if settings.VECTOR_ON_MONGO: movie1 = vec.pop('movie1st').movieID movie2 = vec.pop('movie2nd').movieID movie_vec = MovieVector2(movie1st=movie1, movie2nd=movie2, **vec) else: id += 1 movie_vec = MovieVector(pkey=id, **vec) # movies.append(movie_vec) if MOVIES_QUEUE.full(): print 'queue full, freeze' time.sleep(0.5) MOVIES_QUEUE.put(movie_vec) return True # if movies: # n_cluster = len(movies) / int(BULK_INSERT_ITEMS) + 1 # for i in range(n_cluster): # offset = i*BULK_INSERT_ITEMS # end = offset + BULK_INSERT_ITEMS # cluster = movies[offset:end] # try: # if cluster: # MovieVector.objects.bulk_create(cluster) # except DatabaseError as dbe: # print 'debug [index=%d, n_clusters=%d, n_records=%d, cluster_size=%d]' % \ # (i, n_cluster, len(movies), BULK_INSERT_ITEMS) # raise dbe # return True return False
def __init__(self): self.filter = filter() self.db_helper = DBHelper() self.cmp_table = 'refined_list_info' self.table = 'extracted_info' self.cmp_clms = [COMPANY_NAME,MEETING_TIME,MEETING_LOCATION,ORIGIN_URL,RELEASE_DATE,RECRUIT_TITLE]
def get(self,name,time=None,filter=None,normalize=False,keys=None): """ Return a column of data from the csv Parameters: WRITEME """ if name not in self.cache: data=[] done_header=False index=self.header.index(name) with open(os.path.join(self.dir,self.filename)) as f: for row in f: row=row.strip() if len(row)==0 or row.startswith('#'): continue if not done_header: done_header=True continue data.append(self.parse(row.split(',')[index])) data=np.array(data) self.cache[name]=data else: data=self.cache[name] # one of the types of data in the csv file is a *string* of the form # "8a;9b;<...>" # This string represent a vector (semantic pointer) in terms of # a projections onto named [non-orthogonalized] basis elements. # The numeric prefixes are the inner products, and the character suffixes # name the basis elements. # # if `keys` is specified, then it means to only pay attention to the # explicitly named suffix *keys*. Otherwise all of them are returned. # if keys is not None: data2=np.zeros((len(data),len(keys)),dtype=float) for i,d in enumerate(data): scale=1.0 if normalize: length=np.sqrt(sum([v*v for v in d.values()])) if length>0.01: scale=1.0/length for j,key in enumerate(keys): data2[i][j]=d.get(key,0)*scale data=data2 # -- normalize has already been done in the previous loop normalize=False if filter is not None: data=filter.filter(data,self.time[1]-self.time[0],tau=filter) if time is not None: if isinstance(time,(float,int)): data=data[self.get_index_for_time(time)] else: data=data[self.get_index_for_time(time[0]):self.get_index_for_time(time[1])] if normalize: for i,v in enumerate(data): length=np.linalg.norm(v) if length>0.1: data[i]/=length return data
fstop = open(stopwords_file) totalStop = fstop.readlines() fstop.close() stops = [] for s in totalStop: s = s.strip() stops.append(s) threshold = configs["threshold"] classify_model = configs["classify_model"] if not os.path.exists(classify_model): print "ERROR: you should have a filter model" exit(-1) with open(classify_model, "rb") as file: f = filter(Algorithm) t = pickle.load(file) f.Algorithm.loadmodel(t) file.close() class SpamFilter(Resource): """ 垃圾信息过滤服务 """ def get(self): if "query" not in request.args: abort(404, message="parameter `query` doestn't exist")
def handle_data(self, data): if self.flg==1: self.result.insertLink(filter(self.convertedUrl ,data.strip()),self.link) self.flg=0 #重置标志,进行下次迭代
import observations import filter r = filter.filter(observations.horses) print(r)
def __init__(self): self.filter = filter()
strlist = re.split('\"',content) urlset = set([]) for strstr in strlist: #python正则匹配\时,需要\\\\表示 #if re.match('http://.*com(/|\w)+', str): #这个正则有点简单,只匹配了当前网站 #if re.match('http://'+domain, str): rules="http://"+domain+"[^,^ ^ ^']*" #strstr是unicode对象 result=re.compile(rules).findall(strstr.encode("utf-8")) #result是一个数组 if len(result)==0: pass else: for i in result: urlset.add(i) return list(urlset) if __name__=="__main__": if len(sys.argv)!=3: print "usage:"+sys.argv[0]+" http://test.com/"+" depth" print "example:"+sys.argv[0]+" http://127.0.0.1/a.php?c=1"+" 3" else: domain=sys.argv[1].split('/')[2] #保存最开始的url tmp=[] tmp.insert(0,sys.argv[1]); saveurl(tmp) #开始抓取 main(sys.argv[1],0) filter.filter()
def render_filter(): userTag = request.args.get('ID') filter_snippet = filter(userTag) return render_template("output.html", snippet = filter_snippet)
def QueryFinalPhotos(id): ffname = gconfig.metadatadir + '/%s/final.json' % id if os.path.exists(ffname): return ReadData(ffname) else: WaitAll(id) downloadAll(id) fname = gconfig.metadatadir + '/%s/id.json' % id if not os.path.exists(fname): QueryPhotobyId(id) tmp = ReadData(fname) idlist = [t['photo'] for t in tmp['photos']] event = LoadEventInfo('http://data.linkedevents.org/event/' + id) tmp = event['stime'].split('T')[0] stime = datetime.strptime(tmp,'%Y-%m-%d') fname = gconfig.metadatadir + '/%s/title.json' % id if not os.path.exists(fname): QueryPhotobyTitle(id,event['title'],stime) tmp = ReadData(fname) titlelist = [t['photo'] for t in tmp['photos']] print "titlelist", len(titlelist) fname = gconfig.metadatadir + '/%s/geo.json' % id if not os.path.exists(fname): QueryPhotobyGeo(event['id'],(event['lat'],event['lng']),stime) tmp = ReadData(fname) geolist = [t['photo'] for t in tmp['photos']] trainlist = [] for url in idlist: fname = url.split('/')[-1] fname = gconfig.tmpdir + '/%s' % id + '/' + fname trainlist.append(fname.replace('.jpg','_ch.txt')) testlist = [] alldata = titlelist + geolist for url in alldata: fname = url.split('/')[-1] fname = gconfig.tmpdir + '/%s' % id + '/' + fname testlist.append(fname.replace('.jpg','_ch.txt')) feature = getfeature() feature.run(gconfig.tmpdir + '/%s' % id) myfilter = filter(trainlist,testlist) r = myfilter.filter() #return the index in testing data lst = [] for idx in r: lst.append(testlist[idx]) print "the number of pruned is %d" % len(lst) myrefine = refine(id,lst) results = myrefine.refine() newresults = query.geturlbyid(results,alldata) ftable = query.OutputJson(idlist + newresults) strdata = json.dumps(ftable) jsonfile = open(ffname,'w') jsonfile.write(strdata) jsonfile.close() return ftable