def register(username, password): """Registra el usuario y clave, devuelve: 0 si se registro exitosamente 1 si el usuario ya existe 2 si hubo otro problema """ username = utils.clean(username) password = utils.clean(password) # vemos si existe (medio feo, pero por ahora funciona, dependemos de # que get_personal tira una excepcion si el usuario no existe) p = None try: p = get_personal(username) except: pass if p != None: return 1 p = Personal() p.username = username p.password = password try: p.save() except: return 2 return 0
def get_data(): path = '/Users/zyzdiana/Dropbox/vNav_Test_Data/Apr_17_test_data/' dict_10mm = {} dict_6_4mm = {} dict_8mm = {} for root, dirs, files in os.walk(path): if len(dirs)==0: if('10mm' in root): dict_10mm[root] = clean(files) if('6_4mm' in root): dict_6_4mm[root] = clean(files) if('8mm' in root): dict_8mm[root] = clean(files) list_10mm = [] for item in dict_10mm.iteritems(): list_10mm.append(os.path.join(item[0],item[1][0])) list_10mm.sort() vols_10mm = get_volume(list_10mm, 26) list_6_4mm = [] for item in dict_6_4mm.iteritems(): list_6_4mm.append(os.path.join(item[0],item[1][0])) list_6_4mm.sort() vols_6_4mm = get_volume(list_6_4mm, 40) list_8mm = [] for item in dict_8mm.iteritems(): list_8mm.append(os.path.join(item[0],item[1][0])) list_8mm.sort() vols_8mm = get_volume(list_8mm, 32) return vols_6_4mm, vols_8mm, vols_10mm
def get_data_all(): path = '/Users/zyzdiana/Dropbox/vNav_Test_Data/Apr_17_test_data/' dict_10mm = {} dict_6_4mm = {} dict_8mm = {} for root, dirs, files in os.walk(path): if len(dirs)==0: if('10mm' in root): dict_10mm[root] = clean(files) if('6_4mm' in root): dict_6_4mm[root] = clean(files) if('8mm' in root): dict_8mm[root] = clean(files) keys_10 = sorted(dict_10mm.keys()) keys_8 = sorted(dict_8mm.keys()) keys_6_4 = sorted(dict_6_4mm.keys()) all_10mm = {} all_8mm = {} all_6_4mm = {} for i in xrange(5): all_10mm[idx_to_key(i, keys_10)] = [] all_8mm[idx_to_key(i, keys_8)] = [] all_6_4mm[idx_to_key(i, keys_6_4)] = [] for j in xrange(5): # 10mm path = os.path.join(idx_to_key(i, keys_10),dict_10mm[idx_to_key(i, keys_10)][j]) all_10mm[idx_to_key(i, keys_10)].append(get_volume_1(path, 26)) # 8mm path = os.path.join(idx_to_key(i, keys_8),dict_8mm[idx_to_key(i, keys_8)][j]) all_8mm[idx_to_key(i, keys_8)].append(get_volume_1(path, 32)) # 6.4mm path = os.path.join(idx_to_key(i, keys_6_4),dict_6_4mm[idx_to_key(i, keys_6_4)][j]) all_6_4mm[idx_to_key(i, keys_6_4)].append(get_volume_1(path, 40)) return all_10mm, all_8mm, all_6_4mm
def main(_): if FLAGS.train: if FLAGS.clean: clean(FLAGS) train() else: evaluate_line()
def install(): utils.clean() peg.install() fcgi.install() json_glib.install() discount.install() balde.install() bluster.install() utils.clean()
def register(username, password): """Registra el usuario y clave, devuelve: 0 si se registro exitosamente 1 si el usuario ya existe 2 si hubo otro problema """ # TODO: si alguien molesta con esto, hay que armar un mecanismo para # evitar DoS (tampoco es muy complicado). username = utils.clean(username) password = utils.clean(password) return personal.register(username, password)
def load_words(self, folderpath): filenames = glob.glob(folderpath) for f in filenames: with open(f, 'r') as infile: text = infile.read() text = utils.clean(text) self.words.append([text])
def new(self): self.name = self.parse('company.name') self.suffix = self.fetch('company.suffix') self.website = "http://www.%s.%s" % ( clean(self.name), self.fetch('internet.domain_suffix') )
def process_element(country): currency_dict = {} for currency_tag in country.iterchildren(): # ignore newly added additional info field if currency_tag_map[currency_tag.tag] ==\ "ISO4217-currency_additional_info": break # skip 'same day', 'next day', etc variations elif (currency_tag_map[currency_tag.tag] == "ISO4217-currency_name")\ and (len(currency_tag.items()) > 0): if currency_tag.items()[0][0] == 'IsFund': break else: currency_dict.update({ currency_tag_map[currency_tag.tag]: currency_tag.text}) currency_numeric = None # remove random line breaks, etc currency_name = utils.clean(currency_dict['ISO4217-currency_country_name']) if currency_name is not None: # replace name with line breaks, etc removed currency_dict['ISO4217-currency_country_name'] = currency_name try: currency_numeric = en_names[currency_name] except KeyError: mapped_name = currency_country_name_map.get(currency_name) if mapped_name is not None: currency_numeric = en_names.get(mapped_name.upper()) if currency_numeric: country_info[currency_numeric].update(currency_dict) else: print('Failed to match currency data for country: "%s"' % currency_name) return
def __init__(self, schema, output_dir=None): ''' Bind array of cleaned schema file lines to validator object. ''' self.schema = utils.clean(schema) self.output_dir = output_dir self.indent_size = self._find_first_indent()['indent_size'] self.error = {'msg': None}
def process_text(self): text = utils.clean(self.get_tweet_text()) self.set_tweet_text(text) self.set_tweet_source(utils.parse_alink(self.get_tweet_source())) if self.translation: self.detect_language_or_translate() self.filter_text()
def listar_radios(name,url): link= clean(abrir_url(url)) radios=re.compile('<td><a href="/portalradio/conteudos/ficha/.+?radio_id=(.+?)">(.+?)</a></td><td>(.+?)</td>.+?<td align="center">').findall(link) for idradio,nomeradio,concelho in radios: addDir('[B]'+nomeradio+'[/B] ('+concelho+')',RadiosURL + 'Sintonizador/?radio_id=' + idradio + '&scope=0',21,'http://www.radio.com.pt/APR.ROLI.WEB/Images/Logos/'+ idradio +'.gif',len(radios),'',False) xbmc.executebuiltin("Container.SetViewMode(501)") paginasradios(url,link)
def rate(classifier,filename,medi,mad,medi_ld,mad_ld): #For now, just assuming the text is csv results = classifier with open(filename,'rU') as f: reader = csv.reader(f,delimiter=',') data = [(get_features(tech.clean(row[2]),medi,mad,medi_ld,mad_ld),row[0]) for row in reader] print nltk.classify.accuracy(classifier,data)
def eligibility(self): try: table = utils.Table(self.soup.select("table[summary~=Land]")[0]) except IndexError: return None return utils.clean(table["Eligibility"].text.strip())
def post(self): db=self.application.database content = self.request.arguments.get("content", [""])[0] parent = self.request.arguments.get("parent", [None])[0] super_parent = self.request.arguments.get("super_parent", [None])[0] content = clean(content) new_comment = { "content" : content, "time" : datetime.utcnow(), "author" : self.get_current_user(), "plusvote": list(), "minusvote": list() } if parent != None: new_comment["parent"] = parent if super_parent != None: new_comment["super_parent"] = super_parent new_id = db.comments.insert(new_comment) print new_id print parent print db.comments.update({"_id": ObjectId(parent)}, {"$push": {"children": str(new_id)}}) self.redirect("/")
def listasextras(): iptvurl='http://01.gen.tr/HasBahCa_IPTV/' link= clean(abrir_url(iptvurl)) streams=re.compile('<a class="autoindex_a" href="./(.+?)">.+?<td class="autoindex_td_right">.+?</td.+?td class="autoindex_td_right">(.+?)</td>').findall(link) for nomepasta,act in streams: if re.search('.m3u',nomepasta): titulo=nomepasta.replace('.m3u','').replace('_',' ').title() addDir("[B]%s[/B] (act.%s)" % (titulo,act[2:-2]),iptvurl + nomepasta,5,tvporpath + art + 'listas-ver2.png',1,'',True)
def findName(self, code): s = code.find("def")+len("def") e = code.find("(") name = code[s:e] return clean(name)
def radiosobterurlstream(name,url): #GA("None","Radio - " + name) mensagemprogresso.create('TV Portuguesa','A carregar...') mensagemprogresso.update(0) if re.search('www.radios.pt',url): link=abrir_url(url) try: endereco=re.compile('<param name="url" value="(.+?)"').findall(link)[0] except: xbmc.executebuiltin("XBMC.Notification(Fightnight Music,Não é possível ouvir esta rádio.,'500000',)") return idradio=url.replace('http://www.radios.pt/portalradio/Sintonizador/?radio_id=','').replace('&scope=0','') thumbnail='http://www.radio.com.pt/APR.ROLI.WEB/Images/Logos/'+ idradio +'.gif' else: urlfinal='http://www.radioonline.com.pt/ajax/player.php?clear_s_name=' + url link= clean(abrir_url(urlfinal)) try: player=re.compile('soundManager.createSound\({(.+?)autoLoad').findall(link)[0] except: player=False try: endereco=re.compile('url: "(.+?)"').findall(player)[0].replace(';','') if re.search('serverURL',player): rtmp=re.compile('serverURL: "(.+?)"').findall(player)[0] #rtmp=rtmp.replace('rtmp://195.23.102.206','rtmp://195.23.102.209') #tempfix rtmp=rtmp.replace(':1936','') #tempfix endereco=rtmp + ' playPath=' + endereco except:endereco=False if not endereco: try:endereco=re.compile('<param name="URL" value="(.+?)"').findall(link)[0] except: try: endereco=re.compile('<object data="(.+?)"').findall(link)[0] except: endereco=False if not endereco: xbmc.executebuiltin("XBMC.Notification(TV Portuguesa,Não é possível ouvir esta rádio.,'500000',)") mensagemprogresso.close() return try:thumbnail=re.compile('<img id="station-logo-player" src="(.+?)"').findall(link)[0] except: thumbnail='' if re.search('.asx',endereco): nomeasx='stream.asx' path = xbmc.translatePath(os.path.join(pastaperfil)) lib=os.path.join(path, nomeasx) downloader(endereco,lib) texto= openfile(nomeasx) endereco = xbmc.PlayList(1) endereco.clear() streams=re.compile('<ref.+?"(.+?)"/>').findall(texto) for musica in streams: listitem = xbmcgui.ListItem(name, iconImage="DefaultVideo.png", thumbnailImage=thumbnail) listitem.setInfo("music", {"Title":name}) endereco.add(musica,listitem) else: pass mensagemprogresso.close() listitem = xbmcgui.ListItem(name, iconImage="DefaultVideo.png", thumbnailImage=thumbnail) listitem.setInfo("music", {"Title":name}) xbmc.Player().play(endereco,listitem)
def radioslocais(): link= clean(abrir_url(RadiosURL)) #addDir('Pesquisar (exclui nacionais)',RadiosURL + '?distrito=0&concelho=0&tipo=0&text=',16,'',1,'',True) distritos=re.compile('id="DirectorioPesquisa1_ddlDistritos">(.+?)</select>').findall(link)[0] distritos=distritos.replace('<option value="0"></option>','<option value="0">Todos as rádios locais</option>') lista=re.compile('<option value="(.+?)">(.+?)</option>').findall(distritos) for iddistrito,nomedistrito in lista: addDir(nomedistrito,RadiosURL + '?distrito=' + iddistrito + '&concelho=0&tipo=0',24,'',len(lista),'',True) xbmc.executebuiltin("Container.SetViewMode(501)")
def manage_addPingback(self, sourceTitle, sourceURI, sourceExcerpt): """ Add a pingback """ from utils import isPingbackSpam if isPingbackSpam(sourceTitle, sourceURI, sourceExcerpt, self.blogurl(), self.REQUEST): try: return self.REQUEST.RESPONSE.redirect('http://www.google.com') except: return 0 id = self.createReferenceId() newTitle = clean(sourceTitle) newURI = clean(sourceURI) newExcerpt = clean(sourceExcerpt) pingback = Reference(id, newTitle, newURI, newExcerpt, self.getId()) self._setObject(id, pingback) return 1
def main(args): content_generator = load_file(args.transcript, encoding=args.encoding) rules = load_rules(args.rules, encoding=args.encoding) mapped = do_mapping(content_generator, rules) cleaned = clean(mapped) formatted = mlf_format_data(cleaned) save_file(args.output, formatted, encoding=args.encoding)
def radios(): addDir('[COLOR blue][B]Radios Locais[/B][/COLOR]','nada',20,tvporpath + art + 'radios-v1.png',1,'',True) addLink("",'','') link= clean(abrir_url(RadiosNacionaisURL)) nacionais=re.compile('<div class="radiostation boxgrid">(.+?)</div>').findall(link) for radioindividual in nacionais: radiosnacionais=re.compile('<a href="http://www.radioonline.com.pt/#(.+?)".+?<img.+?src="(.+?)".+?alt="(.+?)"').findall(radioindividual) for idradio,imagemradio,nomeradio in radiosnacionais: nomeradio=nomeradio.replace('Radio ','') addDir(nomeradio,idradio,21,imagemradio,len(radiosnacionais),'',False)
def test_utils_clean(self): test_file = '/tmp/' + utils.test_name() self.assertFalse(os.path.exists(test_file)) utils.run(['touch', test_file]) self.assertTrue(os.path.exists(test_file)) with utils.clean(['rm', test_file]): self.assertFalse(os.path.exists(test_file)) utils.run(['touch', test_file]) self.assertTrue(os.path.exists(test_file)) self.assertFalse(os.path.exists(test_file))
def fetch(args): print 'Welcome to Twitter Spell Checking : Fetching !' CONFIG = ConfigParser.ConfigParser() CONFIG.read(args.config) settings = items_to_dict(CONFIG.items('twitter')) config = items_to_dict(CONFIG.items('namespace:%s' % args.namespace)) api = twitter.Api(consumer_key=settings['consumer_key'], consumer_secret=settings['consumer_secret'], access_token_key=settings['access_token'], access_token_secret=settings['access_token_secret']) accounts = [account.replace(' ', '') for account in config['accounts'].split(',')] max_tweets_file = os.path.join(os.path.dirname(config['files']), 'max_tweets_%s.txt' % args.namespace) def save_max_tweets(): open(max_tweets_file, 'w').write(json.dumps(max_tweets)) if os.path.exists(max_tweets_file): max_tweets = json.loads(open(max_tweets_file).read()) else: max_tweets = dict() print max_tweets_file f = open(os.path.join(config['files'], 'tweets_%s.txt' % args.namespace), 'a') for account in accounts: if account in max_tweets and max_tweets[account] > 0: retrieving = "new" else: retrieving = "old" page = 0 while True: if retrieving == "new": print 'process %s since id %s' % (account, max_tweets[account]) try: tweets = api.GetUserTimeline(account, count=200, include_rts=False, since_id=max_tweets[account]) except twitter.TwitterError, e: print 'error : %s' % str(e) tweets = [] else: print 'process %s from zero, page %s' % (account, page) try: tweets = api.GetUserTimeline(account, count=200, include_rts=False, page=page) except twitter.TwitterError, e: print 'error : %s' % str(e) tweets = [] if tweets: for s in tweets: if is_valid(s, account): f.write(clean(s.text).lower().encode('UTF-8') + '\n') if account not in max_tweets or s.id > max_tweets[account]: max_tweets[account] = s.id if retrieving == "old": page += 1 save_max_tweets() else: print 'no more tweets for %s' % account break
def manage_addPost(self, title, author, body, tags=[], date=DateTime.DateTime(), publish=1, comment_allowed=1, not_clean=0, sendping=1, REQUEST=None): """ Called from ZMI when creating new posts """ if not title and REQUEST is not None: return REQUEST.RESPONSE.redirect('%s/post?msg=%s' % (self.blogurl(), 'You must provide at least the title of the post')) newid = self.createId(title) newtitle = clean(title) newauthor = clean(author) if not_clean: newbody = body else: newbody = cleanBody(self, body) newtags = prepareTags(tags) newdate = DateTime.DateTime(date) while hasattr(self, newid): newid = self.createNewId(newid) post = Post(newid, newtitle, newauthor, newbody, newtags, newdate, publish, comment_allowed) self._setObject(str(newid), post) post = self.get(newid) if self.inCommunity(): # We are in a Bitakora Community, so catalog the post there cat = self.getParentNode().get('Catalog', 'None') if cat is not None: cat.catalog_object(post, '/'.join(post.getPhysicalPath())) self.postcount = self.postcount + 1 if sendping: tech_pings = Future(sendPing, self.absolute_url(), self.blog_title()) pingbacks = Future(postPingBacks, newbody, post.absolute_url()) if REQUEST is not None: return REQUEST.RESPONSE.redirect('%s/admin?msg=%s' % (self.absolute_url(), 'Post added succesfully')) return newid
def todosact(parametro): LOLI=['<item>\n<title>Actualizado: ' + horaportuguesa(True).replace('%20',' ') + '</title>\n<link>nada</link>\n<thumbnail>nada</thumbnail>\n</item>'] dialog = xbmcgui.Dialog() mensagemprogresso.create('TV Portuguesa', 'A criar lista.','Por favor aguarde...') if re.search('Lista Completa',parametro): canaison= openfile(('canaison')) canaison=canaison.replace('[','') lista=re.compile('B](.+?)/B]').findall(canaison) tamanhototal=int(len(lista)) tamanho=int(-1) for nomes in lista: tamanho=tamanho+1 tamanhoenviado=(tamanho*100)/tamanhototal print "Lista completa: Canal " + nomes global activadoextra activadoextra=[] SIM= request_servidores('ignore','[B]' + nomes + '[/B]',tamanho=tamanhoenviado) LOLI.append(SIM) AGORA='\n\n'.join(LOLI) else: SIM= request_servidores('ignore',parametro) LOLI.append(SIM) AGORA='\n\n'.join(LOLI) mensagemprogresso.close() debugfinal='\n'.join(debug) savefile('problema',debugfinal) keyb = xbmc.Keyboard('', 'Nome do ficheiro da lista') keyb.doModal() if (keyb.isConfirmed()): nomelista = keyb.getText() if nomelista=='': nomelista='lista' else: nomelista='lista' pastafinal = dialog.browse(int(0), "Local para guardar xml/m3u", 'myprograms') if not pastafinal: sys.exit(0) savefile(nomelista + '.xml',AGORA,pastafinal=pastafinal) m3uprep=['#EXTM3U#EXTM3U'] openedfile= clean(AGORA) ya=re.compile('<item>(.+?)</item>').findall(openedfile) for lol in ya: chname=re.compile('<title>(.+?)</title>').findall(lol)[0] allstreams=False if allstreams==True: streams=re.compile('<link>(.+?)</link>').findall(lol) for umporum in streams: m3uprep.append('\n#EXTINF:-1,%s\n%s' % (chname,umporum)) else: streams=re.compile('<link>(.+?)</link>').findall(lol)[0] m3uprep.append('\n#EXTINF:-1,%s\n%s' % (chname,streams)) m3uprep='\n'.join(m3uprep) savefile(nomelista + '.m3u',m3uprep,pastafinal=pastafinal) xbmc.executebuiltin("XBMC.Notification(TV Portuguesa, Lista xml/m3u gravada,'100000'," + tvporpath + art + "icon32-ver1.png)")
def post(self): db=self.application.database content = self.request.arguments.get("content", [""])[0] title = self.request.arguments.get("title", [""])[0] content = clean(content) title = clean(title) new_document = { "content" : content, "type" : "status", "title" : title, "author" : self.get_current_user(), "time": tuple(datetime.now().utctimetuple()), "plusvote": list(), "minusvote": list() } db.documents.insert(new_document) self.redirect("/")
def get_tweets_for_user(user_id): path = tweetsd + "/tweets-user-" + str(user_id) + ".txt" tweets = [] with open(path, 'r') as f: for index, line in enumerate(f): if index >= limit: return clean(" ".join(tweets), False) info = line.split("\t") if len(info) != 5: print info[-2] tweets.append(info[-1].strip())
def classify(self, testfile): text = utils.clean(testfile.read()) neg_count = 0 pos_count = 0 for snip in self.neg_snip: if snip in text: neg_count += 1 for snip in self.pos_snip: if snip in text: pos_count += 1 return pos_count - neg_count
def findArgs(self, code): s = code.rfind("(")+1 e = code.find(")") temp = code[s:e] args = [] for a in temp.split(","): args.append(clean(a).split("=")) return args
url = "https://www.sec.gov/edgar/searchedgar/edgarstatecodes.htm" content = urllib.urlopen(url).read() doc = html.fromstring(content) rows = doc.xpath('//table')[3].getchildren() seen_other_countries = False header = ['EDGAR', 'name'] data = [] for row in rows: if seen_other_countries is not True: if utils.clean(row.text_content()) != 'Other Countries': print('SKIPPING', row.text_content()) continue else: seen_other_countries = True print('SEEN OTHER COUNTRIES', row.text_content()) continue cells = row.getchildren() if len(cells) != 2: print('ERROR IN CELL COUNT') for cell in cells: print(cell) print(cell.text_content()) continue code = utils.clean(cells[0].text_content())
if __name__ == '__main__': obj = loadConfig() dl = Downloader() for company in obj['company']: print('...', end=' ') payload = { 'function': 'TIME_SERIES_INTRADAY', 'interval': obj['interval'], 'symbol': company[0], 'apikey': 'M1PAJKCE6DZUZAUS', 'datatype': obj['datatype'] } pipe = Pipe() new_text = appendCol(clean( pipe.read_from_downloader(text=dl.addParams(payload).bulk())), colname=company[1]) old_text = pipe.read_from_file(filename=company[1], ext=obj['datatype']) pipe.read_from_text(new_text + old_text) pipe.write_to_file(filename=company[1], ext=obj['datatype']) pipe.clear() print('Finish pulling stock data: %s' % company[1]) dl.close()
import csv import numpy as np from pprint import pprint from utils import clean import pandas as pd from sklearn.preprocessing import OneHotEncoder from sklearn.compose import ColumnTransformer stadium_data = {} with open('final_data/stadium_record.csv') as f: reader = csv.reader(f) next(reader) for row in reader: stadium_data[clean( row[0])] = [float(row[1]), float(row[9]), float(row[11])] alltime_data = {} with open('final_data/player_batting_record_backup.csv') as f: reader = csv.reader(f) header = next(reader) for row in reader: if clean(row[0]) in alltime_data: alltime_data[clean(row[0])][row[3]] = row else: alltime_data[clean(row[0])] = {row[3]: row} alltime_header = [clean(h) for h in header] ipl_data = {} with open('final_data/data-ipl-fantasy.csv') as f:
def process_one(one): ids = sp.encode_as_ids(clean(one)) return ids[0:args.input_max_length]
def split_data(net, dat_in, dat_out, dat_out_raw, dat_out_oracle, num_flws, bch_trn, bch_tst, use_val=False): """ Divides the input and output data into training, validation, and testing sets and constructs data loaders. """ print("Creating train/val/test data...") #assert len(dat_out.shape) == 1 #assert len(dat_out_raw.shape) == 1 #assert len(dat_out_oracle.shape) == 1 #assert len(num_flws.shape) == 1 fets = dat_in.dtype.names # Destroy columns names to make merging the matrices easier. I.e., # convert from structured to regular numpy arrays. dat_in = utils.clean(dat_in) dat_out = utils.clean(dat_out) dat_out_raw = utils.clean(dat_out_raw) dat_out_oracle = utils.clean(dat_out_oracle) num_flws = utils.clean(num_flws) # Shuffle the data to ensure that the training, validation, and # test sets are uniformly sampled. To shuffle all of the arrays # together, we must first merge them into a combined matrix. num_cols_in = dat_in.shape[1] merged = np.concatenate( (dat_in, dat_out, dat_out_raw, dat_out_oracle, num_flws), axis=1) np.random.shuffle(merged) dat_in = merged[:, :num_cols_in] dat_out = merged[:, num_cols_in] dat_out_raw = merged[:, num_cols_in + 1] dat_out_oracle = merged[:, num_cols_in + 2] num_flws = merged[:, num_cols_in + 3] # 50% for training, 20% for validation, 30% for testing. num_exps = dat_in.shape[0] num_val = int(round(num_exps * 0.2)) if use_val else 0 num_tst = int(round(num_exps * 0.3)) print( (f" Data - train: {num_exps - num_val - num_tst}, val: {num_val}, " f"test: {num_tst}")) # Validation. dat_val_in = dat_in[:num_val] dat_val_out = dat_out[:num_val] # Testing. dat_tst_in = dat_in[num_val:num_val + num_tst] dat_tst_out = dat_out[num_val:num_val + num_tst] dat_tst_out_raw = dat_out_raw[num_val:num_val + num_tst] dat_tst_out_oracle = dat_out_oracle[num_val:num_val + num_tst] num_flws_tst = num_flws[num_val:num_val + num_tst] # Training. dat_trn_in = dat_in[num_val + num_tst:] dat_trn_out = dat_out[num_val + num_tst:] # Create the dataloaders. dataset_trn = utils.Dataset(fets, dat_trn_in, dat_trn_out) ldr_trn = (torch.utils.data.DataLoader( dataset_trn, batch_size=bch_tst, shuffle=True, drop_last=False) if isinstance(net, models.SvmSklearnWrapper) else torch.utils.data.DataLoader( dataset_trn, batch_sampler=utils.BalancedSampler( dataset_trn, bch_trn, drop_last=False))) ldr_val = (torch.utils.data.DataLoader( utils.Dataset(fets, dat_val_in, dat_val_out), batch_size=bch_tst, shuffle=False, drop_last=False) if use_val else None) ldr_tst = torch.utils.data.DataLoader(utils.Dataset( fets, dat_tst_in, dat_tst_out, dat_tst_out_raw, dat_tst_out_oracle, num_flws_tst), batch_size=bch_tst, shuffle=False, drop_last=False) return ldr_trn, ldr_val, ldr_tst
df = preprocessing.load_data() df = feature_engineering.FeatEngin(df) print('LGMB training is stating....') LGBMtrain.Train(df, BO) if x == '2': with open('train//ctdict1.pkl', 'rb') as handle: cvsdict = pickle.load(handle) print('loading preprocess data is stating....') df = pd.read_csv('train//df_application_train_new1.csv', dtype=cvsdict) for c in df: print(c) print(df.shape) print('feature engineering is stating....') df = feature_engineering.FeatEngin(df) df = utils.clean(df) print('LGMB training is stating....') LGBMtrain.Train(df, BO) if x == '3': with open('train//ctdict2.pkl', 'rb') as handle: cvsdict = pickle.load(handle) print('loading preprocess data is stating....') df = pd.read_csv('train//df_application_train_new2.csv', dtype=cvsdict) for c in df: print(c) print(df.shape) df = utils.clean(df) print('LGMB training is stating....') LGBMtrain.Train(df, BO)
clean_sentence = clean(sentence) for word in clean_sentence: freq[word] = freq.get(word, 0) + 1 X.append(clean_sentence) Y_cog.append(mapping_cog[label_cog]) Y_know.append(mapping_know[label_know]) ''' with codecs.open('datasets/BCLs_Question_Dataset.csv', 'r', encoding="utf-8") as csvfile: all_rows = csvfile.read().splitlines()[1:] csvreader = csv.reader( all_rows) #csvreader = csv.reader(all_rows[:len(all_rows)*7//10]) for row in csvreader: sentence, label_cog = row clean_sentence = clean(sentence) if (PREPARE_VOCAB): for word in clean_sentence: freq[word] = freq.get(word, 0) + 1 X.append(clean_sentence) Y_cog.append(mapping_cog[label_cog]) # TODO: Label Y_know.append(1) domain_keywords = pickle.load(open('resources/domain.pkl', 'rb')) for key in domain_keywords: for word in domain_keywords[key]: freq[word] = freq.get(word, 0) + 1 X.append([word]) Y_cog.append(mapping_cog[key])
def run(arguments): mode = arguments.mode chainix = arguments.chain_index tend = arguments.npoints start = 0 move_every = arguments.save_interval outdir = arguments.output inpath = arguments.input devnull = '>& /dev/null' lastaccepted = {} #select mode, use argparse for this move = 1 #run the thing #mode 1: start from new point if mode == "new": outname = "pMSSM_MCMC_" + str(chainix) + "_" + str(start) + "to" + str( min(start + move_every, start + tend)) + ".root" outroot = TFile(outname, "recreate") outtree = TTree("mcmc", "mcmc") setup_tree(outtree) tree_branches["chain_index"][0] = chainix finite_lh = False signchoice = random.randint(0, 7) while not finite_lh: utils.clean() spnerr = False while not spnerr: #find a viable point utils.clean() candidate = generate_point( signchoice=signchoice) #generate a point from flat prior spnin = utils.write_spheno_input( candidate) #write the input for spheno spnerr = run_spheno( spnin, devnull) #run spheno, check if viable point if not run_feynhiggs( '>& /dev/null'): #run feynhiggs, replace higgs sector continue os.system("cp SPheno.spc mmgsin.slha") mmgs_obs = run_micromegas( slhapath="mmgsin.slha" ) #micromegas seems to consume the input file?!?! os.system("mv mmgsin.slha SPheno.spc") print "getting the stuff from the slha file" observables = get_observables( slhapath="SPheno.spc") #get observables for the likelihood siso_obs = run_superiso("SPheno.spc") if siso_obs == -1: continue for obs in siso_obs: observables[obs] = siso_obs[obs] siso_chi2_obs = run_superiso_chi2("SPheno.spc") for obs in siso_chi2_obs: observables[obs] = siso_chi2_obs[obs] for obs in mmgs_obs: observables[obs] = mmgs_obs[obs] _l = likelihood.get_likelihood(observables) #get likelihood finite_lh = _l != 0 lastaccepted["likelihood"] = _l lastaccepted["iteration_index"] = 1 lastaccepted["accepted_index"] = 1 lastaccepted["chain_index"] = chainix lastaccepted["superiso_chi2_stdout"] = observables[ "superiso_chi2_stdout"]["value"] lastaccepted["superiso_stdout"] = observables["superiso_stdout"][ "value"] lastaccepted["chi2"] = observables["chi2"]["value"] lastaccepted["chi2_ndf"] = observables["chi2_ndf"]["value"] lastaccepted["micromegas_stdout"] = observables["micromegas_stdout"][ "value"] lastaccepted["ztoinv_excluded"] = observables["ztoinv_excluded"][ "value"] lastaccepted["lep_excluded"] = observables["lep_excluded"]["value"] lastaccepted["masslim"] = observables["masslim"]["value"] lastaccepted["omegah2"] = observables["omegah2"]["value"] lastaccepted["omegaxf"] = observables["omegaxf"]["value"] #write point to root, start loop for obs in observables.keys(): lastaccepted[obs] = observables[obs]["value"] lastaccepted = prepare_fill( lastaccepted, outtree) #add the rest of the point info, fill the tree branches outtree.Fill() #mode 2: continue from previous point/root file? elif mode == "resume": lastaccepted = utils.get_point_from_rootfile(inpath, chainix) start = lastaccepted["iteration_index"] + 1 if mode == "resume": outname = "pMSSM_MCMC_" + str(chainix) + "_" + str(start) + "to" + str( min(start + move_every, start + tend)) + ".root" print "Creating file " + outname outroot = TFile(outname, "recreate") outtree = TTree("mcmc", "mcmc") setup_tree(outtree) tree_branches["chain_index"][0] = chainix #run print "reached run loop" for iter_ix in range(start, start + tend + 1): print iter_ix if move == move_every - 1 and iter_ix < start + tend - 1: outtree.BuildIndex("chain_index", "iteration_index") outtree.Write() outroot.Close() print "Made " + str( move_every) + " iterations, moving " + outname + " to storage" os.system(" ".join(["mv", outname, outdir])) outname = "pMSSM_MCMC_" + str(chainix) + "_" + str( iter_ix) + "to" + str( min(iter_ix + move_every, start + tend + 1)) + ".root" print "Creating file " + outname outroot = TFile(outname, "recreate") outtree = TTree("mcmc", "mcmc") setup_tree(outtree) move = -1 finite_lh = False while not finite_lh: utils.clean() spnerr = False while not spnerr: #find a viable point utils.clean() candidate = generate_point( lastaccepted) #generate a point from the last point spnin = utils.write_spheno_input( candidate) #write the input for spheno spnerr = run_spheno( spnin, devnull) #run spheno, check if viable point if not run_feynhiggs( '>& /dev/null'): #run feynhiggs, replace higgs sector continue os.system("cp SPheno.spc mmgsin.slha") mmgs_obs = run_micromegas(slhapath="mmgsin.slha") os.system("mv mmgsin.slha SPheno.spc") observables = get_observables( slhapath="SPheno.spc") #get observables for the likelihood siso_obs = run_superiso("SPheno.spc") if siso_obs == -1: continue for obs in siso_obs: observables[obs] = siso_obs[obs] siso_chi2_obs = run_superiso_chi2("SPheno.spc") for obs in siso_chi2_obs: observables[obs] = siso_chi2_obs[obs] for obs in mmgs_obs: observables[obs] = mmgs_obs[obs] _l = likelihood.make_decision( observables, lastaccepted["likelihood"]) #get likelihood finite_lh = _l != 0 if _l < 0: move += 1 if iter_ix == start + tend: print "Made all " + str( tend) + " iterations, moving " + outname + " to storage" outtree.BuildIndex("chain_index", "iteration_index") outtree.Write() outroot.Close() os.system(" ".join(["mv", outname, outdir])) continue #point was not accepted lastaccepted["likelihood"] = _l lastaccepted["iteration_index"] = iter_ix lastaccepted["accepted_index"] = lastaccepted["accepted_index"] + 1 lastaccepted["chain_index"] = chainix lastaccepted["superiso_chi2_stdout"] = observables[ "superiso_chi2_stdout"]["value"] lastaccepted["superiso_stdout"] = observables["superiso_stdout"][ "value"] lastaccepted["chi2"] = observables["chi2"]["value"] lastaccepted["chi2_ndf"] = observables["chi2_ndf"]["value"] lastaccepted["micromegas_stdout"] = observables["micromegas_stdout"][ "value"] lastaccepted["ztoinv_excluded"] = observables["ztoinv_excluded"][ "value"] lastaccepted["lep_excluded"] = observables["lep_excluded"]["value"] lastaccepted["masslim"] = observables["masslim"]["value"] lastaccepted["omegah2"] = observables["omegah2"]["value"] lastaccepted["omegaxf"] = observables["omegaxf"]["value"] #write point to root, start loop lastaccepted = prepare_fill( lastaccepted, outtree) #add the rest of the point info, fill the tree branches outtree.Fill() if iter_ix == start + tend: print "Made all " + str( tend) + " iterations, moving " + outname + " to storage" outtree.BuildIndex("chain_index", "iteration_index") outtree.Write() outroot.Close() os.system(" ".join(["mv", outname, outdir])) move += 1
} } rows_en = soup_en.find_all('tr') for row in rows_en: cells = row.find_all('td') if len(cells) != 3: print('ERROR IN CELL COUNT') print(cells) continue if cells[0] is not None: if cells[0].text.startswith('South') or cells[0].text.startswith( 'Sudan'): continue numerical = utils.clean(cells[0].text) name = utils.clean(cells[1].text) alpha3 = utils.clean(cells[2].text) if alpha3.startswith('ISO ALPHA-3'): # skip first row of column headers print('SKIPPING', numerical, name, alpha3) continue iso3166.update({ numerical: { 'ISO3166-1-numeric': numerical, 'official_name_en': name, 'ISO3166-1-Alpha-3': alpha3 } }) # fetch French
model = Sequential() # 1 hidden layer, input_dim of 64, output_dim of 1 model.add(Dense(12, input_dim=64, activation='relu')) model.add(Dropout(0.25)) model.add(Dense(6, activation='relu')) model.add(Dense(1, activation='sigmoid')) # compile the model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # clean data and prepare to be passed through the model df = clean('data/1year.arff') trainX, trainY, testX, testY = prepare_dataset(df) model.fit(trainX, trainY, epochs=150, batch_size=10) predictions = model.predict(testX) print(predictions[0:5]) for i in range(len(predictions)): if predictions[i][0] > 0.5: predictions[i] = 1 else: predictions[i] = 0 correct = 0
def scale_fets(dat, scl_grps, standardize=False): """ Returns a copy of dat with the columns normalized. If standardize is True, then the scaling groups are normalized to a mean of 0 and a variance of 1. If standardize is False, then the scaling groups are normalized to the range [0, 1]. Also returns an array of shape (dat_all[0].shape[1], 2) where row i contains the scaling parameters of column i in dat. If standardize is True, then the scaling parameters are the mean and standard deviation of that column's scaling group. If standardize is False, then the scaling parameters are the min and max of that column's scaling group. """ fets = dat.dtype.names assert fets is not None, \ f"The provided array is not structured. dtype: {dat.dtype.descr}" assert len(scl_grps) == len(fets), \ f"Invalid scaling groups ({scl_grps}) for dtype ({dat.dtype.descr})!" # Determine the unique scaling groups. scl_grps_unique = set(scl_grps) # Create an empty array to hold the min and max values (i.e., # scaling parameters) for each scaling group. scl_grps_prms = np.empty((len(scl_grps_unique), 2), dtype="float64") # Function to reduce a structured array. rdc = (lambda fnc, arr: fnc( np.array([fnc(arr[fet]) for fet in arr.dtype.names if fet != ""]))) # Determine the min and the max of each scaling group. for scl_grp in scl_grps_unique: # Determine the features in this scaling group. scl_grp_fets = [ fet for fet_idx, fet in enumerate(fets) if scl_grps[fet_idx] == scl_grp ] # Extract the columns corresponding to this scaling group. fet_values = dat[scl_grp_fets] # Record the min and max of these columns. scl_grps_prms[scl_grp] = [ np.mean(utils.clean(fet_values)) if standardize else rdc( np.min, fet_values), np.std(utils.clean(fet_values)) if standardize else rdc( np.max, fet_values) ] # Create an empty array to hold the min and max values (i.e., # scaling parameters) for each column (i.e., feature). scl_prms = np.empty((len(fets), 2), dtype="float64") # Create an empty array to hold the rescaled features. new = np.empty(dat.shape, dtype=dat.dtype) # Rescale each feature based on its scaling group's min and max. for fet_idx, fet in enumerate(fets): # Look up the parameters for this feature's scaling group. prm_1, prm_2 = scl_grps_prms[scl_grps[fet_idx]] # Store this min and max in the list of per-column scaling parameters. scl_prms[fet_idx] = np.array([prm_1, prm_2]) fet_values = dat[fet] if standardize: # prm_1 is the mean and prm_2 is the standard deviation. scaled = ( # Handle the rare case where the standard deviation is # 0 (meaning that all of the feature values are the # same), in which case return an array of zeros. np.zeros(fet_values.shape, dtype=fet_values.dtype) if prm_2 == 0 else (fet_values - prm_1) / prm_2) else: # prm_1 is the min and prm_2 is the max. scaled = ( # Handle the rare case where the min and the max are # the same (meaning that all of the feature values are # the same. np.zeros(fet_values.shape, dtype=fet_values.dtype) if prm_1 == prm_2 else utils.scale( fet_values, prm_1, prm_2, min_out=0, max_out=1)) new[fet] = scaled return new, scl_prms
def main(_): FLAGS.train = True FLAGS.clean = True clean(FLAGS) train()
def text_analytics_cmds(project, projectLang, grepStack, searchKey): """ :param project: project name :param projectLang: primary language of project :param searchKey: build or test :param grepStack: list of file contents each as a string to analyze :return: A build command :rtype str """ proximity_threshold = 10 # We insist that the command that we extract be within # x lines after encountering the word 'build' or 'text'. # Need to factor in HTML syntax. max_lines = 5 # Limit the number of lines (i.e. individual commands) max_words = 16 # Limit the number of words we allow per individual command max_chars = 128 # Limit the number of characters per individual command logger.debug("In text_analytics_cmds, project=%s, searchKey=%s, cnt grepStack[]=%s" % (project, searchKey, str(len(grepStack)))) # List most common build and test commands first commands = ['mvn ', 'ant ', 'npm ', 'grunt ', 'python ', 'py.test ', 'cd ', 'gem ', 'rake ', 'build.sh ', 'bootstrap.sh ', 'autogen.sh ', 'autoreconf ', 'automake ', 'aclocal ', 'scons ', 'sbt ', 'cmake ', 'gradle ', 'bundle ', 'perl ', 'php '] # List words that, if appearing in front of a command, indicate that it's # descriptive text, not a build command english = ['the', 'a', 'an', 'is', 'are', 'can', 'you', 'of', 'in', 'from', 'this', 'to', 'that', 'when', 'should', 'might'] # Symbols used as prompts promptsStr = '$#>%' # Symbols in commands we don't allow. Not a proper list. We have limitations such # as $VAR to prevent expanded environment variables which we don't support yet noContainsStr = '$' # Commands don't end with these characters. noEndsWithStr = ':[]().,' retval = [] for fstr in grepStack: build_found = False build_line_number = 0 lines = 0 for idx, line in enumerate(fstr.splitlines()): line = line.lower() if searchKey in line: build_found = True build_line_number = idx if build_found and\ idx - build_line_number < proximity_threshold: if len(line): logger.debug("text_analytics_cmds, idx=%s scanning line=%s" % (str(idx), line)) isText = False for word in english: if word in line.split(' '): isText = True break if isText or any(x in line for x in noContainsStr): continue lastChar = line[len(line) - 1] # check for command prompt symbols at beginning of line if line.lstrip()[0] in promptsStr: line = line.lstrip()[1:].lstrip() for command in commands: # TODO: validate start of command line if len(line.split(' ')) <= max_words and\ lines <= max_lines and\ not lastChar in noEndsWithStr: # Commands don't end with ':[]()' if command in line: retval.append(utils.clean(line)) lines = lines + 1 break elif (projectLang == 'C' or projectLang == 'C++' or projectLang == 'Perl') and\ 'make' in line: retval.append(utils.clean(line)) lines = lines + 1 break else: build_found = False if len(retval): break # If you find a build command sequence in one file, don't search other files return ';'.join(retval)
def main(): # ----------------------------------------------------------------------------------- # Adjustable Parameters parser = argparse.ArgumentParser() parser.add_argument('--train', action='store_true', help='training or scoring') parser.add_argument('--inputfile', type=str, help='input data file name') parser.add_argument('--outputfile', type=str, help='output prediction file name') args = parser.parse_args() # directory for the input data and output prediction: DATA_DIR = 'data' OUTPUT_DIR = 'output' # columns used: CAT_COLS = [ 'Auction', 'Transmission', 'WheelType', 'Nationality', 'Size', 'TopThreeAmericanName', 'IsOnlineSale' ] NUM_COLS = [ 'VehicleAge', 'VehOdo', 'VehBCost', 'WarrantyCost', 'MMRCurrentAuctionAveragePrice', 'MMRAcquisitionAuctionAveragePrice', 'MMRCurrentAuctionCleanPrice', 'MMRAcquisitionAuctionCleanPrice', 'MMRCurrentRetailAveragePrice', 'MMRAcquisitionRetailAveragePrice', 'MMRCurrentRetailCleanPrice', 'MMRAcquisitonRetailCleanPrice' ] DATE_COLS = ['PurchDate'] LABEL_COL = 'IsBadBuy' IDS_COL = 'RefId' # current time for computing recency feature NOW = '2010-12-31' # modeling step: # model checkpoint for future scoring MODEL_DIR = 'model' CHECKPOINT_XGB = 'xgb.pkl' CHECKPOINT_PREPROCESS = 'preprocess.pkl' # parameter that only relevant for training stage and not scoring if args.train: # number of cross validation and hyperparameter settings to try CV = 10 N_ITER = 5 MODEL_RANDOM_STATE = 4321 # train/validation stratified split VAL_SIZE = 0.1 TEST_SIZE = 0.1 SPLIT_RANDOM_STATE = 1234 # ----------------------------------------------------------------------------------- logger.info('preprocessing') checkpoint_preprocess = os.path.join(MODEL_DIR, CHECKPOINT_PREPROCESS) checkpoint_xgb = os.path.join(MODEL_DIR, CHECKPOINT_XGB) input_path = os.path.join(DATA_DIR, args.inputfile) if args.train: data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL, LABEL_COL) ids = data[IDS_COL].values label = data[LABEL_COL].values data = data.drop([IDS_COL, LABEL_COL], axis=1) # train/test split twice to achieve train/validation/test three way split df_train, df_test, y_train, y_test, ids_train, ids_test = train_test_split( data, label, ids, test_size=TEST_SIZE, random_state=SPLIT_RANDOM_STATE, stratify=label) df_train, df_val, y_train, y_val, ids_train, ids_val = train_test_split( df_train, y_train, ids_train, test_size=VAL_SIZE, random_state=SPLIT_RANDOM_STATE, stratify=y_train) # obtain finalized columns num_cols_cleaned = list( SortedSet(df_train.columns) - SortedSet(CAT_COLS)) preprocess = Preprocesser(num_cols=num_cols_cleaned, cat_cols=CAT_COLS) X_train = preprocess.fit_transform(df_train) X_val = preprocess.transform(df_val) X_test = preprocess.transform(df_test) logger.info('modeling') eval_set = [(X_train, y_train), (X_val, y_val)] xgb_tuned = build_xgb(N_ITER, CV, MODEL_RANDOM_STATE, eval_set) xgb_tuned.fit(X_train, y_train) if not os.path.isdir(MODEL_DIR): os.mkdir(MODEL_DIR) dump(preprocess, checkpoint_preprocess) dump(xgb_tuned, checkpoint_xgb) # model evaluation metric reporting y_pred = [] xgb_best = xgb_tuned.best_estimator_ zipped = zip(('train', 'validation', 'test'), (X_train, X_val, X_test), (y_train, y_val, y_test)) for name, X, y in zipped: xgb_pred = xgb_best.predict_proba( X, ntree_limit=xgb_best.best_ntree_limit)[:, 1] score = round(roc_auc_score(y, xgb_pred), 3) logger.info('{} AUC: {}'.format(name, score)) y_pred.append(xgb_pred) ids = np.hstack((ids_train, ids_val, ids_test)) y_pred = np.hstack(y_pred) else: data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL) ids = data[IDS_COL].values data = data.drop(IDS_COL, axis=1) logger.info('scoring') preprocess = load(checkpoint_preprocess) xgb_tuned = load(checkpoint_xgb) X = preprocess.transform(data) xgb_best = xgb_tuned.best_estimator_ y_pred = xgb_best.predict_proba( X, ntree_limit=xgb_best.best_ntree_limit)[:, 1] if not os.path.isdir(OUTPUT_DIR): os.mkdir(OUTPUT_DIR) output_path = os.path.join(OUTPUT_DIR, args.outputfile) write_output(ids, IDS_COL, y_pred, LABEL_COL, output_path)
def _load_data(self) -> None: print('Loading and preprocessing data...') self._train_data = pd.read_csv('data/train.csv') self._test_data = pd.read_csv('data/test.csv') self._train_data[TEXT] = [clean(s) for s in self._train_data[TEXT]] self._test_data[TEXT] = [clean(s) for s in self._test_data[TEXT]]
import dictionaries import beatDetection import chordPrediction import midiConversion import midiFileCreation from flask import Flask, render_template, request, url_for, flash, send_from_directory, redirect from werkzeug.utils import secure_filename # Initialize the Flask application UPLOAD_FOLDER = cwd+'/static/wav' ALLOWED_EXTENSIONS = set(['wav', 'WAV']) app = Flask(__name__, template_folder='www') app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER utils.clean(UPLOAD_FOLDER+"/") webbrowser.open_new("http://*****:*****@app.route('/') def form(): return render_template('index.html', filename='Browse for file...', disabled_radio="true") def allowed_file(filename): return '.' in filename and \ filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS @app.route('/', methods = ['POST']) def upload_file(): if request.method == 'POST': file = request.files['file']
if row[i] == None: row[i] = float(region_data[region][i - 2]) header_ = header if type(header[0]) == list: header_ = [ header[0][i] + " -- " + header[1][i] for i in range(len(header[0])) ] for row in data: if check_transform_condition(category, path): for i, field in enumerate(row[2:]): col = category + " | " + header_[i + 2] + " -- " + row[1 - sti] polity_data[row[sti]][col] = clean(field) else: for i, field in enumerate(row[2:]): col = category + " | " + header_[i + 2] polity_data[row[sti]][col] = clean(field) if category == "Economy": data = header + data data = map(None, *data) header = data[0] data = [list(row) for row in data[2:]] with open("data/datagov-clean/" + path, "wb") as csvfile: writer = csv.writer(csvfile, delimiter=",", quotechar="\"",
def clean(self): if self.clear_title is None and self.clear_text is None: self.clear_title = clean(self.title) self.clear_text = clean(self.text) return self.clear_title + ' ' + self.clear_text
#continue for section in re.split("\n\-{5,}\n", plain_text): section = "\n" + section for disease_name_match in DISEASE_STATUS_RE.finditer(section): if disease_name_match: #print(re.match(r"\s*High Priority Event Updates", section)) [disease_name, location, status] = disease_name_match.groups() disease_name = re.sub(r"\*", "", disease_name).strip() for possible_status in "Worsening|No Change|Improving|Undetermined".split( '|'): if possible_status.lower() in status.lower(): status = possible_status break disease_to_metadata[disease_name] = { 'location': clean(location), 'status': status } db.nbic.insert({ 'file': file_path, 'date': date, 'diseases': [{ 'nameUsed': disease, 'metadata': metadata, 'resolved': lookup_disease(disease) } for disease, metadata in disease_to_metadata.items()], }) total_disease_topics = 0
import utils, re, time, code pats = [ (re.compile('ate (?P<hour>\d\d):(?P<minute>\d\d):(?P<second>\d\d)'),True), (re.compile('http://maps.google.com/maps.q=(?P<latitude>[-.0-9]+),(?P<longitude>[-.0-9]+)'),True), (re.compile('^Um (?P<pokemon_name>.+) selvagem'),False), (re.compile('^Detectada raid de (?P<raid_pokemon_name>[^!]+)!'),False)] while True: for e in utils.read_raw_events(): print(e) if 'text' in e['evt']: t = e['evt']['text'] evt = {'btg':e['evt']} for p, c in pats: m = p.search(t) if m: evt.update(m.groupdict()) elif c: break else: if ('pokemon_name' in evt) or ('raid_pokemon_name' in evt): utils.save_event(evt) else: print('failed msg'+t) utils.close_raw_event(e) utils.clean() time.sleep(2)
def main(_): # evaluate_line() clean(FLAGS) train()
def star_qsub(job): """ qsub star mission through star_sub.sh input: job - string, "indexing" or "mapping". choose the job you want to run. outputs: indexing - genome index files in the genome folder mapping - sam and log files in star_outputs folder """ paths = RnaSeqPath() # STAR genome indexing job if job == "indexing": # create the shell file shell_file = generate_bash_file( job_name="star_indexing", threads=4, out_log="star_indexing.out", err_log="star_indexing.err", commands=[ "module load star", "module load python/3.6.4", "python3 {} indexing".format( os.path.join(paths.scripts, 'star.py')) ]) # submit the shell file to hpc qsub(shell_file) # STAR RNA-seq alignment job if job == "mapping": try: os.mkdir(paths.star_outputs) except IOError: pass # shell commands commands = [ "module load star", "module load python/3.6.4", "python3 {} mapping $SGE_TASK_ID".format( os.path.join(paths.scripts, 'star.py')) ] # calculate job number based on the trimmomatic outputs n_jobs = int( len( glob(os.path.join(paths.trimmomatic_outputs, "*.cleaned.fastq"))) / 2) # create shell file shell_file = generate_bash_file(job_name="star_mapping", mem_free="35G", threads=8, job_arr=n_jobs, commands=commands) # submit shell file to hpc qsub(shell_file) # clean temp files qsub(clean(after="star_mapping"))