Пример #1
0
def register(username, password):
	"""Registra el usuario y clave, devuelve:
		0 si se registro exitosamente
		1 si el usuario ya existe
		2 si hubo otro problema
	"""
	username = utils.clean(username)
	password = utils.clean(password)

	# vemos si existe (medio feo, pero por ahora funciona, dependemos de
	# que get_personal tira una excepcion si el usuario no existe)
	p = None
	try:
		p = get_personal(username)
	except:
		pass
	if p != None:
		return 1

	p = Personal()
	p.username = username
	p.password = password

	try:
		p.save()
	except:
		return 2
	return 0
def get_data():
    path = '/Users/zyzdiana/Dropbox/vNav_Test_Data/Apr_17_test_data/'
    dict_10mm = {}
    dict_6_4mm = {}
    dict_8mm = {}
    for root, dirs, files in os.walk(path):
        if len(dirs)==0:
            if('10mm' in root):
                dict_10mm[root] = clean(files)
            if('6_4mm' in root):
                dict_6_4mm[root] = clean(files)
            if('8mm' in root):
                dict_8mm[root] = clean(files)

    list_10mm = []
    for item in dict_10mm.iteritems():
        list_10mm.append(os.path.join(item[0],item[1][0]))
    list_10mm.sort()
    vols_10mm = get_volume(list_10mm, 26)
    list_6_4mm = []
    for item in dict_6_4mm.iteritems():
        list_6_4mm.append(os.path.join(item[0],item[1][0]))
    list_6_4mm.sort()
    vols_6_4mm = get_volume(list_6_4mm, 40)
    list_8mm = []
    for item in dict_8mm.iteritems():
        list_8mm.append(os.path.join(item[0],item[1][0]))
    list_8mm.sort()
    vols_8mm = get_volume(list_8mm, 32)
    return vols_6_4mm, vols_8mm, vols_10mm
def get_data_all():
    path = '/Users/zyzdiana/Dropbox/vNav_Test_Data/Apr_17_test_data/'
    dict_10mm = {}
    dict_6_4mm = {}
    dict_8mm = {}
    for root, dirs, files in os.walk(path):
        if len(dirs)==0:
            if('10mm' in root):
                dict_10mm[root] = clean(files)
            if('6_4mm' in root):
                dict_6_4mm[root] = clean(files)
            if('8mm' in root):
                dict_8mm[root] = clean(files)
    keys_10 = sorted(dict_10mm.keys())
    keys_8 = sorted(dict_8mm.keys())
    keys_6_4 = sorted(dict_6_4mm.keys())
    all_10mm = {}
    all_8mm = {}
    all_6_4mm = {}
    for i in xrange(5):
        all_10mm[idx_to_key(i, keys_10)] = []
        all_8mm[idx_to_key(i, keys_8)] = []
        all_6_4mm[idx_to_key(i, keys_6_4)] = []
        for j in xrange(5):
            # 10mm
            path = os.path.join(idx_to_key(i, keys_10),dict_10mm[idx_to_key(i, keys_10)][j])
            all_10mm[idx_to_key(i, keys_10)].append(get_volume_1(path, 26))
            # 8mm
            path = os.path.join(idx_to_key(i, keys_8),dict_8mm[idx_to_key(i, keys_8)][j])
            all_8mm[idx_to_key(i, keys_8)].append(get_volume_1(path, 32))
            # 6.4mm
            path = os.path.join(idx_to_key(i, keys_6_4),dict_6_4mm[idx_to_key(i, keys_6_4)][j])
            all_6_4mm[idx_to_key(i, keys_6_4)].append(get_volume_1(path, 40))
    return all_10mm, all_8mm, all_6_4mm
Пример #4
0
def main(_):

    if FLAGS.train:
        if FLAGS.clean:
            clean(FLAGS)
        train()
    else:
        evaluate_line()
Пример #5
0
def install():
    utils.clean()
    peg.install()
    fcgi.install()
    json_glib.install()
    discount.install()
    balde.install()
    bluster.install()
    utils.clean()
Пример #6
0
def register(username, password):
	"""Registra el usuario y clave, devuelve:
		0 si se registro exitosamente
		1 si el usuario ya existe
		2 si hubo otro problema
	"""
	# TODO: si alguien molesta con esto, hay que armar un mecanismo para
	# evitar DoS (tampoco es muy complicado).

	username = utils.clean(username)
	password = utils.clean(password)
	return personal.register(username, password)
Пример #7
0
	def load_words(self, folderpath):
		filenames = glob.glob(folderpath)
		for f in filenames:
			with open(f, 'r') as infile:
				text = infile.read()
				text = utils.clean(text)
				self.words.append([text])
Пример #8
0
 def new(self):
     self.name = self.parse('company.name')
     self.suffix = self.fetch('company.suffix')
     self.website = "http://www.%s.%s" % (
         clean(self.name),
         self.fetch('internet.domain_suffix')
         )
Пример #9
0
def process_element(country):
    currency_dict = {}
    for currency_tag in country.iterchildren():
        # ignore newly added additional info field
        if currency_tag_map[currency_tag.tag] ==\
                "ISO4217-currency_additional_info":
            break
        # skip 'same day', 'next day', etc variations
        elif (currency_tag_map[currency_tag.tag] == "ISO4217-currency_name")\
                and (len(currency_tag.items()) > 0):
            if currency_tag.items()[0][0] == 'IsFund':
                break
        else:
            currency_dict.update({
                currency_tag_map[currency_tag.tag]: currency_tag.text})
            currency_numeric = None
            # remove random line breaks, etc
            currency_name = utils.clean(currency_dict['ISO4217-currency_country_name'])
            if currency_name is not None:
                # replace name with line breaks, etc removed
                currency_dict['ISO4217-currency_country_name'] = currency_name
            try:
                currency_numeric = en_names[currency_name]
            except KeyError:
                mapped_name = currency_country_name_map.get(currency_name)
                if mapped_name is not None:
                    currency_numeric = en_names.get(mapped_name.upper())

            if currency_numeric:
                country_info[currency_numeric].update(currency_dict)
            else:
                print('Failed to match currency data for country: "%s"'
                        % currency_name)

    return
Пример #10
0
    def __init__(self, schema, output_dir=None):
        ''' Bind array of cleaned schema file lines to validator object. ''' 

        self.schema = utils.clean(schema)
        self.output_dir = output_dir
        self.indent_size = self._find_first_indent()['indent_size']
        self.error = {'msg': None}
Пример #11
0
 def process_text(self):
     text = utils.clean(self.get_tweet_text())
     self.set_tweet_text(text)
     self.set_tweet_source(utils.parse_alink(self.get_tweet_source()))
     if self.translation:
         self.detect_language_or_translate()
     self.filter_text()
Пример #12
0
def listar_radios(name,url):
    link= clean(abrir_url(url))
    radios=re.compile('<td><a href="/portalradio/conteudos/ficha/.+?radio_id=(.+?)">(.+?)</a></td><td>(.+?)</td>.+?<td align="center">').findall(link)
    for idradio,nomeradio,concelho in radios:
        addDir('[B]'+nomeradio+'[/B] ('+concelho+')',RadiosURL + 'Sintonizador/?radio_id=' + idradio + '&scope=0',21,'http://www.radio.com.pt/APR.ROLI.WEB/Images/Logos/'+ idradio +'.gif',len(radios),'',False)
    xbmc.executebuiltin("Container.SetViewMode(501)")
    paginasradios(url,link)
Пример #13
0
def rate(classifier,filename,medi,mad,medi_ld,mad_ld):
	#For now, just assuming the text is csv
	results = classifier
	with open(filename,'rU') as f:
		reader = csv.reader(f,delimiter=',')
		data = [(get_features(tech.clean(row[2]),medi,mad,medi_ld,mad_ld),row[0]) for row in reader]
	print nltk.classify.accuracy(classifier,data)
Пример #14
0
    def eligibility(self):
        try:
            table = utils.Table(self.soup.select("table[summary~=Land]")[0])
        except IndexError:
            return None

        return utils.clean(table["Eligibility"].text.strip())
Пример #15
0
	def post(self):
		db=self.application.database
		content = self.request.arguments.get("content", [""])[0]
		parent = self.request.arguments.get("parent", [None])[0]
		super_parent = self.request.arguments.get("super_parent", [None])[0]

		content = clean(content)
		
		new_comment = {
			"content" : content,
			"time" : datetime.utcnow(),
			"author" : self.get_current_user(),
			"plusvote": list(),
			"minusvote": list()
		}

		if parent != None:
			new_comment["parent"] = parent
		if super_parent != None:
			new_comment["super_parent"] = super_parent
		new_id = db.comments.insert(new_comment)
		print new_id
		print parent
		print db.comments.update({"_id": ObjectId(parent)}, {"$push": {"children": str(new_id)}})
		self.redirect("/")
Пример #16
0
def listasextras():
    iptvurl='http://01.gen.tr/HasBahCa_IPTV/'
    link= clean(abrir_url(iptvurl))
    streams=re.compile('<a class="autoindex_a" href="./(.+?)">.+?<td class="autoindex_td_right">.+?</td.+?td class="autoindex_td_right">(.+?)</td>').findall(link)
    for nomepasta,act in streams:
        if re.search('.m3u',nomepasta):
            titulo=nomepasta.replace('.m3u','').replace('_',' ').title()
            addDir("[B]%s[/B] (act.%s)" % (titulo,act[2:-2]),iptvurl + nomepasta,5,tvporpath + art + 'listas-ver2.png',1,'',True)
Пример #17
0
    def findName(self, code):

        s = code.find("def")+len("def")
        e = code.find("(")

        name = code[s:e]

        return clean(name)
Пример #18
0
def radiosobterurlstream(name,url):
    #GA("None","Radio - " + name)
    mensagemprogresso.create('TV Portuguesa','A carregar...')
    mensagemprogresso.update(0)
    if re.search('www.radios.pt',url):
        link=abrir_url(url)
        try:
            endereco=re.compile('<param name="url" value="(.+?)"').findall(link)[0]
        except:
            xbmc.executebuiltin("XBMC.Notification(Fightnight Music,Não é possível ouvir esta rádio.,'500000',)")
            return
        idradio=url.replace('http://www.radios.pt/portalradio/Sintonizador/?radio_id=','').replace('&scope=0','')
        thumbnail='http://www.radio.com.pt/APR.ROLI.WEB/Images/Logos/'+ idradio +'.gif'
    else:
        urlfinal='http://www.radioonline.com.pt/ajax/player.php?clear_s_name=' + url
        link= clean(abrir_url(urlfinal))
        try: player=re.compile('soundManager.createSound\({(.+?)autoLoad').findall(link)[0]
        except: player=False
        try:
            endereco=re.compile('url: "(.+?)"').findall(player)[0].replace(';','')
            if re.search('serverURL',player):
                rtmp=re.compile('serverURL: "(.+?)"').findall(player)[0]
                #rtmp=rtmp.replace('rtmp://195.23.102.206','rtmp://195.23.102.209') #tempfix
                rtmp=rtmp.replace(':1936','') #tempfix
                endereco=rtmp + ' playPath=' + endereco

        except:endereco=False
        if not endereco:
            try:endereco=re.compile('<param name="URL" value="(.+?)"').findall(link)[0]
            except:
                try: endereco=re.compile('<object data="(.+?)"').findall(link)[0]
                except: endereco=False

        if not endereco:
            xbmc.executebuiltin("XBMC.Notification(TV Portuguesa,Não é possível ouvir esta rádio.,'500000',)")
            mensagemprogresso.close()
            return

        try:thumbnail=re.compile('<img id="station-logo-player" src="(.+?)"').findall(link)[0]
        except: thumbnail=''
        if re.search('.asx',endereco):
            nomeasx='stream.asx'
            path = xbmc.translatePath(os.path.join(pastaperfil))
            lib=os.path.join(path, nomeasx)
            downloader(endereco,lib)
            texto= openfile(nomeasx)
            endereco = xbmc.PlayList(1)
            endereco.clear()
            streams=re.compile('<ref.+?"(.+?)"/>').findall(texto)
            for musica in streams:
                listitem = xbmcgui.ListItem(name, iconImage="DefaultVideo.png", thumbnailImage=thumbnail)
                listitem.setInfo("music", {"Title":name})
                endereco.add(musica,listitem)
        else: pass
    mensagemprogresso.close()
    listitem = xbmcgui.ListItem(name, iconImage="DefaultVideo.png", thumbnailImage=thumbnail)
    listitem.setInfo("music", {"Title":name})
    xbmc.Player().play(endereco,listitem)
Пример #19
0
def radioslocais():
    link= clean(abrir_url(RadiosURL))
    #addDir('Pesquisar (exclui nacionais)',RadiosURL + '?distrito=0&concelho=0&tipo=0&text=',16,'',1,'',True)
    distritos=re.compile('id="DirectorioPesquisa1_ddlDistritos">(.+?)</select>').findall(link)[0]
    distritos=distritos.replace('<option value="0"></option>','<option value="0">Todos as rádios locais</option>')
    lista=re.compile('<option value="(.+?)">(.+?)</option>').findall(distritos)
    for iddistrito,nomedistrito in lista:
        addDir(nomedistrito,RadiosURL + '?distrito=' + iddistrito + '&concelho=0&tipo=0',24,'',len(lista),'',True)
    xbmc.executebuiltin("Container.SetViewMode(501)")
Пример #20
0
def manage_addPingback(self, sourceTitle, sourceURI, sourceExcerpt):
    """ Add a pingback """
    from utils import isPingbackSpam

    if isPingbackSpam(sourceTitle, sourceURI, sourceExcerpt,
                      self.blogurl(), self.REQUEST):
        try:
            return self.REQUEST.RESPONSE.redirect('http://www.google.com')
        except:
            return 0

    id = self.createReferenceId()
    newTitle = clean(sourceTitle)
    newURI = clean(sourceURI)
    newExcerpt = clean(sourceExcerpt)
    pingback = Reference(id, newTitle, newURI, newExcerpt, self.getId())
    self._setObject(id, pingback)
    return 1
Пример #21
0
def main(args):
    content_generator = load_file(args.transcript, encoding=args.encoding)
    rules = load_rules(args.rules, encoding=args.encoding)

    mapped = do_mapping(content_generator, rules)
    cleaned = clean(mapped)
    formatted = mlf_format_data(cleaned)

    save_file(args.output, formatted, encoding=args.encoding)
Пример #22
0
def radios():
    addDir('[COLOR blue][B]Radios Locais[/B][/COLOR]','nada',20,tvporpath + art + 'radios-v1.png',1,'',True)
    addLink("",'','')
    link= clean(abrir_url(RadiosNacionaisURL))
    nacionais=re.compile('<div class="radiostation boxgrid">(.+?)</div>').findall(link)
    for radioindividual in nacionais:
        radiosnacionais=re.compile('<a href="http://www.radioonline.com.pt/#(.+?)".+?<img.+?src="(.+?)".+?alt="(.+?)"').findall(radioindividual)
        for idradio,imagemradio,nomeradio in radiosnacionais:
            nomeradio=nomeradio.replace('Radio ','')
            addDir(nomeradio,idradio,21,imagemradio,len(radiosnacionais),'',False)
Пример #23
0
 def test_utils_clean(self):
     test_file = '/tmp/' + utils.test_name()
     self.assertFalse(os.path.exists(test_file))
     utils.run(['touch', test_file])
     self.assertTrue(os.path.exists(test_file))
     with utils.clean(['rm', test_file]):
         self.assertFalse(os.path.exists(test_file))
         utils.run(['touch', test_file])
         self.assertTrue(os.path.exists(test_file))
     self.assertFalse(os.path.exists(test_file))
Пример #24
0
def fetch(args):
    print 'Welcome to Twitter Spell Checking : Fetching !'
    CONFIG = ConfigParser.ConfigParser()
    CONFIG.read(args.config)

    settings = items_to_dict(CONFIG.items('twitter'))
    config = items_to_dict(CONFIG.items('namespace:%s' % args.namespace))
    api = twitter.Api(consumer_key=settings['consumer_key'], consumer_secret=settings['consumer_secret'], access_token_key=settings['access_token'], access_token_secret=settings['access_token_secret'])

    accounts = [account.replace(' ', '') for account in config['accounts'].split(',')]
    max_tweets_file = os.path.join(os.path.dirname(config['files']), 'max_tweets_%s.txt' % args.namespace)

    def save_max_tweets():
        open(max_tweets_file, 'w').write(json.dumps(max_tweets))

    if os.path.exists(max_tweets_file):
        max_tweets = json.loads(open(max_tweets_file).read())
    else:
        max_tweets = dict()

    print max_tweets_file
    f = open(os.path.join(config['files'], 'tweets_%s.txt' % args.namespace), 'a')
    for account in accounts:
        if account in max_tweets and max_tweets[account] > 0:
            retrieving = "new"
        else:
            retrieving = "old"
            page = 0
        while True:
            if retrieving == "new":
                print 'process %s since id %s' % (account, max_tweets[account])
                try:
                    tweets = api.GetUserTimeline(account, count=200, include_rts=False, since_id=max_tweets[account])
                except twitter.TwitterError, e:
                    print 'error : %s' % str(e)
                    tweets = []
            else:
                print 'process %s from zero, page %s' % (account, page)
                try:
                    tweets = api.GetUserTimeline(account, count=200, include_rts=False, page=page)
                except twitter.TwitterError, e:
                    print 'error : %s' % str(e)
                    tweets = []
            if tweets:
                for s in tweets:
                    if is_valid(s, account):
                        f.write(clean(s.text).lower().encode('UTF-8') + '\n')
                        if  account not in max_tweets or s.id > max_tweets[account]:
                            max_tweets[account] = s.id
                if retrieving == "old":
                    page += 1
                save_max_tweets()
            else:
                print 'no more tweets for %s' % account
                break
Пример #25
0
def manage_addPost(self, title, author, body, tags=[],
                  date=DateTime.DateTime(), publish=1,
                  comment_allowed=1, not_clean=0, sendping=1, REQUEST=None):
    """ Called from ZMI when creating new posts """
    if not title and REQUEST is not None:
        return REQUEST.RESPONSE.redirect('%s/post?msg=%s' % (self.blogurl(), 'You must provide at least the title of the post'))

    newid = self.createId(title)
    newtitle = clean(title)
    newauthor = clean(author)
    if not_clean:
        newbody = body
    else:
        newbody = cleanBody(self, body)
    newtags = prepareTags(tags)
    newdate = DateTime.DateTime(date)

    while hasattr(self, newid):
        newid = self.createNewId(newid)

    post = Post(newid, newtitle, newauthor, newbody,
                newtags, newdate, publish, comment_allowed)

    self._setObject(str(newid), post)
    post = self.get(newid)

    if self.inCommunity():
        # We are in a Bitakora Community, so catalog the post there
        cat = self.getParentNode().get('Catalog', 'None')
        if cat is not None:
            cat.catalog_object(post, '/'.join(post.getPhysicalPath()))

    self.postcount = self.postcount + 1

    if sendping:
        tech_pings = Future(sendPing, self.absolute_url(), self.blog_title())
        pingbacks = Future(postPingBacks, newbody, post.absolute_url())

    if REQUEST is not None:
        return REQUEST.RESPONSE.redirect('%s/admin?msg=%s' % (self.absolute_url(), 'Post added succesfully'))

    return newid
Пример #26
0
def todosact(parametro):
    LOLI=['<item>\n<title>Actualizado: ' + horaportuguesa(True).replace('%20',' ') + '</title>\n<link>nada</link>\n<thumbnail>nada</thumbnail>\n</item>']
    dialog = xbmcgui.Dialog()
    mensagemprogresso.create('TV Portuguesa', 'A criar lista.','Por favor aguarde...')
    if re.search('Lista Completa',parametro):
        canaison= openfile(('canaison'))
        canaison=canaison.replace('[','')
        lista=re.compile('B](.+?)/B]').findall(canaison)
        tamanhototal=int(len(lista))
        tamanho=int(-1)
        for nomes in lista:
            tamanho=tamanho+1
            tamanhoenviado=(tamanho*100)/tamanhototal
            print "Lista completa: Canal " + nomes
            global activadoextra
            activadoextra=[]
            SIM= request_servidores('ignore','[B]' + nomes + '[/B]',tamanho=tamanhoenviado)
            LOLI.append(SIM)
            AGORA='\n\n'.join(LOLI)
    else:
        SIM= request_servidores('ignore',parametro)
        LOLI.append(SIM)
        AGORA='\n\n'.join(LOLI)

    mensagemprogresso.close()

    debugfinal='\n'.join(debug)
    savefile('problema',debugfinal)

    keyb = xbmc.Keyboard('', 'Nome do ficheiro da lista')
    keyb.doModal()
    if (keyb.isConfirmed()):
        nomelista = keyb.getText()
        if nomelista=='': nomelista='lista'
    else: nomelista='lista'
    pastafinal = dialog.browse(int(0), "Local para guardar xml/m3u", 'myprograms')
    if not pastafinal: sys.exit(0)
    savefile(nomelista + '.xml',AGORA,pastafinal=pastafinal)
    m3uprep=['#EXTM3U#EXTM3U']
    openedfile= clean(AGORA)
    ya=re.compile('<item>(.+?)</item>').findall(openedfile)
    for lol in ya:
        chname=re.compile('<title>(.+?)</title>').findall(lol)[0]
        allstreams=False
        if allstreams==True:
            streams=re.compile('<link>(.+?)</link>').findall(lol)
            for umporum in streams:
                m3uprep.append('\n#EXTINF:-1,%s\n%s' % (chname,umporum))
        else:
            streams=re.compile('<link>(.+?)</link>').findall(lol)[0]
            m3uprep.append('\n#EXTINF:-1,%s\n%s' % (chname,streams))
    m3uprep='\n'.join(m3uprep)
    savefile(nomelista + '.m3u',m3uprep,pastafinal=pastafinal)
    xbmc.executebuiltin("XBMC.Notification(TV Portuguesa, Lista xml/m3u gravada,'100000'," + tvporpath + art + "icon32-ver1.png)")
Пример #27
0
	def post(self):
		db=self.application.database
		content = self.request.arguments.get("content", [""])[0]
		title = self.request.arguments.get("title", [""])[0]

		content = clean(content)
		title = clean(title)
		
		new_document = {
			"content" : content,
			"type" : "status",
			"title" : title,
			"author" : self.get_current_user(),
			"time": tuple(datetime.now().utctimetuple()),
			"plusvote": list(),
			"minusvote": list()
		}

		db.documents.insert(new_document)
		self.redirect("/")
def get_tweets_for_user(user_id):
    path = tweetsd + "/tweets-user-" + str(user_id) + ".txt"
    tweets = []
    with open(path, 'r') as f:
        for index, line in enumerate(f):
            if index >= limit:
                return clean(" ".join(tweets), False)
            info = line.split("\t")
            if len(info) != 5:
                print info[-2]
            tweets.append(info[-1].strip())
Пример #29
0
	def classify(self, testfile):
		text = utils.clean(testfile.read())
		neg_count = 0
		pos_count = 0
		for snip in self.neg_snip:
			if snip in text:
				neg_count += 1

		for snip in self.pos_snip:
			if snip in text:
				pos_count += 1

		return pos_count - neg_count
Пример #30
0
    def findArgs(self, code):

        s = code.rfind("(")+1
        e = code.find(")")

        temp = code[s:e]

        args = []

        for a in temp.split(","):
            args.append(clean(a).split("="))

        return args
Пример #31
0
url = "https://www.sec.gov/edgar/searchedgar/edgarstatecodes.htm"

content = urllib.urlopen(url).read()
doc = html.fromstring(content)

rows = doc.xpath('//table')[3].getchildren()

seen_other_countries = False
header = ['EDGAR', 'name']

data = []

for row in rows:
    if seen_other_countries is not True:
        if utils.clean(row.text_content()) != 'Other Countries':
            print('SKIPPING', row.text_content())
            continue
        else:
            seen_other_countries = True
            print('SEEN OTHER COUNTRIES', row.text_content())
            continue

    cells = row.getchildren()
    if len(cells) != 2:
        print('ERROR IN CELL COUNT')
        for cell in cells:
            print(cell)
            print(cell.text_content())
        continue
    code = utils.clean(cells[0].text_content())
Пример #32
0

if __name__ == '__main__':
    obj = loadConfig()
    dl = Downloader()

    for company in obj['company']:
        print('...', end=' ')
        payload = {
            'function': 'TIME_SERIES_INTRADAY',
            'interval': obj['interval'],
            'symbol': company[0],
            'apikey': 'M1PAJKCE6DZUZAUS',
            'datatype': obj['datatype']
        }
        pipe = Pipe()

        new_text = appendCol(clean(
            pipe.read_from_downloader(text=dl.addParams(payload).bulk())),
                             colname=company[1])

        old_text = pipe.read_from_file(filename=company[1],
                                       ext=obj['datatype'])

        pipe.read_from_text(new_text + old_text)
        pipe.write_to_file(filename=company[1], ext=obj['datatype'])

        pipe.clear()
        print('Finish pulling stock data: %s' % company[1])

    dl.close()
import csv
import numpy as np
from pprint import pprint
from utils import clean
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

stadium_data = {}
with open('final_data/stadium_record.csv') as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        stadium_data[clean(
            row[0])] = [float(row[1]),
                        float(row[9]),
                        float(row[11])]

alltime_data = {}
with open('final_data/player_batting_record_backup.csv') as f:
    reader = csv.reader(f)
    header = next(reader)
    for row in reader:
        if clean(row[0]) in alltime_data:
            alltime_data[clean(row[0])][row[3]] = row
        else:
            alltime_data[clean(row[0])] = {row[3]: row}
alltime_header = [clean(h) for h in header]

ipl_data = {}
with open('final_data/data-ipl-fantasy.csv') as f:
Пример #34
0
def process_one(one):
    ids = sp.encode_as_ids(clean(one))
    return ids[0:args.input_max_length]
Пример #35
0
def split_data(net,
               dat_in,
               dat_out,
               dat_out_raw,
               dat_out_oracle,
               num_flws,
               bch_trn,
               bch_tst,
               use_val=False):
    """
    Divides the input and output data into training, validation, and
    testing sets and constructs data loaders.
    """
    print("Creating train/val/test data...")
    #assert len(dat_out.shape) == 1
    #assert len(dat_out_raw.shape) == 1
    #assert len(dat_out_oracle.shape) == 1
    #assert len(num_flws.shape) == 1

    fets = dat_in.dtype.names
    # Destroy columns names to make merging the matrices easier. I.e.,
    # convert from structured to regular numpy arrays.
    dat_in = utils.clean(dat_in)
    dat_out = utils.clean(dat_out)
    dat_out_raw = utils.clean(dat_out_raw)
    dat_out_oracle = utils.clean(dat_out_oracle)
    num_flws = utils.clean(num_flws)
    # Shuffle the data to ensure that the training, validation, and
    # test sets are uniformly sampled. To shuffle all of the arrays
    # together, we must first merge them into a combined matrix.
    num_cols_in = dat_in.shape[1]
    merged = np.concatenate(
        (dat_in, dat_out, dat_out_raw, dat_out_oracle, num_flws), axis=1)
    np.random.shuffle(merged)
    dat_in = merged[:, :num_cols_in]
    dat_out = merged[:, num_cols_in]
    dat_out_raw = merged[:, num_cols_in + 1]
    dat_out_oracle = merged[:, num_cols_in + 2]
    num_flws = merged[:, num_cols_in + 3]

    # 50% for training, 20% for validation, 30% for testing.
    num_exps = dat_in.shape[0]
    num_val = int(round(num_exps * 0.2)) if use_val else 0
    num_tst = int(round(num_exps * 0.3))
    print(
        (f"    Data - train: {num_exps - num_val - num_tst}, val: {num_val}, "
         f"test: {num_tst}"))
    # Validation.
    dat_val_in = dat_in[:num_val]
    dat_val_out = dat_out[:num_val]
    # Testing.
    dat_tst_in = dat_in[num_val:num_val + num_tst]
    dat_tst_out = dat_out[num_val:num_val + num_tst]
    dat_tst_out_raw = dat_out_raw[num_val:num_val + num_tst]
    dat_tst_out_oracle = dat_out_oracle[num_val:num_val + num_tst]
    num_flws_tst = num_flws[num_val:num_val + num_tst]
    # Training.
    dat_trn_in = dat_in[num_val + num_tst:]
    dat_trn_out = dat_out[num_val + num_tst:]

    # Create the dataloaders.
    dataset_trn = utils.Dataset(fets, dat_trn_in, dat_trn_out)
    ldr_trn = (torch.utils.data.DataLoader(
        dataset_trn, batch_size=bch_tst, shuffle=True, drop_last=False)
               if isinstance(net, models.SvmSklearnWrapper) else
               torch.utils.data.DataLoader(
                   dataset_trn,
                   batch_sampler=utils.BalancedSampler(
                       dataset_trn, bch_trn, drop_last=False)))
    ldr_val = (torch.utils.data.DataLoader(
        utils.Dataset(fets, dat_val_in, dat_val_out),
        batch_size=bch_tst,
        shuffle=False,
        drop_last=False) if use_val else None)
    ldr_tst = torch.utils.data.DataLoader(utils.Dataset(
        fets, dat_tst_in, dat_tst_out, dat_tst_out_raw, dat_tst_out_oracle,
        num_flws_tst),
                                          batch_size=bch_tst,
                                          shuffle=False,
                                          drop_last=False)
    return ldr_trn, ldr_val, ldr_tst
Пример #36
0
    df = preprocessing.load_data()

    df = feature_engineering.FeatEngin(df)
    print('LGMB training is stating....')
    LGBMtrain.Train(df, BO)

if x == '2':
    with open('train//ctdict1.pkl', 'rb') as handle:
        cvsdict = pickle.load(handle)
    print('loading preprocess data is stating....')
    df = pd.read_csv('train//df_application_train_new1.csv', dtype=cvsdict)
    for c in df:
        print(c)
    print(df.shape)
    print('feature engineering is stating....')
    df = feature_engineering.FeatEngin(df)
    df = utils.clean(df)
    print('LGMB training is stating....')
    LGBMtrain.Train(df, BO)

if x == '3':
    with open('train//ctdict2.pkl', 'rb') as handle:
        cvsdict = pickle.load(handle)
    print('loading preprocess data is stating....')
    df = pd.read_csv('train//df_application_train_new2.csv', dtype=cvsdict)
    for c in df:
        print(c)
    print(df.shape)
    df = utils.clean(df)
    print('LGMB training is stating....')
    LGBMtrain.Train(df, BO)
Пример #37
0
            clean_sentence = clean(sentence)
            for word in clean_sentence:
                freq[word] = freq.get(word, 0) + 1
            X.append(clean_sentence)
            Y_cog.append(mapping_cog[label_cog])
            Y_know.append(mapping_know[label_know])
    '''
    with codecs.open('datasets/BCLs_Question_Dataset.csv',
                     'r',
                     encoding="utf-8") as csvfile:
        all_rows = csvfile.read().splitlines()[1:]
        csvreader = csv.reader(
            all_rows)  #csvreader = csv.reader(all_rows[:len(all_rows)*7//10])
        for row in csvreader:
            sentence, label_cog = row
            clean_sentence = clean(sentence)
            if (PREPARE_VOCAB):
                for word in clean_sentence:
                    freq[word] = freq.get(word, 0) + 1
            X.append(clean_sentence)
            Y_cog.append(mapping_cog[label_cog])
            # TODO: Label
            Y_know.append(1)

    domain_keywords = pickle.load(open('resources/domain.pkl', 'rb'))
    for key in domain_keywords:
        for word in domain_keywords[key]:
            freq[word] = freq.get(word, 0) + 1
            X.append([word])
            Y_cog.append(mapping_cog[key])
Пример #38
0
def run(arguments):
    mode = arguments.mode
    chainix = arguments.chain_index
    tend = arguments.npoints
    start = 0
    move_every = arguments.save_interval
    outdir = arguments.output
    inpath = arguments.input
    devnull = '>& /dev/null'

    lastaccepted = {}
    #select mode, use argparse for this
    move = 1
    #run the thing
    #mode 1: start from new point
    if mode == "new":
        outname = "pMSSM_MCMC_" + str(chainix) + "_" + str(start) + "to" + str(
            min(start + move_every, start + tend)) + ".root"
        outroot = TFile(outname, "recreate")
        outtree = TTree("mcmc", "mcmc")
        setup_tree(outtree)
        tree_branches["chain_index"][0] = chainix
        finite_lh = False
        signchoice = random.randint(0, 7)
        while not finite_lh:
            utils.clean()
            spnerr = False
            while not spnerr:  #find a viable point
                utils.clean()
                candidate = generate_point(
                    signchoice=signchoice)  #generate a point from flat prior
                spnin = utils.write_spheno_input(
                    candidate)  #write the input for spheno
                spnerr = run_spheno(
                    spnin, devnull)  #run spheno, check if viable point

            if not run_feynhiggs(
                    '>& /dev/null'):  #run feynhiggs, replace higgs sector
                continue
            os.system("cp SPheno.spc mmgsin.slha")
            mmgs_obs = run_micromegas(
                slhapath="mmgsin.slha"
            )  #micromegas seems to consume the input file?!?!
            os.system("mv mmgsin.slha SPheno.spc")
            print "getting the stuff from the slha file"
            observables = get_observables(
                slhapath="SPheno.spc")  #get observables for the likelihood
            siso_obs = run_superiso("SPheno.spc")
            if siso_obs == -1:
                continue
            for obs in siso_obs:
                observables[obs] = siso_obs[obs]
            siso_chi2_obs = run_superiso_chi2("SPheno.spc")
            for obs in siso_chi2_obs:
                observables[obs] = siso_chi2_obs[obs]
            for obs in mmgs_obs:
                observables[obs] = mmgs_obs[obs]
            _l = likelihood.get_likelihood(observables)  #get likelihood
            finite_lh = _l != 0
        lastaccepted["likelihood"] = _l
        lastaccepted["iteration_index"] = 1
        lastaccepted["accepted_index"] = 1
        lastaccepted["chain_index"] = chainix
        lastaccepted["superiso_chi2_stdout"] = observables[
            "superiso_chi2_stdout"]["value"]
        lastaccepted["superiso_stdout"] = observables["superiso_stdout"][
            "value"]
        lastaccepted["chi2"] = observables["chi2"]["value"]
        lastaccepted["chi2_ndf"] = observables["chi2_ndf"]["value"]
        lastaccepted["micromegas_stdout"] = observables["micromegas_stdout"][
            "value"]
        lastaccepted["ztoinv_excluded"] = observables["ztoinv_excluded"][
            "value"]
        lastaccepted["lep_excluded"] = observables["lep_excluded"]["value"]
        lastaccepted["masslim"] = observables["masslim"]["value"]
        lastaccepted["omegah2"] = observables["omegah2"]["value"]
        lastaccepted["omegaxf"] = observables["omegaxf"]["value"]

        #write point to root, start loop
        for obs in observables.keys():
            lastaccepted[obs] = observables[obs]["value"]
        lastaccepted = prepare_fill(
            lastaccepted,
            outtree)  #add the rest of the point info, fill the tree branches
        outtree.Fill()
    #mode 2: continue from previous point/root file?
    elif mode == "resume":
        lastaccepted = utils.get_point_from_rootfile(inpath, chainix)
    start = lastaccepted["iteration_index"] + 1
    if mode == "resume":
        outname = "pMSSM_MCMC_" + str(chainix) + "_" + str(start) + "to" + str(
            min(start + move_every, start + tend)) + ".root"
        print "Creating file " + outname
        outroot = TFile(outname, "recreate")
        outtree = TTree("mcmc", "mcmc")
        setup_tree(outtree)
        tree_branches["chain_index"][0] = chainix

    #run
    print "reached run loop"
    for iter_ix in range(start, start + tend + 1):
        print iter_ix
        if move == move_every - 1 and iter_ix < start + tend - 1:
            outtree.BuildIndex("chain_index", "iteration_index")
            outtree.Write()
            outroot.Close()
            print "Made " + str(
                move_every) + " iterations, moving " + outname + " to storage"
            os.system(" ".join(["mv", outname, outdir]))
            outname = "pMSSM_MCMC_" + str(chainix) + "_" + str(
                iter_ix) + "to" + str(
                    min(iter_ix + move_every, start + tend + 1)) + ".root"
            print "Creating file " + outname
            outroot = TFile(outname, "recreate")
            outtree = TTree("mcmc", "mcmc")
            setup_tree(outtree)
            move = -1
        finite_lh = False
        while not finite_lh:
            utils.clean()
            spnerr = False
            while not spnerr:  #find a viable point
                utils.clean()
                candidate = generate_point(
                    lastaccepted)  #generate a point from the last point
                spnin = utils.write_spheno_input(
                    candidate)  #write the input for spheno
                spnerr = run_spheno(
                    spnin, devnull)  #run spheno, check if viable point
            if not run_feynhiggs(
                    '>& /dev/null'):  #run feynhiggs, replace higgs sector
                continue
            os.system("cp SPheno.spc mmgsin.slha")
            mmgs_obs = run_micromegas(slhapath="mmgsin.slha")
            os.system("mv mmgsin.slha SPheno.spc")

            observables = get_observables(
                slhapath="SPheno.spc")  #get observables for the likelihood
            siso_obs = run_superiso("SPheno.spc")
            if siso_obs == -1:
                continue
            for obs in siso_obs:
                observables[obs] = siso_obs[obs]
            siso_chi2_obs = run_superiso_chi2("SPheno.spc")
            for obs in siso_chi2_obs:
                observables[obs] = siso_chi2_obs[obs]
            for obs in mmgs_obs:
                observables[obs] = mmgs_obs[obs]

            _l = likelihood.make_decision(
                observables, lastaccepted["likelihood"])  #get likelihood
            finite_lh = _l != 0
        if _l < 0:
            move += 1
            if iter_ix == start + tend:
                print "Made all " + str(
                    tend) + " iterations, moving " + outname + " to storage"
                outtree.BuildIndex("chain_index", "iteration_index")
                outtree.Write()
                outroot.Close()
                os.system(" ".join(["mv", outname, outdir]))
            continue  #point was not accepted
        lastaccepted["likelihood"] = _l
        lastaccepted["iteration_index"] = iter_ix
        lastaccepted["accepted_index"] = lastaccepted["accepted_index"] + 1
        lastaccepted["chain_index"] = chainix
        lastaccepted["superiso_chi2_stdout"] = observables[
            "superiso_chi2_stdout"]["value"]
        lastaccepted["superiso_stdout"] = observables["superiso_stdout"][
            "value"]
        lastaccepted["chi2"] = observables["chi2"]["value"]
        lastaccepted["chi2_ndf"] = observables["chi2_ndf"]["value"]
        lastaccepted["micromegas_stdout"] = observables["micromegas_stdout"][
            "value"]
        lastaccepted["ztoinv_excluded"] = observables["ztoinv_excluded"][
            "value"]
        lastaccepted["lep_excluded"] = observables["lep_excluded"]["value"]
        lastaccepted["masslim"] = observables["masslim"]["value"]
        lastaccepted["omegah2"] = observables["omegah2"]["value"]
        lastaccepted["omegaxf"] = observables["omegaxf"]["value"]
        #write point to root, start loop
        lastaccepted = prepare_fill(
            lastaccepted,
            outtree)  #add the rest of the point info, fill the tree branches
        outtree.Fill()
        if iter_ix == start + tend:
            print "Made all " + str(
                tend) + " iterations, moving " + outname + " to storage"
            outtree.BuildIndex("chain_index", "iteration_index")
            outtree.Write()
            outroot.Close()
            os.system(" ".join(["mv", outname, outdir]))
        move += 1
Пример #39
0
    }
}

rows_en = soup_en.find_all('tr')

for row in rows_en:
    cells = row.find_all('td')
    if len(cells) != 3:
        print('ERROR IN CELL COUNT')
        print(cells)
        continue
    if cells[0] is not None:
        if cells[0].text.startswith('South') or cells[0].text.startswith(
                'Sudan'):
            continue
    numerical = utils.clean(cells[0].text)
    name = utils.clean(cells[1].text)
    alpha3 = utils.clean(cells[2].text)
    if alpha3.startswith('ISO ALPHA-3'):
        # skip first row of column headers
        print('SKIPPING', numerical, name, alpha3)
        continue
    iso3166.update({
        numerical: {
            'ISO3166-1-numeric': numerical,
            'official_name_en': name,
            'ISO3166-1-Alpha-3': alpha3
        }
    })

# fetch French
Пример #40
0
model = Sequential()

# 1 hidden layer, input_dim of 64, output_dim of 1
model.add(Dense(12, input_dim=64, activation='relu'))
model.add(Dropout(0.25))
model.add(Dense(6, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# compile the model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# clean data and prepare to be passed through the model
df = clean('data/1year.arff')
trainX, trainY, testX, testY = prepare_dataset(df)

model.fit(trainX, trainY, epochs=150, batch_size=10)

predictions = model.predict(testX)

print(predictions[0:5])

for i in range(len(predictions)):
    if predictions[i][0] > 0.5:
        predictions[i] = 1
    else:
        predictions[i] = 0

correct = 0
Пример #41
0
def scale_fets(dat, scl_grps, standardize=False):
    """
    Returns a copy of dat with the columns normalized. If standardize
    is True, then the scaling groups are normalized to a mean of 0 and
    a variance of 1. If standardize is False, then the scaling groups
    are normalized to the range [0, 1]. Also returns an array of shape
    (dat_all[0].shape[1], 2) where row i contains the scaling
    parameters of column i in dat. If standardize is True, then the
    scaling parameters are the mean and standard deviation of that
    column's scaling group. If standardize is False, then the scaling
    parameters are the min and max of that column's scaling group.
    """
    fets = dat.dtype.names
    assert fets is not None, \
        f"The provided array is not structured. dtype: {dat.dtype.descr}"
    assert len(scl_grps) == len(fets), \
        f"Invalid scaling groups ({scl_grps}) for dtype ({dat.dtype.descr})!"

    # Determine the unique scaling groups.
    scl_grps_unique = set(scl_grps)
    # Create an empty array to hold the min and max values (i.e.,
    # scaling parameters) for each scaling group.
    scl_grps_prms = np.empty((len(scl_grps_unique), 2), dtype="float64")
    # Function to reduce a structured array.
    rdc = (lambda fnc, arr: fnc(
        np.array([fnc(arr[fet]) for fet in arr.dtype.names if fet != ""])))
    # Determine the min and the max of each scaling group.
    for scl_grp in scl_grps_unique:
        # Determine the features in this scaling group.
        scl_grp_fets = [
            fet for fet_idx, fet in enumerate(fets)
            if scl_grps[fet_idx] == scl_grp
        ]
        # Extract the columns corresponding to this scaling group.
        fet_values = dat[scl_grp_fets]
        # Record the min and max of these columns.
        scl_grps_prms[scl_grp] = [
            np.mean(utils.clean(fet_values)) if standardize else rdc(
                np.min, fet_values),
            np.std(utils.clean(fet_values)) if standardize else rdc(
                np.max, fet_values)
        ]

    # Create an empty array to hold the min and max values (i.e.,
    # scaling parameters) for each column (i.e., feature).
    scl_prms = np.empty((len(fets), 2), dtype="float64")
    # Create an empty array to hold the rescaled features.
    new = np.empty(dat.shape, dtype=dat.dtype)
    # Rescale each feature based on its scaling group's min and max.
    for fet_idx, fet in enumerate(fets):
        # Look up the parameters for this feature's scaling group.
        prm_1, prm_2 = scl_grps_prms[scl_grps[fet_idx]]
        # Store this min and max in the list of per-column scaling parameters.
        scl_prms[fet_idx] = np.array([prm_1, prm_2])
        fet_values = dat[fet]
        if standardize:
            # prm_1 is the mean and prm_2 is the standard deviation.
            scaled = (
                # Handle the rare case where the standard deviation is
                # 0 (meaning that all of the feature values are the
                # same), in which case return an array of zeros.
                np.zeros(fet_values.shape, dtype=fet_values.dtype)
                if prm_2 == 0 else (fet_values - prm_1) / prm_2)
        else:
            # prm_1 is the min and prm_2 is the max.
            scaled = (
                # Handle the rare case where the min and the max are
                # the same (meaning that all of the feature values are
                # the same.
                np.zeros(fet_values.shape, dtype=fet_values.dtype)
                if prm_1 == prm_2 else utils.scale(
                    fet_values, prm_1, prm_2, min_out=0, max_out=1))
        new[fet] = scaled

    return new, scl_prms
Пример #42
0
def main(_):
    FLAGS.train = True
    FLAGS.clean = True
    clean(FLAGS)
    train()
Пример #43
0
def text_analytics_cmds(project, projectLang, grepStack, searchKey):
    """
    :param project: project name
    :param projectLang: primary language of project
    :param searchKey: build or test
    :param grepStack: list of file contents each as a string to analyze
    :return: A build command :rtype str
    """
    proximity_threshold = 10 # We insist that the command that we extract be within
                             # x lines after encountering the word 'build' or 'text'.
                             # Need to factor in HTML syntax.
    max_lines = 5   # Limit the number of lines (i.e. individual commands)
    max_words = 16  # Limit the number of words we allow per individual command
    max_chars = 128 # Limit the number of characters per individual command

    logger.debug("In text_analytics_cmds, project=%s, searchKey=%s, cnt grepStack[]=%s" % (project, searchKey, str(len(grepStack))))

    # List most common build and test commands first
    commands = ['mvn ', 'ant ', 'npm ', 'grunt ', 'python ', 'py.test ', 'cd ',
                'gem ', 'rake ', 'build.sh ', 'bootstrap.sh ', 'autogen.sh ',
                'autoreconf ', 'automake ', 'aclocal ', 'scons ', 'sbt ',
                'cmake ', 'gradle ', 'bundle ', 'perl ', 'php ']

    # List words that, if appearing in front of a command, indicate that it's
    # descriptive text, not a build command
    english = ['the', 'a', 'an', 'is', 'are', 'can', 'you', 'of', 'in', 'from',
               'this', 'to', 'that', 'when', 'should', 'might']

    # Symbols used as prompts
    promptsStr = '$#>%'
    
    # Symbols in commands we don't allow.  Not a proper list.  We have limitations such
    # as $VAR to prevent expanded environment variables which we don't support yet
    noContainsStr = '$'

    # Commands don't end with these characters.
    noEndsWithStr = ':[]().,'

    retval = []
    for fstr in grepStack:
        build_found = False
        build_line_number = 0
        lines = 0

        for idx, line in enumerate(fstr.splitlines()):
            line = line.lower()
            if searchKey in line:
                build_found = True
                build_line_number = idx
            if build_found and\
               idx - build_line_number < proximity_threshold:
                if len(line):
                    logger.debug("text_analytics_cmds, idx=%s scanning line=%s" % (str(idx), line))

                    isText = False
                    for word in english:
                        if word in line.split(' '):
                            isText = True
                            break

                    if isText or any(x in line for x in noContainsStr):
                        continue

                    lastChar = line[len(line) - 1]

                    # check for command prompt symbols at beginning of line
                    if line.lstrip()[0] in promptsStr:
                        line = line.lstrip()[1:].lstrip()
                    
                    for command in commands:
                                                                    # TODO: validate start of command line
                        if len(line.split(' ')) <= max_words and\
                           lines <= max_lines and\
                           not lastChar in noEndsWithStr:           # Commands don't end with ':[]()'
                            if command in line:
                                retval.append(utils.clean(line))
                                lines = lines + 1
                                break
                            elif (projectLang == 'C' or projectLang == 'C++' or projectLang == 'Perl') and\
                                 'make' in line:
                                retval.append(utils.clean(line))
                                lines = lines + 1
                                break
            else:
                build_found = False

        if len(retval):
            break          # If you find a build command sequence in one file, don't search other files

    return ';'.join(retval)
Пример #44
0
def main():
    # -----------------------------------------------------------------------------------
    # Adjustable Parameters
    parser = argparse.ArgumentParser()
    parser.add_argument('--train',
                        action='store_true',
                        help='training or scoring')
    parser.add_argument('--inputfile', type=str, help='input data file name')
    parser.add_argument('--outputfile',
                        type=str,
                        help='output prediction file name')
    args = parser.parse_args()

    # directory for the input data and output prediction:
    DATA_DIR = 'data'
    OUTPUT_DIR = 'output'

    # columns used:
    CAT_COLS = [
        'Auction', 'Transmission', 'WheelType', 'Nationality', 'Size',
        'TopThreeAmericanName', 'IsOnlineSale'
    ]
    NUM_COLS = [
        'VehicleAge', 'VehOdo', 'VehBCost', 'WarrantyCost',
        'MMRCurrentAuctionAveragePrice', 'MMRAcquisitionAuctionAveragePrice',
        'MMRCurrentAuctionCleanPrice', 'MMRAcquisitionAuctionCleanPrice',
        'MMRCurrentRetailAveragePrice', 'MMRAcquisitionRetailAveragePrice',
        'MMRCurrentRetailCleanPrice', 'MMRAcquisitonRetailCleanPrice'
    ]
    DATE_COLS = ['PurchDate']
    LABEL_COL = 'IsBadBuy'
    IDS_COL = 'RefId'

    # current time for computing recency feature
    NOW = '2010-12-31'

    # modeling step:
    # model checkpoint for future scoring
    MODEL_DIR = 'model'
    CHECKPOINT_XGB = 'xgb.pkl'
    CHECKPOINT_PREPROCESS = 'preprocess.pkl'

    # parameter that only relevant for training stage and not scoring
    if args.train:
        # number of cross validation and hyperparameter settings to try
        CV = 10
        N_ITER = 5
        MODEL_RANDOM_STATE = 4321

        # train/validation stratified split
        VAL_SIZE = 0.1
        TEST_SIZE = 0.1
        SPLIT_RANDOM_STATE = 1234

    # -----------------------------------------------------------------------------------
    logger.info('preprocessing')
    checkpoint_preprocess = os.path.join(MODEL_DIR, CHECKPOINT_PREPROCESS)
    checkpoint_xgb = os.path.join(MODEL_DIR, CHECKPOINT_XGB)
    input_path = os.path.join(DATA_DIR, args.inputfile)

    if args.train:
        data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL,
                     LABEL_COL)
        ids = data[IDS_COL].values
        label = data[LABEL_COL].values
        data = data.drop([IDS_COL, LABEL_COL], axis=1)

        # train/test split twice to achieve train/validation/test three way split
        df_train, df_test, y_train, y_test, ids_train, ids_test = train_test_split(
            data,
            label,
            ids,
            test_size=TEST_SIZE,
            random_state=SPLIT_RANDOM_STATE,
            stratify=label)

        df_train, df_val, y_train, y_val, ids_train, ids_val = train_test_split(
            df_train,
            y_train,
            ids_train,
            test_size=VAL_SIZE,
            random_state=SPLIT_RANDOM_STATE,
            stratify=y_train)

        # obtain finalized columns
        num_cols_cleaned = list(
            SortedSet(df_train.columns) - SortedSet(CAT_COLS))
        preprocess = Preprocesser(num_cols=num_cols_cleaned, cat_cols=CAT_COLS)
        X_train = preprocess.fit_transform(df_train)
        X_val = preprocess.transform(df_val)
        X_test = preprocess.transform(df_test)

        logger.info('modeling')
        eval_set = [(X_train, y_train), (X_val, y_val)]
        xgb_tuned = build_xgb(N_ITER, CV, MODEL_RANDOM_STATE, eval_set)
        xgb_tuned.fit(X_train, y_train)

        if not os.path.isdir(MODEL_DIR):
            os.mkdir(MODEL_DIR)

        dump(preprocess, checkpoint_preprocess)
        dump(xgb_tuned, checkpoint_xgb)

        # model evaluation metric reporting
        y_pred = []
        xgb_best = xgb_tuned.best_estimator_
        zipped = zip(('train', 'validation', 'test'), (X_train, X_val, X_test),
                     (y_train, y_val, y_test))
        for name, X, y in zipped:
            xgb_pred = xgb_best.predict_proba(
                X, ntree_limit=xgb_best.best_ntree_limit)[:, 1]
            score = round(roc_auc_score(y, xgb_pred), 3)
            logger.info('{} AUC: {}'.format(name, score))
            y_pred.append(xgb_pred)

        ids = np.hstack((ids_train, ids_val, ids_test))
        y_pred = np.hstack(y_pred)
    else:
        data = clean(input_path, NOW, CAT_COLS, NUM_COLS, DATE_COLS, IDS_COL)
        ids = data[IDS_COL].values
        data = data.drop(IDS_COL, axis=1)

        logger.info('scoring')
        preprocess = load(checkpoint_preprocess)
        xgb_tuned = load(checkpoint_xgb)
        X = preprocess.transform(data)
        xgb_best = xgb_tuned.best_estimator_
        y_pred = xgb_best.predict_proba(
            X, ntree_limit=xgb_best.best_ntree_limit)[:, 1]

    if not os.path.isdir(OUTPUT_DIR):
        os.mkdir(OUTPUT_DIR)

    output_path = os.path.join(OUTPUT_DIR, args.outputfile)
    write_output(ids, IDS_COL, y_pred, LABEL_COL, output_path)
Пример #45
0
 def _load_data(self) -> None:
     print('Loading and preprocessing data...')
     self._train_data = pd.read_csv('data/train.csv')
     self._test_data = pd.read_csv('data/test.csv')
     self._train_data[TEXT] = [clean(s) for s in self._train_data[TEXT]]
     self._test_data[TEXT] = [clean(s) for s in self._test_data[TEXT]]
Пример #46
0
import dictionaries
import beatDetection
import chordPrediction
import midiConversion
import midiFileCreation

from flask import Flask, render_template, request, url_for, flash, send_from_directory, redirect
from werkzeug.utils import secure_filename


# Initialize the Flask application
UPLOAD_FOLDER = cwd+'/static/wav'
ALLOWED_EXTENSIONS = set(['wav', 'WAV'])
app = Flask(__name__, template_folder='www')
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
utils.clean(UPLOAD_FOLDER+"/")
webbrowser.open_new("http://*****:*****@app.route('/')
def form():		
	return render_template('index.html', filename='Browse for file...', disabled_radio="true")

def allowed_file(filename):
    return '.' in filename and \
           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

@app.route('/', methods = ['POST'])
def upload_file():
    if request.method == 'POST':     
        file = request.files['file']
Пример #47
0
                if row[i] == None:
                    row[i] = float(region_data[region][i - 2])

        header_ = header
        if type(header[0]) == list:
            header_ = [
                header[0][i] + " -- " + header[1][i]
                for i in range(len(header[0]))
            ]

        for row in data:
            if check_transform_condition(category, path):
                for i, field in enumerate(row[2:]):
                    col = category + " | " + header_[i + 2] + " -- " + row[1 -
                                                                           sti]
                    polity_data[row[sti]][col] = clean(field)
            else:
                for i, field in enumerate(row[2:]):
                    col = category + " | " + header_[i + 2]
                    polity_data[row[sti]][col] = clean(field)

        if category == "Economy":
            data = header + data
            data = map(None, *data)
            header = data[0]
            data = [list(row) for row in data[2:]]

        with open("data/datagov-clean/" + path, "wb") as csvfile:
            writer = csv.writer(csvfile,
                                delimiter=",",
                                quotechar="\"",
Пример #48
0
 def clean(self):
     if self.clear_title is None and self.clear_text is None:
         self.clear_title = clean(self.title)
         self.clear_text = clean(self.text)
     return self.clear_title + ' ' + self.clear_text
Пример #49
0
            #continue
        for section in re.split("\n\-{5,}\n", plain_text):
            section = "\n" + section
            for disease_name_match in DISEASE_STATUS_RE.finditer(section):
                if disease_name_match:
                    #print(re.match(r"\s*High Priority Event Updates", section))
                    [disease_name, location,
                     status] = disease_name_match.groups()
                    disease_name = re.sub(r"\*", "", disease_name).strip()
                    for possible_status in "Worsening|No Change|Improving|Undetermined".split(
                            '|'):
                        if possible_status.lower() in status.lower():
                            status = possible_status
                            break
                    disease_to_metadata[disease_name] = {
                        'location': clean(location),
                        'status': status
                    }
        db.nbic.insert({
            'file':
            file_path,
            'date':
            date,
            'diseases': [{
                'nameUsed': disease,
                'metadata': metadata,
                'resolved': lookup_disease(disease)
            } for disease, metadata in disease_to_metadata.items()],
        })

total_disease_topics = 0
Пример #50
0
import utils, re, time, code
pats = [
	(re.compile('ate (?P<hour>\d\d):(?P<minute>\d\d):(?P<second>\d\d)'),True),
	(re.compile('http://maps.google.com/maps.q=(?P<latitude>[-.0-9]+),(?P<longitude>[-.0-9]+)'),True),
	(re.compile('^Um (?P<pokemon_name>.+) selvagem'),False),
	(re.compile('^Detectada raid de (?P<raid_pokemon_name>[^!]+)!'),False)]

while True:
	for e in utils.read_raw_events():
		print(e)
		if 'text' in e['evt']:
			t = e['evt']['text']
			evt = {'btg':e['evt']}
			for p, c in pats:
				m = p.search(t)
				if m:
					evt.update(m.groupdict())
				elif c:
					break
			else:
				if ('pokemon_name' in evt) or ('raid_pokemon_name' in evt):
					utils.save_event(evt)
				else:
					print('failed msg'+t)
		utils.close_raw_event(e)
	utils.clean()
	time.sleep(2)
Пример #51
0
def main(_):
    # evaluate_line()
    clean(FLAGS)
    train()
Пример #52
0
def star_qsub(job):
    """
    qsub star mission through star_sub.sh

    input:
    job - string, "indexing" or "mapping". choose the job you want to run.

    outputs:
    indexing - genome index files in the genome folder
    mapping - sam and log files in star_outputs folder
    """

    paths = RnaSeqPath()
    # STAR genome indexing job
    if job == "indexing":
        # create the shell file
        shell_file = generate_bash_file(
            job_name="star_indexing",
            threads=4,
            out_log="star_indexing.out",
            err_log="star_indexing.err",
            commands=[
                "module load star", "module load python/3.6.4",
                "python3 {} indexing".format(
                    os.path.join(paths.scripts, 'star.py'))
            ])
        # submit the shell file to hpc
        qsub(shell_file)

    # STAR RNA-seq alignment job
    if job == "mapping":

        try:
            os.mkdir(paths.star_outputs)
        except IOError:
            pass

        # shell commands
        commands = [
            "module load star", "module load python/3.6.4",
            "python3 {} mapping $SGE_TASK_ID".format(
                os.path.join(paths.scripts, 'star.py'))
        ]

        # calculate job number based on the trimmomatic outputs
        n_jobs = int(
            len(
                glob(os.path.join(paths.trimmomatic_outputs,
                                  "*.cleaned.fastq"))) / 2)

        # create shell file
        shell_file = generate_bash_file(job_name="star_mapping",
                                        mem_free="35G",
                                        threads=8,
                                        job_arr=n_jobs,
                                        commands=commands)
        # submit shell file to hpc
        qsub(shell_file)

        # clean temp files
        qsub(clean(after="star_mapping"))