def parse_doc(path, icao, country, title, category): print "Parsing AIP doc" icao = icao.upper() assert len(icao) == 4 url = fetchdata.getrawurl(path, country=country) ret = dict() ret['icao'] = icao ret['url'] = url ret['title'] = title ret['name'] = icao + " - " + title ret['category'] = category #data,nowdate=fetchdata.getdata(path,country=country,maxcacheage=7200) blobname = icao + "_" + category tmppath = os.path.join(os.getenv("SWFP_DATADIR"), "aiptext", icao) if not os.path.exists(tmppath): os.makedirs(tmppath) if path.lower().endswith("pdf"): outpath_inter = os.path.join(tmppath, blobname + ".tmp.html") def render(inputfile, outputfile): r = "pdftohtml -c -s -i -zoom 2 -noframes -nodrm %s %s" % ( inputfile, outputfile ) #-s is not supported on older pdftohtml, and doesn't appear necessary either. print "running", r assert 0 == os.system(r) fetchdata.getcreate_derived_data_raw(path, outpath_inter, render, "html", country=country) whole = open(outpath_inter).read() fixed = (whole.replace("<BODY bgcolor=\"#A0A0A0\"", "<BODY bgcolor=\"#FFFFFF\"").replace( "<TITLE>Microsoft Word - ", "<TITLE>")) else: assert path.endswith("html") fixed, date = fetchdata.getdata(path, country=country) cksum = md5.md5(fixed).hexdigest() outpath = os.path.join(tmppath, blobname + "." + cksum + ".html") f = open(outpath, "w") f.write(fixed) f.close() #print "Wrote raw:",out,outpath ret['checksum'] = cksum ret['date'] = fetchdata.get_filedate(outpath) ret['blobname'] = blobname return ret
def parse_doc(path,icao,country,title,category): print "Parsing AIP doc" icao=icao.upper() assert len(icao)==4 url=fetchdata.getrawurl(path,country=country) ret=dict() ret['icao']=icao ret['url']=url ret['title']=title ret['name']=icao+" - "+title ret['category']=category #data,nowdate=fetchdata.getdata(path,country=country,maxcacheage=7200) blobname=icao+"_"+category tmppath=os.path.join(os.getenv("SWFP_DATADIR"),"aiptext",icao) if not os.path.exists(tmppath): os.makedirs(tmppath) if path.lower().endswith("pdf"): outpath_inter=os.path.join(tmppath,blobname+".tmp.html") def render(inputfile,outputfile): r="pdftohtml -c -s -i -zoom 2 -noframes -nodrm %s %s"%(inputfile,outputfile) #-s is not supported on older pdftohtml, and doesn't appear necessary either. print "running",r assert 0==os.system(r) fetchdata.getcreate_derived_data_raw( path,outpath_inter,render,"html",country=country) whole=open(outpath_inter).read() fixed=(whole.replace("<BODY bgcolor=\"#A0A0A0\"","<BODY bgcolor=\"#FFFFFF\"") .replace("<TITLE>Microsoft Word - ","<TITLE>")) else: assert path.endswith("html") fixed,date=fetchdata.getdata(path,country=country) cksum=md5.md5(fixed).hexdigest() outpath=os.path.join(tmppath,blobname+"."+cksum+".html") f=open(outpath,"w") f.write(fixed) f.close() #print "Wrote raw:",out,outpath ret['checksum']=cksum ret['date']=fetchdata.get_filedate(outpath) ret['blobname']=blobname return ret
def lr_test(epo): x = tf.placeholder(tf.float32, shape=[None, 108]) y = tf.placeholder(tf.float32, shape=[None]) m = 1 learning_rate = 0.3 w = tf.Variable(tf.random_normal([108, m], 0.0, 0.5), name='u') W = tf.matmul(x, w) p2 = tf.reduce_sum(tf.nn.sigmoid(W), 1) pred = p2 cost1 = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, labels=y)) cost = tf.add_n([cost1]) train_op = tf.train.FtrlOptimizer(learning_rate).minimize(cost) init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) sess = tf.Session() sess.run(init_op) train_x, train_y, test_x, test_y = getdata() result = [] time_s = time.time() for epoch in range(0, epo): f_dict = {x: train_x, y: train_y} _, cost_, predict_ = sess.run([train_op, cost, pred], feed_dict=f_dict) auc = roc_auc_score(train_y, predict_) time_t = time.time() if epoch % 100 == 0: f_dict = {x: test_x, y: test_y} _, cost_, predict_test = sess.run([train_op, cost, pred], feed_dict=f_dict) test_auc = roc_auc_score(test_y, predict_test) print("%d %ld cost:%f,train_auc:%f,test_auc:%f" % (epoch, (time_t - time_s), cost_, auc, test_auc)) result.append([epoch, (time_t - time_s), auc, test_auc]) pd.DataFrame(result, columns=['epoch', 'time', 'train_auc', 'test_auc']).to_csv("data/lr.csv")
def extract_segel(): segeldata,stamp=getdata("/ImageVaultFiles/id_21795/cf_78/Sektorer-2013-CU-rev1.TXT","segel") return list(getareas(segeldata,stamp))
def parse_landing_chart(path,arppos,icao,country='se',variant=''): icao=icao.upper() if variant and not variant.startswith("."): variant="."+variant print "Running parse_landing_chart" print "country:",country #p=parse.Parser(path,country=country) arppos=mapper.from_str(arppos) res=[] #assert p.get_num_pages()<=2 url=fetchdata.getrawurl(path,country=country) ret=dict() ret['url']=url data,nowdate=fetchdata.getdata(path,country=country,maxcacheage=7200) cksum=md5.md5(data).hexdigest() ret['checksum']=cksum #page=p.parse_page_to_items(0, donormalize=False) #ret['width']=page.width #ret['height']=page.height #width=page.width #height=page.height #scale=2048.0/min(width,height) #width*=scale #height*=scale #width=int(width+0.5) #height=int(height+0.5) blobname=icao+variant tmppath=os.path.join(os.getenv("SWFP_DATADIR"),"adcharts",icao) if not os.path.exists(tmppath): os.makedirs(tmppath) assert len(icao)==4 outpath=os.path.join(tmppath,blobname+"."+cksum+".png") def render(inputfile,outputfile): ext=inputfile.split(".")[-1].lower() if ext=='jpg' or ext=='png': assert 0==os.system("convert -adaptive-resize 2500x2500 %s %s"%(inputfile,outputfile)) else: ext='pdf' r="pdftoppm -f 0 -l 0 -scale-to 2500 -png -freetype yes -aa yes -aaVector yes %s >%s"%( inputfile,outputfile) print "rendering",r assert 0==os.system(r) ret['image']=blobname+"."+cksum+".png" fetchdata.getcreate_derived_data_raw( path,outpath,render,"png",country=country) fspath=fetchdata.getdatafilename(path,country=country) sizepts=None for line in os.popen("pdfinfo "+fspath): m=re.match(r"\s*.age\s+size:\s*(\d+\.?\d*)\s*x\s*(\d+\.?\d*)\s*pts.*",line) if m: sizepts=(float(m.groups()[0]),float(m.groups()[1])) if sizepts: sizemm=(0.3527*sizepts[0],0.3527*sizepts[1]) ret['mapsize']=sizemm print "Mapsize:",sizemm else: raise Exception("No size of this PDF!") outpath2=os.path.join(tmppath,blobname+"."+cksum+".2.png") def greyscale(input,output): assert 0==os.system("convert -define png:color-type=3 -depth 8 -type Palette -define \"png:compression-level=9\" %s %s"%(input,output)) fetchdata.getcreate_local_data_raw( outpath,outpath2,greyscale) i=Image.open(outpath2) width,height=i.size #ret['width']=page.width #ret['height']=page.height ret['render_width']=width ret['render_height']=height if country!='raw': icao_prefix=get_icao_prefix(country) assert icao.startswith(icao_prefix) for level in xrange(5): hashpath=os.path.join(tmppath,"%s.%s-%d.bin"%(blobname,cksum,level)) fetchdata.getcreate_local_data_raw( outpath2,hashpath,lambda input,output:chop_up(input,output,level)) ret['blobname']=blobname ret['variant']=variant return ret
def mlr_test(m, epo): """ MLR模型的丐版实现,未添加L1、L2-1正则并实现在L1、L2-1正则下的参数更新。 数据集比较曹丹,另L2-1正则和L2正则有多大区别呢? @param m: 结构先验个数 @param epo: 训练轮数 @return """ m = m learning_rate = 0.3 x = tf.placeholder(tf.float32, shape=[None, 108]) # 数据入口-x y = tf.placeholder(tf.float32, shape=[None]) # 数据入口-y u = tf.Variable(tf.random_normal([108, m], 0.0, 0.5), name='u') # 初始化向量u w = tf.Variable(tf.random_normal([108, m], 0.0, 0.5), name='w') # 初始化向量w U = tf.matmul(x, u) p1 = tf.nn.softmax(U) # 得到结构先验-p1 W = tf.matmul(x, w) p2 = tf.nn.sigmoid(W) # 得到结构内的预测结果-p2 pred = tf.reduce_sum(tf.multiply(p1, p2), 1) # 在每一结构中的预测结果加权相加 paras = tf.concat([w, u], 0) l1_loss = tf.contrib.layers.l1_regularizer(0.1)(paras) # l1正则项 l2_loss = tf.contrib.layers.l2_regularizer(0.1)(paras) # l2正则项 cost1 = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits(logits=pred, labels=y) + l1_loss + l2_loss) cost = tf.add_n([cost1]) train_op = tf.train.FtrlOptimizer(learning_rate).minimize(cost) train_x, train_y, test_x, test_y = getdata() time_s = time.time() result = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for epoch in range(0, epo): f_dict = {x: train_x, y: train_y} _, cost_, predict_ = sess.run([train_op, cost, pred], feed_dict=f_dict) auc = roc_auc_score(train_y, predict_) time_t = time.time() if epoch % 100 == 0: f_dict = {x: test_x, y: test_y} _, cost_, predict_test = sess.run([train_op, cost, pred], feed_dict=f_dict) test_auc = roc_auc_score(test_y, predict_test) print("%d %ld cost:%f, train_auc:%f, test_auc:%f" % (epoch, (time_t - time_s), cost_, auc, test_auc)) result.append([epoch, (time_t - time_s), auc, test_auc]) pd.DataFrame(result, columns=['epoch', 'time', 'train_auc', 'test_auc']).to_csv("data/mlr_" + str(m) + '.csv')
def extract_segel(): segeldata, stamp = getdata( "/ImageVaultFiles/id_21795/cf_78/Sektorer-2013-CU-rev1.TXT", "segel") return list(getareas(segeldata, stamp))