def get_reads_per_group(df, prefix, taxlevel='species', min_reads=10, names=[] ): """ Get the number of reads per taxonomic level and the number of unique taxa per read :param min_reads: Minumum number of reads to retain group :param df: filtered blast dataframe :param taxlevel: taxonomic :return: """ # Get number of reads per taxonomic group # Get empty taxlevels df = df.apply(report_any, args=(taxlevel, names,), axis=1) cou = df.groupby([taxlevel])['qseqid'].nunique() if 'size' in df.columns: size = df.groupby([taxlevel])['size'].sum() size.name = 'Total' cou.name = 'Unique' cou = pd.concat((cou, size), axis=1) cou.to_csv('%s_number_of_reads_in_%s.tsv' % (prefix, taxlevel), sep='\t') # Get number of unique species per read re = pd.concat([df.groupby('qseqid')[taxlevel].nunique().rename( 'No. unique taxa'), df.groupby('qseqid')[taxlevel].unique().rename( 'Unique taxa')], axis=1).sort_values(by='No. unique taxa', ascending=False) re.to_csv('%s_Number_unique_%s_per_read.tsv' % (prefix, taxlevel), sep='\t') # List number of unique species above the min_reads sps = cou[cou > min_reads].index.unique().to_series() sps.to_csv('%s_List_unique_%s.txt' % (prefix, taxlevel), header=False, index=False) return df
def save_pattern_abnormal(self, path1, path2, path3): """ implement : save pattern abnormal label to a file :return: """ df1 = pd.read_csv(path1, header=None) # print(df1) df2 = pd.read_csv(path2, header=None) re = pd.concat([df1, df2], axis=1) re.to_csv(path3, sep=' ', index=False, header=False)
dic_tmp['count'] = count dic_tmp['price'] = meam_tmp dic_tmp['month_six_rs'] = month_six_rs dic_tmp['year_rs'] = year_rs dic_tmp['year'] = year dic_tmp['grow_6'] = grow_6 dic_tmp['grow_3'] = grow_3 dic_tmp['grow_3_1'] = grow_3_1 dic_tmp['grow_3_2'] = grow_3_2 dic_tmp['grow_3_3'] = grow_3_3 dic_tmp['volume_start'] = vol_start dic_tmp['volume_end'] = vol_end dic_tmp['vol_radio'] = vol_radio li.append(dic_tmp) nu_nu = nu_nu + 1 datatable = pd.DataFrame(li) return datatable pan = get_laohu_price(url, li_code) re = pd.merge(code, pan, how='outer', on='code') re = re.drop_duplicates() re.to_csv(date + '_Laohu_us_grow_rs.csv', encoding='gbk', index=False) # cn_us=get_laohu_code(url_us, [i for i in range(14)]) # cn_us.to_csv(date+'us_us_code.csv', index=False, encoding ='gbk') # frames=[cn_nsdq,cn_ny,cn_us] # sum=pd.concat(frames,ignore_index=True) # sum.to_csv(date+'us_all_code.csv', index=False, encoding ='gbk')
dic_tmp['price_week_3'] = price_week_3 dic_tmp['week_6_rs'] = week_6_rs dic_tmp['week_3_rs'] = week_3_rs dic_tmp['year'] = year dic_tmp['grow_self_6'] = grow_self_6 dic_tmp['grow_self_3'] = grow_self_3 dic_tmp['grow_radio_6'] = grow_radio_6 dic_tmp['grow_radio_3'] = grow_radio_3 dic_tmp['volume_6'] = vol_6 dic_tmp['volume_3'] = vol_3 dic_tmp['vol_radio'] = vol_radio li.append(dic_tmp) nu_nu = nu_nu + 1 datatable = pd.DataFrame(li) return datatable pan = get_laohu_price(url, li_code) re = pd.merge(code, pan, how='outer', on='code') re = re.drop_duplicates() re.to_csv(date + '_Laohu_us_week_grow_radio_rs_2.csv', encoding='gbk', index=False) # cn_us=get_laohu_code(url_us, [i for i in range(14)]) # cn_us.to_csv(date+'us_us_code.csv', index=False, encoding ='gbk') # frames=[cn_nsdq,cn_ny,cn_us] # sum=pd.concat(frames,ignore_index=True) # sum.to_csv(date+'us_all_code.csv', index=False, encoding ='gbk')
dic_tmp['count'] = count dic_tmp['mean'] = meam_tmp dic_tmp['std'] = std_tmp dic_tmp['max'] = max dic_tmp['min'] = min dic_tmp['start'] = startprice dic_tmp['end'] = endd dic_tmp['year'] = year dic_tmp['price_start'] = price_start dic_tmp['price_middle'] = price_middle dic_tmp['price_end'] = price_end dic_tmp['volume_start'] = vol_start dic_tmp['volume_end'] = vol_end li.append(dic_tmp) nu_nu = nu_nu + 1 datatable = pd.DataFrame(li) return datatable pan = get_laohu_price(url, li_code) re = pd.merge(code, pan, how='outer', on='code') re = re.drop_duplicates() re.to_csv(date + '_Laohu_us_price.csv', encoding='gbk', index=False) # cn_us=get_laohu_code(url_us, [i for i in range(14)]) # cn_us.to_csv(date+'us_us_code.csv', index=False, encoding ='gbk') # frames=[cn_nsdq,cn_ny,cn_us] # sum=pd.concat(frames,ignore_index=True) # sum.to_csv(date+'us_all_code.csv', index=False, encoding ='gbk')
coun=len(code_year) s = r'股東權益回報率(.*?)資本運用回報率' pat = re.compile(s) codd = pat.findall(html) if codd: if not codd is None: s = r'">(.*?)</td>' pat = re.compile(s) code_value = pat.findall(codd[0]) print(code_value[:coun]) for i in li: i.append('0') for i in range(coun): for j in range(len(hee)): if code_year[i][:4]==hee[j]: li[j][n]=code_value[i] n=n+1 pdd=pd.DataFrame(li, columns=li_code, index=[ 'roe2013','roe2014','roe2015','roe2016','roe2017','roe2018']) pan=pdd.T pan['code'] = li_code re=pd.merge(code,pan,how='outer',on='code') re.to_csv(date+'_aastocks_uk_roe.csv', encoding = 'gbk',index=False)
t = zong.loc[:,~zong.columns.duplicated()] # t=zong.drop(0,axis=0) date=time.strftime('%Y-%m-%d',time.localtime(time.time())) t.index.rename('code', inplace=True) t.reset_index(inplace=True) # t.code=t.code.str.replace('sz', '').replace('sh', '') return t # 获取股票代码列表 code= pd.read_excel('Data20190311.xls',encoding='gbk') # code = list(set(code)) listcode=code.code.tolist() Code_List=[] for item in listcode: if len(str(item)) == 6 and str(item)[0] == '6': Code_List.append('sh'+str(item)) if len(str(item)) < 6: Code_List.append('sz'+(6-len(str(item)))*'0'+str(item)) if len(str(item)) == 6 and str(item)[0] != '6': Code_List.append('sz'+str(item)) code.code=pd.Series(Code_List) # code = code[:10] t=get_income(url,Code_List) re=pd.merge(code, t, how='outer',on='code') re.to_csv(date+'_year_code_eastmoney.csv',encoding = 'gbk',index=False) # print(t)
price_middle=jo[int(count/3):int((count*2)/3)]['close'].mean() price_end=jo[int((count*2)/3):]['close'].mean() vol_start=jo[:int(count/2)]['volume'].mean() vol_end=jo[int(count/2):]['volume'].mean() vol_radio=(vol_end-vol_start)/vol_start dic_tmp['count']=count dic_tmp['price']=meam_tmp dic_tmp['month_six_rs']=month_six_rs dic_tmp['year_rs']=year_rs dic_tmp['year']=year dic_tmp['volume_start']=vol_start dic_tmp['volume_end']=vol_end dic_tmp['vol_radio']=vol_radio li.append(dic_tmp) nu_nu=nu_nu+1 datatable=pd.DataFrame(li) return datatable pan=get_laohu_price(url, li_code) re=pd.merge(code, pan, how='outer',on='code') re=re.drop_duplicates() re.to_csv(date+'_Laohu_us_strength.csv', encoding = 'gbk',index=False) # cn_us=get_laohu_code(url_us, [i for i in range(14)]) # cn_us.to_csv(date+'us_us_code.csv', index=False, encoding ='gbk') # frames=[cn_nsdq,cn_ny,cn_us] # sum=pd.concat(frames,ignore_index=True) # sum.to_csv(date+'us_all_code.csv', index=False, encoding ='gbk')