def download_volume_price_distr(stock_index, now_dt):
    save_dir = data_dict.get("volume_price_distr")
    dir_name1 = os.path.join(save_dir, stock_index)
    out_file_name = save_file_name(dir_name1, stock_index, now_dt)
    if stock_index[0] == '6':
        start_string = 'sh'
    else:
        start_string = 'sz'
    html1 = "https://vip.stock.finance.sina.com.cn/quotes_service/view/cn_price.php?symbol=" + start_string + stock_index
    soup2 = url_opener(html1)
    soup_out = soup2.findAll('tr', attrs={'class', 'gray'})
    soup_out = soup2.findAll('td')
    data1 = []
    for i in soup_out:
        try:
            a1 = float(i.get_text())
            data1.append(a1)
        except Exception as e:
            logging.error(e)
            pass
    out_list = []
    for i in range(0, len(data1), 2):
        b = data1[i:i + 2]
        out_list.append(b)
    df2 = pd.DataFrame(out_list)
    df2.columns = ["price", "vol"]
    df2["stock_index"] = stock_index
    df2["dt"] = now_dt
    save_dir = data_dict.get("volume_price_distr")
    dir_name1 = os.path.join(save_dir, stock_index)
    make_dir(dir_name1)
    save_df_date(dir_name1, stock_index, df2, now_dt)
    time.sleep(3.5)
    return df2
示例#2
0
def download_volume_price_distr(stock_index):
    if stock_index[0] == '6':
        start_string = 'sh'
    else:
        start_string = 'sz'
    html1 = "https://vip.stock.finance.sina.com.cn/quotes_service/view/cn_price.php?symbol=" + start_string + stock_index
    soup2 = url_opener(html1)

    soup_out = soup2.findAll('tr', attrs={'class', 'gray'})
    soup_out = soup2.findAll('td')

    data1 = []
    for i in soup_out:
        try:
            a1 = float(i.get_text())
            data1.append(a1)
        except:
            pass

    out_list = []
    for i in range(0, len(data1), 2):
        b = data1[i:i + 2]
        out_list.append(b)
    df2 = pd.DataFrame(out_list)
    df2.columns = ["price", "vol"]
    df2.to_csv(stock_index + ".csv", index=0)
    return df2
示例#3
0
def get_page_num(html1):
    soup2 = url_opener(html1)
    a1 = soup2.get_text()
    reg = r'pages(.{3})'  ## find 3 str after page
    wordreg = re.compile(reg)
    a2 = re.findall(wordreg, a1)
    page_num = a2[0][2]
    return page_num
示例#4
0
 def get_html_table(url1):
     '''
     return: the html table, and how many rows in the table
     '''
     soup2 = url_opener(url1)
     table = soup2.find_all('tbody')[1]
     new_table_index = [x for x in range(0,len(table.find_all('tr')))]
     return table,new_table_index
def url_to_df(html1):
    soup2 = url_opener(html1)
    a2 = soup2.find_all('table')
    a3 = a2[0].find_all('tr')
    data_list_all = []
    for i in range(2,len(a3)):
        a5 = a3[i].get_text()
        data_list = [x.strip() for x in a5.split("\r\n")]
        data_list = [x for x in data_list if x != '']
        data_list_all.append(data_list)
    df1 = pd.DataFrame(data_list_all)
    df1.dropna(axis=0, how='any', inplace=True)
    return df1
示例#6
0
def get_html_table(html1):
    '''
    get the table from a html
    input: html
    output:  table:   the table 
             new_table_index:  how many rows in the table

    '''
    #    html1 = "http://app.finance.ifeng.com/hq/all_stock_bill.php"
    soup2 = url_opener(html1)
    table = soup2.find_all('table')[0]  # Grab the first table
    new_table_index = [x for x in range(0, len(table.find_all('tr')))]
    return table, new_table_index
示例#7
0
def download_html_to_df(html1):
    '''
    download html do DataFrame,
    @input: a html url,
    @output: a pandas DataFrame
    '''
    soup2 = url_opener(html1)
    soup_out = soup2.findAll('a', href=True)
    ## get date, which '2018-10-30' start with #2
    dates_in = [
        dates_str['href'][1:11] for dates_str in soup_out
        if dates_str['href'].startswith("#2")
    ]
    tr_str = soup2.findAll('tr')
    #a1=t3[10]
    #------------------------------------------------------------#
    #--------- prase log to data ----------------------#
    data = []
    for a1 in tr_str:
        cols = a1.findAll('td')
        cols = [ele.text.strip() for ele in cols]
        publish_date = '2099-01-01'
        if len(cols) == 2:
            if cols[0] == '公告日期':
                publish_date = cols[1]
        else:
            pass
        if len(cols) == 5:
            data.append([ele for ele in cols if ele] + [publish_date])
        else:
            pass
    #print(data)
    import pandas as pd
    data1 = pd.DataFrame(data)
    data1.columns = [
        'index_in', 'owner_name', 'amount', 'ratio', 'character',
        'publish_date'
    ]
    #--------------------------------------------------------------------#
    #------- make date date align with data
    #data1.index_in.value_counts()
    date_all = []
    k = -1
    for index1 in data1.index_in:
        if index1 == '1':
            k += 1
        dates_in1 = dates_in[k]
        date_all.append(dates_in1)
    data1['date'] = date_all
    return data1
def download_data(stock_index,year,season):
    url = 'http://quotes.money.163.com/trade/lsjysj_%s.html?year=%s&season=%s'%(stock_index,str(year),str(season))
    soup1 = url_opener(url)
    table = soup1.find_all('table',\
            attrs={'class','table_bg001 border_box limit_sale'})
    all_tr = table[0].findAll('tr')
    str_in = []
    for i in all_tr:
        a1 = i.find_all('td')
        a2 = [x.get_text() for x in a1]
        str_in.append(a2)
    df1 = pd.DataFrame(str_in)
    df1 = df1.iloc[1:]
    df1['stock_index'] = stock_index
    return df1
示例#9
0
def download_owner(url1):
    ## get the table in web
    soup1 = url_opener(url1)
    t1 = soup1.find_all("table", attrs={'id': 'Table1'})
    tab1 = t1[0]
    t3 = tab1.find_all('tr')
    #print len(t3)
    # t4=t3[3]  ## the six one is the column names of the table
    data = []
    for t4 in t3[3:]:
        t5 = t4.find_all('td')
        cols = [ele.text.split(' ')[0] for ele in t5]
        cols[0] = cols[0].replace("\t", "")
        data.append([ele for ele in cols if ele])
    return data
示例#10
0
def find_url(stk_num):
    html1="http://vip.stock.finance.sina.com.cn/q/go.php/vReport_List/kind/search/index.phtml?symbol=%s&t1=all" %str(stk_num)
    soup2=url_opener(html1)
    a1=soup2.find_all('td',attrs={'class','tal f14'})
    return a1
示例#11
0
def get_html_table(html1):
    #    html1 = "http://app.finance.ifeng.com/hq/all_stock_bill.php"
    soup2 = url_opener(html1)
    table = soup2.find_all('table')[0]  # Grab the first table
    new_table_index = [x for x in range(0, len(table.find_all('tr')))]
    return table, new_table_index
示例#12
0
from davidyu_cfg import *
from functions.connect_url import url_opener

h1 = url_opener("http://snap.stanford.edu/class/cs224w-2012/handouts.html")

a1 = h1.findAll('a')
for a2 in a1:
    a3 = a2.get('href')
    if '.pdf' in a3:
        print(a3)
示例#13
0
    else:
        textOut=text1[0].get_text().encode('latin1',"ignore").decode('gb2312',"ignore")
    return date,textOut
from dir_control.data_dir_v1 import data_dict,stk_index_list
import time
stk_index_list=[x for x in stk_index_list if str(x).zfill(6)[0]!='3']
k=0
for stk in stk_index_list[0:2]:
    stk=str(stk).zfill(6)
    conts=[]
    content1=find_url(stk)
    k=0
    for ss in content1:
        k+=1
        con_url=ss.find('a').get('href')
        soup2=url_opener(con_url)
        #conts.append(strs)
        date,textout=content_get(soup2)
        file1=str(stk)+'_'+str(date)+'_'+str(k)+'.txt'
        f=open(file1,'w')
        print(textout, file = f)
        f.close()
        print(k)
        time.sleep(2.5)

    #jieba.lcut(t3)
#date,textout=content_get(soup2)


'''
示例#14
0
def find_url(url):
    soup2 = url_opener(url)
    a1 = soup2.find_all(id='BalanceSheetNewTable0')
    soup1 = a1[0]
    return soup1
def news_in_html_url(url):
    con_url = 'http:' + url.find('a').get('href')
    soup2 = url_opener(con_url)
    date, textout = content_get(soup2)
    return date, textout
 def find_url_content(self):
     soup2 = url_opener(self.html)
     a1 = soup2.find_all('td', attrs={'class', 'tal f14'})
     return a1
示例#17
0
def get_html_table(url1):
    soup2 = url_opener(url1)
    table = soup2.find_all('tbody')[1]
    new_table_index = [x for x in range(0, len(table.find_all('tr')))]
    return table, new_table_index