Exemplo n.º 1
0
def crawler_pages(district, url):
    # print district, url
    try:
        html = httptool.getResponseHtml(url)

        soup = BeautifulSoup(html)

        totalPages = int(
            soup.find(name="span", attrs={
                'class': 'fy_text'
            }).getText().split('/')[1])
        #         totalNumber = int(soup.find(name="span", attrs={'class':'number orange'}).getText())
        #         totalPages = (totalNumber / 20) if totalNumber % 20 == 0 else (totalNumber / 20 + 1)
        #         print str(totalPages)
        #         loop pages to get community

        shequlist = []
        for pn in range(totalPages):
            parts = url.split('_')
            parts[-3] = str(pn + 1)
            url = '_'.join(parts)
            print url
            shequlist = shequlist + crawl_page(district, url)
            time.sleep(random.randint(1, 3))

        return shequlist

    except Exception, e:
        print "Exception in crawler_pages ", e
Exemplo n.º 2
0
def fetch_detail(dt, tik, counter):

    url = 'http://data.eastmoney.com/stock/lhb,#dt#,#tik#.html'.replace(
        '#dt#', dt).replace('#tik#', tik)

    html = httptool.getResponseHtml(url)

    sleeptime = random.randint(1, 3)
    time.sleep(sleeptime)
    print dt, tik, sleeptime, counter

    if html is None:
        print 'bad response :', dt, tik
        return [], []

    soup = BeautifulSoup(html)

    buy_tab = soup.find(name="table", attrs={'id': 'tab-2'})

    sell_tab = soup.find(name="table", attrs={'id': 'tab-4'})

    buy_rank = parse_table(buy_tab, dt, tik, 'buy')

    sell_rank = parse_table(sell_tab, dt, tik, 'sell')

    return buy_rank, sell_rank
Exemplo n.º 3
0
def fetch_winner_list2(dt='2016-12-30'):

    print 'start to get ', dt

    url = "http://datainterface3.eastmoney.com//EM_DataCenter_V3/api/LHBGGDRTJ/GetLHBGGDRTJ?tkn=eastmoney&mkt=0&dateNum=&startDateTime=#{dt}&endDateTime=#{dt}&sortRule=1&sortColumn=&pageNum=1&pageSize=200&cfg=lhbggdrtj".replace(
        '#{dt}', dt)

    result = httptool.getResponseHtml(url)
    '''
    {"Message":"","Status":0,"Data":[{"TableName":"RptLhbXQMap","TotalPage":1,"ConsumeMSecond":0,"SplitSymbol":"|",
    "FieldName":"SCode,SName,ClosePrice,Chgradio,Dchratio,JmMoney,Turnover,Ntransac,Ctypedes,Oldid,Smoney,BMoney,ZeMoney,Tdate,JmRate,ZeRate,Ltsz,Rchange1dc,Rchange1do,
    Rchange2dc,Rchange2do,Rchange3dc,Rchange3do,Rchange5dc,Rchange5do,Rchange10dc,Rchange10do,Rchange15dc,Rchange15do,Rchange20dc,Rchange20do,Rchange30dc,Rchange30do,
    Rchange1m,Rchange3m,Rchange6m,Rchange1y,SumCount,JGBSumCount,JGSSumCount,JGBMoney,JGSMoney,JGJMMoney,DP",
    "Data":["000538|云南白药|76.15|9.9957||9084847.3|161382715|2119274|日涨幅偏离值达到7%的前五只证券|2445780|96344827.7|105429675|201774502.7|2016-12-30|5.63|125.03|79302033544.5|||||||||||||||||||||||||||实力游资买入,成功率72.18%",
    "000612|焦作万方|11.55|8.0449||210253710.31|850814366|75774853|日涨幅偏离值达到7%的前五只证券|2445781|165166791.68|375420501.99|540587293.67|2016-12-30|24.71|63.54|11448045389.55|||||||||||||||||||||||||||买一主买,成功率42.69%",
    "000635|英力特|30.72|-9.9912||-64527768.79|996865377|31485498|日跌幅偏离值达到7%的前五只证券|2445782|110720856.95|46193088.16|156913945.11|2016-12-30|-6.47|15.74|9310851133.44|||||||||||||||||||||||||||实力游资卖出,成功率11.67%"
    '''

    result = json.loads(result)

    seculist = result['Data'][0]['Data']

    lhb = []
    related_securities = []

    counter = 0
    for secuinfo in seculist:
        sf = secuinfo.split('|')
        secu = str(sf[0])
        name = sf[1]
        close = sf[2]
        chg = sf[3]  # 涨跌幅
        dp = sf[-1]  # 解读
        jm = sf[5]  # 龙虎榜净买额 需要1000
        mr = sf[11]  # 龙虎榜买入额
        mc = sf[10]  # 龙虎榜卖出额
        ze = sf[12]  # 龙虎榜成交额
        turn = sf[6]  # 市场总成交额
        jmrate = sf[14]  # 净买额占总成交比例
        zerate = sf[15]  # 成交额占总成交比
        turn_rate = sf[4]  # 换手率
        ltsz = sf[17]  # 流通市值
        list_reason = sf[8]  # 上榜原因

        lhb.append([
            dt, secu,
            name.encode('GBK'), close, chg,
            dp.encode('GBK'), jm, mr, mc, ze, turn, jmrate, zerate, turn_rate,
            ltsz,
            list_reason.encode('GBK')
        ])

        counter = counter + 1
        buy, sell = fetch_detail(dt, secu, counter)
        related_securities.extend(buy)
        related_securities.extend(sell)

    return lhb, related_securities
Exemplo n.º 4
0
def crawlerjob(url):

    try:
        html = httptool.getResponseHtml(url)

        soup = BeautifulSoup(html)
        list_dist = soup.find(name="li", attrs={'id': 'hlist_21'})
        dist = list_dist.findAll(name="a", attrs={'class': ''})
        # loop to get district
        for e in dist:
            crawler_shangquan(e.getText(), url, e.get('href'))
            time.sleep(random.randint(1, 10))

    except Exception, e:
        print "Exception in crawlerjob ", e
Exemplo n.º 5
0
def crawl_page(district, url):
    try:
        html = httptool.getResponseHtml(url)

        soup = BeautifulSoup(html)

        houselistdom = soup.find(name="ul", attrs={'id': 'houselist'})
        houselist = houselistdom.findAll('dt')
        shequlist = []
        for h in houselist:
            shequlist.append(h.getText().replace('[',
                                                 ',').replace(']',
                                                              ',').split(','))
        return shequlist

    except Exception, e:
        print "Exception in crawl_page ", e
Exemplo n.º 6
0
def fetch_winner_list():

    dt = '2016-12-30'
    url = 'http://data.eastmoney.com/stock/tradedetail/#dt#.html'.replace(
        '#dt#', dt)

    html = httptool.getResponseHtml(url)

    data_re = re.compile('var data_tab_1=(.*?);')

    # default_tab = soup.findAll(data_re)
    default_tab = data_re.findall(html)

    result = json.loads(unicode(default_tab[0], 'GBK'))

    # {u'Rchange1do': u'', u'Chgradio': u'9.9957', u'Rchange3m': u'9.99566662', u'Rchange10do': u'', u'Rchange1dc': u'',
    # u'JD': u'\u5b9e\u529b\u6e38\u8d44\u4e70\u5165\uff0c\u6210\u529f\u738772.18%', u'Rchange20dc': u'', u'Rchange10dc': u'', u'Rchange5do': u'',
    # u'ZeRate': u'125.03', u'Rchange20do': u'', u'Rchange1y': u'3.41536031', u'Rchange5dc': u'', u'JGSMoney': u'', u'JmMoney': u'9084847.3',
    # u'Ctypedes': u'\u65e5\u6da8\u5e45\u504f\u79bb\u503c\u8fbe\u52307%\u7684\u524d\u4e94\u53ea\u8bc1\u5238', u'Rchange2do': u'',
    # u'JGBMoney': u'', u'Rchange3do': u'', u'Rchange30do': u'', u'Ntransac': u'2119274', u'Oldid': u'2445780', u'Rchange15dc': u'', u'Rchange15do': u'',
    # u'Turnover': u'161382715', u'Rchange3dc': u'', u'Rchange2dc': u'', u'JmRate': u'5.63', u'ClosePrice': u'76.15', u'SName': u'\u4e91\u5357\u767d\u836f',
    # u'Rchange6m': u'18.68765586', u'Tdate': u'2016-12-30', u'Rchange1m': u'9.99566662', u'SCode': u'000538', u'Smoney': u'96344827.7', u'Bmoney': u'105429675',
    # u'ZeMoney': u'201774502.7', u'Dchratio': u'0.204', u'JGSSumCount': u'', u'DP': u'\u5b9e\u529b\u6e38\u8d44\u4e70\u5165\uff0c\u6210\u529f\u738772.18%',
    # u'JGBSumCount': u'', u'SumCount': u'', u'Ltsz': u'79302033544.5', u'Rchange30dc': u'', u'JGJMMoney': u''}
    for tik in result['data']:
        secu = tik['SCode']
        name = tik['SName']
        close = tik['ClosePrice']
        chg = tik['Chgradio']
        dp = tik['DP']  # 解读
        jm = tik['JmMoney']  # 龙虎榜净买额 需要1000
        mr = tik['Bmoney']  # 龙虎榜买入额
        mc = tik['Smoney']  # 龙虎榜卖出额
        ze = tik['ZeMoney']  # 龙虎榜成交额
        turn = tik['Turnover']  # 市场总成交额
        jmrate = tik['JmRate']  # 净买额占总成交比例
        zerate = tik['ZeRate']  # 成交额占总成交比
        turn_rate = tik['Dchratio']  # 换手率
        ltsz = tik['Ltsz']  # 流通市值
        list_reason = tik['Ctypedes']  # 上榜原因

        fetch_detail(dt, secu)
        print secu, name, close, chg, dp, jm, mr, mc, ze, turn, jmrate, zerate, turn_rate, ltsz, list_reason
        break
Exemplo n.º 7
0
def crawler_shangquan(district, baseurl, href):

    try:
        html = httptool.getResponseHtml(url.replace('/housing/', href))

        soup = BeautifulSoup(html)
        list_shangquan = soup.find(name="li", attrs={'class': 'shangquan'})
        shq = list_shangquan.findAll(name="a", attrs={'class': ''})
        # loop to get district
        shequlist = []
        for e in shq:
            shequlist = shequlist + crawler_pages(
                e.getText(), url.replace('/housing/', e.get('href')))
            time.sleep(random.randint(1, 5))

        write_to_excel(district, shequlist)

    except Exception, e:
        print "Exception in crawlerjob ", e
Exemplo n.º 8
0
def craw_investor_info(investorid, url):

    invests = []
    try:
        html = httptool.getResponseHtml(baseurl + url)

        soup = BeautifulSoup(html)

        investinfo_html = soup.find(name="table",
                                    attrs={'class': 'detailsList mTop'})
        investinfo_rows = investinfo_html.findAll(name="tr", recursive=False)

        size = len(investinfo_rows)

        if size > 1:
            investinfo_rows = investinfo_rows[1:size]
            for row in investinfo_rows:
                cols = row.findAll(name="td", recursive=False)
                t2 = cols[2].find(name="table",
                                  attrs={'class': 'detailsList mTop'})
                group_rows = t2.findAll(name="tr", recursive=False)
                for grouprow in group_rows:
                    group_cols = grouprow.findAll(name="td", recursive=False)
                    t3 = group_cols[3].find(
                        name="table", attrs={'class': 'detailsList mTop'})
                    invest_times = t3.findAll(name="tr", recursive=False)
                    for inv in invest_times:
                        inv_cols = inv.findAll(name="td", recursive=False)
                        investinfo = [investorid]
                        investinfo.append(group_cols[0].getText())
                        investinfo.append(group_cols[1].getText())
                        investinfo.append(group_cols[2].getText())
                        investinfo.append(inv_cols[0].getText())
                        investinfo.append(inv_cols[1].getText())
                        investinfo.append(inv_cols[2].getText())
                        invests.append(list(investinfo))

    except Exception, e:
        print "Exception in craw_investor_info ", e
Exemplo n.º 9
0
def crawlcompany(companyid, company, url):

    baseinfo = []
    investors = []
    investorinfo = []
    changeitems = []
    members = []
    punishs = []
    try:
        html = httptool.getResponseHtml(url)

        soup = BeautifulSoup(html)

        if company == '_':
            name = soup.find(name="h2")
            company = name.getText().replace(' ', '').strip()

        #=======================================================================
        # baseinfo
        #=======================================================================
        profile_html = soup.find(name="table",
                                 attrs={'class': 'detailsList mTop'})
        profile_info = profile_html.findAll(name="td", attrs={'class': 'left'})

        baseprofile = [companyid, company]
        for e in profile_info:
            baseprofile.append(e.getText())
        baseinfo.append(list(baseprofile))

        print 'start to craw investors'
        #=======================================================================
        # investors
        #=======================================================================
        investor_html = soup.find(name="table",
                                  attrs={
                                      'class': 'detailsList mTop',
                                      'id': 'investor'
                                  })
        investor_rows = investor_html.findAll(name="tr")

        size = len(investor_rows)
        if size > 3:
            investor_rows = investor_rows[2:size - 1]
            rowid = 1
            for row in investor_rows:
                cols = row.findAll(name="td")
                investor = [companyid]
                investorid = companyid + '_' + str(rowid)
                investor.append(investorid)
                for i in range(4):
                    investor.append(cols[i].getText())
                investors.append(list(investor))

                alink = cols[4].find(name="a", attrs={'target': '_blank'})
                investorinfo = investorinfo + craw_investor_info(
                    investorid, alink.get('href'))

                rowid = rowid + 1

        print 'start to craw changeitems'
        #=======================================================================
        # changeitems
        #=======================================================================
        changeitems_html = soup.find(name="table",
                                     attrs={
                                         'class': 'detailsList mTop',
                                         'id': 'changeItem'
                                     })
        changeitems_rows = changeitems_html.findAll(name="tr")

        size = len(changeitems_rows)
        if size > 3:
            changeitems_rows = changeitems_rows[2:size - 1]
            for row in changeitems_rows:
                cols = row.findAll(name="td")
                changeinfo = [companyid]
                for i in range(4):
                    changeinfo.append(cols[i].getText())
                changeitems.append(list(changeinfo))

        print 'start to craw members'
        #=======================================================================
        # members
        #=======================================================================
        members_html = soup.find(name="table",
                                 attrs={
                                     'class': 'detailsList mTop',
                                     'id': 'member'
                                 })
        members_rows = members_html.findAll(name="tr")

        size = len(members_rows)
        if size > 3:
            members_rows = members_rows[2:size - 1]
            for row in members_rows:
                cols = row.findAll(name="td")
                memberinfo = [companyid]
                for i in range(3):
                    memberinfo.append(cols[i].getText())
                members.append(list(memberinfo))
                if len(cols) == 6:
                    memberinfo = [companyid]
                    for i in range(3):
                        memberinfo.append(cols[i + 3].getText())
                    members.append(list(memberinfo))

        print 'start to craw punish'
        #=======================================================================
        # punish
        #=======================================================================
        punish_div = soup.find(name="div", attrs={'id': 'punishDiv'})
        punish_html = punish_div.find(name="table",
                                      attrs={'class': 'detailsList mTop'})
        punish_rows = punish_html.findAll(name="tr")

        size = len(punish_rows)
        if size > 2:
            punish_rows = punish_rows[2:size]
            for row in punish_rows:
                cols = row.findAll(name="td")
                punishinfo = [companyid]
                for i in range(7):
                    punishinfo.append(cols[i].getText())
                punishs.append(punishinfo)

    except Exception, e:
        print "Exception in crawlcompany ", e