def start(self):
        print 'start JRJ_Report'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:

            for raw_front_page_index in range(1, 8):

                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)

                Path.mkdir(sdPath)

                url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html"

                request_url = url.format(uux, raw_front_page_index)
                content = Http.get_content(request_url)

                soup = BeautifulSoup(content, 'html.parser')

                list_p_list = soup.find_all('td', class_="left")

                for p in list_p_list:
                    # print p

                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        xxurl = li.get('href')
                        # print xxurl

                        if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl:

                            time.sleep(1)
                            result = Http.get_content(xxurl)
                            result = unicode(str(result),
                                             'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            # title_tationl = xxsoup.find_all('h1')
                            # tt = str(title_tationl[0].text).strip()

                            xxlist_p_list = xxsoup.find_all('p',
                                                            class_='title')[0]
                            xxlist_ds = xxsoup.find_all('span', class_='fr')[0]

                            realu = str(xxlist_p_list).replace(
                                str(xxlist_ds), '', 1)

                            realuxsoup = BeautifulSoup(realu, 'html.parser')

                            sp = str(realuxsoup.text).split(' ')

                            ttime = sp[1]

                            if ttime.__contains__('发表于'):
                                ttime = sp[2]

                            # print (sp[2]).text
                            # print (sp[3]).text

                            # print ttime

                            all_main = xxsoup.find_all('div', class_='main')[0]

                            realuxsoup = BeautifulSoup(str(all_main),
                                                       'html.parser')

                            reaupp = realuxsoup.find_all('p')

                            for pp in reaupp:
                                list_pcyc_li = pp.find_all('a')

                                for li in list_pcyc_li:
                                    print li.text
                                    ttt = li.get('href')

                                    print ttt

                                    fileName = u"{}_{}.pdf".format(
                                        ttime,
                                        str(li.text).replace('/', ""))

                                    print fileName

                                    basePath = '/ink/work/62/ink/{}/{}'.format(
                                        fileN, fileName)

                                    Path.mkdirAndPath(basePath)

                                    Debug.print_in_single_line(
                                        u'开始下载   {}'.format(ttt))
                                    if ttt:
                                        content = Http.get_content(url=ttt,
                                                                   timeout=180)
                                        if not content:
                                            Debug.logger.debug(
                                                u'pdf『{}』下载失败'.format(ttt))
                                            content = ''
                                        else:
                                            Debug.print_in_single_line(
                                                u'pdf {} 下载完成'.format(ttt))
                                    else:
                                        #   当下载地址为空的时候,就没必要再去下载了
                                        content = ''
                                    if content.__len__() > 10:
                                        with open(basePath, "wb") as pdf:
                                            pdf.write(content)
    def start(self):
        print 'start 东财研报'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:
            for raw_front_page_index in range(1, 5):
                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)
                Path.mkdir(sdPath)
                # url = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&p=1&code=000333&rt=51734025"

                burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=GGSR&js=var%20LhAYbcgn={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&ps=25&"
                uu = u"p={0}&code={1}&rt="

                url = '%s%s' % (burl, uu.format(raw_front_page_index, uux))

                content = Http.get_content(url)

                if content:
                    jsonD = str(content).split('=')[-1]

                    jdata = json.loads(jsonD)
                    articles = jdata['data']
                    for article in articles:
                        rticlet = article['datetime']

                        date_time = datetime.datetime.strptime(rticlet, '%Y-%m-%dT%H:%M:%S')
                        destU = u"http://data.eastmoney.com/report/{}/{}.html ".format(date_time.strftime('%Y%m%d'),
                                                                                       article['infoCode'])

                        result = Http.get_content(destU)
                        result = unicode(result, 'GBK').encode('UTF-8')

                        xxsoup = BeautifulSoup(result, 'html.parser')

                        title_tationl = xxsoup.find_all('h1')
                        tt = str(title_tationl[0].text).strip()

                        xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0]

                        sp = xxlist_p_list.find_all('span')

                        ttime = str((sp[1]).text)

                        date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M')

                        # print date_time.strftime('%Y-%m-%d')

                        ttime = date_time.strftime('%Y-%m-%d')

                        # print (sp[2]).text
                        # print (sp[3]).text

                        title = Match.replace_specile_chars(tt)
                        title = title.replace('/', '', 100)

                        fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text)
                        # 时间 券商 名称  author

                        print fileName

                        urlsp = sp[-1]

                        basePath = '{}/{}'.format(sdPath, fileName)

                        # print basePath

                        # 创建文件夹

                        list_pcyc_li = urlsp.find_all('a')
                        for li in list_pcyc_li:
                            ttt = li.get('href')
                            Path.mkdirAndPath(basePath)
                            print ttt

                            Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                            if ttt:
                                content = Http.get_content(url=ttt, timeout=180)
                                if not content:
                                    Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                    content = ''
                                else:
                                    Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                            else:
                                #   当下载地址为空的时候,就没必要再去下载了
                                content = ''
                            if content.__len__() > 10:
                                with open(basePath, "wb") as pdf:
                                    pdf.write(content)
    def start(self):
        print 'start 东财股吧 研报'

        stockList = []
        file_name = 'annual.txt'

        with open(file_name, 'r') as read_list:
            read_list = read_list.readlines()

            resultsL = read_list.__len__()
            for x in range(0, resultsL):
                line = read_list[x]
                splits = line.split('#')
                code = (str)(splits[0])
                fieName = (str)(splits[1]).strip()
                print fieName
                stockList.append({'URL': code, 'NAME': fieName})

        for xx in stockList:
            for raw_front_page_index in range(1, 3):
                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = '/ink/work/62/ink/{}'.format(fileN)
                Path.mkdir(sdPath)

                burl = u"http://guba.eastmoney.com/list,{},2,f_{}.html"

                content = Http.get_content(burl.format(uux, raw_front_page_index))

                xxsoup = BeautifulSoup(content, 'html.parser')

                tagrt = xxsoup.find_all('div', id='articlelistnew')[0]

                ols = tagrt.find_all('div', class_='articleh normal_post')
                olss = tagrt.find_all('div', class_='articleh normal_post odd')

                splicy = []

                for xxos in ols:
                    splicy.append(xxos)
                for xx in olss:
                    splicy.append(xxos)

                for inkl in splicy:

                    try:

                        inklinkl = BeautifulSoup(str(inkl), 'html.parser')

                        spp = inklinkl.find_all('span', class_='l3')[0]

                        list_pcyc_li = spp.find_all('a')
                        for li in list_pcyc_li:
                            ttt = li.get('href')

                            print ttt

                            destU = u'http://guba.eastmoney.com{}'.format(ttt)

                            result = Http.get_content(destU)
                            # result = unicode(result, 'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            title_tationl = xxsoup.find_all('div', id='zwconttbt')
                            tt = str(title_tationl[0].text).strip()
                            print tt

                            title = Match.replace_specile_chars(tt)
                            title = title.replace('/', '', 100)

                            title = title.replace('查看原文', '')

                            ttime = xxsoup.find_all('p', class_='publishdate')[0]

                            tttttime = str(ttime.text)[-10:]

                            print  tttttime

                            date_time = datetime.datetime.strptime(tttttime, '%Y-%m-%d')

                            # print date_time.strftime('%Y-%m-%d')

                            ttime = date_time.strftime('%Y-%m-%d')

                            fileName = u"{}_{}.pdf".format(ttime, title)
                            # 时间 券商 名称  author

                            print fileName

                            basePath = '{}/{}'.format(sdPath, fileName)

                            # print basePath

                            # 创建文件夹
                            #

                            spx = xxsoup.find_all('span', class_='zwtitlepdf')[0]

                            pdfu = spx.find_all('a')
                            for li in pdfu:
                                ttt = li.get('href')

                                print ttt
                                Path.mkdirAndPath(basePath)

                                Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                                if ttt:
                                    content = Http.get_content(url=ttt, timeout=180)
                                    if not content:
                                        Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                        content = ''
                                    else:
                                        Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                                else:
                                    #   当下载地址为空的时候,就没必要再去下载了
                                    content = ''
                                if content.__len__() > 10:
                                    with open(basePath, "wb") as pdf:
                                        pdf.write(content)
                    except Exception as e:
                        print('next')
    def start(self):
        print ' 中文研报 '

        stockList = []


        for raw_front_page_index in range(1, 251):
            fileN = '策略'

            sdPath = '/ink/work/62/ink/{}'.format(fileN)
            Path.mkdir(sdPath)

       #    http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p=2&js=var%20UxmjGoYW={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&
            burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=CLBG&cmd=4&code=&ps=50&p="
            # burl = u"http://datainterface.eastmoney.com//EM_DataCenter/js.aspx?type=SR&sty=HGYJ&cmd=4&code=&ps=50&p="
            uu = u"&js=var%20GdYXcAjX={%22data%22:[(x)],%22pages%22:%22(pc)%22,%22update%22:%22(ud)%22,%22count%22:%22(count)%22}&"

            url = '%s%s%s' % (burl, str(raw_front_page_index), uu)

            # print url

            content = Http.get_content(url)

            if content:
                try:
                    jsonD = str(content).split('=')[-1]

                    jdata = json.loads(jsonD)
                    articles = jdata['data']
                    for article in articles:

                        xxxs = str(article).split(',')
                        rticlet = xxxs[0]

                        preTitle = xxxs[5]

                        if str(preTitle).__contains__('川财') or str(preTitle).__contains__('或'):
                           continue

                        # if str(preTitle).__contains__('历史') or str(preTitle).__contains__('周期')or str(preTitle).__contains__('成长'):
                        # if str(preTitle).__contains__('政治') or str(preTitle).__contains__('中央经济')or str(preTitle).__contains__('贸易战'):
                        if str(preTitle).__contains__('日本'):
                            print preTitle
                            date_time = datetime.datetime.strptime(rticlet, '%Y/%m/%d %H:%M:%S')

                            infoCode = xxxs[1]
                            destU = u"http://data.eastmoney.com/report/{}/cl,{}.html ".format(
                                date_time.strftime('%Y%m%d'), infoCode)

                            print destU

                            result = Http.get_content(destU)
                            result = unicode(result, 'GBK').encode('UTF-8')

                            xxsoup = BeautifulSoup(result, 'html.parser')

                            title_tationl = xxsoup.find_all('h1')
                            tt = str(title_tationl[0].text).strip()

                            xxlist_p_list = xxsoup.find_all('div', class_='report-infos')[0]

                            sp = xxlist_p_list.find_all('span')

                            ttime = str((sp[1]).text)

                            date_time = datetime.datetime.strptime(ttime, '%Y年%m月%d日 %H:%M')

                            # print date_time.strftime('%Y-%m-%d')

                            ttime = date_time.strftime('%Y-%m-%d')

                            # print (sp[2]).text
                            # print (sp[3]).text

                            title = Match.replace_specile_chars(tt)
                            title = title.replace('/', '', 100)

                            fileName = u"{}_{}_{}_{}.pdf".format(ttime, (sp[2]).text, title, (sp[3]).text)
                            # 时间 券商 名称  author

                            print fileName

                            urlsp = sp[-1]

                            basePath = '{}/{}'.format(sdPath, fileName)

                            # print basePath

                            # 创建文件夹

                            list_pcyc_li = urlsp.find_all('a')
                            for li in list_pcyc_li:
                                ttt = li.get('href')
                                Path.mkdirAndPath(basePath)
                                print ttt

                                Debug.print_in_single_line(u'开始下载   {}'.format(ttt))
                                if ttt:
                                    content = Http.get_content(url=ttt, timeout=180)
                                    if not content:
                                        Debug.logger.debug(u'pdf『{}』下载失败'.format(ttt))
                                        content = ''
                                    else:
                                        Debug.print_in_single_line(u'pdf {} 下载完成'.format(ttt))
                                else:
                                    #   当下载地址为空的时候,就没必要再去下载了
                                    content = ''
                                if content.__len__() > 10:
                                    with open(basePath, "wb") as pdf:
                                        pdf.write(content)



                except Exception as e:
                    print('next')
예제 #5
0
    def start(self):
        print 'start JRJ_Report'

        stockList = []

        stockList.append({'URL': '1', 'NAME': '宏观研究'})
        # stockList.append({'URL': '8', 'NAME': '策略趋势'})

        for xx in stockList:

            for raw_front_page_index in range(5, 50):

                print '开始第' + str(raw_front_page_index) + '页面 下载'

                fileN = str(xx['NAME']).strip()
                uux = xx['URL']

                sdPath = ' /Volumes/MacintoshHD/File/{}'.format(fileN)

                Path.mkdir(sdPath)

                url = u"http://istock.jrj.com.cn/yanbao_{}_p{}.html"

                request_url = url.format(uux, raw_front_page_index)
                content = Http.get_content(request_url)

                soup = BeautifulSoup(content, 'html.parser')

                list_p_list = soup.find_all('div', class_="yb_con")

                for p in list_p_list:
                    # print p

                    list_pcyc_li = p.find_all('a')
                    for li in list_pcyc_li:
                        xxurl = li.get('href')
                        # print xxurl

                        if not 'http://istock.jrj.com.cn/list,yanbao.html' == xxurl:

                            try:

                                result = Http.get_content(xxurl)
                                result = unicode(str(result), 'GBK').encode('UTF-8')

                                xxsoup = BeautifulSoup(result, 'html.parser')

                                # title_tationl = xxsoup.find_all('h1')
                                # tt = str(title_tationl[0].text).strip()

                                xxlist_p_list = xxsoup.find_all('p', class_='title')[0]
                                xxlist_ds = xxsoup.find_all('span', class_='fr')[0]

                                realu = str(xxlist_p_list).replace(str(xxlist_ds), '', 1)

                                realuxsoup = BeautifulSoup(realu, 'html.parser')

                                sp = str(realuxsoup.text).split(' ')

                                ttime = sp[1]

                                if ttime.__contains__('发表于'):
                                    ttime = sp[2]

                                # print (sp[2]).text
                                # print (sp[3]).text

                                # print ttime

                                all_main = xxsoup.find_all('div', class_='main')[0]

                                realuxsoup = BeautifulSoup(str(all_main), 'html.parser')

                                reaupp = realuxsoup.find_all('p')

                                for pp in reaupp:
                                    list_pcyc_li = pp.find_all('a')

                                    for li in list_pcyc_li:
                                        print li.text
                                        ttt = li.get('href')

                                        # print ttt

                                        ftype = 'pdf'

                                        if str(ttt).endswith('.xlsx'):
                                            ftype = 'xlsx'

                                        fileName = u"{}_{}.{}".format(ttime, str(li.text).replace('/', ""), ftype)

                                        print fileName

                                        basePath = '/ink/work/62/ink/{}/{}'.format(fileN, fileName)

                                        Path.mkdirAndPath(basePath)

                                        Debug.print_in_single_line(u'开始下载   {}  '.format(ttt))
                                        if ttt:
                                            content = Http.get_content(url=ttt, timeout=180)
                                            if not content:
                                                # Debug.logger.debug(u'文件『{}』下载失败'.format(ttt))
                                                content = ''
                                            else:
                                                Debug.print_in_single_line(u'文件 {} 下载完成'.format(ttt))
                                        else:
                                            #   当下载地址为空的时候,就没必要再去下载了
                                            content = ''
                                        if not os.path.exists(fileName):
                                            if content.__len__() > 10:
                                                with open(basePath, "wb") as pdf:
                                                    pdf.write(content)
                            except Exception as e:
                                print 'Exception ' + e.message