예제 #1
0
    def parse_page(self, response):
        info = json.loads(response.body.strip())
        results = info['results']
        for result in results:
            item = QaItem()
            item['nick_name'] = result['authorName']
            item['source'] = 'sz'

            item['stock'] = result['companyShortName']
            item['code'] = result['stockCode']
            item['content'] = result['attachedContent']
            item['qa'] = 1
            pub_date = now_date.timeStamp(
                int(result['attachedPubDate'].encode("utf-8")))
            item['pub_date'] = pub_date

            item['create_date'] = now_date.get_now_time()
            #print ('===',type(content))
            item['uuid'] = uuid.uuid5(
                uuid.NAMESPACE_DNS,
                item['content'].decode('utf-8').encode('gbk'))
            #print('nick_name',nick_name,'stock',stock,'code',code,'content',content,'pub_date',pub_date)
            yield item
예제 #2
0
 def parse(self, response):
     div_lists = response.xpath(
         '//div[@class="Tl talkList2"]/div[@class="answerBoxOuter clear"]')
     for div_list in div_lists:
         item = QaItem()
         item['nick_name'] = 'kk'
         item['source'] = 'sz'
         stock = '1'
         code = '2'
         if stock is not None:
             item['stock'] = stock
             item['code'] = code
         else:
             item['stock'] = None
             item['code'] = None
         # item['stock'] = None
         # item['code'] = None
         data = div_list.xpath(
             './div[@class="answerBox"]/div[@class="msgCnt gray666"]/a[@class="cntcolor"]'
         )
         content_str = data.xpath('string(.)').extract_first()
         content = removetnr(str_to_strip(content_str))
         item['content'] = 'haha'
         item['qa'] = 1
         item['pub_date'] = None
         '''
         这个地方需要对时间进行处理
         1.分钟
         2.小时 如果是小时 就用当时时间减去 当前小时数
         3.天
         '''
         # if pub_date is not None:
         #     #匹配是否有前
         #     temp = pub_date.decode('utf8')
         #     findword = u"(前+)"
         #     pattern = re.compile(findword)
         #     results = pattern.findall(temp)
         #     if len(results):#有值
         #        for result in results:
         #            if result is not None:
         #               temp = pub_date.decode('utf8')
         #               findword=u"(小时+)"
         #               pattern = re.compile(findword)
         #               results = pattern.findall(temp)
         #               for result in results:
         #                   if result is not None:
         #                     pub_date = re.findall(r"\d+\.?\d*", pub_date)[0]
         #                     item['pub_date'] = (datetime.datetime.now()-datetime.timedelta(hours=int(pub_date))).strftime("%Y-%m-%d %H:%M:%S")
         #                   else:
         #                     pub_date = re.findall(r"\d+\.?\d*", pub_date)[0]
         #                     item['pub_date'] = (datetime.datetime.now()-datetime.timedelta(minutes=int(pub_date))).strftime("%Y-%m-%d %H:%M:%S")
         #            else:
         #              pub_temp = pub_date.replace('月', '-').replace('日', '')
         #              item['pub_date'] = '2018-'+pub_temp
         #     else:
         #         pub_temp = pub_date.replace('月', '-').replace('日', '')
         #         item['pub_date'] = '2018-'+pub_temp
         # else:
         #     item['pub_date'] = None
         item['create_date'] = now_date.get_now_time()
         #print ('===',type(content))
         item['uuid'] = uuid.uuid5(uuid.NAMESPACE_DNS,
                                   content.decode('utf-8').encode('gbk'))
         #print('nick_name',nick_name,'stock',stock,'code',code,'content',content,'pub_date',pub_date)
         yield item
예제 #3
0
    def parse(self, response):
        div_lists = response.xpath('//div[@class="m_feed_item m_question"]')
        for div_list in div_lists:
            item = QaItem()
            item['nick_name'] = div_list.xpath(
                './div[@class="m_feed_detail"]/div[@class="m_feed_face"]/a/@title'
            ).extract_first()
            item['source'] = 'sh'
            stock = div_list.xpath(
                './div[@class="m_feed_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_txt"]/a/text()'
            ).extract_first()
            if stock is not None:
                item['stock'] = stock.replace(':', '').split('(')[0]
                item['code'] = stock.replace(':', '').split('(')[1].replace(
                    ')', '')
            else:
                item['stock'] = None
                item['code'] = None
            data = div_list.xpath(
                './div[@class="m_feed_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_txt"]'
            )
            content_str = data.xpath('string(.)').extract_first()
            content = removetnr(str_to_strip(content_str)).replace(':', '')
            item['content'] = content
            item['qa'] = 0
            pub_date = div_list.xpath(
                './div[@class="m_feed_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_func clearfix"]/div[@class="m_feed_from"]/span/text()'
            ).extract_first()
            '''
            这个地方需要对时间进行处理
            1.分钟
            2.小时 如果是小时 就用当时时间减去 当前小时数
            3.天
            '''
            if pub_date is not None:
                #匹配是否有前
                tempbefore = pub_date.decode('utf8')
                findwordbefore = u"(前+)"
                patternbefore = re.compile(findwordbefore)
                resultsbefores = patternbefore.findall(tempbefore)
                if len(resultsbefores):  #有值
                    # for resultsbefore in resultsbefores:
                    #  if resultsbefore is not None:
                    temp = pub_date.decode('utf8')
                    findwordshours = u"(小时+)"
                    patternhours = re.compile(findwordshours)
                    resulthours = patternhours.findall(temp)
                    if len(resulthours):
                        pub_date = re.findall(r"\d+\.?\d*", pub_date)[0]
                        item['pub_date'] = (
                            datetime.datetime.now() -
                            datetime.timedelta(hours=int(pub_date))
                        ).strftime("%Y-%m-%d %H:%M:%S")
                    else:
                        pub_date = re.findall(r"\d+\.?\d*", pub_date)[0]
                        item['pub_date'] = (
                            datetime.datetime.now() -
                            datetime.timedelta(minutes=int(pub_date))
                        ).strftime("%Y-%m-%d %H:%M:%S")
                    # else:
                    #      pub_temp = pub_date.replace('月', '-').replace('日', '')
                    #      item['pub_date'] = '2018-'+pub_temp
                else:
                    #昨天
                    tempyestoday = pub_date.decode('utf8')
                    findwordsyestoday = u"(昨天+)"
                    patternyestodays = re.compile(findwordsyestoday)
                    resultyestodays = patternyestodays.findall(tempyestoday)
                    if len(resultyestodays):
                        pub_dates = tempyestoday.replace('昨天 ', '')
                        last_date = (datetime.datetime.now() -
                                     datetime.timedelta(days=int(1))
                                     ).strftime("%Y-%m-%d")
                        item['pub_date'] = last_date + ' ' + pub_dates
                        # item['pub_date'] = (datetime.datetime.now()-datetime.timedelta(minutes=int(24))).strftime("%Y-%m-%d %H:%M:%S")
                    else:
                        pub_temp = pub_date.replace('月', '-').replace('日', '')
                        item['pub_date'] = '2018-' + pub_temp
            else:
                item['pub_date'] = None
            # if pub_date is not None:
            #     #匹配是否有前
            #     temp = pub_date.decode('utf8')
            #     findword = u"(前+)"
            #     pattern = re.compile(findword)
            #     results = pattern.findall(temp)
            #     if len(results):#有值
            #        for result in results:
            #            if result is not None:
            #               temp = pub_date.decode('utf8')
            #               findword=u"(小时+)"
            #               pattern = re.compile(findword)
            #               results = pattern.findall(temp)
            #               for result in results:
            #                   if result is not None:
            #                     pub_date = re.findall(r"\d+\.?\d*", pub_date)[0]
            #                     item['pub_date'] = (datetime.datetime.now()-datetime.timedelta(hours=int(pub_date))).strftime("%Y-%m-%d %H:%M:%S")
            #                   else:
            #                     pub_date = re.findall(r"\d+\.?\d*", pub_date)[0]
            #                     item['pub_date'] = (datetime.datetime.now()-datetime.timedelta(minutes=int(pub_date))).strftime("%Y-%m-%d %H:%M:%S")
            #            else:
            #              pub_temp = pub_date.replace('月', '-').replace('日', '')
            #              item['pub_date'] = '2018-'+pub_temp
            #     else:
            #         pub_temp = pub_date.replace('月', '-').replace('日', '')
            #         item['pub_date'] = '2018-'+pub_temp
            #
            #
            #     # temp = re.search(r'^(.*)前', pub_date)
            #
            # else:
            #     item['pub_date'] = None

            item['create_date'] = now_date.get_now_time()
            #print ('===',type(content))
            item['uuid'] = uuid.uuid5(uuid.NAMESPACE_DNS,
                                      content.decode('utf-8').encode('gbk'))
            #item['uuid'] = None
            #print('nick_name',nick_name,'stock',stock,'code',code,'content',content,'pub_date',pub_date)
            yield item