def parse_page(self, response): info = json.loads(response.body.strip()) results = info['results'] for result in results: item = QaItem() item['nick_name'] = result['authorName'] item['source'] = 'sz' item['stock'] = result['companyShortName'] item['code'] = result['stockCode'] item['content'] = result['attachedContent'] item['qa'] = 1 pub_date = now_date.timeStamp( int(result['attachedPubDate'].encode("utf-8"))) item['pub_date'] = pub_date item['create_date'] = now_date.get_now_time() #print ('===',type(content)) item['uuid'] = uuid.uuid5( uuid.NAMESPACE_DNS, item['content'].decode('utf-8').encode('gbk')) #print('nick_name',nick_name,'stock',stock,'code',code,'content',content,'pub_date',pub_date) yield item
def parse(self, response): div_lists = response.xpath( '//div[@class="Tl talkList2"]/div[@class="answerBoxOuter clear"]') for div_list in div_lists: item = QaItem() item['nick_name'] = 'kk' item['source'] = 'sz' stock = '1' code = '2' if stock is not None: item['stock'] = stock item['code'] = code else: item['stock'] = None item['code'] = None # item['stock'] = None # item['code'] = None data = div_list.xpath( './div[@class="answerBox"]/div[@class="msgCnt gray666"]/a[@class="cntcolor"]' ) content_str = data.xpath('string(.)').extract_first() content = removetnr(str_to_strip(content_str)) item['content'] = 'haha' item['qa'] = 1 item['pub_date'] = None ''' 这个地方需要对时间进行处理 1.分钟 2.小时 如果是小时 就用当时时间减去 当前小时数 3.天 ''' # if pub_date is not None: # #匹配是否有前 # temp = pub_date.decode('utf8') # findword = u"(前+)" # pattern = re.compile(findword) # results = pattern.findall(temp) # if len(results):#有值 # for result in results: # if result is not None: # temp = pub_date.decode('utf8') # findword=u"(小时+)" # pattern = re.compile(findword) # results = pattern.findall(temp) # for result in results: # if result is not None: # pub_date = re.findall(r"\d+\.?\d*", pub_date)[0] # item['pub_date'] = (datetime.datetime.now()-datetime.timedelta(hours=int(pub_date))).strftime("%Y-%m-%d %H:%M:%S") # else: # pub_date = re.findall(r"\d+\.?\d*", pub_date)[0] # item['pub_date'] = (datetime.datetime.now()-datetime.timedelta(minutes=int(pub_date))).strftime("%Y-%m-%d %H:%M:%S") # else: # pub_temp = pub_date.replace('月', '-').replace('日', '') # item['pub_date'] = '2018-'+pub_temp # else: # pub_temp = pub_date.replace('月', '-').replace('日', '') # item['pub_date'] = '2018-'+pub_temp # else: # item['pub_date'] = None item['create_date'] = now_date.get_now_time() #print ('===',type(content)) item['uuid'] = uuid.uuid5(uuid.NAMESPACE_DNS, content.decode('utf-8').encode('gbk')) #print('nick_name',nick_name,'stock',stock,'code',code,'content',content,'pub_date',pub_date) yield item
def parse(self, response): div_lists = response.xpath('//div[@class="m_feed_item m_question"]') for div_list in div_lists: item = QaItem() item['nick_name'] = div_list.xpath( './div[@class="m_feed_detail"]/div[@class="m_feed_face"]/a/@title' ).extract_first() item['source'] = 'sh' stock = div_list.xpath( './div[@class="m_feed_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_txt"]/a/text()' ).extract_first() if stock is not None: item['stock'] = stock.replace(':', '').split('(')[0] item['code'] = stock.replace(':', '').split('(')[1].replace( ')', '') else: item['stock'] = None item['code'] = None data = div_list.xpath( './div[@class="m_feed_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_txt"]' ) content_str = data.xpath('string(.)').extract_first() content = removetnr(str_to_strip(content_str)).replace(':', '') item['content'] = content item['qa'] = 0 pub_date = div_list.xpath( './div[@class="m_feed_detail"]/div[@class="m_feed_cnt "]/div[@class="m_feed_func clearfix"]/div[@class="m_feed_from"]/span/text()' ).extract_first() ''' 这个地方需要对时间进行处理 1.分钟 2.小时 如果是小时 就用当时时间减去 当前小时数 3.天 ''' if pub_date is not None: #匹配是否有前 tempbefore = pub_date.decode('utf8') findwordbefore = u"(前+)" patternbefore = re.compile(findwordbefore) resultsbefores = patternbefore.findall(tempbefore) if len(resultsbefores): #有值 # for resultsbefore in resultsbefores: # if resultsbefore is not None: temp = pub_date.decode('utf8') findwordshours = u"(小时+)" patternhours = re.compile(findwordshours) resulthours = patternhours.findall(temp) if len(resulthours): pub_date = re.findall(r"\d+\.?\d*", pub_date)[0] item['pub_date'] = ( datetime.datetime.now() - datetime.timedelta(hours=int(pub_date)) ).strftime("%Y-%m-%d %H:%M:%S") else: pub_date = re.findall(r"\d+\.?\d*", pub_date)[0] item['pub_date'] = ( datetime.datetime.now() - datetime.timedelta(minutes=int(pub_date)) ).strftime("%Y-%m-%d %H:%M:%S") # else: # pub_temp = pub_date.replace('月', '-').replace('日', '') # item['pub_date'] = '2018-'+pub_temp else: #昨天 tempyestoday = pub_date.decode('utf8') findwordsyestoday = u"(昨天+)" patternyestodays = re.compile(findwordsyestoday) resultyestodays = patternyestodays.findall(tempyestoday) if len(resultyestodays): pub_dates = tempyestoday.replace('昨天 ', '') last_date = (datetime.datetime.now() - datetime.timedelta(days=int(1)) ).strftime("%Y-%m-%d") item['pub_date'] = last_date + ' ' + pub_dates # item['pub_date'] = (datetime.datetime.now()-datetime.timedelta(minutes=int(24))).strftime("%Y-%m-%d %H:%M:%S") else: pub_temp = pub_date.replace('月', '-').replace('日', '') item['pub_date'] = '2018-' + pub_temp else: item['pub_date'] = None # if pub_date is not None: # #匹配是否有前 # temp = pub_date.decode('utf8') # findword = u"(前+)" # pattern = re.compile(findword) # results = pattern.findall(temp) # if len(results):#有值 # for result in results: # if result is not None: # temp = pub_date.decode('utf8') # findword=u"(小时+)" # pattern = re.compile(findword) # results = pattern.findall(temp) # for result in results: # if result is not None: # pub_date = re.findall(r"\d+\.?\d*", pub_date)[0] # item['pub_date'] = (datetime.datetime.now()-datetime.timedelta(hours=int(pub_date))).strftime("%Y-%m-%d %H:%M:%S") # else: # pub_date = re.findall(r"\d+\.?\d*", pub_date)[0] # item['pub_date'] = (datetime.datetime.now()-datetime.timedelta(minutes=int(pub_date))).strftime("%Y-%m-%d %H:%M:%S") # else: # pub_temp = pub_date.replace('月', '-').replace('日', '') # item['pub_date'] = '2018-'+pub_temp # else: # pub_temp = pub_date.replace('月', '-').replace('日', '') # item['pub_date'] = '2018-'+pub_temp # # # # temp = re.search(r'^(.*)前', pub_date) # # else: # item['pub_date'] = None item['create_date'] = now_date.get_now_time() #print ('===',type(content)) item['uuid'] = uuid.uuid5(uuid.NAMESPACE_DNS, content.decode('utf-8').encode('gbk')) #item['uuid'] = None #print('nick_name',nick_name,'stock',stock,'code',code,'content',content,'pub_date',pub_date) yield item