def nextPageNegativeOneAllPage(self, response, url): np = [] maxURL = '' nextPageText = ''.join(response.xpath(self.xpath['nextPageText']).extract()).strip() if nextPageText == '下一页': a = response.xpath(self.xpath['allPage']).extract() for one in a: maxURL = one break else: nextPageText = ''.join(response.xpath(self.xpath['nextPageText2']).extract()).strip() if nextPageText == '下一页': a = response.xpath(self.xpath['allPage']).extract() for one in a: maxURL = one break if maxURL is '': return [] tmp = maxURL.split('/') maxNumber = util.String2Number(tmp[-2]) if tmp[-1] == '' else util.String2Number(tmp[-1]) if self.reversed: for i in range(int(maxNumber), 1, -1): np.append(url + 'g' + str(i)) else: for i in range(2, int(maxNumber) + 1): np.append(url + 'g' + str(i)) return np
def processSong(self, value): # 处理送股 index = value.find('送') index2 = value.find('转') if index != -1: newValue = value[index + 1:] out = util.String2Number(newValue) return out elif index2 != -1: newValue = value[index2 + 1:] out = util.String2Number(newValue) return out else: return 0
def CalcDividend(self, year, position): # '10送3.00派4.00元(含税,扣税后3.30元)' if const.GPFH_KEYWORD.KEY_NAME['AllocationPlan'] in self.checkPoint[ year][position]: value = self.checkPoint[year][position][ const.GPFH_KEYWORD.KEY_NAME['AllocationPlan']] number = util.String2Number(value) profit = self.processPai(value) self.checkPoint[year][position]['dividend'] = profit / number profit2 = self.processPai2(value) self.checkPoint[year][position][ 'dividend_aftertax'] = profit2 / number gift = self.processSong(value) self.checkPoint[year][position]['gift'] = gift / number if const.GPFH_KEYWORD.KEY_NAME['CQCXR'] in self.checkPoint[year][ position]: tmpDate = self.checkPoint[year][position][ const.GPFH_KEYWORD.KEY_NAME['CQCXR']] # 农业银行,2010年 if tmpDate == '-': if position == 'midYear': # 半年默认在当年年底除权 tmpDate = pd.Timestamp(datetime(year, 12, 1)) else: tmpDate = pd.Timestamp(datetime(year + 1, 6, 30)) else: tmpDate = pd.to_datetime(np.datetime64(tmpDate)) self.dividendPoint.append( DividendPoint( tmpDate, self.checkPoint[year][position]['dividend'], self.checkPoint[year][position]['dividend_aftertax'], self.checkPoint[year][position]['gift'], year, position))
def Run(codes): out = [] for one in codes: try: baseCounter = 0 hitCounter = 0 firstQuarter = None lastQuarter = None baseCounter2010 = 0 hitCounter2010 = 0 firstQuarter2010 = None lastQuarter2010 = None df = util.LoadData('stock', 'yjbg2-' + one['_id'], condition={}, sort=[('_id', 1)]) for quarter, row in df.iterrows(): id = datetime.strptime(quarter, '%Y-%m-%d') value = util.String2Number(row['sjltz']) if not np.isnan(value): if firstQuarter is None: firstQuarter = id lastQuarter = id baseCounter += 1 if id.year >= 2010: if firstQuarter2010 is None: firstQuarter2010 = id lastQuarter2010 = id baseCounter2010 += 1 if value < -10: hitCounter += 1 if id.year >= 2010: hitCounter2010 += 1 percent = 0 percent2010 = 0 if baseCounter > 0: percent = hitCounter / baseCounter if baseCounter2010 > 0: percent2010 = hitCounter2010 / baseCounter2010 out.append({ '_id': one['_id'], 'begin': firstQuarter, 'end': lastQuarter, 'base': baseCounter, 'hit': hitCounter, 'percent': percent, 'begin2010': firstQuarter2010, 'end2010': lastQuarter2010, 'base2010': baseCounter2010, 'hit2010': hitCounter2010, 'percent2010': percent2010 }) except Exception as e: util.PrintException(e) dfOut = pd.DataFrame(out) util.SaveMongoDB_DF(dfOut, 'stock_statistics2', 'dangerousQuarterRatio')
def processPai2(self, value): # 处理派息,税后 index = value.find('扣税后') if index != -1: newValue = value[index + 1:] out = util.String2Number(newValue) return out else: return 0
def parseOne(self, one): oneOut = items.HouseItem() oneOut['src'] = self.src oneOut['district'] = ''.join(one.xpath('./div[1]/p[3]/a[1]/text()').extract()).strip() oneOut['subDistrict'] = ''.join(one.xpath('./div[1]/p[3]/a[2]/text()').extract()).strip() oneOut['title'] = ''.join(one.xpath('./div[1]/h4/a/text()').extract()).strip() href = ''.join(one.xpath('./div[1]/h4/a/@href').extract()).strip() if len(href) > 0: id = '-1' try: id = href.split('/')[-1][:-5] except Exception as e: logging.warning("parseOne Exception %s" % (str(e))) oneOut['_id'] = id try: totalPrice = util.String2Number(''.join(one.xpath('./div[2]/p[1]/span/text()').extract()).strip()) oneOut['totalPrice'] = totalPrice oneOut['unitPrice'] = util.String2Number( ''.join(one.xpath('./div[2]/p[2]/text()').extract()).strip()) oneOut['community'] = ''.join(one.xpath('./div[1]/p[1]/a/text()').extract()) # community = community.split(' ') # if len(community) >= 2: # oneOut['community'] = community[1] oneOut['houseType'] = ''.join(one.xpath('./div[1]/p[1]/span[2]/text()').extract()).strip() if oneOut['houseType'] == '|': oneOut['houseType'] = ''.join(one.xpath('./div[1]/p[1]/span[3]/text()').extract()).strip() oneOut['square'] = util.String2Number(''.join(one.xpath('./div[1]/p[1]/span[4]/text()').extract()).strip()) if np.isnan(oneOut['square']): oneOut['square'] = util.String2Number(''.join(one.xpath('./div[1]/p[1]/span[5]/text()').extract()).strip()) oneOut['level'] = ''.join(one.xpath('./div[1]/p[2]/span[1]/text()').extract()).strip() oneOut['crawlDate'] = util.today() except Exception as e: print(e) logging.warning("parseOne Exception %s"%(str(e))) return oneOut
def nextPageNegativeOneAllPage(self, response, url): np = [] maxURL = None maxURL = ''.join(response.xpath(self.xpath['allPage']).extract()).strip() # if len(allPage): # maxURL = allPage[0].strip() if maxURL is '': return [] tmp = maxURL.split('/') maxNumber = util.String2Number(tmp[-2]) if tmp[-1] == '' else util.String2Number(tmp[-1]) if self.reversed: for i in range(int(maxNumber), 1, -1): np.append(url + 'g' + str(i)) else: for i in range(2, int(maxNumber) + 1): np.append(url + 'g' + str(i)) return np
def nextPageNegativeOne(self, response, url): np = [] maxURL = None nextPageText = ''.join( response.xpath(self.xpath['nextPageText']).extract()).strip() if nextPageText == '下一页': tmp = response.xpath(self.xpath['allPage2']).extract() if len(tmp): maxURL = tmp[0].strip() else: tmp = response.xpath(self.xpath['nextPage']).extract() if len(tmp): maxURL = tmp[0].strip() if maxURL is not None: tmp = maxURL.split('/') maxNumber = util.String2Number( tmp[-2]) if tmp[-1] == '' else util.String2Number(tmp[-1]) for i in range(2, int(maxNumber) + 1): np.append(url + 'pg' + str(i)) return np
def parseOne(self, one, district, subDistrict): oneOut = items.HouseItem() oneOut['src'] = self.src oneOut['district'] = district oneOut['subDistrict'] = subDistrict oneOut['title'] = ''.join(one.xpath('./div/div[2]/div[1]/a/text()').extract()).strip() href = ''.join(one.xpath('./div/div[2]/div[1]/a/@href').extract()).strip() if len(href) > 0: id = '-1' try: id = href.split('/')[-1][:-5] except Exception as e: logging.warning("parseOne Exception %s" % (str(e))) oneOut['_id'] = id try: totalPrice = util.String2Number(''.join(one.xpath('./div/div[3]/h3/span/text()').extract()).strip()) oneOut['totalPrice'] = totalPrice oneOut['unitPrice'] = util.String2Number( ''.join(one.xpath('./div/div[3]/p/text()').extract()).strip()) oneOut['community'] = ''.join(one.xpath('./div/div[2]/div[2]/a/text()').extract()) tmp = ''.join(one.xpath('./div/div[2]/div[2]/text()').extract()).strip() tmp2 = tmp.split('|') if len(tmp2) > 0: oneOut['houseType'] = tmp2[0] if len(tmp2) > 1: oneOut['square'] = util.String2Number(tmp2[1]) if len(tmp2) > 4: oneOut['level'] = tmp2[3] + '-' + tmp2[4] oneOut['crawlDate'] = util.today() except Exception as e: print(e) logging.warning("parseOne Exception %s"%(str(e))) return oneOut
def Run(codes): #计算全财报的,计算2010年作为开始年份的 for one in codes: try: out = [] # out2010 = [] beforeSJLTZ = 0 beforeContinuityTrend = 0 # beforeSJLTZ2010 = 0 beforeContinuityTrend2010 = 0 df = util.LoadData('stock', 'yjbg-' + one, condition={}, sort=[('_id', 1)]) for quarter, row in df.iterrows(): id = datetime.strptime(quarter, '%Y-%m-%d') value = util.String2Number(row['sjltz']) if not np.isnan(value): nowTrend = None #这个条件不过分,一个公司即使业务完全停止增长,考虑到通胀也该在数值上是增长的 if value < 0: nowTrend = -1 # nowTrend2010 = -1 elif value - beforeSJLTZ < 0: nowTrend = -0.5 # nowTrend2010 = -0.5 else: nowTrend = 1 # nowTrend2010 = 1 beforeSJLTZ = value nowContinuityTrend = beforeContinuityTrend + nowTrend beforeContinuityTrend = nowContinuityTrend out.append({ '_id': id, 'nowPMT': nowTrend, 'continuityPMT': nowContinuityTrend }) if id.year >= 2010: nowContinuityTrend2010 = beforeContinuityTrend2010 + nowTrend beforeContinuityTrend2010 = nowContinuityTrend2010 out[-1].update({ 'continuityPMTFrom2010': nowContinuityTrend2010 }) dfOut = pd.DataFrame(out) util.SaveMongoDB_DF(dfOut, 'stock_statistics', one) except Exception as e: print(e)
def parseUpDown(self, response): path = [ '/html/body/div[@class="portraitLayer "]/div/p', # '/html/body/div[@class="portraitLayer tiphide openHotTip"]/div/p', ] clazz = { 'up': './span[@class="up"]/text()', 'down': './span[@class="down"]/text()', } for one in path: p = response.xpath(one) for k, v in clazz.items(): percent = util.String2Number(''.join( p.xpath(v).extract()).strip()) if not np.isnan(percent): return (k, percent)
def parse(self, response): self.received.add(response.url) if 'step' not in response.meta or response.meta['step'] == 0: districts = self.parseDistricts(response) realOut = set(districts) - self.received for one in realOut: yield Request(one, meta={'step': 0}) if 'step' in response.meta and response.meta['step'] <= 1: subDistricts = self.parseSubDistricts(response) realOut = set(subDistricts) - self.received for one in realOut: yield Request(one, meta={'step': 1, 'url': one}) district = '' subDistrict = '' if 'step' in response.meta: if response.meta['step'] == 1: d = response.xpath(self.xpath['districtName']).extract() if len(d): district = d[0] d = response.xpath(self.xpath['subDistrictName']).extract() if len(d): subDistrict = d[0] number = util.String2Number(''.join(response.xpath(self.xpath['districtNumber']).extract()).strip()) n = items.HouseDetailDigest() n['city'] = self.city n['src'] = self.src n['district'] = district n['subDistrict'] = subDistrict n['number'] = number today = util.todayString() try: n['_id'] = today + '_' + self.city + '_' + district + '_' + subDistrict except Exception as e: print(e) yield n nextPage = self.nextPage(response, self.head, response.meta['url'], number) realOut = set(nextPage) - self.received for one in realOut: print('next url: %s %s %s'%(district, subDistrict, one)) yield Request(one, meta={'step': 2, 'district': district, 'subDistrict': subDistrict}) if response.meta['step'] == 2: district = response.meta['district'] subDistrict = response.meta['subDistrict'] if response.meta['step'] >= 1: ones = response.xpath(self.xpath['lists']) for one in ones: oneOut = self.parseOne(one, district, subDistrict) yield oneOut
def parseOne(self, one, district, subDistrict): # {'_id': '', # 'area': '', # 'attention': '', # 'community': '', # 'crawlDate': datetime.datetime(2019, 8, 24, 0, 0), # 'district': '朝阳', # 'level': ')', # 'src': 'lj', # 'subDistrict': '通州北苑', # 'title': '', # 'totalPrice': nan, # 'unitPrice': nan} oneOut = items.HouseItem() oneOut['src'] = self.src oneOut['district'] = district oneOut['subDistrict'] = subDistrict oneOut['title'] = ''.join( one.xpath('.//div[1]/div[1]/a/text()').extract()).strip() oneOut['_id'] = ''.join( one.xpath('.//div[1]/div[1]/a/@data-housecode').extract()).strip() try: unitPrice = util.String2Number(''.join( one.xpath( './/div[1]/div[6]/div[2]/span/text()').extract()).strip()) if not np.isnan(unitPrice): oneOut['unitPrice'] = unitPrice oneOut['totalPrice'] = util.String2Number(''.join( one.xpath('.//div[1]/div[6]/div[1]/span/text()').extract() ).strip()) else: # https://sh.lianjia.com/ershoufang/changning/pg96/ oneOut['unitPrice'] = util.String2Number(''.join( one.xpath('.//div[1]/div[7]/div[2]/span/text()').extract() ).strip()) oneOut['totalPrice'] = util.String2Number(''.join( one.xpath('.//div[1]/div[7]/div[1]/span/text()').extract() ).strip()) oneOut['community'] = ''.join( one.xpath('.//div[1]/div[2]/div/a/text()').extract()) houseInfo = ''.join( one.xpath('.//div[1]/div[2]/div/text()').extract()) houseInfo = houseInfo.split('|') if len(houseInfo) > 1: oneOut['houseType'] = houseInfo[1].strip() if len(houseInfo) > 2: oneOut['square'] = util.String2Number(houseInfo[2].strip()) #'/html/body/div[4]/div[1]/ul/li[1]/div[1]/div[3]/div/a' oneOut['area'] = util.ExtractString( one, './/div[1]/div[3]/div/a/text()') positionInfo = ''.join( one.xpath('.//div[1]/div[3]/div/text()').extract()) positionInfo = positionInfo.split(')') if len(positionInfo) > 0: oneOut['level'] = positionInfo[0].strip() + ')' if len(positionInfo) > 1: oneOut['structure'] = positionInfo[1].strip() followInfo = ''.join( one.xpath('.//div[1]/div[4]/text()').extract()) followInfo = followInfo.split('/') if len(followInfo) > 0: oneOut['attention'] = followInfo[0].strip() if len(followInfo) > 1: oneOut['follow'] = followInfo[1].strip() if len(followInfo) > 2: oneOut['release'] = followInfo[2].strip() oneOut['crawlDate'] = util.today() except Exception as e: print(e) logging.warning("parseOne Exception %s" % (str(e))) return oneOut
def parse(self, response): self.received.add(response.url) districts = self.parseDistricts(response) realOut = set(districts) - self.received for one in realOut: yield Request(one, meta={'step': 0}) subDistricts = self.parseSubDistricts(response) realOut = set(subDistricts) - self.received for one in realOut: yield Request(one, meta={'step': 1, 'url': one}) district = np.nan subDistrict = np.nan if 'step' in response.meta: if response.meta['step'] == 1: d = response.xpath(self.xpath['districtName']).extract() if len(d): district = d[0] d = response.xpath(self.xpath['subDistrictName']).extract() if len(d): subDistrict = d[0] number = util.String2Number(''.join( response.xpath( self.xpath['districtNumber']).extract()).strip()) n = items.HouseDetailDigest() n['city'] = self.city n['src'] = self.src n['district'] = district n['subDistrict'] = subDistrict n['number'] = number n['_id'] = util.todayString() + '_' + n['city'] + '_' + n[ 'district'] + '_' + n['subDistrict'] yield n nextPage = self.nextPage(response, self.head, response.meta['url']) realOut = set(nextPage) - self.received for one in realOut: # nextURL = self.head + one print('next url: %s %s %s' % (district, subDistrict, one)) yield Request(one, meta={ 'step': 2, 'district': district, 'subDistrict': subDistrict }) if response.meta['step'] == 2: district = response.meta['district'] subDistrict = response.meta['subDistrict'] if response.meta['step'] >= 1: ones = response.xpath(self.xpath['lists']) for one in ones: oneOut = self.parseOne(one, district, subDistrict) if len(oneOut['_id']): yield oneOut else: continue