def parseHYDetailsMeta(self, text, id): key_value_dic = GateHuoYuanSource.get_k_v_dic() """ { u"货源名称": "sourceName", u"货源地址": "sourceAddress", u"货物名称": "startPlace", u"货物类型": "goodsType", u"重量": "weight", u"数量": "number", u"外形": "appearance", u"体积": "volume", u"所需车型": "vehicleDependent", u"运送方式": "shippingMethod", u"运费": "freight", u"包装方式": "packingManer", u"注意事项": "attention", u"货物情况": "cargoCondition", u"最早起运": "earliestShipment", u"最晚起运": "latestShipment", u"是否长期货源": "isLongTerm", u"出发地": "departure", u"目的地": "destPlace", u"发布时间": "postTime", u"webSiteId": "webSiteId", u"出发地联系人": "sourceContact", u"目的地联系人": "destContact", u"补充说明": "remark", } """ result = {} soup = BeautifulSoup(text) soup.prettify() sourceDetailinfo = soup.find(attrs={"class": "detail_top_content_1"}) sourceName = sourceDetailinfo.findAll("li")[0].find("span").fetchText(text=True)[0].replace("\t", "").strip( "\t") sourceName = "".join(sourceName.split()) sourceAdress = sourceDetailinfo.findAll("li")[1].fetchText(text=True)[1].strip() remark = soup.find(attrs={"class": "detail_left_content_1_4"}).find("li").fetchText(text=True) if (len(remark) > 1): remark = "".join(remark[1:]) else: remark = None result["remark"] = remark result["sourceName"] = sourceName result["sourceAddress"] = sourceAdress result["webSiteId"] = id sourceInfo = soup.findAll(attrs={"class": "detail_left_content_1_1"}) infos = list(sourceInfo[0].findAll("li")) infos.extend(sourceInfo[1].findAll("li")) allInfo = {} for info in infos: key = info.fetchText(text=True)[0].replace(":", "") if key in ["最早起运", "最晚起运"]: value = "".join(info.fetchText(text=True)) else: value = "".join(info.fetchText(text=True)) allInfo[key] = value for k, v in allInfo.iteritems(): result[key_value_dic.get(k)] = v contact = self.parseHYContact(id) result["source"] = contact[0] result["dest"] = contact[1] return result
try: GateHuoYuanSource.objects.get(webSiteId=id) logger.info("HuoYuan %s already exsists" % id) return except: pass text = self.httpClient.geturlcon(url) result = {} try: result = self.parseHYDetailsMeta(text, id) except Exception, e: traceback.print_exc() logger.error(e) if True: source = GateHuoYuanSource() try: for k, v in result.iteritems(): logger.info("gateee " + str(k) + " " + str(v)) source.save_from_a_source(result) except Exception, e: traceback.print_exc() logger.error(e) else: logger.info("%s saved" % (id)) else: logger.error(url + " is a null page") def getAndSaveCYFromId(self, id): url = "http://56gate.com/html/cy/cydetail_%s.html" % id try: