示例#1
0
 def _listparse(self,element,cfg):
     rlist = [];
     if self.result is None:
         self.result = rlist;
     
     list = self.tran.select(element,cfg);
     
     ncfg = cfg.xpath("dict | list | val");
     if not len(ncfg):
         ncfg = None;
     else:
         ncfg = ncfg[0];
     
     for x in list:
         rlist.append(self._parse(x,ncfg));
         
     for item in cfg.findall("incl"):
         for ele in list:
             ilist = self.tran.select(ele,item);
             rlt = self._inclparse(item , ilist);
             if type(rlt) == list : 
                 for v in rlt:
                     rlist.append(v);
             else:
                 log.error("%s 's result is not a list (%s)" % (cfg.getroottree().getpath(cfg),type(rlt)));
     
     for item in cfg.findall("spec"):
         self.tran.spec(rlist , item , list);
     
     return rlist;
示例#2
0
文件: fund.py 项目: henry42/html2json
def update_all_nav(fundcode=[],start=None,end=datetime.now()):
    
    log.info('update_all_nav start');
    
    global SQL_CONN;
    
    req = 'http://biz.finance.sina.com.cn/fundinfo/open/lsjz.php?fund_code=';
    fundstart = {};
    endtime = end.strftime('%Y-%m-%d');
    
    conn = pydb.connect(**SQL_CONN);
    cur = conn.cursor();
    cur.execute("select code,birthday from fund_info");
    for row in cur.fetchall():
        fundstart[row[0]] = {'startdate1' : row[1].strftime('%Y-%m-%d'),'enddate1':endtime};
    if fundcode is None:
        fundcode = fundstart.keys();

    sql4data = '''insert into fund_data (
                code , 
                date , 
                nav , 
                tnav) values (%s,%s,%s,%s) on duplicate key update 
                nav=%s,
                tnav=%s ''';
    ind = 1;
    count = len(fundcode);
    cur = conn.cursor();
    for fc in fundcode:
        log.info("start %s %s/%s" % (fc,ind,count));
        crawl = crawler();
        crawl.settranslator("xml");
        postdata = fundstart[fc];
        if start is not None:
            postdata['startdate1'] = start.strftime('%Y-%m-%d');
        header = {'Refer':req + fc};
        crawl.seturi(req + fc,postdata,header);
        crawl.setcfgfile("crawler_allfund_of_sina.xml");
        result = crawl.parse();
        log.debug("data %s %s" % (fc,result));
        
        for data in result or []:
            if(result):
                cur.execute(sql4data,(fc,data['date'])+(data['nav'],data['tnav'])*2);
            else:
                log.error("no information about %s" % fc);
        warnings = cur.fetchwarnings()
        if warnings:
            log.warn("db:" + warnings);
        conn.commit();
        log.info("done %s %s/%s" % (fc,ind,count));
	ind = ind + 1;
    
    conn.close();
    log.info('update_all_nav done');
示例#3
0
 def seturi(self,uri,params=None,headers=None):
     log.info("loading %s %s" % (uri,params));
     self.uri = uri;
     if(params is not None and type(params) == dict):
         params = urllib.urlencode(params);
     for i in range(1,self.MAX_URL_OPEN):
         try:
             opener=urllib.URLopener();
             opener.addheader("User-Agent","Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0;  Embedded Web Browser from: http://bsalsa.com/; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.2; .NET CLR 1.1.4322; Tablet PC 2.0)");
             opener.addheader("Cache-Control","no-cache");
             for k,v in (headers or {}).iteritems():
                 opener.addheader(k,v);
             self.content = opener.open(uri,params).read();
             break;
         except BaseException,e:
             log.error("%s got error %s" % (i,e));
             if i < self.MAX_URL_OPEN:
                 time.sleep(5);
示例#4
0
    def parse(self):
        self.result = None;
        
        if(self.content is None):
            log.error("no content for parsing");
            return;
        if(self.cfg is None):
            log.error("no config file");
            return;
        if(self.tran is None):
            log.error("no translator");
            return;

        log.info("parsing");
        data = self._parse(self.tran.getroot(),self.cfg);
        log.info("done!");
        _writetologfile(self.uuid , json.dumps(data,ensure_ascii=False,indent=4));
        return data;
示例#5
0
 def showerrorpath(self,text,cfg):
     log.error((text or "parse cfg error") + " " + cfg.getroottree().getpath(cfg));