示例#1
0
文件: etl.py 项目: sunxy1988/etlpy
    def transform(self,datas):
        subetl = self.__proj__.modules[self.ETLSelector];
        if self.IsMultiYield:

            for data in datas:
                doc = data.copy();
                for r in subetl.__generate__(subetl.AllETLTools, [doc]):
                    yield extends.MergeQuery(r, data, self.NewColumn);
        else:
            yield None;  # TODO
示例#2
0
def XPathTF(etl, data):
    from lxml import etree
    if etl.IsManyData:
        tree = spider.GetHtmlTree(data[etl.Column]);
        nodes = tree.xpath(etl.XPath);
        for node in nodes:
            ext = {'Text': spider.getnodetext(node), 'HTML': etree.tostring(node).decode('utf-8')};
            ext['OHTML'] = ext['HTML']
            yield extends.MergeQuery(ext, data, etl.NewColumn);
    else:
        tree = spider.GetHtmlTree(data[etl.Column]);
        nodes = tree.xpath(etl.XPath);
        data[etl.NewColumn] = nodes[0].text;
        yield data;
示例#3
0
文件: etl.py 项目: sunxy1988/etlpy
 def execute(self,datas):
     subetl = self.__proj__.modules[self.ETLSelector];
     for data in datas:
         if spider.IsNone(self.NewColumn):
             doc = data.copy();
         else:
             doc = {};
             extends.MergeQuery(doc, data, self.NewColumn + " " + self.Column);
         result=(r for r in generate(subetl.AllETLTools, [doc]))
         count=0;
         for r in result:
             count+=1;
             print(r);
         print(count)
         yield data;
示例#4
0
def CrawlerTF(etl, data):
    crawler = etl.crawler
    url = data[etl.Column]
    buff = etl.buff
    if url in buff:
        datas = buff[url]
    else:
        datas = crawler.CrawData(url)
        if len(buff) < 100:
            buff[url] = datas
    if etl.crawler.IsMultiData == 'List':
        for d in datas:
            res = extends.MergeQuery(d, data, etl.NewColumn)
            yield res
    else:
        data = extends.Merge(data, datas)
        yield data
示例#5
0
文件: etl.py 项目: sunxy1988/etlpy
 def transform(self, data):
     crawler = self.crawler;
     url = data[self.Column];
     buff = self.buff;
     if url in buff:
         datas = buff[url];
     else:
         datas = crawler.CrawData(url);
         if len(buff) < 100:
             buff[url] = datas;
     if self.crawler.IsMultiData == 'List':
         for d in datas:
             res = extends.MergeQuery(d, data, self.NewColumn);
             yield res;
     else:
         data = extends.Merge(data, datas);
         yield data;
示例#6
0
文件: etl.py 项目: sunxy1988/etlpy
 def process(self,data):
     if self.IsMultiYield:  # one to many
         for r in data:
             for p in self.transform( r):
                 yield extends.MergeQuery(p, r,self.NewColumn);
         return;
     for d in data:  # one to one
         if self.OneOutput:
             if self.Column not in d or self.Column not in d:
                 yield d;
                 continue;
             item = d[self.Column] if self.OneInput else d;
             res = self.transform(item)
             key= self.NewColumn if self.NewColumn!='' else self.Column;
             d[key]=res;
         else:
             self.transform( d)
         yield d;
示例#7
0
文件: etl.py 项目: sunxy1988/etlpy
 def transform(self, data):
     from lxml import etree
     if self.IsManyData:
         tree = spider.GetHtmlTree(data[self.Column]);
         nodes = tree.xpath(self.XPath);
         for node in nodes:
             ext = {'Text': spider.getnodetext(node), 'HTML': etree.tostring(node).decode('utf-8')};
             ext['OHTML'] = ext['HTML']
             yield extends.MergeQuery(ext, data, self.NewColumn);
     else:
         tree = spider.GetHtmlTree(data[self.Column]);
         nodes = tree.xpath(self.XPath);
         node=nodes[0]
         if hasattr(node,'text'):
             setValue(data, self, node.text);
         else:
             setValue(data,self,str(node))
         yield data;