def transform(self,datas): subetl = self.__proj__.modules[self.ETLSelector]; if self.IsMultiYield: for data in datas: doc = data.copy(); for r in subetl.__generate__(subetl.AllETLTools, [doc]): yield extends.MergeQuery(r, data, self.NewColumn); else: yield None; # TODO
def XPathTF(etl, data): from lxml import etree if etl.IsManyData: tree = spider.GetHtmlTree(data[etl.Column]); nodes = tree.xpath(etl.XPath); for node in nodes: ext = {'Text': spider.getnodetext(node), 'HTML': etree.tostring(node).decode('utf-8')}; ext['OHTML'] = ext['HTML'] yield extends.MergeQuery(ext, data, etl.NewColumn); else: tree = spider.GetHtmlTree(data[etl.Column]); nodes = tree.xpath(etl.XPath); data[etl.NewColumn] = nodes[0].text; yield data;
def execute(self,datas): subetl = self.__proj__.modules[self.ETLSelector]; for data in datas: if spider.IsNone(self.NewColumn): doc = data.copy(); else: doc = {}; extends.MergeQuery(doc, data, self.NewColumn + " " + self.Column); result=(r for r in generate(subetl.AllETLTools, [doc])) count=0; for r in result: count+=1; print(r); print(count) yield data;
def CrawlerTF(etl, data): crawler = etl.crawler url = data[etl.Column] buff = etl.buff if url in buff: datas = buff[url] else: datas = crawler.CrawData(url) if len(buff) < 100: buff[url] = datas if etl.crawler.IsMultiData == 'List': for d in datas: res = extends.MergeQuery(d, data, etl.NewColumn) yield res else: data = extends.Merge(data, datas) yield data
def transform(self, data): crawler = self.crawler; url = data[self.Column]; buff = self.buff; if url in buff: datas = buff[url]; else: datas = crawler.CrawData(url); if len(buff) < 100: buff[url] = datas; if self.crawler.IsMultiData == 'List': for d in datas: res = extends.MergeQuery(d, data, self.NewColumn); yield res; else: data = extends.Merge(data, datas); yield data;
def process(self,data): if self.IsMultiYield: # one to many for r in data: for p in self.transform( r): yield extends.MergeQuery(p, r,self.NewColumn); return; for d in data: # one to one if self.OneOutput: if self.Column not in d or self.Column not in d: yield d; continue; item = d[self.Column] if self.OneInput else d; res = self.transform(item) key= self.NewColumn if self.NewColumn!='' else self.Column; d[key]=res; else: self.transform( d) yield d;
def transform(self, data): from lxml import etree if self.IsManyData: tree = spider.GetHtmlTree(data[self.Column]); nodes = tree.xpath(self.XPath); for node in nodes: ext = {'Text': spider.getnodetext(node), 'HTML': etree.tostring(node).decode('utf-8')}; ext['OHTML'] = ext['HTML'] yield extends.MergeQuery(ext, data, self.NewColumn); else: tree = spider.GetHtmlTree(data[self.Column]); nodes = tree.xpath(self.XPath); node=nodes[0] if hasattr(node,'text'): setValue(data, self, node.text); else: setValue(data,self,str(node)) yield data;