def createXmlandWrite(self, dataToStore, filename): if not dataToStore: return doc = Document() results = doc.createElement("results") doc.appendChild(results) for a in dataToStore: item = doc.createElement("item") for k, d in a.items(): tag = doc.createElement(k) if k == "author": tag2 = doc.createElement("name") data = doc.createTextNode(d) tag2.appendChild(data) tag.appendChild(tag2) else: data = doc.createTextNode(toUnicode(d)) tag.appendChild(data) item.appendChild(tag) results.appendChild(item) qwe = doc.toprettyxml(indent=" ", encoding="UTF-8") writeToFile( qwe, "scf_" + filename, "/mnt/minerva1/nlp/projects/spinn3r/solr_data/xjerab13", #"/mnt/minerva1/nlp/projects/blogs_download2/sail_data_out", #"e:\\tmp\\tmp\\conv", extension=".xml", timestamp=True)
def createXmlandWrite(self, dataToStore, filename): if not dataToStore: return doc = Document() results = doc.createElement("results") doc.appendChild(results) for a in dataToStore: item = doc.createElement("item") for k, d in a.items(): tag = doc.createElement(k) if k == "author": tag2 = doc.createElement("name") data = doc.createTextNode(d) tag2.appendChild(data) tag.appendChild(tag2) else: data = doc.createTextNode(toUnicode(d)) tag.appendChild(data) item.appendChild(tag) results.appendChild(item) qwe = doc.toprettyxml(indent=" ", encoding = "UTF-8") writeToFile(qwe, "scf_"+filename, "/mnt/minerva1/nlp/projects/spinn3r/solr_data/xjerab13", #"/mnt/minerva1/nlp/projects/blogs_download2/sail_data_out", #"e:\\tmp\\tmp\\conv", extension=".xml", timestamp=True )
def text_creator(self, link): xml_file = getDataFrom(urljoin(self.handle_url, urlparse(link).path), self.username, self.password) writeToFile(xml_file, link.split("/")[-1], os.path.join("rss_backup","sail","xmls"), ".xml", timestamp=True) tree = ElementTree.fromstring(xml_file) text = "" for node in tree.getiterator('word'): if node.text is not None: text += "".join([a+" " for a in node.text.split()]) text = text.replace(" .",".") text = " ".join(text.split()) return text
def text_creator(self, link): xml_file = getDataFrom(urljoin(self.handle_url, urlparse(link).path), self.username, self.password) writeToFile(xml_file, link.split("/")[-1], os.path.join("rss_backup", "sail", "xmls"), ".xml", timestamp=True) tree = ElementTree.fromstring(xml_file) text = "" for node in tree.getiterator('word'): if node.text is not None: text += "".join([a + " " for a in node.text.split()]) text = text.replace(" .", ".") text = " ".join(text.split()) return text
def writeWrapper(self, name): '''wrapping save function''' ext = ".err" if self.was_error else ".xml" msg = writeToFile(content = self.doc.toprettyxml(indent=" ", encoding="UTF-8"), filename = name, folder = self.folder, extension = ext, timestamp = True ) log.info(msg)
def writeWrapper(self, name): '''wrapping save function''' ext = ".err" if self.was_error else ".xml" msg = writeToFile(content=self.doc.toprettyxml(indent=" ", encoding="UTF-8"), filename=name, folder=self.folder, extension=ext, timestamp=True) log.info(msg)
def createXmlandWrite(self, name): '''From parsed data, create final xml document''' if not self.dataToStore: return doc = Document() results = doc.createElement("results") doc.appendChild(results) #walk over list of data for a in self.dataToStore: item = doc.createElement("item") for k, d in a.items(): tag = doc.createElement(k) if k == "author": tag2 = doc.createElement("name") data = doc.createTextNode(d) tag2.appendChild(data) tag.appendChild(tag2) elif k == "text": mpa = dict.fromkeys(range(10)+[11,12]+range(14,32)) d = d.translate(mpa) data = doc.createTextNode(" ".join(d.split())) tag.appendChild(data) else: data = doc.createTextNode(" ".join(d.split())) tag.appendChild(data) item.appendChild(tag) results.appendChild(item) writeToFile(doc.toprettyxml(indent=" ", encoding = "UTF-8"), name, self.outputFolder, extension=".xml", timestamp=True ) self.dataToStore = []
def createXmlandWrite(self, name): '''From parsed data, create final xml document''' if not self.dataToStore: return doc = Document() results = doc.createElement("results") doc.appendChild(results) #walk over list of data for a in self.dataToStore: item = doc.createElement("item") for k, d in a.items(): tag = doc.createElement(k) if k == "author": tag2 = doc.createElement("name") data = doc.createTextNode(d) tag2.appendChild(data) tag.appendChild(tag2) elif k == "text": mpa = dict.fromkeys(range(10) + [11, 12] + range(14, 32)) d = d.translate(mpa) data = doc.createTextNode(" ".join(d.split())) tag.appendChild(data) else: data = doc.createTextNode(" ".join(d.split())) tag.appendChild(data) item.appendChild(tag) results.appendChild(item) writeToFile(doc.toprettyxml(indent=" ", encoding="UTF-8"), name, self.outputFolder, extension=".xml", timestamp=True) self.dataToStore = []
def process(self, url): '''Download feed, process and create output''' content = getDataFrom(url, self.username, self.password) self.was_error = False self.doc = Document() #parsing feed feed = feedparser.parse(content) #if zero entries, no job appeared if len(feed.entries) <= 0: log.error("No Entries found in %s, you sure it is RSS or atom feed?", url) return None; #add static data to feed self.update_feed(feed , url) #create root tag t_results = self.doc.createElement("results") self.doc.appendChild(t_results) item_cnt = 0 timestamp = None if self.cache: timestamp = self.cache.loadFromCache(url) new_stamp = timestamp #print "timestamp ", #print new_stamp #walk over parsed feed, minning data and create final xml for _c_ in range(len(feed.entries)): t_item = self.doc.createElement("item") try: #chceck feed item publication date, save only newer items if new_stamp < feed.entries[_c_].date_parsed: new_stamp = feed.entries[_c_].date_parsed if timestamp >= feed.entries[_c_].date_parsed: continue except: pass #walking over xml tag and their values for tag,value in self.item_list.items(): path, func, req = value data = None if path is not None: try: f = feed for key in path: # _C_ as counter, mean number if key == "_C_": f = f[_c_] else: f = f[key] data = f except: pass #data postprocessing if func is not None: data = func(data) if req and data is None: log.error("Nenalezena data pro tag %s", tag) self.was_error = True if data is None: data = "" child = self.doc.createElement(tag) if isinstance(data, basestring): data = self.doc.createTextNode(toUnicode(data)) if isinstance(data, Node): child.appendChild(data) elif isinstance(data, dict): for id, data in data.items(): child.setAttribute(id,data) t_item.appendChild(child) t_results.appendChild(t_item) item_cnt += 1 if item_cnt > 0: #print "new stamp " #print new_stamp self.writeWrapper(urlparse(url).netloc.split(".")[-2]) if self.cache: self.cache.storeToCache(url,new_stamp) if self.makeBackup: writeToFile(content, "RSS_Sail_", os.path.join("rss_backup","sail"), ".xml", timestamp=True) else: log.debug("Nothing new in feed - %s " , url) return
def process(self, url): '''Download feed, process and create output''' content = getDataFrom(url, self.username, self.password) self.was_error = False self.doc = Document() #parsing feed feed = feedparser.parse(content) #if zero entries, no job appeared if len(feed.entries) <= 0: log.error( "No Entries found in %s, you sure it is RSS or atom feed?", url) return None #add static data to feed self.update_feed(feed, url) #create root tag t_results = self.doc.createElement("results") self.doc.appendChild(t_results) item_cnt = 0 timestamp = None if self.cache: timestamp = self.cache.loadFromCache(url) new_stamp = timestamp #print "timestamp ", #print new_stamp #walk over parsed feed, minning data and create final xml for _c_ in range(len(feed.entries)): t_item = self.doc.createElement("item") try: #chceck feed item publication date, save only newer items if new_stamp < feed.entries[_c_].date_parsed: new_stamp = feed.entries[_c_].date_parsed if timestamp >= feed.entries[_c_].date_parsed: continue except: pass #walking over xml tag and their values for tag, value in self.item_list.items(): path, func, req = value data = None if path is not None: try: f = feed for key in path: # _C_ as counter, mean number if key == "_C_": f = f[_c_] else: f = f[key] data = f except: pass #data postprocessing if func is not None: data = func(data) if req and data is None: log.error("Nenalezena data pro tag %s", tag) self.was_error = True if data is None: data = "" child = self.doc.createElement(tag) if isinstance(data, basestring): data = self.doc.createTextNode(toUnicode(data)) if isinstance(data, Node): child.appendChild(data) elif isinstance(data, dict): for id, data in data.items(): child.setAttribute(id, data) t_item.appendChild(child) t_results.appendChild(t_item) item_cnt += 1 if item_cnt > 0: #print "new stamp " #print new_stamp self.writeWrapper(urlparse(url).netloc.split(".")[-2]) if self.cache: self.cache.storeToCache(url, new_stamp) if self.makeBackup: writeToFile(content, "RSS_Sail_", os.path.join("rss_backup", "sail"), ".xml", timestamp=True) else: log.debug("Nothing new in feed - %s ", url) return