def extract_to_map(speaker_file): speaker_to_speeches = {} root = et.parse(os.path.abspath(speaker_file)).getroot() for doc in root: text = doc[1].text speaker = doc[0].text if text is None: logging.warning('Speaker "%s" had nothing to say', speaker) continue # Add start/end of sentence tokens marked_sentences = [ "{} {} {}".format(SENTENCE_START_TOKEN, word, SENTENCE_END_TOKEN) for word in text.split("\n") ] speaker_tokens = itertools.chain.from_iterable( map(lambda sentence: re.split("\s+", sentence), marked_sentences)) l = list(speaker_tokens) if not l: continue speaker_to_speeches[speaker] = l return speaker_to_speeches
def test_definition_transience_simple(self): with app.test_request_context(): tree = etree.parse('tests/transient_defs.xml', parser=self.parser) tree, definitions = populate_definitions(tree, document_id=0) tree = process_definitions(tree, definitions) self.assertEqual(len(definitions.active), 1) # one global self.assertEqual(len(definitions.items()), 4)
def validate(self): if not hasattr(self, 'xsd'): self.xsd=etree.XMLSchema(etree.parse(XSD)) xml = etree.fromstring(self.Xml) return self.xsd.validate(xml)
def recentlyAdded(servercfg): logger = logging.getLogger(__name__) url = "http://apollo.ayercraft.net:32400/library/sections/7/recentlyAdded" response = getURL(url) if pArgs.xmlparser == "lxml": xmlContent = letree.fromstring(response.text.encode('utf8')) parser = letree.XMLParser(ns_clean=True) tree = letree.parse(StringIO(xmlContent), parser) elif pArgs.xmlparser == "xml": xmlContent = xetree.ElementTree.fromstring( response.text.encode('utf8')) parser = xetree.XMLParser(ns_clean=True) tree = xetree.parse(StringIO(xmlContent), parser) for element in xmlContent.iter(): if element.tag == "Directory": mptitle = element.get('parentTitle') mstitle = element.get('title') mtype = element.get('type') logger.info("Title: %s\tType: %s\tSubTitle: %s" % (mptitle, mstitle, mtype)) if element.tag == "Video": vtype = element.get('type') if vtype == "episode": vptitle = str( element.get('grandparentTitle')).encode('utf-8').strip() vseason = str( element.get('parentIndex')).encode('utf-8').strip() vepisode = str(element.get('index')).encode('utf-8').strip() vetitle = unicode(element.get('title')).encode( 'ascii', 'replace') logger.info("Title: %s\tSeason: %s\tEpisode: %s\tTitle: %s" % (vptitle, vseason, vepisode, vetitle))
def transformXML(repository, XSL): printHeading('Transforming OAI records to Solr records') # Run through XML files in oai-temp folder OAITempPath = repositoryPath('oai-temp', repository) OAIPath = repositoryPath('oai', repository) solrTempPath = repositoryPath('solr-temp', repository, None, True) if os.path.exists(OAITempPath): fileList = os.listdir(OAITempPath) if fileList != None: fileList.sort() for fileName in fileList: solrFilePath = solrTempPath + '/' + fileName OAIFilePath = OAITempPath + '/' + fileName try: fileXML = etree.parse(OAIFilePath) solrXML = XSL(fileXML, collections="'geoleo-oai'") solrFile = open(solrFilePath, 'w') solrFile.write( etree.tostring(solrXML, encoding='utf-8', method='xml')) solrFile.close() print u'Created Solr file »' + solrFilePath + u'«' moveFile(fileName, OAITempPath, OAIPath) except: printError(u'Could not convert file »' + fileName + u'«', repository)
def load(fp): """ :param fp: :return Rpm: """ parser = etree.XMLParser(target=RpmParser()) return etree.parse(fp.read(), parser)
def process(path): root = etree.parse(path) element = root.find(".//database") element.attrib['clustered'] = 'true' xml = etree.tostring(root, pretty_print=True) return xml
def main(argv): dirname_input = argv[1] filename_output = argv[2] source = os.path.join(dirname_input, 'word', 'document.xml') tree = et.parse(os.path.abspath(source)) result = extract(tree) result = transform(result) result = load(filename_output, result)
def test_definition_extraction(self): with app.test_request_context(): tree = etree.parse('tests/3_definitions.xml', parser=self.parser) definitions = Definitions() find_all_definitions(tree, definitions, document_id=0, expire=False) self.assertEqual(len(definitions.items()), 3) self.assertTrue(('accounting period', 'accounting periods') in definitions.active) self.assertTrue(('address for service', 'address for services', 'addresses for service', 'addresses for services') in definitions.active) self.assertTrue(('annual meeting', 'annual meetings') in definitions.active)
def test_complex(self): with app.test_request_context(): tree = etree.parse('tests/companiesact_gutted.xml', parser=self.parser) tree, definitions = populate_definitions(tree, document_id=0) tree, _ = process_definitions(tree, definitions) for d in definitions.pool: if d.full_word in ['shareholder', 'holder of the shares']: self.assertIn('DLM320498', d.expiry_tags) self.assertIn('DLM1624955', d.expiry_tags)
def test_equations(self): tree = etree.parse('tests/path_extraction.xml', parser=self.parser) el = tree.xpath('.//*[@id="zzz"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)(i)') el = tree.xpath('.//*[@id="yyy"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)') el = tree.xpath('.//*[@id="xxx"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)') el = tree.xpath('.//*[@id="aaa"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 sch 1 cl 1(1)')
def test_definition_redefinitions(self): with app.test_request_context(): tree = etree.parse('tests/redefinitions.xml', parser=self.parser) tree, definitions = populate_definitions(tree, document_id=0) tree, _ = process_definitions(tree, definitions) self.assertEqual(len(tree.xpath('.//catalex-def')), 4) self.assertEqual(tree.xpath('.//catalex-def')[0].attrib['def-ids'], '0-xxx') self.assertEqual(tree.xpath('.//catalex-def')[1].attrib['def-ids'], '0-yyy') self.assertEqual(tree.xpath('.//catalex-def')[2].attrib['def-ids'], '0-xxx') self.assertEqual(tree.xpath('.//catalex-def')[3].attrib['def-ids'], '0-zzz')
def test_equations(self): tree = etree.parse('tests/path_extraction.xml', parser=self.parser) el = tree.xpath('.//*[@id="zzz"]')[0] self.assertEqual( generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)(i)') el = tree.xpath('.//*[@id="yyy"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)') el = tree.xpath('.//*[@id="xxx"]')[0] self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)') el = tree.xpath('.//*[@id="aaa"]')[0] self.assertEqual( generate_path_string(el)[0], 'Test Act 666 sch 1 cl 1(1)')
def test_definition_redefinitions(self): with app.test_request_context(): tree = etree.parse('tests/redefinitions.xml', parser=self.parser) tree, definitions = populate_definitions(tree, document_id=0) tree, _ = process_definitions(tree, definitions) self.assertEqual(len(tree.xpath('.//catalex-def')), 4) self.assertEqual( tree.xpath('.//catalex-def')[0].attrib['def-ids'], '0-xxx') self.assertEqual( tree.xpath('.//catalex-def')[1].attrib['def-ids'], '0-yyy') self.assertEqual( tree.xpath('.//catalex-def')[2].attrib['def-ids'], '0-xxx') self.assertEqual( tree.xpath('.//catalex-def')[3].attrib['def-ids'], '0-zzz')
def main(): playlist = ET.Element("playlist") output_list = '/home/alxant/PathonStady/Laba1/out.xml' for list_entry in output_list: product = ET.SubElement(playlist, "entry") print product ET.SubElement(product, "artist").text = list_entry[0] ET.SubElement(product, "title").text = list_entry[1] ET.SubElement(product, "genre").text = list_entry[2] tree = ET.ElementTree(playlist) print tree tree.write(output_list) parser = etree.MLParser(resolve_entities=False, strip_cdata=False) print parser document = etree.parse(output_list, parser) print document document.write(output_list, pretty_print=True, encoding='utf-8')
def test_case_and_plurals(self): with app.test_request_context(): tree = etree.parse('tests/plural_charcase_defs.xml', parser=self.parser) tree, definitions = populate_definitions(tree, document_id=0) tree, _ = process_definitions(tree, definitions) self.assertEqual(len(definitions.items()), 6) self.assertEqual(len(tree.xpath('.//*[@cid="case_wrong_start"]/catalex-def-def')), 0) self.assertEqual(len(tree.xpath('.//*[@cid="case_wrong_end"]/catalex-def')), 0) self.assertEqual(len(tree.xpath('.//*[@cid="case_correct"]/catalex-def')), 1) self.assertEqual(len(tree.xpath('.//*[@cid="case_plural_correct"]/catalex-def')), 1) self.assertEqual(len(tree.xpath('.//*[@cid="plural_correct"]/catalex-def')), 1) self.assertEqual(len(tree.xpath('.//*[@cid="plural_wrong"]/catalex-def')), 0) self.assertEqual(len(tree.xpath('.//*[@cid="complex_plural_correct"]/catalex-def')), 1) self.assertEqual(len(tree.xpath('.//*[@cid="complex_plural_possessive_correct"]/catalex-def')), 1) self.assertEqual(len(tree.xpath('.//*[@cid="complex_plural_possessive_correct_2"]/catalex-def')), 2) self.assertEqual(len(tree.xpath('.//*[@cid="complex_plural_possessive_correct_3"]/catalex-def')), 4) self.assertEqual(len(tree.xpath('.//catalex-def')), 12)
def load_xml_keyfile(filename): """ // Sample XML file: // <?xml version="1.0" encoding="utf-8"?> // <KeyFile> // <Meta> // <Version>1.00</Version> // </Meta> // <Key> // <Data>ySFoKuCcJblw8ie6RkMBdVCnAf4EedSch7ItujK6bmI=</Data> // </Key> // </KeyFile> """ with open(filename, 'r') as f: # ignore meta, currently there is only version "1.00" tree = etree.parse(f).getroot() # read text from key, data and convert from base64 return base64.b64decode(tree.find('Key/Data').text)
def initConfig(): #config = Element("MetamergeConfig") logsNsettings = os.path.join(args['p'], "Log & Settings") #print "Log & Settings: %s" % logsNsettings try: baseCfg = ET.parse(logsNsettings) print "** found Log & Settings" cfg = Element("MetamergeConfig") for child in baseCfg.getroot(): cfg.append(child) except Exception as ex: print "** Exception reading Log & Settings: %s" % str(ex) print "** creating default basic config **" cfg = defaultConfig() cfg.attrib[ 'IDIversion'] = 'Compiled by ' + prog + " v" + vrs + " - " + today.__format__( '%Y-%m-%d') cfg.attrib['created'] = now cfg.attrib['createdBy'] = prog + " v" + vrs cfg.attrib['version'] = "7.1.1" for folder in folders: #print "Looking for %s" % folder if folder == "Properties": fld = cfg.find(".//Properties[@name='%s']" % folder) elif folder == "References": fld = cfg.find(".//Folder[@name='Includes']") else: fld = cfg.find(".//Folder[@name='%s']" % folder) if fld is None: #print "Adding folder %s" % folder if folder == "Properties": fld = SubElement(cfg, "Properties") fld.attrib['name'] = folder elif folder == "References": fld = SubElement(cfg, "Folder") fld.attrib['name'] = "Includes" else: fld = SubElement(cfg, "Folder") fld.attrib['name'] = folder return cfg
def load_xml_keyfile(filename): """ // Sample XML file: // <?xml version="1.0" encoding="utf-8"?> // <KeyFile> // <Meta> // <Version>1.00</Version> // </Meta> // <Key> // <Data>ySFoKuCcJblw8ie6RkMBdVCnAf4EedSch7ItujK6bmI=</Data> // </Key> // </KeyFile> """ with open(filename, 'r') as f: # ignore meta, currently there is only version "1.00" tree = etree.parse(f).getroot() # read text from key, data and convert from base64 return base64.b64decode(tree.find('Key/Data').text) raise IOError('Could not parse XML keyfile.')
def get_latest_version(url): previous_version = '0' version = '0' response = urllib2.urlopen(url) html = response.read() parser = etree.HTMLParser() tree = etree.parse(BytesIO(html), parser) result = etree.tostring(tree.getroot(), pretty_print=True, method="html") for row in tree.iter('td'): if row is not None and row.xpath('a/@href'): build_num = row.xpath('a/@href')[0] previous_version = build_num[-4:-1] if previous_version != 'way' and previous_version != "0.0" and int( previous_version) >= int(version): version = previous_version return version
def test_case_and_plurals(self): with app.test_request_context(): tree = etree.parse('tests/plural_charcase_defs.xml', parser=self.parser) tree, definitions = populate_definitions(tree, document_id=0) tree, _ = process_definitions(tree, definitions) self.assertEqual(len(definitions.items()), 6) self.assertEqual( len(tree.xpath( './/*[@cid="case_wrong_start"]/catalex-def-def')), 0) self.assertEqual( len(tree.xpath('.//*[@cid="case_wrong_end"]/catalex-def')), 0) self.assertEqual( len(tree.xpath('.//*[@cid="case_correct"]/catalex-def')), 1) self.assertEqual( len(tree.xpath( './/*[@cid="case_plural_correct"]/catalex-def')), 1) self.assertEqual( len(tree.xpath('.//*[@cid="plural_correct"]/catalex-def')), 1) self.assertEqual( len(tree.xpath('.//*[@cid="plural_wrong"]/catalex-def')), 0) self.assertEqual( len( tree.xpath( './/*[@cid="complex_plural_correct"]/catalex-def')), 1) self.assertEqual( len( tree.xpath( './/*[@cid="complex_plural_possessive_correct"]/catalex-def' )), 1) self.assertEqual( len( tree.xpath( './/*[@cid="complex_plural_possessive_correct_2"]/catalex-def' )), 2) self.assertEqual( len( tree.xpath( './/*[@cid="complex_plural_possessive_correct_3"]/catalex-def' )), 4) self.assertEqual(len(tree.xpath('.//catalex-def')), 12)
class GSASearchProvider(BaseSearchProvider): SCHEME_HOST_RE = re.compile(r'[a-z\d]+:\/\/[a-z.\-\d]+\/') def __init__(self, search_url, domain, params={}, title_clean_re=None): self.search_url, self.domain, self.params = search_url, domain, params self.title_clean_re = re.compile(title_clean_re) if title_clean_re else None def perform_search(self, request, query, application=None): if application: domain = self.domain + reverse('%s:index' % application)[:-1] else: domain = self.domain query = self._perform_query_expansion(query) query = ' '.join(('(%s)' % (' OR '.join(((('"%s"' % t) if ' ' in t else t) for t in terms))) for terms in query[:])) params = dict(self.params) params.update({ 'q': query.encode('utf-8'), 'output': 'xml', 'ie': 'utf8', 'oe': 'utf8', 'as_sitesearch': domain, }) try: response = urllib2.urlopen('?'.join((self.search_url, urllib.urlencode(params)))) except urllib2.HTTPError, e: logger.exception("Couldn't fetch results from Google Search Appliance") return [] try: xml_root = etree.parse(response) except xml.parsers.expat.ExpatError, e: logger.exception("Couldn't parse results from Google Search Appliance") return []
def transformXML (repository, XSL): printHeading('Transforming OAI records to Solr records') # Run through XML files in oai-temp folder OAITempPath = repositoryPath('oai-temp', repository) OAIPath = repositoryPath('oai', repository) solrTempPath = repositoryPath('solr-temp', repository, None, True) if os.path.exists(OAITempPath): fileList = os.listdir(OAITempPath) if fileList != None: fileList.sort() for fileName in fileList: solrFilePath = solrTempPath + '/' + fileName OAIFilePath = OAITempPath + '/' + fileName try: fileXML = etree.parse(OAIFilePath) solrXML = XSL(fileXML, collections="'geoleo-oai'") solrFile = open(solrFilePath, 'w') solrFile.write(etree.tostring(solrXML, encoding='utf-8', method='xml')) solrFile.close() print u'Created Solr file »' + solrFilePath + u'«' moveFile(fileName, OAITempPath, OAIPath) except: printError(u'Could not convert file »' + fileName + u'«', repository)
def indent(elem, level=0): i = "\n" + level*" " if len(elem): if not elem.text or not elem.text.strip(): elem.text = i + " " pass for e in elem: indent(e, level+1) if not e.tail or not e.tail.strip(): e.tail = i + " " pass if not e.tail or not e.tail.strip(): e.tail = i pass else: if level and (not elem.tail or not elem.tail.strip()): elem.tail = i pass pass if len(sys.argv) > 1: src = sys.argv[1] pass else: src = sys.stdin pass tree = etree.parse(src) indent(tree.getroot()) tree.write(sys.stdout, "utf-8")
def main(): configurationPath = u'config/config.js' formats = False delete = None harvest = None transform = False solrXSL = None solrURL = None try: # evaluate command line parameters options, arguments = getopt.getopt(sys.argv[1:], 'c:d:fD:h:ts:', [ 'config=', 'datapath=', 'formats', 'delete=', 'harvest=', 'transform', 'solr=' ]) for option, value in options: if option in ('-c', '--config'): configurationPath = value elif option in ('-d', '--datapath'): dataPath = value elif option in ('-D', '--delete'): delete = value elif option in ('-f', '--formats'): formats = True elif option in ('-h', '--harvest'): harvest = value elif option in ('-t', '--transform'): transform = True elif option in ('-s', '--solr'): solrURL = value print value else: assert False, 'unhandled option' # read configuration f = open(configurationPath) configuration = json.load(f) f.close() repositories = configuration['servers'] if delete != None: deleteFiles(delete) # Read transformation XSL if needed. if transform != None: try: solrXSLXML = etree.parse('OAI-to-Solr.xsl') solrXSL = etree.XSLT(solrXSLXML) except: printError('Failed to read XSL for Solr transformation.') # loop through repositories and run actions determined by the command line parameters for repositoryID in sorted(repositories.iterkeys()): repository = repositories[repositoryID] repository['ID'] = repositoryID if not repository.has_key('broken'): print '' printHeading(u'==== ' + repositoryID + u' ====', repository) if formats: determineFormats(repository) if harvest != None: updateOAI(repository, configuration, harvest.split(',')) if transform: transformXML(repository, solrXSL) if solrURL != None: updateSolr(repository, solrURL) if solrURL != None: print '' print u'Committing Solr Index …' try: solrCommit = urllib2.urlopen(updateURL, "commit=true") solrCommit.close() except urllib2.URLError as err: printError('Failed to commit the Solr index: ' + str(err)) except getopt.GetoptError, err: printError('Could not parse the options: ' + str(err)) sys.exit(2)
def setUp(self): self.parser = etree.XMLParser(remove_blank_text=True) self.tree = etree.parse('tests/instruments/companies.xml', parser=self.parser).getroot()
def prettyPrintXml(self, ): assert self.xmlFile is not None parser = etree.XMLParser(resolve_entities=False, strip_cdata=False) document = etree.parse(self.xmlFile, parser) document.write(self.xmlFile, pretty_print=True, encoding='utf-8')
#!/usr/bin/python #-*-coding:utf-8-*- import sys from 1xml import etree htm1 = open(sys.argv[1]) root = etree.parse(htm1) for item in root.xpath('//sentence[@id= "2"]/tokens/token[@id= "5"]/word'): print item.text
def scan(path): for dir in dirs(path): if dir == "Resources": scan(os.path.join(path, dir)) elif dir in folders: print "checking %s" % dir if dir == "Properties": folder = config.find(".//Properties[@name='%s']" % dir) elif dir == "References": folder = config.find(".//Folder[@name='Includes']") else: folder = config.find(".//Folder[@name='%s']" % dir) if folder is None: raise Exception("Missing <Folder> with name = %s" % dir) if dir == "Properties": elemName = "PropertyStore" elif dir == "References": elemName = "Include" elif dir == "Schedules": elemName = "Scheduler" else: elemName = dir[:len(dir) - 1] #print "**** Looking for %s in %s" % (elemName, path + "/" + dir) for file in files(path + "/" + dir): #subfolder = SubElement(folder, type) #name = os.path.splitext(file)[0] #subfolder.attrib['name'] = name if file.find('Java-Properties') >= 0: continue filepath = os.path.join(os.path.join(path, dir), file) xml = ET.parse(filepath) #print "Parsed file %s" % filepath if dir == "Properties": parent = xml.findall(".//PropertyStore") useFolder = folder.find("./Stores") if useFolder is None: useFolder = ET.SubElement(folder, "Stores") folder = useFolder #print "Folder set to %s" % ET.tostring(folder) elif dir == "References": parent = xml.findall(".//Include") else: parent = xml.getroot() for child in parent: #print "Checking %s == %s" % (child.tag, elemName) if child.tag == elemName: try: childName = file p = childName.rfind(".") if p > 0: childName = childName[:p] #print "--> reduced childName to %s from %s" % (childName, file) #childName = child.attrib['name'] if childName.startswith("."): childName = childName[1:] if childName.endswith(".script"): childName = childName[:len(childName) - 7] if childName.endswith(".assemblyline"): childName = childName[:len(childName) - 13] child.attrib['name'] = childName print " ..adding %s" % child.attrib['name'] folder.append(child) except Exception as ex: print "!! Skipping due to %s - %s" % (ex, child) i = 42 # Skip those tag without the 'name' attribute #print ' <%s name="%s" />' % (type, name) i = 42 # do nothing
def write_XML(self, data_list): print data_list #data_list contains id, question_number and received_answer id_list=[] print data_list #data_list = [item.received_answer.replace('#', ' ') for item in data_list] #data_list = [item.received_answer.replace('|', 'Ì') for item in data_list] #data_list = [item.received_answer.replace('$', 'Î') for item in data_list] for item in data_list: #add all the IDs from the data_list into id_list id_list.append(item.id) #a skeleton of answer XML is already created containg tags and attribute names #the XML skeleton is copied into another file with name "XML'id_list[0]" in which the answers will be filled current_path = os.path.dirname(__file__) parent_folder = os.path.abspath(os.path.join(current_path, os.pardir)) xml_folder = os.path.normpath(os.path.join(parent_folder, "XML_files")) shutil.copy2(os.path.join(xml_folder, "Answer_XML_skeleton.xml"), os.path.join(xml_folder,"XML%s.xml"%id_list[0])) path = os.path.join(xml_folder,"XML%s.xml"%id_list[0]) print "11231239123123981209381092380192830912830812938102938" print path print "11231239123123981209381092380192830912830812938102938" parser = etree.XMLParser(remove_blank_text=True, encoding='iso-8859-1') tree = etree.parse(path,parser) root = tree.getroot() #first filling the value of tag attributes(eg <data id="value">) from the received answer SMS for element in root.iter(): attribute = element.attrib for item in attribute: for item1 in data_list: #print "item ="+item, "question_number="+str(item1.question_number) #to avoid conflict, the attribute "id" of "data" tag is stored as "dataID" #check if "dataID" is encountered in the data list if item == "id" and item1.question_number == "dataID": #if encountered, add the value of received answer to the value of "id" attribute element.attrib[item] = item1.received_answer if item == str(item1.question_number): text = item1.received_answer if '#' in item1.received_answer: text = item1.received_answer.replace('#', ' ') #check if the attribute name from the answer XML skeleton matches the item from the data_list if item1.received_answer == "X": #if received answer contains "X" then nothing is stored as the value of attribute continue #otherwise the the received answer is stored as the value of the attribute element.attrib[item] = text #filling the data items(eg <QN_470_0>data</QN_470-0> for element in root: root1 = element #counter to make sure that data is inserted one after another rather than one before another count = 0 for item in data_list: if item.question_number.startswith("QN"): #check if the received_answer is "-", if true it confirms the presence of tags of group or matrix if item.received_answer == "-": #no need to add the data to the XML, so SKIP continue #if received answer contains data, then make the corresponding question_number a tag elem = etree.Element(str(item.question_number)) #and store the received_answer as the text of the same tag text = item.received_answer if '#' in item.received_answer: text = item.received_answer.replace('#', ' ') print u"\u00CE" if '$' in item.received_answer: text = item.received_answer.replace('$', u"\u00CE") print text if '|' in item.received_answer: text = item.received_answer.replace('|', u"\u00CC") print text print str(text) elem.text = unicode(text) #insert the tag into the XML file one after another #count defines where to insert in the XML file root1.insert(count,elem) count=count+1 #finally write the data into the XML file tree.write(path,pretty_print=True) #old code #return id_list[0] #old code #-------------------- #new test code fo = open(path,"r") print fo """ soup = BeautifulStoneSoup(fo.read()) html = fo.read().encode('ascii', 'ignore') html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES) """ print "2" h = HTMLParser.HTMLParser() xml = h.unescape(fo.read()) print "xml=", xml fo.close() fo = open(path, 'w+') fo.write(xml) fo.close fo = open(path,"r") return fo.read()
def extract_to_map(speaker_file, speaker_to_speeches): root = et.parse(os.path.abspath(speaker_file)).getroot() for doc in root: speaker_to_speeches[doc[0].text].append(doc[1].text if doc[1].text is not None else '')
def __listManifestItems(contentOPFPath): tree = etree.parse(contentOPFPath) return tree.xpath("//opf:manifest/opf:item/@href", namespaces={'opf': 'http://www.idpf.org/2007/opf'})
def slaZ( inp, State =State, newline ='\n'): import xml.etree as et e = et.parse( inp) q = e.getroot()
i = "\n" + level * " " if len(elem): if not elem.text or not elem.text.strip(): elem.text = i + " " pass for e in elem: indent(e, level + 1) if not e.tail or not e.tail.strip(): e.tail = i + " " pass if not e.tail or not e.tail.strip(): e.tail = i pass else: if level and (not elem.tail or not elem.tail.strip()): elem.tail = i pass pass if len(sys.argv) > 1: src = sys.argv[1] pass else: src = sys.stdin pass tree = etree.parse(src) indent(tree.getroot()) tree.write(sys.stdout, "utf-8")
def main(): configurationPath = u'config/config.js' formats = False delete = None harvest = None transform = False solrXSL = None solrURL = None try: # evaluate command line parameters options, arguments = getopt.getopt(sys.argv[1:], 'c:d:fD:h:ts:', ['config=', 'datapath=', 'formats', 'delete=', 'harvest=', 'transform', 'solr=']) for option, value in options: if option in ('-c', '--config'): configurationPath = value elif option in ('-d', '--datapath'): dataPath = value elif option in ('-D', '--delete'): delete = value elif option in ('-f', '--formats'): formats = True elif option in ('-h', '--harvest'): harvest = value elif option in ('-t', '--transform'): transform = True elif option in ('-s', '--solr'): solrURL = value print value else: assert False, 'unhandled option' # read configuration f = open(configurationPath) configuration = json.load(f) f.close() repositories = configuration['servers'] if delete != None: deleteFiles(delete) # Read transformation XSL if needed. if transform != None: try: solrXSLXML = etree.parse('OAI-to-Solr.xsl') solrXSL = etree.XSLT(solrXSLXML) except: printError('Failed to read XSL for Solr transformation.') # loop through repositories and run actions determined by the command line parameters for repositoryID in sorted(repositories.iterkeys()): repository = repositories[repositoryID] repository['ID'] = repositoryID if not repository.has_key('broken'): print '' printHeading(u'==== ' + repositoryID + u' ====', repository) if formats: determineFormats(repository) if harvest != None: updateOAI(repository, configuration, harvest.split(',')) if transform: transformXML(repository, solrXSL) if solrURL != None: updateSolr(repository, solrURL) if solrURL != None: print '' print u'Committing Solr Index …' try: solrCommit = urllib2.urlopen(updateURL, "commit=true") solrCommit.close() except urllib2.URLError as err: printError('Failed to commit the Solr index: ' + str(err)) except getopt.GetoptError, err: printError('Could not parse the options: ' + str(err)) sys.exit(2)