示例#1
0
文件: hw2.py 项目: bedoron/nlp_hw_1
def extract_to_map(speaker_file):
    speaker_to_speeches = {}
    root = et.parse(os.path.abspath(speaker_file)).getroot()
    for doc in root:
        text = doc[1].text
        speaker = doc[0].text
        if text is None:
            logging.warning('Speaker "%s" had nothing to say', speaker)
            continue

        # Add start/end of sentence tokens
        marked_sentences = [
            "{} {} {}".format(SENTENCE_START_TOKEN, word, SENTENCE_END_TOKEN)
            for word in text.split("\n")
        ]
        speaker_tokens = itertools.chain.from_iterable(
            map(lambda sentence: re.split("\s+", sentence), marked_sentences))

        l = list(speaker_tokens)
        if not l:
            continue

        speaker_to_speeches[speaker] = l

    return speaker_to_speeches
示例#2
0
 def test_definition_transience_simple(self):
     with app.test_request_context():
         tree = etree.parse('tests/transient_defs.xml', parser=self.parser)
         tree, definitions = populate_definitions(tree, document_id=0)
         tree = process_definitions(tree, definitions)
         self.assertEqual(len(definitions.active), 1)  # one global
         self.assertEqual(len(definitions.items()), 4)
示例#3
0
    def validate(self):

        if not hasattr(self, 'xsd'):
            self.xsd=etree.XMLSchema(etree.parse(XSD))
        xml = etree.fromstring(self.Xml)

        return self.xsd.validate(xml)
示例#4
0
def recentlyAdded(servercfg):
    logger = logging.getLogger(__name__)

    url = "http://apollo.ayercraft.net:32400/library/sections/7/recentlyAdded"
    response = getURL(url)

    if pArgs.xmlparser == "lxml":
        xmlContent = letree.fromstring(response.text.encode('utf8'))
        parser = letree.XMLParser(ns_clean=True)
        tree = letree.parse(StringIO(xmlContent), parser)
    elif pArgs.xmlparser == "xml":
        xmlContent = xetree.ElementTree.fromstring(
            response.text.encode('utf8'))
        parser = xetree.XMLParser(ns_clean=True)
        tree = xetree.parse(StringIO(xmlContent), parser)

    for element in xmlContent.iter():
        if element.tag == "Directory":
            mptitle = element.get('parentTitle')
            mstitle = element.get('title')
            mtype = element.get('type')
            logger.info("Title: %s\tType: %s\tSubTitle: %s" %
                        (mptitle, mstitle, mtype))
        if element.tag == "Video":
            vtype = element.get('type')
            if vtype == "episode":
                vptitle = str(
                    element.get('grandparentTitle')).encode('utf-8').strip()
                vseason = str(
                    element.get('parentIndex')).encode('utf-8').strip()
                vepisode = str(element.get('index')).encode('utf-8').strip()
                vetitle = unicode(element.get('title')).encode(
                    'ascii', 'replace')
                logger.info("Title: %s\tSeason: %s\tEpisode: %s\tTitle: %s" %
                            (vptitle, vseason, vepisode, vetitle))
示例#5
0
 def test_definition_transience_simple(self):
     with app.test_request_context():
         tree = etree.parse('tests/transient_defs.xml', parser=self.parser)
         tree, definitions = populate_definitions(tree, document_id=0)
         tree = process_definitions(tree, definitions)
         self.assertEqual(len(definitions.active), 1)  # one global
         self.assertEqual(len(definitions.items()), 4)
示例#6
0
def transformXML(repository, XSL):
    printHeading('Transforming OAI records to Solr records')
    # Run through XML files in oai-temp folder
    OAITempPath = repositoryPath('oai-temp', repository)
    OAIPath = repositoryPath('oai', repository)
    solrTempPath = repositoryPath('solr-temp', repository, None, True)
    if os.path.exists(OAITempPath):
        fileList = os.listdir(OAITempPath)
        if fileList != None:
            fileList.sort()
            for fileName in fileList:
                solrFilePath = solrTempPath + '/' + fileName
                OAIFilePath = OAITempPath + '/' + fileName
                try:
                    fileXML = etree.parse(OAIFilePath)
                    solrXML = XSL(fileXML, collections="'geoleo-oai'")
                    solrFile = open(solrFilePath, 'w')
                    solrFile.write(
                        etree.tostring(solrXML, encoding='utf-8',
                                       method='xml'))
                    solrFile.close()
                    print u'Created Solr file »' + solrFilePath + u'«'
                    moveFile(fileName, OAITempPath, OAIPath)

                except:
                    printError(u'Could not convert file »' + fileName + u'«',
                               repository)
示例#7
0
文件: rpm.py 项目: drewsonne/pyum
    def load(fp):
        """

        :param fp:
        :return Rpm:
        """
        parser = etree.XMLParser(target=RpmParser())
        return etree.parse(fp.read(), parser)
def process(path):
    root = etree.parse(path)
    element = root.find(".//database")
    element.attrib['clustered'] = 'true'
    
    xml = etree.tostring(root, pretty_print=True)

    return xml
示例#9
0
文件: rpm.py 项目: pombredanne/pyum
    def load(fp):
        """

        :param fp:
        :return Rpm:
        """
        parser = etree.XMLParser(target=RpmParser())
        return etree.parse(fp.read(), parser)
示例#10
0
def main(argv):
    dirname_input = argv[1]
    filename_output = argv[2]

    source = os.path.join(dirname_input, 'word', 'document.xml')
    tree = et.parse(os.path.abspath(source))
    result = extract(tree)
    result = transform(result)
    result = load(filename_output, result)
示例#11
0
 def test_definition_extraction(self):
     with app.test_request_context():
         tree = etree.parse('tests/3_definitions.xml', parser=self.parser)
         definitions = Definitions()
         find_all_definitions(tree, definitions, document_id=0, expire=False)
         self.assertEqual(len(definitions.items()), 3)
         self.assertTrue(('accounting period', 'accounting periods') in definitions.active)
         self.assertTrue(('address for service', 'address for services', 'addresses for service', 'addresses for services') in definitions.active)
         self.assertTrue(('annual meeting', 'annual meetings') in definitions.active)
示例#12
0
 def test_complex(self):
     with app.test_request_context():
         tree = etree.parse('tests/companiesact_gutted.xml', parser=self.parser)
         tree, definitions = populate_definitions(tree, document_id=0)
         tree, _ = process_definitions(tree, definitions)
         for d in definitions.pool:
             if d.full_word in ['shareholder', 'holder of the shares']:
                 self.assertIn('DLM320498', d.expiry_tags)
                 self.assertIn('DLM1624955', d.expiry_tags)
示例#13
0
 def test_complex(self):
     with app.test_request_context():
         tree = etree.parse('tests/companiesact_gutted.xml',
                            parser=self.parser)
         tree, definitions = populate_definitions(tree, document_id=0)
         tree, _ = process_definitions(tree, definitions)
         for d in definitions.pool:
             if d.full_word in ['shareholder', 'holder of the shares']:
                 self.assertIn('DLM320498', d.expiry_tags)
                 self.assertIn('DLM1624955', d.expiry_tags)
示例#14
0
 def test_equations(self):
     tree = etree.parse('tests/path_extraction.xml', parser=self.parser)
     el = tree.xpath('.//*[@id="zzz"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)(i)')
     el = tree.xpath('.//*[@id="yyy"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)')
     el = tree.xpath('.//*[@id="xxx"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)')
     el = tree.xpath('.//*[@id="aaa"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 sch 1 cl 1(1)')
示例#15
0
 def test_definition_redefinitions(self):
     with app.test_request_context():
         tree = etree.parse('tests/redefinitions.xml', parser=self.parser)
         tree, definitions = populate_definitions(tree, document_id=0)
         tree, _ = process_definitions(tree, definitions)
         self.assertEqual(len(tree.xpath('.//catalex-def')), 4)
         self.assertEqual(tree.xpath('.//catalex-def')[0].attrib['def-ids'], '0-xxx')
         self.assertEqual(tree.xpath('.//catalex-def')[1].attrib['def-ids'], '0-yyy')
         self.assertEqual(tree.xpath('.//catalex-def')[2].attrib['def-ids'], '0-xxx')
         self.assertEqual(tree.xpath('.//catalex-def')[3].attrib['def-ids'], '0-zzz')
示例#16
0
 def test_equations(self):
     tree = etree.parse('tests/path_extraction.xml', parser=self.parser)
     el = tree.xpath('.//*[@id="zzz"]')[0]
     self.assertEqual(
         generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)(i)')
     el = tree.xpath('.//*[@id="yyy"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)(a)')
     el = tree.xpath('.//*[@id="xxx"]')[0]
     self.assertEqual(generate_path_string(el)[0], 'Test Act 666 s 2(1)')
     el = tree.xpath('.//*[@id="aaa"]')[0]
     self.assertEqual(
         generate_path_string(el)[0], 'Test Act 666 sch 1 cl 1(1)')
示例#17
0
 def test_definition_redefinitions(self):
     with app.test_request_context():
         tree = etree.parse('tests/redefinitions.xml', parser=self.parser)
         tree, definitions = populate_definitions(tree, document_id=0)
         tree, _ = process_definitions(tree, definitions)
         self.assertEqual(len(tree.xpath('.//catalex-def')), 4)
         self.assertEqual(
             tree.xpath('.//catalex-def')[0].attrib['def-ids'], '0-xxx')
         self.assertEqual(
             tree.xpath('.//catalex-def')[1].attrib['def-ids'], '0-yyy')
         self.assertEqual(
             tree.xpath('.//catalex-def')[2].attrib['def-ids'], '0-xxx')
         self.assertEqual(
             tree.xpath('.//catalex-def')[3].attrib['def-ids'], '0-zzz')
示例#18
0
 def test_definition_extraction(self):
     with app.test_request_context():
         tree = etree.parse('tests/3_definitions.xml', parser=self.parser)
         definitions = Definitions()
         find_all_definitions(tree,
                              definitions,
                              document_id=0,
                              expire=False)
         self.assertEqual(len(definitions.items()), 3)
         self.assertTrue(('accounting period',
                          'accounting periods') in definitions.active)
         self.assertTrue(('address for service', 'address for services',
                          'addresses for service',
                          'addresses for services') in definitions.active)
         self.assertTrue(('annual meeting',
                          'annual meetings') in definitions.active)
示例#19
0
文件: App03.py 项目: Alxant1394/Test
def main():
    playlist = ET.Element("playlist")
    output_list = '/home/alxant/PathonStady/Laba1/out.xml'
    for list_entry in output_list:
        product = ET.SubElement(playlist, "entry")
        print product
        ET.SubElement(product, "artist").text = list_entry[0]
        ET.SubElement(product, "title").text = list_entry[1]
        ET.SubElement(product, "genre").text = list_entry[2]
    tree = ET.ElementTree(playlist)
    print tree
    tree.write(output_list)
    parser = etree.MLParser(resolve_entities=False, strip_cdata=False)
    print parser
    document = etree.parse(output_list, parser)
    print document
    document.write(output_list, pretty_print=True, encoding='utf-8')
示例#20
0
 def test_case_and_plurals(self):
     with app.test_request_context():
         tree = etree.parse('tests/plural_charcase_defs.xml', parser=self.parser)
         tree, definitions = populate_definitions(tree, document_id=0)
         tree, _ = process_definitions(tree, definitions)
         self.assertEqual(len(definitions.items()), 6)
         self.assertEqual(len(tree.xpath('.//*[@cid="case_wrong_start"]/catalex-def-def')), 0)
         self.assertEqual(len(tree.xpath('.//*[@cid="case_wrong_end"]/catalex-def')), 0)
         self.assertEqual(len(tree.xpath('.//*[@cid="case_correct"]/catalex-def')), 1)
         self.assertEqual(len(tree.xpath('.//*[@cid="case_plural_correct"]/catalex-def')), 1)
         self.assertEqual(len(tree.xpath('.//*[@cid="plural_correct"]/catalex-def')), 1)
         self.assertEqual(len(tree.xpath('.//*[@cid="plural_wrong"]/catalex-def')), 0)
         self.assertEqual(len(tree.xpath('.//*[@cid="complex_plural_correct"]/catalex-def')), 1)
         self.assertEqual(len(tree.xpath('.//*[@cid="complex_plural_possessive_correct"]/catalex-def')), 1)
         self.assertEqual(len(tree.xpath('.//*[@cid="complex_plural_possessive_correct_2"]/catalex-def')), 2)
         self.assertEqual(len(tree.xpath('.//*[@cid="complex_plural_possessive_correct_3"]/catalex-def')), 4)
         self.assertEqual(len(tree.xpath('.//catalex-def')), 12)
示例#21
0
def load_xml_keyfile(filename):
    """
    // Sample XML file:
    // <?xml version="1.0" encoding="utf-8"?>
    // <KeyFile>
    //     <Meta>
    //         <Version>1.00</Version>
    //     </Meta>
    //     <Key>
    //         <Data>ySFoKuCcJblw8ie6RkMBdVCnAf4EedSch7ItujK6bmI=</Data>
    //     </Key>
    // </KeyFile>
    """
    with open(filename, 'r') as f:
        # ignore meta, currently there is only version "1.00"
        tree = etree.parse(f).getroot()
        # read text from key, data and convert from base64
        return base64.b64decode(tree.find('Key/Data').text)
示例#22
0
def initConfig():
    #config = Element("MetamergeConfig")
    logsNsettings = os.path.join(args['p'], "Log & Settings")
    #print "Log & Settings: %s" % logsNsettings
    try:
        baseCfg = ET.parse(logsNsettings)
        print "** found Log & Settings"
        cfg = Element("MetamergeConfig")
        for child in baseCfg.getroot():
            cfg.append(child)
    except Exception as ex:
        print "** Exception reading Log & Settings: %s" % str(ex)
        print "** creating default basic config **"
        cfg = defaultConfig()

    cfg.attrib[
        'IDIversion'] = 'Compiled by ' + prog + " v" + vrs + " - " + today.__format__(
            '%Y-%m-%d')
    cfg.attrib['created'] = now
    cfg.attrib['createdBy'] = prog + " v" + vrs
    cfg.attrib['version'] = "7.1.1"

    for folder in folders:
        #print "Looking for %s" % folder
        if folder == "Properties":
            fld = cfg.find(".//Properties[@name='%s']" % folder)
        elif folder == "References":
            fld = cfg.find(".//Folder[@name='Includes']")
        else:
            fld = cfg.find(".//Folder[@name='%s']" % folder)

        if fld is None:
            #print "Adding folder %s" % folder
            if folder == "Properties":
                fld = SubElement(cfg, "Properties")
                fld.attrib['name'] = folder
            elif folder == "References":
                fld = SubElement(cfg, "Folder")
                fld.attrib['name'] = "Includes"
            else:
                fld = SubElement(cfg, "Folder")
                fld.attrib['name'] = folder

    return cfg
示例#23
0
def load_xml_keyfile(filename):
    """
    // Sample XML file:
    // <?xml version="1.0" encoding="utf-8"?>
    // <KeyFile>
    //     <Meta>
    //         <Version>1.00</Version>
    //     </Meta>
    //     <Key>
    //         <Data>ySFoKuCcJblw8ie6RkMBdVCnAf4EedSch7ItujK6bmI=</Data>
    //     </Key>
    // </KeyFile>
    """
    with open(filename, 'r') as f:
        # ignore meta, currently there is only version "1.00"
        tree = etree.parse(f).getroot()
        # read text from key, data and convert from base64
        return base64.b64decode(tree.find('Key/Data').text)
    raise IOError('Could not parse XML keyfile.')
示例#24
0
def get_latest_version(url):
    previous_version = '0'
    version = '0'
    response = urllib2.urlopen(url)
    html = response.read()
    parser = etree.HTMLParser()
    tree = etree.parse(BytesIO(html), parser)

    result = etree.tostring(tree.getroot(), pretty_print=True, method="html")

    for row in tree.iter('td'):
        if row is not None and row.xpath('a/@href'):
            build_num = row.xpath('a/@href')[0]
            previous_version = build_num[-4:-1]
            if previous_version != 'way' and previous_version != "0.0" and int(
                    previous_version) >= int(version):
                version = previous_version

    return version
示例#25
0
 def test_case_and_plurals(self):
     with app.test_request_context():
         tree = etree.parse('tests/plural_charcase_defs.xml',
                            parser=self.parser)
         tree, definitions = populate_definitions(tree, document_id=0)
         tree, _ = process_definitions(tree, definitions)
         self.assertEqual(len(definitions.items()), 6)
         self.assertEqual(
             len(tree.xpath(
                 './/*[@cid="case_wrong_start"]/catalex-def-def')), 0)
         self.assertEqual(
             len(tree.xpath('.//*[@cid="case_wrong_end"]/catalex-def')), 0)
         self.assertEqual(
             len(tree.xpath('.//*[@cid="case_correct"]/catalex-def')), 1)
         self.assertEqual(
             len(tree.xpath(
                 './/*[@cid="case_plural_correct"]/catalex-def')), 1)
         self.assertEqual(
             len(tree.xpath('.//*[@cid="plural_correct"]/catalex-def')), 1)
         self.assertEqual(
             len(tree.xpath('.//*[@cid="plural_wrong"]/catalex-def')), 0)
         self.assertEqual(
             len(
                 tree.xpath(
                     './/*[@cid="complex_plural_correct"]/catalex-def')), 1)
         self.assertEqual(
             len(
                 tree.xpath(
                     './/*[@cid="complex_plural_possessive_correct"]/catalex-def'
                 )), 1)
         self.assertEqual(
             len(
                 tree.xpath(
                     './/*[@cid="complex_plural_possessive_correct_2"]/catalex-def'
                 )), 2)
         self.assertEqual(
             len(
                 tree.xpath(
                     './/*[@cid="complex_plural_possessive_correct_3"]/catalex-def'
                 )), 4)
         self.assertEqual(len(tree.xpath('.//catalex-def')), 12)
class GSASearchProvider(BaseSearchProvider):

    SCHEME_HOST_RE = re.compile(r'[a-z\d]+:\/\/[a-z.\-\d]+\/')

    def __init__(self, search_url, domain, params={}, title_clean_re=None):
        self.search_url, self.domain, self.params = search_url, domain, params
        self.title_clean_re = re.compile(title_clean_re) if title_clean_re else None

    def perform_search(self, request, query, application=None):

        if application:
            domain = self.domain + reverse('%s:index' % application)[:-1]
        else:
            domain = self.domain

        query = self._perform_query_expansion(query)
        query = ' '.join(('(%s)' % (' OR '.join(((('"%s"' % t) if ' ' in t else t) for t in terms))) for terms in query[:]))

        params = dict(self.params)
        params.update({
            'q': query.encode('utf-8'),
            'output': 'xml',
            'ie': 'utf8',
            'oe': 'utf8',
            'as_sitesearch': domain,
        })

        try:
            response = urllib2.urlopen('?'.join((self.search_url, urllib.urlencode(params))))
        except urllib2.HTTPError, e:
            logger.exception("Couldn't fetch results from Google Search Appliance")
            return []

        try:
            xml_root = etree.parse(response)
        except xml.parsers.expat.ExpatError, e:
            logger.exception("Couldn't parse results from Google Search Appliance")
            return []
示例#27
0
def transformXML (repository, XSL):
	printHeading('Transforming OAI records to Solr records')
	# Run through XML files in oai-temp folder
	OAITempPath = repositoryPath('oai-temp', repository)
	OAIPath = repositoryPath('oai', repository)
	solrTempPath = repositoryPath('solr-temp', repository, None, True)
	if os.path.exists(OAITempPath):
		fileList = os.listdir(OAITempPath)
		if fileList != None:
			fileList.sort()
			for fileName in fileList:
				solrFilePath = solrTempPath + '/' + fileName
				OAIFilePath = OAITempPath + '/' + fileName
				try:
					fileXML = etree.parse(OAIFilePath)
					solrXML = XSL(fileXML, collections="'geoleo-oai'")
					solrFile = open(solrFilePath, 'w')
					solrFile.write(etree.tostring(solrXML, encoding='utf-8', method='xml'))
					solrFile.close()
					print u'Created Solr file »' + solrFilePath + u'«'
					moveFile(fileName, OAITempPath, OAIPath)
					
				except:
					printError(u'Could not convert file »' + fileName + u'«', repository)
示例#28
0
def indent(elem, level=0):
    i = "\n" + level*"  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
            pass
        for e in elem:
            indent(e, level+1)
            if not e.tail or not e.tail.strip():
                e.tail = i + "  "
                pass
        if not e.tail or not e.tail.strip():
            e.tail = i
            pass
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i
            pass
        pass

if len(sys.argv) > 1:
    src = sys.argv[1]
    pass
else:
    src = sys.stdin
    pass

tree = etree.parse(src)
indent(tree.getroot())
tree.write(sys.stdout, "utf-8")
示例#29
0
def main():
    configurationPath = u'config/config.js'
    formats = False
    delete = None
    harvest = None
    transform = False
    solrXSL = None
    solrURL = None
    try:
        # evaluate command line parameters
        options, arguments = getopt.getopt(sys.argv[1:], 'c:d:fD:h:ts:', [
            'config=', 'datapath=', 'formats', 'delete=', 'harvest=',
            'transform', 'solr='
        ])
        for option, value in options:
            if option in ('-c', '--config'):
                configurationPath = value
            elif option in ('-d', '--datapath'):
                dataPath = value
            elif option in ('-D', '--delete'):
                delete = value
            elif option in ('-f', '--formats'):
                formats = True
            elif option in ('-h', '--harvest'):
                harvest = value
            elif option in ('-t', '--transform'):
                transform = True
            elif option in ('-s', '--solr'):
                solrURL = value
                print value
            else:
                assert False, 'unhandled option'

        # read configuration
        f = open(configurationPath)
        configuration = json.load(f)
        f.close()
        repositories = configuration['servers']

        if delete != None:
            deleteFiles(delete)

        # Read transformation XSL if needed.
        if transform != None:
            try:
                solrXSLXML = etree.parse('OAI-to-Solr.xsl')
                solrXSL = etree.XSLT(solrXSLXML)
            except:
                printError('Failed to read XSL for Solr transformation.')

        # loop through repositories and run actions determined by the command line parameters
        for repositoryID in sorted(repositories.iterkeys()):
            repository = repositories[repositoryID]
            repository['ID'] = repositoryID

            if not repository.has_key('broken'):
                print ''
                printHeading(u'==== ' + repositoryID + u' ====', repository)

                if formats:
                    determineFormats(repository)

                if harvest != None:
                    updateOAI(repository, configuration, harvest.split(','))

                if transform:
                    transformXML(repository, solrXSL)

                if solrURL != None:
                    updateSolr(repository, solrURL)

        if solrURL != None:
            print ''
            print u'Committing Solr Index …'
            try:
                solrCommit = urllib2.urlopen(updateURL, "commit=true")
                solrCommit.close()
            except urllib2.URLError as err:
                printError('Failed to commit the Solr index: ' + str(err))

    except getopt.GetoptError, err:
        printError('Could not parse the options: ' + str(err))
        sys.exit(2)
示例#30
0
 def setUp(self):
     self.parser = etree.XMLParser(remove_blank_text=True)
     self.tree = etree.parse('tests/instruments/companies.xml', parser=self.parser).getroot()
 def prettyPrintXml(self, ):
     assert self.xmlFile is not None
     parser = etree.XMLParser(resolve_entities=False, strip_cdata=False)
     document = etree.parse(self.xmlFile, parser)
     document.write(self.xmlFile, pretty_print=True, encoding='utf-8')
示例#32
0
#!/usr/bin/python
#-*-coding:utf-8-*-
import sys
from 1xml import etree

htm1 = open(sys.argv[1])
root = etree.parse(htm1)

for item in root.xpath('//sentence[@id= "2"]/tokens/token[@id= "5"]/word'):
	print item.text
示例#33
0
def scan(path):
    for dir in dirs(path):
        if dir == "Resources":
            scan(os.path.join(path, dir))
        elif dir in folders:
            print "checking %s" % dir

            if dir == "Properties":
                folder = config.find(".//Properties[@name='%s']" % dir)
            elif dir == "References":
                folder = config.find(".//Folder[@name='Includes']")
            else:
                folder = config.find(".//Folder[@name='%s']" % dir)

            if folder is None:
                raise Exception("Missing <Folder> with name = %s" % dir)

            if dir == "Properties":
                elemName = "PropertyStore"
            elif dir == "References":
                elemName = "Include"
            elif dir == "Schedules":
                elemName = "Scheduler"
            else:
                elemName = dir[:len(dir) - 1]

            #print "**** Looking for %s in %s" % (elemName, path + "/" + dir)
            for file in files(path + "/" + dir):
                #subfolder = SubElement(folder, type)
                #name = os.path.splitext(file)[0]
                #subfolder.attrib['name'] = name
                if file.find('Java-Properties') >= 0:
                    continue

                filepath = os.path.join(os.path.join(path, dir), file)
                xml = ET.parse(filepath)

                #print "Parsed file %s" % filepath
                if dir == "Properties":
                    parent = xml.findall(".//PropertyStore")
                    useFolder = folder.find("./Stores")
                    if useFolder is None:
                        useFolder = ET.SubElement(folder, "Stores")
                    folder = useFolder
                    #print "Folder set to %s" % ET.tostring(folder)
                elif dir == "References":
                    parent = xml.findall(".//Include")
                else:
                    parent = xml.getroot()

                for child in parent:
                    #print "Checking %s == %s" % (child.tag, elemName)
                    if child.tag == elemName:
                        try:
                            childName = file
                            p = childName.rfind(".")
                            if p > 0:
                                childName = childName[:p]
                                #print "--> reduced childName to %s from %s" % (childName, file)
                            #childName = child.attrib['name']
                            if childName.startswith("."):
                                childName = childName[1:]
                            if childName.endswith(".script"):
                                childName = childName[:len(childName) - 7]
                            if childName.endswith(".assemblyline"):
                                childName = childName[:len(childName) - 13]
                            child.attrib['name'] = childName
                            print " ..adding %s" % child.attrib['name']
                            folder.append(child)
                        except Exception as ex:
                            print "!! Skipping due to %s - %s" % (ex, child)
                            i = 42  # Skip those tag without the 'name' attribute

                #print '    <%s name="%s" />' % (type, name)
                i = 42  # do nothing
 def write_XML(self, data_list):
     print data_list
     #data_list contains id, question_number and received_answer
     id_list=[]
     
     print data_list
     #data_list = [item.received_answer.replace('#', ' ') for item in data_list]
     #data_list = [item.received_answer.replace('|', 'Ì') for item in data_list]
     #data_list = [item.received_answer.replace('$', 'Î') for item in data_list]
     
     for item in data_list:
         #add all the IDs from the data_list into id_list
         id_list.append(item.id)
               
     
     #a skeleton of answer XML is already created containg tags and attribute names
     #the XML skeleton is copied into another file with name "XML'id_list[0]" in which the answers will be filled
     current_path = os.path.dirname(__file__)
     parent_folder = os.path.abspath(os.path.join(current_path, os.pardir))
     xml_folder = os.path.normpath(os.path.join(parent_folder, "XML_files"))
     shutil.copy2(os.path.join(xml_folder, "Answer_XML_skeleton.xml"), os.path.join(xml_folder,"XML%s.xml"%id_list[0]))
     
     path = os.path.join(xml_folder,"XML%s.xml"%id_list[0])
     print "11231239123123981209381092380192830912830812938102938"
     print path
     print "11231239123123981209381092380192830912830812938102938"
     parser = etree.XMLParser(remove_blank_text=True, encoding='iso-8859-1')
     tree = etree.parse(path,parser)
     root = tree.getroot()
     #first filling the value of tag attributes(eg <data id="value">) from the received answer SMS
     for element in root.iter():
         attribute = element.attrib
         for item in attribute:
             for item1 in data_list:
                     #print "item ="+item, "question_number="+str(item1.question_number)
                     
                     #to avoid conflict, the attribute "id" of "data" tag is stored as "dataID"
                     #check if "dataID" is encountered in the data list
                     if item == "id" and item1.question_number == "dataID":
                         #if encountered, add the value of received answer to the value of "id" attribute
                         element.attrib[item] = item1.received_answer
                     if item == str(item1.question_number):
                         text = item1.received_answer
                         if '#' in item1.received_answer:
                             text = item1.received_answer.replace('#', ' ')
                         
                         #check if the attribute name from the answer XML skeleton matches the item from the data_list
                         if item1.received_answer == "X":
                             #if received answer contains "X" then nothing is stored as the value of attribute
                             continue
                         #otherwise the the received answer is stored as the value of the attribute
                         element.attrib[item] = text 
     
     #filling the data items(eg <QN_470_0>data</QN_470-0>
     for element in root:
         root1 = element
         #counter to make sure that data is inserted one after another rather than one before another
         count = 0
         for item in data_list:
             if item.question_number.startswith("QN"):
                 #check if the received_answer is "-", if true it confirms the presence of tags of group or matrix 
                 if item.received_answer == "-":
                     #no need to add the data to the XML, so SKIP
                     continue
                 #if received answer contains data, then make the corresponding question_number a tag
                 elem = etree.Element(str(item.question_number))
                 #and store the received_answer as the text of the same tag
                 text = item.received_answer
                 if '#' in item.received_answer:
                     text = item.received_answer.replace('#', ' ')
                 print u"\u00CE"
                 if '$' in item.received_answer:
                     text = item.received_answer.replace('$', u"\u00CE")
                     print text
                 if '|' in item.received_answer:
                     text = item.received_answer.replace('|', u"\u00CC")
                     print text
                 print str(text)
                 elem.text = unicode(text)
                 #insert the tag into the XML file one after another
                 #count defines where to insert in the XML file
                 root1.insert(count,elem)
                 count=count+1
     
     #finally write the data into the XML file
     tree.write(path,pretty_print=True)
     #old code
     #return id_list[0]
     #old code
     #--------------------
     
     #new test code
     fo = open(path,"r")
     print fo
     """
     soup = BeautifulStoneSoup(fo.read())
     html = fo.read().encode('ascii', 'ignore')
     html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
     """
     print "2"
     h = HTMLParser.HTMLParser()
     xml = h.unescape(fo.read())
     print "xml=", xml
     
     fo.close()
     fo = open(path, 'w+')
     fo.write(xml)
     fo.close
     
     fo = open(path,"r")
     return fo.read()
     
                     
示例#35
0
def extract_to_map(speaker_file, speaker_to_speeches):
    root = et.parse(os.path.abspath(speaker_file)).getroot()
    for doc in root:
        speaker_to_speeches[doc[0].text].append(doc[1].text if doc[1].text is not None else '')
示例#36
0
 def __listManifestItems(contentOPFPath):
     tree = etree.parse(contentOPFPath)
     return tree.xpath("//opf:manifest/opf:item/@href",
                       namespaces={'opf': 'http://www.idpf.org/2007/opf'})
示例#37
0
def slaZ( inp, State =State, newline ='\n'):
    import xml.etree as et
    e = et.parse( inp)
    q = e.getroot()
示例#38
0
 def setUp(self):
     self.parser = etree.XMLParser(remove_blank_text=True)
     self.tree = etree.parse('tests/instruments/companies.xml',
                             parser=self.parser).getroot()
示例#39
0
    i = "\n" + level * "  "
    if len(elem):
        if not elem.text or not elem.text.strip():
            elem.text = i + "  "
            pass
        for e in elem:
            indent(e, level + 1)
            if not e.tail or not e.tail.strip():
                e.tail = i + "  "
                pass
        if not e.tail or not e.tail.strip():
            e.tail = i
            pass
    else:
        if level and (not elem.tail or not elem.tail.strip()):
            elem.tail = i
            pass
        pass


if len(sys.argv) > 1:
    src = sys.argv[1]
    pass
else:
    src = sys.stdin
    pass

tree = etree.parse(src)
indent(tree.getroot())
tree.write(sys.stdout, "utf-8")
示例#40
0
def main():
	configurationPath = u'config/config.js'
	formats = False
	delete = None
	harvest = None
	transform = False
	solrXSL = None
	solrURL = None
	try:
		# evaluate command line parameters
		options, arguments = getopt.getopt(sys.argv[1:], 'c:d:fD:h:ts:', ['config=', 'datapath=', 'formats', 'delete=', 'harvest=', 'transform', 'solr='])
		for option, value in options:
			if option in ('-c', '--config'):
				configurationPath = value
			elif option in ('-d', '--datapath'):
				dataPath = value
			elif option in ('-D', '--delete'):
				delete = value
			elif option in ('-f', '--formats'):
				formats = True
			elif option in ('-h', '--harvest'):
				harvest = value
			elif option in ('-t', '--transform'):
				transform = True
			elif option in ('-s', '--solr'):
				solrURL = value
				print value
			else:
				assert False, 'unhandled option'

		# read configuration
		f = open(configurationPath)
		configuration = json.load(f)
		f.close()
		repositories = configuration['servers']
		
		if delete != None:
			deleteFiles(delete)

		# Read transformation XSL if needed.
		if transform != None:
			try:
				solrXSLXML = etree.parse('OAI-to-Solr.xsl')
				solrXSL = etree.XSLT(solrXSLXML)
			except:
				printError('Failed to read XSL for Solr transformation.')

		# loop through repositories and run actions determined by the command line parameters
		for repositoryID in sorted(repositories.iterkeys()):
			repository = repositories[repositoryID]
			repository['ID'] = repositoryID
			
			if not repository.has_key('broken'):
				print ''
				printHeading(u'==== ' + repositoryID + u' ====', repository)
				
				if formats:
					determineFormats(repository)
				
				if harvest != None:
					updateOAI(repository, configuration, harvest.split(','))

				if transform:
					transformXML(repository, solrXSL)
			
				if solrURL != None:
					updateSolr(repository, solrURL)
	
		if solrURL != None:
			print ''
			print u'Committing Solr Index …'
			try:
				solrCommit = urllib2.urlopen(updateURL, "commit=true")
				solrCommit.close()
			except urllib2.URLError as err:
				printError('Failed to commit the Solr index: ' + str(err))
		
	except getopt.GetoptError, err:
		printError('Could not parse the options: ' + str(err))
		sys.exit(2)