Exemplo n.º 1
0
def run_xml2sql():
	print """xml2sql by Kailash Nadh (http://nadh.in)
	--help for help

	"""

	# parse arguments
	parser = argparse.ArgumentParser(description='Convert an xml file to sql.')
	parser.add_argument('--input', type=file, dest='input_file', required=True, help='input xml filename')
	parser.add_argument('--output', dest='output_file', required=True, help='output sql filename')
	parser.add_argument('--tag', dest='tag', required=True, help='the record tag. eg: item')
	parser.add_argument('--table', dest='table', required=True, help='table name')
	parser.add_argument('--ignore', dest='ignore', default='', nargs='+', help='list of tags to ignore')
	parser.add_argument('--encoding', dest='encoding', default='utf-8', help='character encoding (default=utf-8)')
	parser.add_argument('--limit', type=int, dest='limit', default=-1, help='maximum number of records to process')
	parser.add_argument('--packet', type=float, dest='packet', default='8', \
						help=r'maximum size of an insert query in MB. \
						see MySQL\'s max_allowed_packet (default=8)')

	args = parser.parse_args()

	converter = xml2sql(args.input_file, args.output_file, args.encoding)
	num = converter.convert(tag=args.tag, table=args.table, ignore=args.ignore, limit=args.limit, packet=args.packet)

	print "\n\nWrote", num['num'], "records to", args.output_file, \
		  " (INSERT queries =", num['num_insert'], ")"
Exemplo n.º 2
0
def run_xml2sql():
    print """xml2sql by Kailash Nadh (http://nadh.in)
	--help for help

	"""

    # parse arguments
    parser = argparse.ArgumentParser(description="Convert an xml file to sql.")
    parser.add_argument("--input", type=file, dest="input_file", required=True, help="input xml filename")
    parser.add_argument("--output", dest="output_file", required=True, help="output sql filename")
    parser.add_argument("--tag", dest="tag", required=True, help="the record tag. eg: item")
    parser.add_argument("--table", dest="table", required=True, help="table name")
    parser.add_argument("--ignore", dest="ignore", default="", nargs="+", help="list of tags to ignore")
    parser.add_argument("--encoding", dest="encoding", default="utf-8", help="character encoding (default=utf-8)")
    parser.add_argument("--limit", type=int, dest="limit", default=-1, help="maximum number of records to process")
    parser.add_argument(
        "--packet",
        type=float,
        dest="packet",
        default="8",
        help=r"maximum size of an insert query in MB. \
						see MySQL\'s max_allowed_packet (default=8)",
    )

    args = parser.parse_args()

    converter = xml2sql(args.input_file, args.output_file, args.encoding)
    num = converter.convert(tag=args.tag, table=args.table, ignore=args.ignore, limit=args.limit, packet=args.packet)

    print "\n\nWrote", num["num"], "records to", args.output_file, " (INSERT queries =", num["num_insert"], ")"
def run_xml2sql():
    print """xml2sql by Kailash Nadh (http://nadh.in)
	--help for help

	"""

    # parse arguments
    parser = argparse.ArgumentParser(description='Convert an xml file to sql.')
    parser.add_argument('--input',
                        type=file,
                        dest='input_file',
                        required=True,
                        help='input xml filename')
    parser.add_argument('--output',
                        dest='output_file',
                        required=True,
                        help='output sql filename')
    parser.add_argument('--tag',
                        dest='tag',
                        required=True,
                        help='the record tag. eg: item')
    parser.add_argument('--table',
                        dest='table',
                        required=True,
                        help='table name')
    parser.add_argument('--ignore',
                        dest='ignore',
                        default='',
                        nargs='+',
                        help='list of tags to ignore')
    parser.add_argument('--encoding',
                        dest='encoding',
                        default='utf-8',
                        help='character encoding (default=utf-8)')
    parser.add_argument('--limit',
                        type=int,
                        dest='limit',
                        default=-1,
                        help='maximum number of records to process')
    parser.add_argument('--packet', type=float, dest='packet', default='8', \
         help=r'maximum size of an insert query in MB. \
						see MySQL\'s max_allowed_packet (default=8)'                                                     )

    args = parser.parse_args()

    converter = xml2sql(args.input_file, args.output_file, args.encoding)
    num = converter.convert(tag=args.tag,
                            table=args.table,
                            ignore=args.ignore,
                            limit=args.limit,
                            packet=args.packet)

    print "\n\nWrote", num['num'], "records to", args.output_file, \
     " (INSERT queries =", num['num_insert'], ")"
#  Result: We get 16446 rows, including some duplicates.
TIWIS: Per line below, I am doing new tests in draftone-altxml.py to see if I can just make a nice manual importer of some sort. 
#NEXT TEST is to see whether the FederalSiteIdentifier ATTRIBUTE is a true "site" uid. This will require a different XML module.
#  Result: _______ 
#Next test is to keep cycling in fields to see whether solely non-numeric fields are ignored. 
#(I have tried fields up through and including Classification_Name so far. 
#(If we can even get a basic numeric site identifier field, we may be able to scrape other data manually from the XML using other libraries.)
ignorestring = "ReportingOrganization PlannedCompletionDateStep7 PlannedCompletionDateStep8 PlannedCompletionDateStep9 "
for tag in fulltaglist:
	try:
		index = tagstokeep.index(tag)
		print "using "+tag
	except ValueError, e:
		print "discarding "+tag
		ignorestring += tag+" "
ignorestring = ignorestring.strip()
siteconverter = xml2sql("fcsi-rscf.xml", "sites.sql")
siteconverter.convert(table="sites",tag="Site",ignore=ignorestring)
#In converting sites to a .sql dump, we'll want to ignore most tags, since they create all sorts of probles. 
#I'll then need to break up that dump into individual transactions.
#(30,000+ inserts in a single transaction is a bad idea in SQLite. The "lite" is there for a reason. 



#Now let's create a very crude sqlite database (mainly using TEXT types for now) and import the smaller table. 
#conn = sqlite3.connect('crude.db')
#c = conn.cursor()
#c.execute('''CREATE TABLE IF NOT EXISTS organizations 
#		(Code TEXT, EN TEXT, FR TEXT)''')
#c.execute('DELETE FROM organizations')