def convert_files2(): #files = glob.glob("/lfs/local/0/mraison/*") files = ["/lfs/local/0/mraison/a4_posthistory.xml"] #files = ["/lfs/local/0/mraison/a1_posts.xml"] for fname in files: bname, ext = os.path.splitext(fname) if ext == ".xml": print "First pass for " + fname + "..." numcol = 0 coldict = {} columns = [] source = iter(ET.iterparse(fname, events=('start', 'end'))) while 1: try: event, elem = source.next() if event == 'end' and elem.tag == 'row': for name, _ in elem.attrib.items(): name = name.encode('unicode-escape') if not name in columns: coldict[name] = numcol columns.append(name) numcol += 1 except xml.parsers.expat.ExpatError: print "one warning generated" except StopIteration: break print "Second pass for " + fname + "..." with open(bname + ".tsv", "wb") as dest: string = '\t'.join(columns) + '\n' dest.write(string) source = iter(ET.iterparse(fname, events=('start', 'end'))) while 1: try: event, elem = source.next() if event == 'end' and elem.tag == 'row': row = [""] * numcol for name, val in elem.attrib.items(): if '\t' in val: print "Found guilty string" row[coldict[name]] = val.replace( "\t", "").encode('unicode-escape') string = '\t'.join(row) + '\n' dest.write(string) except xml.parsers.expat.ExpatError: print "one warning generated" except StopIteration: break
def convert_file(fname, size): numrows = size * 1000 bname, ext = os.path.splitext(fname) print "First pass for " + fname + " with size " + str(size) + "..." numcol = 0 coldict = {} columns = [] source = iter(et.iterparse(fname, events=('start', 'end'))) counter = 0 while counter < numrows: try: event, elem = source.next() if event == 'end' and elem.tag == 'row': for name, _ in elem.attrib.items(): name = name.encode('ascii', 'ignore') if not name in columns: coldict[name] = numcol columns.append(name) numcol += 1 counter += 1 except xml.parsers.expat.ExpatError: print "one warning generated" except StopIteration: break print "Second pass for " + fname + " with size " + str(size) + "..." with open(bname + "_" + str(size) + ".tsv", "wb") as dest: string = '\t'.join(columns) + '\n' dest.write(string) source = iter(et.iterparse(fname, events=('start', 'end'))) counter = 0 while counter < numrows: try: event, elem = source.next() if event == 'end' and elem.tag == 'row': row = [""] * numcol for name, val in elem.attrib.items(): if '\t' in val: print "Found guilty string" row[coldict[name]] = val.replace( "\t", "").encode('unicode-escape') string = '\t'.join(row) + '\n' dest.write(string) counter += 1 except xml.parsers.expat.ExpatError: print "one warning generated" except StopIteration: break
def convert_files2(): #files = glob.glob("/lfs/local/0/mraison/*") files = ["/lfs/local/0/mraison/a4_posthistory.xml"] #files = ["/lfs/local/0/mraison/a1_posts.xml"] for fname in files: bname, ext = os.path.splitext(fname) if ext == ".xml": print "First pass for " + fname + "..." numcol = 0 coldict = {} columns = [] source = iter(ET.iterparse(fname, events = ('start','end'))) while 1: try: event,elem = source.next() if event == 'end' and elem.tag == 'row': for name,_ in elem.attrib.items(): name = name.encode('unicode-escape') if not name in columns: coldict[name] = numcol columns.append(name) numcol += 1 except xml.parsers.expat.ExpatError: print "one warning generated" except StopIteration: break print "Second pass for " + fname + "..." with open(bname + ".tsv","wb") as dest: string = '\t'.join(columns)+'\n' dest.write(string) source = iter(ET.iterparse(fname, events = ('start','end'))) while 1: try: event,elem = source.next() if event == 'end' and elem.tag == 'row': row = [""]*numcol for name,val in elem.attrib.items(): if '\t' in val: print "Found guilty string" row[coldict[name]] = val.replace("\t","").encode('unicode-escape') string = '\t'.join(row)+'\n' dest.write(string) except xml.parsers.expat.ExpatError: print "one warning generated" except StopIteration: break
def __init__(self, file, output): self.file = file self.output = output self.context = iter(ET.iterparse(file)) self.database = Redis() self.par_re = re.compile( ur'\[\[([\w, \(\)"]+)\]\]') #For parsing for article links self.redirect_re = re.compile(ur'#REDIRECT', re.UNICODE) '''
def __init__(self, file, output): self.file = file self.output = output self.context = iter(ET.iterparse(file)) self.database = Redis() self.par_re = re.compile(ur'\[\[([\w, \(\)"]+)\]\]') # For parsing for article links self.redirect_re = re.compile(ur"#REDIRECT", re.UNICODE) """
def load(self, filename): parser = ET.iterparse(filename) for action, elem in parser: unmarshal = self._unmarshallers.get(elem.tag) if unmarshal: data = unmarshal(elem) elem.clear() elem.text = data elif elem.tag != "plist": raise IOError("unknown plist type: %r" % elem.tag) return parser.root[0].text
def xml2json_file(input, output = None, pretty = True, encoding='utf-8'): context = et.iterparse(input, events=("start", "end")) context = iter(context) event, root = context.next() json = xml2json(root, pretty) # if an output filename is given, write to it, otherwise, return json if output != None: output = codecs.open(output, "w", encoding) output.write(json) else: return json
def get_facts(article_file): facts = [] source = open(article_file) context = ElementTree.iterparse(source, events=("start", "end")) context = iter(context) event, root = context.next() idx = 0 for event, elem in context: if event == "end" and elem.tag == "Name": if "val" in elem.attrib: facts.append(elem.attrib["val"]) root.clear() if idx % 10 == 0: print idx idx += 1 return facts
def get_articles(category, in_dir): articles = [] categories = set() idx = 0 files = os.listdir(in_dir) for idx_f, file_name in enumerate(files): print idx_f + 1, "/", len(files) source = open(in_dir + '/' + file_name) context = ElementTree.iterparse(source, events=("start", "end")) context = iter(context) event, root = context.next() for event, elem in context: tag = elem.tag.split('}')[1] if event == "end" and tag == "text": if not elem.text: continue start = elem.text.find("{{", ) end = elem.text.find("}}") if start != -1 and end != -1: parts = elem.text[start+2:end].lower().split('|') text_c = parts[0].encode('utf-8') if text_c.startswith(category): articles.append(elem.text.encode('utf-8')) categories.add(text_c) root.clear() if idx % 1000 == 0: print idx idx += 1 f = open("categories", "w") for c in categories: f.write(c + "\n") f.close() return articles
def get_articles(category, in_dir): articles = [] categories = set() idx = 0 files = os.listdir(in_dir) for idx_f, file_name in enumerate(files): print idx_f + 1, "/", len(files) source = open(in_dir + '/' + file_name) context = ElementTree.iterparse(source, events=("start", "end")) context = iter(context) event, root = context.next() for event, elem in context: tag = elem.tag.split('}')[1] if event == "end" and tag == "text": if not elem.text: continue start = elem.text.find("{{", ) end = elem.text.find("}}") if start != -1 and end != -1: parts = elem.text[start + 2:end].lower().split('|') text_c = parts[0].encode('utf-8') if text_c.startswith(category): articles.append(elem.text.encode('utf-8')) categories.add(text_c) root.clear() if idx % 1000 == 0: print idx idx += 1 f = open("categories", "w") for c in categories: f.write(c + "\n") f.close() return articles
def load_xml(self,filename): """ Load data from XML file. Erases all existing data in the table """ self.initvars() source = iter(ET.iterparse(filename, events = ('start','end'))) self.name = source.next()[1].tag for event,elem in source: if event == 'end' and elem.tag == 'row': row = [None]*self.numcols() for name,val in elem.attrib.items(): try: idx = self.getColIndex(name) except ColumnNotFoundError: idx = len(self.cols) row.append(None) # Add new column to the table self.cols.append(set([name])) for oldrow in self.data: oldrow.append(None) row[idx] = val self.data.append(row) self.initTypes()
def convert_file_hash(fname, size): numrows = size * 1000 bname, ext = os.path.splitext(fname) print "First pass for " + fname + " with size " + str(size) + "..." numcol = 0 coldict = {} columns = [] coltypes = [] source = iter(et.iterparse(fname, events=('start', 'end'))) counter = 0 while counter < numrows: try: event, elem = source.next() if event == 'end' and elem.tag == 'row': for name, val in elem.attrib.items(): if not name in columns: coldict[name] = numcol columns.append(name) coltypes.append(types.IntType) numcol += 1 idx = coldict[name] if val != "": if coltypes[idx] == types.IntType: try: int(val) except ValueError: coltypes[idx] = types.FloatType if coltypes[idx] == types.FloatType: try: float(val) except ValueError: coltypes[idx] = types.UnicodeType counter += 1 except xml.parsers.expat.ExpatError: print "one warning generated" except StopIteration: break pdb.set_trace() print "Second pass for " + fname + " with size " + str(size) + "..." with open(bname + "_" + str(size) + ".hashed.tsv", "wb") as dest: string = '\t'.join(columns) + '\n' dest.write(string) source = iter(et.iterparse(fname, events=('start', 'end'))) counter = 0 while counter < numrows: try: event, elem = source.next() if event == 'end' and elem.tag == 'row': row = [-1] * numcol for name, val in elem.attrib.items(): idx = coldict[name] if coltypes[idx] == types.IntType: try: row[idx] = int(val) except ValueError: row[idx] = 0 elif coltypes[idx] == types.FloatType: try: row[idx] = float(val) except: row[idx] = 0 else: # UnicodeType val = val.encode('unicode-escape') row[idx] = binascii.crc32(val) & 65535 string = '\t'.join(map(str, row)) + '\n' dest.write(string) counter += 1 except xml.parsers.expat.ExpatError: print "one warning generated" except StopIteration: break
parser.add_argument('--input', type=file, dest='input_file', required=True, help='input xml filename') parser.add_argument('--output', dest='output_file', required=True, help='output sql filename') parser.add_argument('--tag', dest='tag', required=True, help='the record tag. eg: item') parser.add_argument('--table', dest='table', required=True, help='table name') parser.add_argument('--ignore', dest='ignore', default='', nargs='+', help='list of tags to ignore') parser.add_argument('--encoding', dest='encoding', default='utf-8', help='character encoding (default=utf-8)') parser.add_argument('--limit', type=int, dest='limit', default=-1, help='maximum number of records to process') parser.add_argument('--packet', type=float, dest='packet', default='8', help='maximum size of an insert query in MB. see MySQL\'s max_allowed_packet (default=8)') args = parser.parse_args() # output file handle output = codecs.open(args.output_file, "w", encoding=args.encoding) # open the xml file for iteration context = et.iterparse(args.input_file, events=("start", "end")) context = iter(context) # get to the root event, root = context.next() max_packet = 1048576 * args.packet items = []; tags = []; output_buffer = [] tagged = False started = False sql_len = 0 sql_insert = None num_insert = 0 n = 0 packet_size = 0
#!/usr/bin/python from elementtree import ElementTree import string config = [] data = open('speedtest.xml') eliterator = ElementTree.iterparse(data,events=('start','end')) counter = 0 commits = 0 for event,element in eliterator: ns_tag = None tag = element.tag if tag[0] == '{': # cut ns ns_tag = tag tag = tag.split('}')[1] # print "=== LOAD",event,tag,ns_tag if event == 'end': country = element.attrib.get('countrycode') city = element.attrib.get('name') isp = element.attrib.get('sponsor') url = element.attrib.get('url') counter += 1 if (country == 'RU') or \ (country == 'DE') or \ (country == 'BG') or \ (country == 'DE') or \ (country == 'PL') or \ (country == 'FR') or \
print cur.lastrowid conn.commit() to_insert['article_id'] = cur.lastrowid save_author_data(to_insert) def save_author_data(to_insert): # print to_insert try: if 'author' in to_insert: cur.execute("INSERT INTO authors(name, article_id) VALUES (%s, %s)", (to_insert['name'], to_insert['article_id'])) print cur.lastrowid conn.commit() except pymysql.MySQLError: print "error, but continuing" to_insert = initialize_row() for event, elem in ET.iterparse('dblp.xml'): if elem.tag in delimiter_keys: save_data(to_insert) to_insert = initialize_row() if elem.tag in keys: #print elem.tag, elem.text to_insert[elem.tag] = elem.text if elem.tag in author_keys: #print elem.tag, elem.text to_insert[elem.tag] = elem.text