示例#1
0
def convert_files2():
    #files = glob.glob("/lfs/local/0/mraison/*")
    files = ["/lfs/local/0/mraison/a4_posthistory.xml"]
    #files = ["/lfs/local/0/mraison/a1_posts.xml"]
    for fname in files:
        bname, ext = os.path.splitext(fname)
        if ext == ".xml":
            print "First pass for " + fname + "..."
            numcol = 0
            coldict = {}
            columns = []
            source = iter(ET.iterparse(fname, events=('start', 'end')))
            while 1:
                try:
                    event, elem = source.next()
                    if event == 'end' and elem.tag == 'row':
                        for name, _ in elem.attrib.items():
                            name = name.encode('unicode-escape')
                            if not name in columns:
                                coldict[name] = numcol
                                columns.append(name)
                                numcol += 1
                except xml.parsers.expat.ExpatError:
                    print "one warning generated"
                except StopIteration:
                    break
            print "Second pass for " + fname + "..."
            with open(bname + ".tsv", "wb") as dest:
                string = '\t'.join(columns) + '\n'
                dest.write(string)
                source = iter(ET.iterparse(fname, events=('start', 'end')))
                while 1:
                    try:
                        event, elem = source.next()
                        if event == 'end' and elem.tag == 'row':
                            row = [""] * numcol
                            for name, val in elem.attrib.items():
                                if '\t' in val:
                                    print "Found guilty string"
                                row[coldict[name]] = val.replace(
                                    "\t", "").encode('unicode-escape')
                            string = '\t'.join(row) + '\n'
                            dest.write(string)
                    except xml.parsers.expat.ExpatError:
                        print "one warning generated"
                    except StopIteration:
                        break
示例#2
0
def convert_file(fname, size):
    numrows = size * 1000
    bname, ext = os.path.splitext(fname)
    print "First pass for " + fname + " with size " + str(size) + "..."
    numcol = 0
    coldict = {}
    columns = []
    source = iter(et.iterparse(fname, events=('start', 'end')))
    counter = 0
    while counter < numrows:
        try:
            event, elem = source.next()
            if event == 'end' and elem.tag == 'row':
                for name, _ in elem.attrib.items():
                    name = name.encode('ascii', 'ignore')
                    if not name in columns:
                        coldict[name] = numcol
                        columns.append(name)
                        numcol += 1
            counter += 1
        except xml.parsers.expat.ExpatError:
            print "one warning generated"
        except StopIteration:
            break
    print "Second pass for " + fname + " with size " + str(size) + "..."
    with open(bname + "_" + str(size) + ".tsv", "wb") as dest:
        string = '\t'.join(columns) + '\n'
        dest.write(string)
        source = iter(et.iterparse(fname, events=('start', 'end')))
        counter = 0
        while counter < numrows:
            try:
                event, elem = source.next()
                if event == 'end' and elem.tag == 'row':
                    row = [""] * numcol
                    for name, val in elem.attrib.items():
                        if '\t' in val:
                            print "Found guilty string"
                        row[coldict[name]] = val.replace(
                            "\t", "").encode('unicode-escape')
                    string = '\t'.join(row) + '\n'
                    dest.write(string)
                counter += 1
            except xml.parsers.expat.ExpatError:
                print "one warning generated"
            except StopIteration:
                break
示例#3
0
文件: test.py 项目: Peratham/ringo
def convert_files2():
  #files = glob.glob("/lfs/local/0/mraison/*")
  files = ["/lfs/local/0/mraison/a4_posthistory.xml"]
  #files = ["/lfs/local/0/mraison/a1_posts.xml"]
  for fname in files:
    bname, ext = os.path.splitext(fname)
    if ext == ".xml":
      print "First pass for " + fname + "..."
      numcol = 0
      coldict = {}
      columns = []
      source = iter(ET.iterparse(fname, events = ('start','end')))
      while 1:
        try:
          event,elem = source.next()
          if event == 'end' and elem.tag == 'row':
            for name,_ in elem.attrib.items():
              name = name.encode('unicode-escape')
              if not name in columns:
                coldict[name] = numcol
                columns.append(name)
                numcol += 1
        except xml.parsers.expat.ExpatError:
          print "one warning generated"
        except StopIteration:
          break
      print "Second pass for " + fname + "..."
      with open(bname + ".tsv","wb") as dest:
        string = '\t'.join(columns)+'\n'
        dest.write(string)
        source = iter(ET.iterparse(fname, events = ('start','end')))
        while 1:
          try:
            event,elem = source.next()
            if event == 'end' and elem.tag == 'row':
              row = [""]*numcol
              for name,val in elem.attrib.items():
                if '\t' in val:
                  print "Found guilty string"
                row[coldict[name]] = val.replace("\t","").encode('unicode-escape')
              string = '\t'.join(row)+'\n'
              dest.write(string)
          except xml.parsers.expat.ExpatError:
            print "one warning generated"
          except StopIteration:
            break
示例#4
0
 def __init__(self, file, output):
     self.file = file
     self.output = output
     self.context = iter(ET.iterparse(file))
     self.database = Redis()
     self.par_re = re.compile(
         ur'\[\[([\w, \(\)"]+)\]\]')  #For parsing for article links
     self.redirect_re = re.compile(ur'#REDIRECT', re.UNICODE)
     '''
示例#5
0
    def __init__(self, file, output):
        self.file = file
        self.output = output
        self.context = iter(ET.iterparse(file))
        self.database = Redis()
        self.par_re = re.compile(ur'\[\[([\w, \(\)"]+)\]\]')  # For parsing for article links
        self.redirect_re = re.compile(ur"#REDIRECT", re.UNICODE)

        """
 def load(self, filename):
     parser = ET.iterparse(filename)
     for action, elem in parser:
         unmarshal = self._unmarshallers.get(elem.tag)
         if unmarshal:
             data = unmarshal(elem)
             elem.clear()
             elem.text = data
         elif elem.tag != "plist":
             raise IOError("unknown plist type: %r" % elem.tag)
     return parser.root[0].text
示例#7
0
def xml2json_file(input, output = None, pretty = True, encoding='utf-8'):
	context = et.iterparse(input, events=("start", "end"))
	context = iter(context)
	event, root = context.next()

	json = xml2json(root, pretty)

	# if an output filename is given, write to it, otherwise, return json
	if output != None:
		output = codecs.open(output, "w", encoding)
		output.write(json)
	else:
		return json
示例#8
0
def get_facts(article_file):
    facts = []
    source = open(article_file)
    context = ElementTree.iterparse(source, events=("start", "end"))
    context = iter(context)
    event, root = context.next()
    idx = 0
    for event, elem in context:
        if event == "end" and elem.tag == "Name":
            if "val" in elem.attrib:
                facts.append(elem.attrib["val"])
            root.clear()
            if idx % 10 == 0:
                print idx
            idx += 1
    return facts
示例#9
0
def get_facts(article_file):
	facts = []
	source = open(article_file)
	context = ElementTree.iterparse(source, events=("start", "end"))
	context = iter(context)
	event, root = context.next()
	idx = 0
	for event, elem in context:
		if event == "end" and elem.tag == "Name":
			if "val" in elem.attrib:
				facts.append(elem.attrib["val"])
			root.clear()
			if idx % 10 == 0:
				print idx
			idx += 1
	return facts		
示例#10
0
def get_articles(category, in_dir):
    articles = []
    categories = set()
    idx = 0
    files = os.listdir(in_dir)
    for idx_f, file_name in enumerate(files):
        print idx_f + 1, "/", len(files)
        source = open(in_dir + '/' + file_name)
        context = ElementTree.iterparse(source, events=("start", "end"))
        context = iter(context)
        event, root = context.next()

        for event, elem in context:
            tag = elem.tag.split('}')[1]
            if event == "end" and tag == "text":
                if not elem.text:
                    continue

                start = elem.text.find("{{", )
                end = elem.text.find("}}")
                if start != -1 and end != -1:
                    parts = elem.text[start+2:end].lower().split('|')
                    text_c = parts[0].encode('utf-8')

                    if text_c.startswith(category):
                        articles.append(elem.text.encode('utf-8'))
                    categories.add(text_c)

                root.clear()
                if idx % 1000 == 0:
                    print idx
                idx += 1

    f = open("categories", "w")
    for c in categories:
        f.write(c + "\n")
    f.close()
    return articles
示例#11
0
def get_articles(category, in_dir):
    articles = []
    categories = set()
    idx = 0
    files = os.listdir(in_dir)
    for idx_f, file_name in enumerate(files):
        print idx_f + 1, "/", len(files)
        source = open(in_dir + '/' + file_name)
        context = ElementTree.iterparse(source, events=("start", "end"))
        context = iter(context)
        event, root = context.next()

        for event, elem in context:
            tag = elem.tag.split('}')[1]
            if event == "end" and tag == "text":
                if not elem.text:
                    continue

                start = elem.text.find("{{", )
                end = elem.text.find("}}")
                if start != -1 and end != -1:
                    parts = elem.text[start + 2:end].lower().split('|')
                    text_c = parts[0].encode('utf-8')

                    if text_c.startswith(category):
                        articles.append(elem.text.encode('utf-8'))
                    categories.add(text_c)

                root.clear()
                if idx % 1000 == 0:
                    print idx
                idx += 1

    f = open("categories", "w")
    for c in categories:
        f.write(c + "\n")
    f.close()
    return articles
示例#12
0
 def load_xml(self,filename):
   """
   Load data from XML file. Erases all existing data in the table
   """
   self.initvars()
   source = iter(ET.iterparse(filename, events = ('start','end')))
   self.name = source.next()[1].tag
   for event,elem in source:
     if event == 'end' and elem.tag == 'row':
       row = [None]*self.numcols()
       for name,val in elem.attrib.items():
         try:
           idx = self.getColIndex(name)
         except ColumnNotFoundError:
           idx = len(self.cols)
           row.append(None)
           # Add new column to the table
           self.cols.append(set([name]))
           for oldrow in self.data:
             oldrow.append(None)
         row[idx] = val
       self.data.append(row)
   self.initTypes()
示例#13
0
def convert_file_hash(fname, size):
    numrows = size * 1000
    bname, ext = os.path.splitext(fname)
    print "First pass for " + fname + " with size " + str(size) + "..."
    numcol = 0
    coldict = {}
    columns = []
    coltypes = []
    source = iter(et.iterparse(fname, events=('start', 'end')))
    counter = 0
    while counter < numrows:
        try:
            event, elem = source.next()
            if event == 'end' and elem.tag == 'row':
                for name, val in elem.attrib.items():
                    if not name in columns:
                        coldict[name] = numcol
                        columns.append(name)
                        coltypes.append(types.IntType)
                        numcol += 1
                    idx = coldict[name]
                    if val != "":
                        if coltypes[idx] == types.IntType:
                            try:
                                int(val)
                            except ValueError:
                                coltypes[idx] = types.FloatType
                        if coltypes[idx] == types.FloatType:
                            try:
                                float(val)
                            except ValueError:
                                coltypes[idx] = types.UnicodeType
                counter += 1
        except xml.parsers.expat.ExpatError:
            print "one warning generated"
        except StopIteration:
            break
    pdb.set_trace()
    print "Second pass for " + fname + " with size " + str(size) + "..."
    with open(bname + "_" + str(size) + ".hashed.tsv", "wb") as dest:
        string = '\t'.join(columns) + '\n'
        dest.write(string)
        source = iter(et.iterparse(fname, events=('start', 'end')))
        counter = 0
        while counter < numrows:
            try:
                event, elem = source.next()
                if event == 'end' and elem.tag == 'row':
                    row = [-1] * numcol
                    for name, val in elem.attrib.items():
                        idx = coldict[name]
                        if coltypes[idx] == types.IntType:
                            try:
                                row[idx] = int(val)
                            except ValueError:
                                row[idx] = 0
                        elif coltypes[idx] == types.FloatType:
                            try:
                                row[idx] = float(val)
                            except:
                                row[idx] = 0
                        else:  # UnicodeType
                            val = val.encode('unicode-escape')
                            row[idx] = binascii.crc32(val) & 65535
                    string = '\t'.join(map(str, row)) + '\n'
                    dest.write(string)
                    counter += 1
            except xml.parsers.expat.ExpatError:
                print "one warning generated"
            except StopIteration:
                break
示例#14
0
parser.add_argument('--input', type=file, dest='input_file', required=True, help='input xml filename')
parser.add_argument('--output', dest='output_file', required=True, help='output sql filename')
parser.add_argument('--tag', dest='tag', required=True, help='the record tag. eg: item')
parser.add_argument('--table', dest='table', required=True, help='table name')
parser.add_argument('--ignore', dest='ignore', default='', nargs='+', help='list of tags to ignore')
parser.add_argument('--encoding', dest='encoding', default='utf-8', help='character encoding (default=utf-8)')
parser.add_argument('--limit', type=int, dest='limit', default=-1, help='maximum number of records to process')
parser.add_argument('--packet', type=float, dest='packet', default='8', help='maximum size of an insert query in MB. see MySQL\'s max_allowed_packet (default=8)')
args = parser.parse_args()


# output file handle
output = codecs.open(args.output_file, "w", encoding=args.encoding)

# open the xml file for iteration
context = et.iterparse(args.input_file, events=("start", "end"))
context = iter(context)
# get to the root
event, root = context.next()


max_packet = 1048576 * args.packet

items = []; tags = []; output_buffer = []
tagged = False
started = False
sql_len = 0
sql_insert = None
num_insert = 0
n = 0
packet_size = 0
示例#15
0
#!/usr/bin/python


from elementtree import ElementTree
import string
config = []
data = open('speedtest.xml')

eliterator = ElementTree.iterparse(data,events=('start','end'))
counter = 0
commits = 0
for event,element in eliterator:
    ns_tag = None
    tag = element.tag
    if tag[0] == '{':
        # cut ns
        ns_tag = tag
        tag = tag.split('}')[1]
#    print "=== LOAD",event,tag,ns_tag
    if event == 'end':
        country = element.attrib.get('countrycode')
        city = element.attrib.get('name')
        isp = element.attrib.get('sponsor') 
        url = element.attrib.get('url')
        counter += 1
        if (country == 'RU') or \
           (country == 'DE') or \
           (country == 'BG') or \
           (country == 'DE') or \
           (country == 'PL') or \
           (country == 'FR') or \
示例#16
0
  print cur.lastrowid
  conn.commit()
  to_insert['article_id'] = cur.lastrowid
  save_author_data(to_insert)

def save_author_data(to_insert):
  # print to_insert
  try:
    if 'author' in to_insert:
      cur.execute("INSERT INTO authors(name, article_id) VALUES (%s, %s)", (to_insert['name'], to_insert['article_id']))
      print cur.lastrowid
      conn.commit()

  except pymysql.MySQLError:
    print "error, but continuing"

to_insert = initialize_row()

for event, elem in ET.iterparse('dblp.xml'):
  if elem.tag in delimiter_keys:
    save_data(to_insert)
    to_insert = initialize_row()
  
  if elem.tag in keys:
    #print elem.tag, elem.text
    to_insert[elem.tag] = elem.text

  if elem.tag in author_keys:
    #print elem.tag, elem.text
    to_insert[elem.tag] = elem.text