예제 #1
0
def make_xml(filedig,filehw,fileout):
 # slurp txt file into list of lines
 with codecs.open(filein,encoding='utf-8',mode='r') as f:
    inlines = [line.rstrip('\r\n') for line in f]
 # parse xxxhw.txt 
 hwrecs = init_hwrecs(filehw)
 # open output xml file
 fout = codecs.open(fileout,'w','utf-8')
 nout = 0  # count of lines written to fout
 # generate xml header lines
 lines = xml_header(xmlroot)
 for line in lines:
  fout.write(line + '\n')
  nout = nout + 1
 # process hwrecs records one at a time and generate output
 nerr = 0
 for ihwrec,hwrec in enumerate(hwrecs):
  if ihwrec > 1000000: # 12 
   print("debug stopping")
   break
  datalines = get_datalines(hwrec,inlines)
  # construct output
  xmlstring = construct_xmlstring(datalines,hwrec)
  # data is a string, which should be well-formed xml
  # try parsing this string to verify well-formed.
  try:
   root = ET.fromstring(xmlstring.encode('utf-8'))
  except:
예제 #2
0
def make_xml(filedig,filehw,fileout):
 # slurp txt file into list of lines
 with codecs.open(filein,encoding='utf-8',mode='r') as f:
    inlines = [line.rstrip('\r\n') for line in f]
 # parse xxxhw.txt
 hwrecs = init_hwrecs(filehw)
 # open output xml file
 fout = codecs.open(fileout,'w','utf-8')
 nout = 0  # count of lines written to fout
 # generate xml header lines
 lines = xml_header(xmlroot)
 for line in lines:
  fout.write(line + '\n')
  nout = nout + 1
 # process hwrecs records one at a time and generate output
 nerr = 0
 for ihwrec,hwrec in enumerate(hwrecs):
  if ihwrec > 1000000: # 12
   print("debug stopping")
   break
  datalines = get_datalines(hwrec,inlines)
  # construct output
  xmlstring = construct_xmlstring(datalines,hwrec)
  # data is a string, which should be well-formed xml
  # try parsing this string to verify well-formed.
  try:
   root = ET.fromstring(xmlstring)
  except:
   # 01-09-2021. Remove conditional err messaging
   # since some Python versions (e.g. 2.7.5) give false occasions
   nerr = nerr + 1
   # For debugging, change False to True
   if False:
    outarr = []
    out = "<!-- xml error #%s: L = %s, hw = %s-->" %(nerr,hwrec.L,hwrec.k1)
    outarr.append(out)
    outarr.append("datalines = ")
    outarr = outarr + datalines
    outarr.append("xmlstring=")
    outarr.append(xmlstring)
    outarr.append('')
    for out in outarr:
     print(out)
    #exit(1) continue
  # write output
  fout.write(xmlstring + '\n')
  nout = nout + 1

 # write closing line for xml file.
 out = "</%s>\n" % xmlroot
 fout.write(out)
 fout.close()
 if (nerr == 0):
  print("All records parsed by ET")
 else:
  print("WARNING: make_xml.py:",nerr,"records records not parsed by ET")
예제 #3
0
def make_xml(filedig, filehw, fileout):
    # slurp txt file into list of lines
    with codecs.open(filein, encoding='utf-8', mode='r') as f:
        inlines = [line.rstrip('\r\n') for line in f]
    # parse xxxhw.txt
    hwrecs = init_hwrecs(filehw)
    # open output xml file
    fout = codecs.open(fileout, 'w', 'utf-8')
    nout = 0  # count of lines written to fout
    # generate xml header lines
    lines = xml_header(xmlroot)
    for line in lines:
        fout.write(line + '\n')
        nout = nout + 1
    # process hwrecs records one at a time and generate output
    nerr = 0
    for ihwrec, hwrec in enumerate(hwrecs):
        if ihwrec > 1000000:  # 12
            print("debug stopping")
            break
        datalines = get_datalines(hwrec, inlines)
        # construct output
        xmlstring = construct_xmlstring(datalines, hwrec)
        # data is a string, which should be well-formed xml
        # try parsing this string to verify well-formed.
        try:
            root = ET.fromstring(xmlstring.encode('utf-8'))
        except:
            outarr = []
            nerr = nerr + 1
            out = "<!-- xml error #%s: L = %s, hw = %s-->" % (nerr, hwrec.L,
                                                              hwrec.k1)
            outarr.append(out)
            outarr.append("datalines = ")
            outarr = outarr + datalines
            outarr.append("xmlstring=")
            outarr.append(xmlstring)
            outarr.append('')
            for out in outarr:
                print(out.encode('utf-8'))
            #exit(1) continue
        # write output
        fout.write(xmlstring + '\n')
        nout = nout + 1

    # write closing line for xml file.
    out = "</%s>\n" % xmlroot
    fout.write(out)
    fout.close()
예제 #4
0
def extract_keys(filein, fileout):
    fout = codecs.open(fileout, "w", 'utf-8')
    hwrecs = hwparse.init_hwrecs(filein)
    #f = codecs.open(filein,"r",'utf-8')
    n = 0  # number of lines read
    nout = 0  # Number of lines written
    for r in hwrecs:
        n = n + 1
        #m = re.search(r'<(H[^>]*)>.*?<key1>(.*?)</key1>.*?<L.*?>(.*?)</L>',line)
        #if not m: # skip boilerplate
        # continue
        # line = line.rstrip('\r\n')
        cat = 'H' + r.e
        key = r.k1
        L = r.L
        #key = m.group(2)
        #L = m.group(3)
        fout.write('%s,%s,%s\n' % (key, cat, L))
        nout = nout + 1
    #f.close()
    fout.close()
    print(n, "records in,", nout, "records written")
예제 #5
0
파일: hw2.py 프로젝트: zhongys/csl-pywork
    pc = rec.pc
    k1 = rec.k1
    L = rec.L
    out1 = '%s:%s:%s,%s:%s' % (pc, k1, ln1, ln2, L)
    if rec.type == None:
        out2 = ''
    else:
        # For alternate headwords, one more field  type,LP
        out2 = ':%s,%s' % (rec.type, rec.LP)
    out = out1 + out2
    return out


def extract_hw2(hwrecs):
    recs2 = []  # an array of strings
    for rec in hwrecs:
        # rec is HW object
        # construct HW2 object by excluding key2
        # out1 = '%s:%s:%s:%s,%s:%s' %(pc,key1,key2,linenum1,linenum2,L)
        rec2 = extract_hw2_helper(rec)
        recs2.append(rec2)
    return recs2


if __name__ == "__main__":
    filein = sys.argv[1]  # xxxhw.txt
    fileout = sys.argv[2]
    hwrecs = init_hwrecs(filein)
    hw2recs = extract_hw2(hwrecs)
    write(hw2recs, fileout)