def writeHeadings():
    f = open("all_headings_other_without.txt", 'w')
    fileset = getFiles("/home/shreya/Wharton/NEW/Other/ONLY_XML")
    for filepath in fileset:
        index = filepath.index(".") 
        if "xml" in filepath[index:]:
            print filepath
            pt = ParseText(filepath)
            content = pt.readXmlToString()
            #print content
            content_list = pt.readTextToList()     
            heading_indexes, headings = pt.findHeadings(content, content_list, [])
            #print headings
            for heading in headings:
                f.write(heading.encode('ascii', 'ignore') + "\n")
            

    f.close()
Пример #2
0
    with open("work_exp.csv", "w") as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=",")
        csvwriter.writerow(["Filename", "Company, Position, Duration", "About Job", "EXPERIENCE"])
        for xml_filepath in fileset:
            index = xml_filepath.index(".")
            if "xml" in xml_filepath[index:]:
                # print "xml: ", xml_filepath
                row = []
                first_index = xml_filepath.rfind("/") + 1
                last_index = xml_filepath.rfind(".")
                filename = xml_filepath[first_index:last_index]
                # print "filename: "+filename
                text_filepath = "/home/shreya/Wharton/PDF_text/" + filename + ".txt"
                # print "text_filepath: ", text_filepath
                pt = ParseText(xml_filepath, text_filepath)
                content = pt.readXmlToString()
                content_list = pt.readTextToList()
                heading_indexes, headings = pt.findHeadings(content, content_list, PROBABLE_HEADINGS)
                # bio = pt.find_bio(content, content_list, headings, heading_indexes)
                # print "BIO: ", bio
                # print "HEADINGS: ",  headings
                # edu = pt.find_this(content, content_list, "education", headings, heading_indexes)
                exp = pt.find_this(content, content_list, "experience", headings, heading_indexes)
                # print "EDUCATION: ", edu
                # print "EXPERIENCE: ", exp
                if not exp:
                    exp = pt.find_this(content, content_list, "history", headings, heading_indexes)

                split_exp = splitExp(exp)

                for ex in split_exp: