def writeHeadings():
    f = open("all_headings_other_without.txt", 'w')
    fileset = getFiles("/home/shreya/Wharton/NEW/Other/ONLY_XML")
    for filepath in fileset:
        index = filepath.index(".") 
        if "xml" in filepath[index:]:
            print filepath
            pt = ParseText(filepath)
            content = pt.readXmlToString()
            #print content
            content_list = pt.readTextToList()     
            heading_indexes, headings = pt.findHeadings(content, content_list, [])
            #print headings
            for heading in headings:
                f.write(heading.encode('ascii', 'ignore') + "\n")
            

    f.close()
예제 #2
0
# Change this: should only merge_toplines on lines that had single-letter changes
def preprocess(soup):
    soup,s_set = merge_sletters(soup)
    print s_set
    merge_toplines(soup,s_set)
  
    return soup


# # Extract data from all resumes



#CODE START FROM HERE
fileset = getFiles("/home/shreya/RA_ML/Resume-Parsing/Data/split/Other")
for f_csv in fileset:
    index = f_csv.index(".") 
    first_index =f_csv.rfind("/")+1
    last_index = f_csv.rfind(".")
    out_filename = f_csv[first_index: last_index]
                

    with open(f_csv, 'rb') as f:
        reader = csv.reader(f)
        data = [row for row in reader]
    
    datanp = np.array(data)


예제 #3
0
                    # print line
                    value += line + "\n"

            """if k_flag:        
            	print key
            """
            if k_flag and v_flag:
                print "HERE"
                work_exp[key] = value

    return work_exp


if __name__ == "__main__":
    PROBABLE_HEADINGS = getAllHeadings("set_of_headings.txt")
    fileset = getFiles("/home/shreya/Wharton/XML")
    with open("work_exp.csv", "w") as csvfile:
        csvwriter = csv.writer(csvfile, delimiter=",")
        csvwriter.writerow(["Filename", "Company, Position, Duration", "About Job", "EXPERIENCE"])
        for xml_filepath in fileset:
            index = xml_filepath.index(".")
            if "xml" in xml_filepath[index:]:
                # print "xml: ", xml_filepath
                row = []
                first_index = xml_filepath.rfind("/") + 1
                last_index = xml_filepath.rfind(".")
                filename = xml_filepath[first_index:last_index]
                # print "filename: "+filename
                text_filepath = "/home/shreya/Wharton/PDF_text/" + filename + ".txt"
                # print "text_filepath: ", text_filepath
                pt = ParseText(xml_filepath, text_filepath)
예제 #4
0
        #break    
    return file_year

if __name__ == "__main__":
    PROBABLE_HEADINGS = getAllHeadings("set_of_headings_1.txt")
    print "OLD LEN: ", len(PROBABLE_HEADINGS)
    PROBABLE_HEADINGS.extend(getAllHeadings("set_of_headings_boston.txt"))
    PROBABLE_HEADINGS.extend(getAllHeadings("set_of_headings_newyork.txt"))
    PROBABLE_HEADINGS.extend(getAllHeadings("set_of_headings_other.txt"))
    
    PROBABLE_HEADINGS = list(set(PROBABLE_HEADINGS))
    print "NEW LEN: ", len(PROBABLE_HEADINGS)
    if "education:" in PROBABLE_HEADINGS:
        print "AYE!" 

    fileset = getFiles("/home/shreya/Wharton/NEW/Other/ONLY_XML")
    

    fileSet = []
    for f in fileset:
        if ".xml" == f[-4:]:
            fileSet.append(f)

    #print len(fileSet)
    for i in range(0, len(fileSet), 50):
        if (i+50)< len(fileSet):
            file_set = fileSet[i: i+50]
        else:
            file_set = fileSet[i:]   
        
        try:
예제 #5
0
def convertAll(dirpath="", outdir=""):
	filepaths = getFiles(dirpath)
	for filepath in filepaths:
		print filepath
		if filepath[-4:] == ".pdf":
			pdfToXML(filepath, outdir)