def writeHeadings(): f = open("all_headings_other_without.txt", 'w') fileset = getFiles("/home/shreya/Wharton/NEW/Other/ONLY_XML") for filepath in fileset: index = filepath.index(".") if "xml" in filepath[index:]: print filepath pt = ParseText(filepath) content = pt.readXmlToString() #print content content_list = pt.readTextToList() heading_indexes, headings = pt.findHeadings(content, content_list, []) #print headings for heading in headings: f.write(heading.encode('ascii', 'ignore') + "\n") f.close()
# Change this: should only merge_toplines on lines that had single-letter changes def preprocess(soup): soup,s_set = merge_sletters(soup) print s_set merge_toplines(soup,s_set) return soup # # Extract data from all resumes #CODE START FROM HERE fileset = getFiles("/home/shreya/RA_ML/Resume-Parsing/Data/split/Other") for f_csv in fileset: index = f_csv.index(".") first_index =f_csv.rfind("/")+1 last_index = f_csv.rfind(".") out_filename = f_csv[first_index: last_index] with open(f_csv, 'rb') as f: reader = csv.reader(f) data = [row for row in reader] datanp = np.array(data)
# print line value += line + "\n" """if k_flag: print key """ if k_flag and v_flag: print "HERE" work_exp[key] = value return work_exp if __name__ == "__main__": PROBABLE_HEADINGS = getAllHeadings("set_of_headings.txt") fileset = getFiles("/home/shreya/Wharton/XML") with open("work_exp.csv", "w") as csvfile: csvwriter = csv.writer(csvfile, delimiter=",") csvwriter.writerow(["Filename", "Company, Position, Duration", "About Job", "EXPERIENCE"]) for xml_filepath in fileset: index = xml_filepath.index(".") if "xml" in xml_filepath[index:]: # print "xml: ", xml_filepath row = [] first_index = xml_filepath.rfind("/") + 1 last_index = xml_filepath.rfind(".") filename = xml_filepath[first_index:last_index] # print "filename: "+filename text_filepath = "/home/shreya/Wharton/PDF_text/" + filename + ".txt" # print "text_filepath: ", text_filepath pt = ParseText(xml_filepath, text_filepath)
#break return file_year if __name__ == "__main__": PROBABLE_HEADINGS = getAllHeadings("set_of_headings_1.txt") print "OLD LEN: ", len(PROBABLE_HEADINGS) PROBABLE_HEADINGS.extend(getAllHeadings("set_of_headings_boston.txt")) PROBABLE_HEADINGS.extend(getAllHeadings("set_of_headings_newyork.txt")) PROBABLE_HEADINGS.extend(getAllHeadings("set_of_headings_other.txt")) PROBABLE_HEADINGS = list(set(PROBABLE_HEADINGS)) print "NEW LEN: ", len(PROBABLE_HEADINGS) if "education:" in PROBABLE_HEADINGS: print "AYE!" fileset = getFiles("/home/shreya/Wharton/NEW/Other/ONLY_XML") fileSet = [] for f in fileset: if ".xml" == f[-4:]: fileSet.append(f) #print len(fileSet) for i in range(0, len(fileSet), 50): if (i+50)< len(fileSet): file_set = fileSet[i: i+50] else: file_set = fileSet[i:] try:
def convertAll(dirpath="", outdir=""): filepaths = getFiles(dirpath) for filepath in filepaths: print filepath if filepath[-4:] == ".pdf": pdfToXML(filepath, outdir)