def generate_inlink_map(): inlinkmap = {} for docno in docs_to_format: document = fileutils.readdocumentwithinlin(docno) current_page = document['page_url'] for outlink in document['outlinks']: if outlink in inlinkmap: inlinks = inlinkmap[outlink] inlinks.append(current_page) inlinkmap[outlink] = inlinks else: inlinks = [] inlinks.append(current_page) inlinkmap[outlink] = inlinks print "COMPLETED " + docno fileutils.write_inlink_map(inlinkmap)
def convert_documents(): for docno in doc_files: print docno document = fileutils.readdocumentwithinlin(docno) current_txt = "" current_txt += "<DOCNO>"+document['page_url']+"</DOCNO>\n" header_txt = ''.join(document['headerdata']) current_txt += "<HTTP_HEADER>\n"+header_txt+"\n</HTTP_HEADER>\n" current_txt += "<HEAD>"+document['page_title']+"</HEAD>\n" current_txt += "<HTML_SOURCE>\n"+document['raw_html']+"\n</HTML_SOURCE>\n" current_txt += "<TEXT>\n"+document['clean_text']+"\n</TEXT>\n" if "outlinks" in document: current_txt += "<OUTLINKS>\n"+",".join(set(document['outlinks']))+"\n</OUTLINKS>\n" else: current_txt += "<OUTLINKS>\n"+",".join([])+"\n</OUTLINKS>\n" if "inlink" in document: current_txt += "<INLINKS>\n"+",".join(set(document['inlink']))+"\n</INLINKS>\n" else: current_txt += "<INLINKS>\n"+",".join([])+"\n</INLINKS>\n" fileutils.write_final_document(docno, current_txt) print "COMPLETED "+docno