if(j=='-'): j = ' ' bookclass = cls break except ValueError as ve: bookclass = cls break except ValueError as ve: continue except ValueError as ve: continue buff = [] xml += '<book class="'+str(bookclass.encode('utf8'))+'">' xml += '<title>'+str(title.encode('utf8'))+'</title>' xml += '<noticekoha>'+str(noticekoha.encode('utf8'))+'</noticekoha>' xml += '<category>'+str(Classification.classToCategory(bookclass).encode('utf8'))+'</category>' xml += '</book>' else: buff.append(ref); xml += '</Document>' session.close() # Write String xml into database (.xml file). session1.add("bookref.xml", xml) xml = xmldom.parseString(xml) pretty_xml_as_string = xml.toprettyxml() with open(outFile,"w") as f: f.write(pretty_xml_as_string.encode('utf8')); session1.close()
break except ValueError as ve: bookclass = cls break except ValueError as ve: continue except ValueError as ve: continue #print bookclass lang_offset = buff.index("##") lang = [] for i in range(0,lang_offset): lang.append(buff[i]) code = str(Classification.classToCategory(bookclass).encode('utf8')) if code not in books : books[code] = dict() #keyword for a book of 'code'&'ref', set() for non-duplicate ref = buff[lang_offset+1] if ref not in books[code] : books[code][ref] = set() print code,ref for i in range(lang_offset+2,len(buff)): tokens = wpt.tokenize(buff[i]) #remove stopwords before stem filtered_tokens = [w for w in tokens if not w in stopwords_list_encoded] for token in filtered_tokens: #use only token that doesn't have punctuation