def parse_reuters(directory, output_dir): if not os.path.exists(output_dir): os.mkdir(output_dir) with open(os.path.join(output_dir, 'reuters.xml'),'w') as fileout: index = 0 bodyhead = "<body>\n <body.content>\n <block class='lead_paragraph'>\n" bodyfooter = "</block>\n </body.content>\n </body>" footer = "\n</nitf>" keyhead = "<classifier>" keyfooter = "</classifier>" categories = os.listdir(directory) print >> fileout, "<?xml version='1.0' encoding='UTF-8'?>\n" print >> fileout, "<nitf>" for cat in categories: category_path = os.path.join(directory, cat) articles = os.listdir(category_path) for article in articles: title = True author = True place = True index += 1 keys = [] titlestring = "" docdatastring = "" with open(os.path.join(category_path, article)) as contents: text = "" for content in contents: if not content.isspace(): if title: header = '<head>\n' titlestring = '<title>' + clean(content) + '</title>\n' title = False docdatastring = '<doc-id id-string="%s"/>' % (10000000 + index) # keys = [i for i in clean_total(content).lower().split() if i not in stop] elif author: author = False if '<AUTHOR>' in content: content = content.replace('<AUTHOR>','').replace('</AUTHOR>','') text += clean(content) else: text += content cleaned_text = clean_total(text) keys = get_keywords(cleaned_text) print >>fileout, titlestring print >> fileout, header print >> fileout, '<docdata>' print >> fileout, docdatastring print >> fileout, '<identified-content>\n' print >> fileout, keyhead print >> fileout, (" ").join(keys + [cat]) print >> fileout, keyfooter print >> fileout, '</identified-content>\n' print >> fileout, '</docdata>\n' print >> fileout, '</head>\n' print >> fileout, bodyhead print >> fileout, clean(cleaned_text) print >> fileout, bodyfooter print >> fileout, footer
def identify(self, data): device = utils.get_device_ua(data) if "Windows" in device: return {"brand": "Windows", "model": "Windows"} elif "Macintosh" in device: return {"brand": "Apple", "model": "MacBook"} # (iPhone; CPU iPhone OS 10_3_2 like Mac OS X) # (iPad; CPU iPhone OS 10_3_2 like Mac OS X) elif "iphone" in device.lower() and "like Mac OS".lower() in device.lower(): return {"brand": "Apple", "model": "iPhone"} elif "iPad".lower() in device.lower() and "like Mac OS".lower() in device.lower(): return {"brand": "Apple", "model": "iPad"} elif "android" in device.lower(): device_ua = utils.get_keywords(device) # print(self.precise) if device_ua is None: return {"brand": "", "model": ""} device_ua = device_ua.upper() if device_ua in self.precise: return self.precise[device_ua] if device_ua in self.pattern: return self.pattern[device_ua] # return {"brand": "", "model": ""} return None