def pack_n_doc(VOX_VERSION_NUMBER): if len(str(VOX_VERSION_NUMBER)) == 0: print "No version indicated" return pack_nucleo.pack(VOX_VERSION_NUMBER) pack_plexo.pack(VOX_VERSION_NUMBER) document.document(VOX_VERSION_NUMBER)
def light(self, q, wildcards): if (wildcards[1] == "one"): id = 1 elif (wildcards[1] == "two"): id = 2 elif (wildcards[1] == "three"): id = 3 elif (wildcards[1] == "four"): id = 4 elif (wildcards[1] == "five"): id = 5 elif (wildcards[1] == "six"): id = 6 else: id = -1 html = document(self.connection) html.title("Light Switch") html.outgoing(q) if ((wildcards[0] == "on" or wildcards[0] == "off") and id > -1): html.incoming("Okay, let's turn " + wildcards[0] + " lamp " + wildcards[1]) html.send() html.request("http://zimmer:2525/remote/" + wildcards[0] + "?id=" + str(id)) #Only works in my setup else: html.incoming("No such lamp available") html.send()
def test_get_resolved_inverted_index(): test_crawler = crawler(None, "") # test values WORD_ID_A = "1" WORD_ID_B = "2" WORD_ID_C = "3" WORD_A = "I" WORD_B = "am" WORD_C = "Groot" FONT_A = 0 FONT_B = 0 FONT_C = 0 DOC_ID_A = "1" URL_A = "http://www.test.com" # Initialize crawler needed for the function call test_crawler._inverted_lexicon = {WORD_ID_A:WORD_A,WORD_ID_B:WORD_B,WORD_ID_C:WORD_C} test_crawler._document_index = {DOC_ID_A:document(URL_A)} curr_words = ( (WORD_ID_A,FONT_A), (WORD_ID_B,FONT_B), (WORD_ID_C,FONT_C) ) curr_doc_id = DOC_ID_A test_crawler.add_words_to_inverted_index(curr_words,curr_doc_id) # Expected and actual result comparison expected_result = {WORD_A:{URL_A},WORD_B:{URL_A},WORD_C:{URL_A}} actual_result = test_crawler.get_resolved_inverted_index() # If the two results equal return true if cmp(expected_result, actual_result) == 0: return True else: return False
def OnOpen(self,e): """ Open a file""" dlg = wx.FileDialog(self, "Choose files", self.dirname, ".", "*.pdf", wx.FD_MULTIPLE) if dlg.ShowModal() == wx.ID_OK: #ProgressBar # self.totalpages=0 # self.pagesdone=0 # self.progress=wx.ProgressDialog('Progress','Starting...') #self.toolbar.EnableTool(5,True) self.docList=[] for address in dlg.GetPaths(): d = document(self,address,len(self.docList)) #d.start() self.docList.append(d) self.RefreshTree() self.menuProcess.Enable(True) # for d in self.docList: # d.join() #self.tree_ctrl = wx.TreeCtrl(self, -1, style=wx.TR_DEFAULT_STYLE | \ # wx.TR_FULL_ROW_HIGHLIGHT ) #self.root = self.tree_ctrl.AddRoot('Files') #self.tree_ctrl.Bind(wx.EVT_TREE_SEL_CHANGED, self.OnSelChanged, id=1) dlg.Destroy()
def run_ex(self, itemid, content, call_index=True): try: doc = document(content) self.insert(doc, key=itemid) if call_index: self.lsh.index() except: pass
def document_list_view(request, page = '1'): title = "文档列表" page = int(page) num = 20 document_instance = documentModel() doccuments = document_instance.getDocumentList(page, num) docs = [] for i in range(len(doccuments)): docs.append(document(doccuments[i])) doc_total = document_instance.getDocumentNum() page_total = (doc_total - 1) / num + 1 if page == 1: previous_page = page else: previous_page = page - 1 if page == page_total: next_page = page else: next_page = page_total + 1 pages = [] temp = page - page % 5 if page_total <= 5: for i in range(page_total): pages.append(i + 1) elif page > page_total - page_total % 5: for i in range(page_total - page_total % 5, page_total): pages.append(i + 1) else: for i in range(5): pages.append(i + 1 + temp) return render_to_response("document.html", { "title": title, 'docs': docs, 'project_name': webConfig.PROJECTNAME, 'toplabel0': webConfig.TOPLABEL0, 'toplabel1': webConfig.TOPLABEL1, 'toplabel2': webConfig.TOPLABEL2, 'toplabel3': webConfig.TOPLABEL3, 'toplabel4': webConfig.TOPLABEL4, 'toplabel5': webConfig.TOPLABEL5, 'toplabel6': webConfig.TOPLABEL6, 'page_total': page_total, 'page': page, 'previous_page': previous_page, 'next_page': next_page, 'pages': pages, # 'category': category, # 'category_name': category_name } )
def query(self, doc, topn=1000): try: unicodedata.normalize('NFKC', doc) doc = document(doc) minhash = doc.get_minhash(doc.k_shingles, config.MINHASH_CONFIG['num_permutation']) return self.lsh.query(minhash, topn) except: return []
def load_json_dataset(dataset_file): logging.info('loading dataset: ' + dataset_file + ' ...') dataset = [] ds = json.load(open(dataset_file)) for i, item in enumerate(ds): article = document(item['html_text'], item['propaganda_label'], item['gdlt_id'], item['mbfc_url']) dataset.append(article) logging.info('dataset loaded !') return dataset
def __main__(): # This function process all the pdfs in the docList e = document() e.process("a.pdf",0) #docList = [] #docList.append(e) for d in docList: d.process()
def no_action (self, q, wildcards): spvoice_url = 'http://localhost:9000/command' params = urllib.parse.urlencode({ 'command': q }).encode('utf8') response = urllib.request.urlopen(spvoice_url, params).read().decode("utf-8") response = json.loads(response) html = document(self.connection) html.title("House") html.incoming(q) html.outgoing(response['response'].replace("\\n","<br />\n")) html.send()
def read_documents(author_dir): document_list = [] auth = author(os.path.basename(author_dir)) # Constructs 'author' object with given name. for doc_name in os.listdir(author_dir): if doc_name.endswith('.txt'): doc_text = open(author_dir + "/" + doc_name,'r').read() document_list.append(document(auth.name + doc_name[:-4], auth.name, doc_text)) # Constructs documents with given file and author, auth.doc_list = document_list auth.doc_count = len(document_list) return auth # adds them into a list. Then returns author object.
def test_sorted_resolved_inverted_index(): test_crawler = crawler(None, "") # test values WORD_A = "test" DOC_ID_A = 1 DOC_ID_B = 2 DOC_ID_C = 3 DOC_ID_D = 4 DOC_A = "http://www.A.com" DOC_B = "http://www.B.com" DOC_C = "http://www.C.com" DOC_D = "http://www.D.com" # Initialize crawler needed for the function call word_id = test_crawler.word_id(WORD_A) doc_id_list = [] doc_id_list.append(DOC_ID_A) doc_id_list.append(DOC_ID_B) doc_id_list.append(DOC_ID_D) test_crawler.document_id(DOC_A) test_crawler.document_id(DOC_B) test_crawler.document_id(DOC_C) test_crawler.document_id(DOC_D) test_crawler._inverted_index[word_id] = doc_id_list test_crawler.add_link(DOC_ID_A, DOC_ID_B) test_crawler.add_link(DOC_ID_B, DOC_ID_D) test_crawler.add_link(DOC_ID_D, DOC_ID_C) test_crawler.compute_page_rank() test_crawler.construct_sorted_resolved_inverted_index() # Expected and actual result comparison expected_result = [document(DOC_D),document(DOC_B),document(DOC_A)] actual_result = test_crawler._sorted_resolved_inverted_index[WORD_A] # If the two results equal return true bool_A = (cmp(expected_result[0].get_doc_url(), actual_result[0].get_doc_url()) == 0) bool_B = (cmp(expected_result[1].get_doc_url(), actual_result[1].get_doc_url()) == 0) bool_C = (cmp(expected_result[2].get_doc_url(), actual_result[2].get_doc_url()) == 0) if bool_A and bool_B and bool_C: return True else: return False
def load_dataset(dataset_file, classification="binary"): logging.info('loading dataset: ' + dataset_file + ' ...') dataset = [] with codecs.open(dataset_file, 'r') as f: i = 0 for line in f: fields = line.split('\t') if fields[0] == '3': prop_gold = '1' else: prop_gold = '-1' if classification == 'binary': article = document(fields[1], prop_gold, str(i), '') else: article = document(fields[1], fields[0], str(i), '') dataset.append(article) i += 1 f.close() logging.info('dataset loaded !') return dataset
def run(self, line): parts = line.split(",") doc = document() doc.setText(parts[self.text_col]) if (self.label_const == None): #print parts[self.label_col]+": "+self.label_dict[self.clean(parts[self.label_col])] doc.setLabel(self.label_dict[self.clean(parts[self.label_col])]) else: doc.setLabel(self.label_const) doc.setText(self.clean(parts[self.text_col])) return doc
def run(self, docs): count = 1 for itemid, content in docs.items(): try: doc = document(content) self.insert(doc, key=itemid) print('\rpushed %d items' % (count)), sys.stdout.flush() count += 1 except: pass self.lsh.index() print('')
def test_document_index(): test_crawler = crawler(None, "") # test values URL_A = "http://www.testA.com" DOC_ID_A = 1 URL_B = "http://www.testB.com" DOC_ID_B = 2 # Initialize crawler needed for the function call test_crawler.document_id(URL_A) test_crawler.document_id(URL_B) # Expected and actual result comparison expected_result = {DOC_ID_A:document(URL_A),DOC_ID_B:document(URL_B)} actual_result = test_crawler._document_index # If the two results equal return true bool_A = (cmp(expected_result[DOC_ID_A].get_doc_url(), actual_result[DOC_ID_A].get_doc_url()) == 0) bool_B = (cmp(expected_result[DOC_ID_B].get_doc_url(), actual_result[DOC_ID_B].get_doc_url()) == 0) if bool_A and bool_B: return True else: return False
def add_document(self, file_path): # Initializes this file as a new document in the corpus try: # load the data f = open(join(file_path), 'r') raw_data = f.read() f.close() # create the document object d = document(self, raw_data, file_path.split("/")[-1]) self.documents[d.ID] = d except (UnicodeDecodeError): self.log_file.write("UnicodeDecodeError on file " + file_path + "\n")
def do(self,addresses): # This function process all the pdfs in the docList docList=[] count = 0 for address in addresses: e = document() e.process(address,count) docList.append(e) count=count+1 #docList = [] #docList.append(e) #for d in docList: # d.process() return docList
def add_document(self,file_path): # Initializes this file as a new document in the corpus try: # load the data f = open(join(file_path), 'r') raw_data = f.read() f.close() # create the document object d = document(self,raw_data,file_path.split("/")[-1]) self.documents[d.ID] = d except (UnicodeDecodeError): self.log_file.write("UnicodeDecodeError on file " + file_path + "\n")
def document_id(self, url): """Get the document id for some url.""" if url in self._doc_id_cache: return self._doc_id_cache[url] # TODO: just like word id cache, but for documents. if the document # doesn't exist in the db then only insert the url and leave # the rest to their defaults. doc_id = self._mock_insert_document(url) self._doc_id_cache[url] = doc_id # add the newly created document object to the document index self._document_index[doc_id] = document(url) #self._document_index[doc_id] = url return doc_id
def load_myds(dataset_file): logging.info('loading dataset: ' + dataset_file + ' ...') dataset = [] with codecs.open(dataset_file, 'r', encoding='utf8') as f: i = 0 for line in f: line = line.strip() fields = line.split('\t') article = document( fields[0], fields[-1], fields[4], fields[-2]) # html_text, prop_label, gdelt_id, gdelt_sourceURL dataset.append(article) i += 1 f.close() logging.info('dataset loaded !') return dataset
def do(self,addresses): # This function process all the pdfs in the docList docList=[] count = 0 #self,parent,pdf_path,job_number for address in addresses: e = document(self,address,0) e.process() docList.append(e) count=count+1 #docList = [] #docList.append(e) #for d in docList: # d.process() return docList
def wrappedindocument(self, file=None, **kwargs): page_kwargs = {} write_kwargs = {} for name, value in kwargs.items(): if name.startswith("page_"): page_kwargs[name[5:]] = value elif name.startswith("write_"): write_kwargs[name[6:]] = value else: warnings.warn("Keyword argument %s of %s method should be prefixed with 'page_'" % (name, method.__name__), DeprecationWarning) page_kwargs[name] = value d = document.document([document.page(self, **page_kwargs)]) self.__name__ = method.__name__ self.__doc__ = method.__doc__ return method(d, file, **write_kwargs)
def __init__(self): """Constructor""" wx.Frame.__init__(self, None, wx.ID_ANY, "Notebook Tutorial", size=(600,400) ) panel = wx.Panel(self) self.obj=document() self.obj.pageList=[page(1,1)] lis=[self.obj, self.obj, self.obj] notebook = DocNoteBook(panel,lis) sizer = wx.BoxSizer(wx.VERTICAL) sizer.Add(notebook, 1, wx.ALL|wx.EXPAND, 5) panel.SetSizer(sizer) self.Layout() self.Show()
def wrappedindocument(self, file=None, **kwargs): page_kwargs = {} write_kwargs = {} for name, value in kwargs.items(): if name.startswith("page_"): page_kwargs[name[5:]] = value elif name.startswith("write_"): write_kwargs[name[6:]] = value else: warnings.warn( "Keyword argument %s of %s method should be prefixed with 'page_'" % (name, method.__name__), DeprecationWarning) page_kwargs[name] = value d = document.document([document.page(self, **page_kwargs)]) self.__name__ = method.__name__ self.__doc__ = method.__doc__ return method(d, file, **write_kwargs)
def scan(self, params): finish = params['document-finish'] document_type = params['document-type'] del params['document-type'] del params['document-finish'] params['output-file'] = self.name+'-scan-'+str(self.id)+'-%04d' params['d'] = self.get_sane_name() command = ['/usr/bin/scanadf'] for option, value in params.iteritems(): if len(option) == 1: command.append('-'+option) else: command.append('--'+option) if len(value): command.append(value) command += ['--pagewidth', '210', '--pageheight', '297', '-x', '210', '-y', '297'] print >> sys.stderr, command self.unclaim() self.scanadf = subprocess.Popen(command, cwd='/dev/shm', stderr=subprocess.PIPE, bufsize=1) if self.doc == None: self.doc = document.document(self.config, format=document_type) error = False while True: line = self.scanadf.stderr.readline() print >> sys.stderr, line if not line: break msg = line.strip().split(' ') if msg[0] == 'scanadf:': if msg[1][0:7] == 'rounded': continue error = True break if len(msg) == 3: if msg[1] == 'document': self.doc.process_image('/dev/shm/'+msg[2]) if (not error) and finish: self.doc.finish() self.doc = None else: self.timer = 0 self.scanadf.wait() self.scanadf = None self.claim() self.id = self.id + 1
def determine_author(text, author_list, vocabulary, tot_doc_count, use_extrafuture): auth_res = {} doc = document("","",text) doc.count_sentence() doc.compute_ave_words_in_sentence() doc.count_quatation_mark() doc.count_exclamation_mark() bow = doc.construct_bow() for auth in author_list: auth_res[auth.name] = log(1.0 * auth.doc_count / tot_doc_count) for token, count in bow.items(): token_pos = log((auth.vocabulary.get(token,0) + alpha) / (auth.tot_token_count + alpha * len(vocabulary))) auth_res[auth.name] += token_pos * count if use_extrafuture : auth_res[auth.name] += abs(auth.ave_words_in_sentence - doc.ave_words_in_sentence) * word_coef auth_res[auth.name] += abs(auth.ave_sentence_count - doc.sentence_count) * sentence_coef auth_res[auth.name] += abs(auth.ave_quatation_mark - doc.quatation_mark_count) * quatation_coef auth_res[auth.name] += abs(auth.ave_exclamation_mark - doc.exclamation_mark_count) * exclamation_coef return sorted(auth_res, key=auth_res.get,reverse=True)[0]
def __init__(self,parent,settings): # based on a frame, so set up the frame GenericFrameSimple.__init__(self,parent,wx.ID_ANY, settings) self.mainpanel = wx.Panel(self, -1,style=wx.EXPAND) self.doctree = MyDictTree(self.mainpanel,'Document') self.ModelNoteBook = wx.aui.AuiNotebook(self.mainpanel,1,size=(500,500),style=wx.aui.AUI_NB_DEFAULT_STYLE) self.sizer=wx.BoxSizer(wx.HORIZONTAL) self.sizer.Add(self.doctree,0,wx.EXPAND) self.sizer.Add(self.ModelNoteBook,1,wx.EXPAND) self.mainpanel.SetSizer(self.sizer) self.doc = document() self.dirname = ''
def document_id(self, url): """Get the document id for some url.""" # Acquire doc_id_lock before enter critical section crawler.doc_id_lock.acquire() if url in self._doc_id_cache: # Release doc_id_lock before return crawler.doc_id_lock.release() return self._doc_id_cache[url] # TODO: just like word id cache, but for documents. if the document # doesn't exist in the db then only insert the url and leave # the rest to their defaults. doc_id = self._mock_insert_document(url) self._doc_id_cache[url] = doc_id # Release doc_id_lock crawler.doc_id_lock.release() # add the newly created document object to the document index self._document_index[doc_id] = document(url) #self._document_index[doc_id] = url return doc_id
def run_example(): #first lets get the auth code from the client request_token_url = huddleAuthServer + "request?response_type=code" + "&client_id=" + consumer_key + "&redirect_uri=" + redirect_uri print "Get Your Authorization Code and paste it back into python\n" + request_token_url code = raw_input('--> ') auth = oAuth.oAuth(huddleAuthServer, code, consumer_key, redirect_uri) #store our access token tokenStore = auth.handleAccessToken() #now we can make calls to the api #we only have the uri for what folder we want to create the file in so first of all lets find the upload uri of that api = huddleApi.huddleApi(huddleApiServer, tokenStore) getFolder = folder.folder(api.getFolder("http://api.huddle.dev/files/folders/1237980/")) print getFolder.getLinksWithRel("create-document") getDocument = document.document(api.createFile("foo", "bar", getFolder.getLinksWithRel("create-document"))) #time to upload the contents api.uploadToFile("C:\\Users\\adam.flax\\Documents\\foo.txt", getDocument.getLinkWithRel("upload")) os.system("pause")
def hans(self, q, wildcards): html = document(self.connection) html.title("Welcome") html.outgoing(q) html.incoming("Hi Robin!") html.send()
def no_action (self, q, wildcards): #Is called if no action found html = document(self.connection) html.title("Error") html.outgoing(q) html.incoming("Sorry, I don't know how to do that") html.send()
def parse(self,xml_string,input_file_name,curs): parser_dtd=etree.XMLParser(encoding='ISO-8859-1',dtd_validation=True,load_dtd=True,remove_comments=True,recover=True) root = etree.fromstring(xml_string.encode('ISO-8859-1'),parser_dtd) #print(type(root)) for REC in root: #Parse publication and create a publication object containing all the attributes of publication new_pub=pub.publication() author_names=[] new_pub.source_type=REC.tag new_pub.source_id=REC.attrib.get('key') if 'mdate' in REC.attrib: new_pub.modified_date=REC.attrib.get('mdate') if 'publtype' in REC.attrib: new_pub.document_type=REC.attrib.get('publtype') #Can be more than 1 author author_fields=REC.findall('author') if author_fields is not None: for auth in author_fields: if 'orcid' in auth.attrib: author_names.append((auth.text,auth.attrib.get('orcid'))) else: author_names.append((auth.text,None)) pages=REC.find('pages') if pages is not None: if len(pages.text.split('-')) == 2: new_pub.begin_page=pages.text.split('-')[0] new_pub.end_page=pages.text.split('-')[1] else: new_pub.begin_page=pages.text.split('-')[0] title=REC.find('title') if title is not None: new_pub.document_title=title.text issue_no=REC.find('number') if issue_no is not None: new_pub.issue=issue_no.text year=REC.find('year') if year is not None: new_pub.publication_year=year.text address=REC.find('address') if address is not None: new_pub.publisher_address=address.text publisher=REC.find('publisher') if publisher is not None: new_pub.publisher_name=publisher.text vol=REC.find('volume') if vol is not None: new_pub.volume=vol.text s_title=REC.find('journal') if s_title is not None: new_pub.source_title=s_title.text else: s_title=REC.find('booktitle') if s_title is not None: new_pub.source_title=s_title.text #Query to insert publication record into the publications table in the database curs.execute("INSERT INTO dblp_publications (begin_page,modified_date,document_title,document_type,end_page,issue,"\ "publication_year,publisher_address,publisher_name,source_id,source_title,source_type,volume)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"\ " ON CONFLICT (source_id) DO UPDATE SET begin_page=excluded.begin_page,modified_date=excluded.modified_date,document_title=excluded.document_title,"\ "document_type=excluded.document_type,end_page=excluded.end_page,issue=excluded.issue,publication_year=excluded.publication_year,"\ "publisher_address=excluded.publisher_address,publisher_name=excluded.publisher_name,source_id=excluded.source_id,source_title=excluded.source_title,"\ "source_type=excluded.source_type,volume=excluded.volume,last_updated_time=current_timestamp;", (str(new_pub.begin_page),new_pub.modified_date,str(new_pub.document_title),str(new_pub.document_type),str(new_pub.end_page),str(new_pub.issue), str(new_pub.publication_year),str(new_pub.publisher_address),str(new_pub.publisher_name),str(new_pub.source_id),str(new_pub.source_title),str(new_pub.source_type), str(new_pub.volume))) #parse document identifier fields for each publication new_doc=doc.document() #A dictionary which stores all the document id's and types docs=dict() new_doc.source_id=new_pub.source_id ee=REC.findall('ee') if ee is not None: for i in ee: docs[i.text]=i.tag url=REC.findall('url') if url is not None: for i in url: docs[i.text]=i.tag crossref=REC.findall('crossref') if crossref is not None: for i in crossref: docs[i.text]=i.tag isbn=REC.find('isbn') if isbn is not None: docs[isbn.text]=isbn.tag series=REC.find('series') if series is not None: docs[series.text]=series.tag cdrom=REC.find('cdrom') if cdrom is not None: docs[cdrom.text]=cdrom.tag school=REC.find('school') if school is not None: docs[school.text]=school.tag notes=REC.find('notes') if notes is not None: docs[notes.text]=notes.tag #Inserting records into dblp_document_identifiers for text,tag in docs.items(): new_doc.document_id=text new_doc.document_id_type=tag curs.execute("INSERT INTO dblp_document_identifiers(source_id,document_id,document_id_type) VALUES(%s,%s,%s)"\ "ON CONFLICT (source_id,document_id,document_id_type) DO UPDATE SET source_id=excluded.source_id,"\ "document_id=excluded.document_id,document_id_type=excluded.document_id_type,last_updated_time=current_timestamp;", (str(new_doc.source_id),str(new_doc.document_id),str(new_doc.document_id_type))) #parse author fields for dblp_authors new_auth=author.author() editor=REC.find('editor') if editor is not None: new_auth.editor_name=editor.text seq_no=0 for name in author_names: new_auth.first_name=' '.join(name[0].split()[:-1]) new_auth.last_name=name[0].split()[-1] new_auth.full_name=name[0] new_auth.source_id=new_pub.source_id new_auth.seq_no=seq_no if name[1] is not None: new_auth.orc_id=name[1] curs.execute("INSERT INTO dblp_authors(source_id,full_name,last_name,first_name,seq_no,orc_id,editor_name)"\ "VALUES(%s,%s,%s,%s,%s,%s,%s) ON CONFLICT (source_id,seq_no) DO UPDATE SET source_id=excluded.source_id,"\ "full_name=excluded.full_name,last_name=excluded.last_name,first_name=excluded.first_name,seq_no=excluded.seq_no,"\ "orc_id=excluded.orc_id,editor_name=excluded.editor_name,last_updated_time=current_timestamp;",(str(new_auth.source_id),str(new_auth.full_name),str(new_auth.last_name), str(new_auth.first_name),str(new_auth.seq_no),str(new_auth.orc_id),str(new_auth.editor_name))) seq_no+=1 #parser citataion fields for dblp_references new_ref=reference.reference() new_ref.source_id=new_pub.source_id citations=REC.findall('cite') if citations is not None: for cite in citations: if cite != '...': new_ref.cited_source_id=cite.text curs.execute("INSERT INTO dblp_references(source_id,cited_source_id) VALUES(%s,%s) ON CONFLICT ON CONSTRAINT"\ " dblp_references_pk DO UPDATE SET source_id=excluded.source_id,cited_source_id=excluded.cited_source_id,"\ "last_updated_time=current_timestamp;",(str(new_ref.source_id),str(new_ref.cited_source_id)))
def preprocess(sent): doc = document() doc.setText(sent) process.processDoc(doc) return doc.toVector()[0]
def parse_review_text(self, review_file, review_text): review_ndx = 1 # we copy this to a variable because it's going to be modified text = review_text # This variable is to accommodate any dbag who put more than one review # in a file parsed_docs = [] paragraph_ndx = 0 (reviewer, review_ratings, review_paragraphs) = self.clear_doc_params() if (re.search('|'.join(ALDRIN_TELL_TALES), text, re.I)): text = re.sub('<br />', ' ', text) elif (re.search('|'.join(JOSEPH_TELL_TALES), text, re.I)): text = re.sub('(?:[^>])<br />(?:[^<])', ' ', text) review_sections = re.findall(('(?:<p.*?>)?(.*?)' + EOL_REGEX), text, re.S) is_review_section = 0 for section in review_sections: section = re.sub('<.*?>', '', section) if (re.sub('\s+', '', section) == ''): continue if ('DEBUG' in os.environ): print("%s:\n'%s'\n\n" % (review_file, section)) if (re.match('\w+\s*:\s*\w+', section)): if (section.count(':') > 1): reviewer = 'X' continue (uppercase_label, value_with_stuff) = section.split(':') label = uppercase_label.lower() value = value_with_stuff.strip() if (label in review_ratings and review_ratings[label] is None): review_ratings[label] = value elif (label == 'overall'): review_ratings['rating'] = value elif (re.match('reviewer', label, re.I)): if (reviewer is not None): parsed_docs.append(document(author=reviewer, ratings=deepcopy(review_ratings), paragraphs=deepcopy(review_paragraphs), filename="%s-%s" % (review_file, review_ndx))) paragraph_ndx = 0 is_review_section = 0 review_ndx += 1 (reviewer, review_ratings, review_paragraphs) = self.clear_doc_params() # we are still on a reviewer text line so this has to be # done after everything has been 'Reset' from the # previously observed review reviewer = value elif (re.match('written review', section, re.I)): if (re.match('written review:(?:\w+\s*)+', section, re.I)): odd_section = re.sub('WRITTEN REVIEW:', '', section, re.I) review_paragraphs[paragraph_ndx] = section paragraph_ndx += 1 is_review_section = 1 #elif(not re.search(':', section)): elif(is_review_section): section = re.sub('<br \/>', '', section) review_paragraphs[paragraph_ndx] = section paragraph_ndx += 1 if (reviewer is not None): parsed_docs.append(document(author=reviewer, ratings=review_ratings, paragraphs=review_paragraphs, filename="%s-%s" % (review_file, review_ndx))) return parsed_docs
def wrappedindocument(self, file=None, **kwargs): d = document.document([document.page(self, **kwargs)]) self.__name__ = method.__name__ self.__doc__ = method.__doc__ return method(d, file)
def timetable(self, q, wildcards): html = document(self.connection) html.redirect( "http://zimmer:5000/index.php?component=timetable&resolution=desktop" ) #Only works in my setup html.send()
def timetable(self, q, wildcards): html = document (self.connection) html.redirect("http://zimmer:5000/index.php?component=timetable&resolution=desktop") #Only works in my setup html.send()
def no_action(self, q, wildcards): #Is called if no action found html = document(self.connection) html.title("Error") html.incoming(q) html.outgoing("Sorry, I don't know how to do that") html.send()
def load_document(path, label, header_seperator='\n\n'): ''' Load document from file path, return document class''' with open(path, 'r') as file: return document(path,label).parser(file,header_seperator)
def wrappedindocument(self, file, *args, **kwargs): d = document.document([document.page(self, *args, **kwargs)]) self.__name__ = method.__name__ self.__doc__ = method.__doc__ return method(d, file)