def new_commit(self, content, pdfbits): """ Handle new commit based on [content] and [pdfbits]. inputs: content: Contents of the paper. pdfbits: The pdf file in bits. """ # Check all required field exist title = content['title'] keywords = content['keywords'] timestamp = content['timestamp'] descriptions = content['descriptions'] # Write pdf file to [fname] # Get legal file name fname = self._title2fname_(title) with open(os.path.join(self.papers_dir, fname), 'wb') as f: f.write(pdfbits) # Update keywords for kw in keywords: print(kw) self.insert_keyword(kw, title, timestamp) # Update descriptions for desc in descriptions: print(desc) self.insert_description(desc, descriptions[desc], title) # Done logger.info(f'PAPERS_SERVER received new commit of {title}')
def update(self): """ Update pdfs from buffer folder. yield: self.pdfs """ self.read_ignores() pdfs = pd.DataFrame() # Walk through folder # names = [n for n in os.listdir(self.buffer_dir) # if all([n.endswith('.pdf'), # n not in self.ignores.name.values])] names = [n for n in os.listdir(self.buffer_dir) if n.endswith('.pdf')] paths = [os.path.join(self.buffer_dir, n) for n in names] pdfs['name'] = names pdfs['path'] = paths # Parse informations for entry, method in [('atime', os.path.getatime), ('ctime', os.path.getctime), ('mtime', os.path.getmtime)]: pdfs[entry] = [ time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(method(p))) for p in paths ] # Set index self.pdfs = pdfs.set_index('name', drop=True) # Done logger.info('Buffered file names updated.')
def __init__(self): self.papers_dir = profiles.papers_dir self.keywords_path = os.path.join(self.papers_dir, 'keywords.json') self.descriptions_path = os.path.join( self.papers_dir, 'descriptions.xlsx') self.read_keywords() self.read_descriptions() logger.info('PAPERS_SERVER started.')
def new_ignore(self, name): """ Add new ignores by [name]. Update this buffer. inputs: name: File name to be ignored. yield: Update self.ignores and write to disk. """ if name not in self.ignores.name.values: self.ignores = self.ignores.append({'name': name}, ignore_index=True) self.ignores.to_json(self.ignores_path) logger.info(f'Ignore name: {name}') self.update()
def get_by_name(self, name): """ Get paper and its path based on [name] output: bits: Bitstream of the pdf file fpath: Path to the pdf file """ # Assert we have the paper assert (name in self.pdfs.index) # Get fpath fpath = self.pdfs['path'][name] info = PdfReader(fpath).Info # Get bits with open(fpath, 'rb') as f: bits_list = f.readlines() bits = b''.join(bits_list) logger.info(f'BUFFER_SERVER get_by_name success on {name}') return fpath, info, bits
def buffer_get(self, name, method='open'): """ Get file by [name] in buffer_server using [method='open' or 'start'] method: 'open' means return bits stream 'start' means start the file using default app """ try: assert (method in ['start', 'open']) fpath, info, bits = self.buffer_server.get_by_name(name) if method == 'open': return bits if method == 'start': logger.info(f'WORKER buffer_get starts {name}') os.system(fpath) return None except Exception as e: logger.error(f'WORKER buffer_get failed: {e}') return None
def buffer_commit(self, name, content): """ Handle new commit based on [name] and [content]. Return 0 if success, return others if failed. """ # Parse [content] try: new_content = dict( timestamp=float(content['date']) / 1000, # Commit timestamp title=content['title'], # Title of the paper keywords=[ e.strip().title() for e in content['keywords'].split(',') if e.strip() ], # Keywords of the paper, list descriptions=self._description2dict_(content['descriptions']) ) # Descriptions of the paper, dict logger.info(f'WORKER buffer_commit parsed content') print(new_content) except Exception as e: logger.error( f'WORKER buffer_commit failed on parsing content: {content}, error: {e}' ) return 1 # Get pdfbits pdfbits = self.buffer_get(name) if pdfbits is None: logger.error( f'WORKER buffer_commit failed on getting pdf file {name}') return 1 try: # Commit to papers_server self.papers_server.new_commit(new_content, pdfbits) # Ignore new name in buffer_server self.buffer_server.new_ignore(name) logger.info(f'WORKER buffer_commit committed {new_content}.') return 0 except Exception as e: logger.error( f'Worker buffer_commit failed on committing content: {new_content}, error: {e}' ) return 1
def get_by_title(self, title, fields): """ Get paper and its content by [title] according to [fields]. inputs: title: Title of the paper fields: Fields to be returned outputs: Outputs are according to [fields], may contains following fields fpath: Path to the pdf file bits: Bits stream of pdf file keywords: Keywords descriptions: Descriptions """ contents = dict() # Get fname and fpath fname = self._title2fname_(title) fpath = os.path.join(self.papers_dir, fname) if 'fpath' in fields: contents['fpath'] = fpath # Assert descriptions, keywords and pdf file exists assert(title in self.descriptions.index) assert(title in self.keywords.index) assert(os.path.exists(fpath)) # Get contents # Descriptions if 'descriptions' in fields: descriptions = self.descriptions.loc[title] contents['descriptions'] = descriptions[descriptions.notna()] # Keywords if 'keywords' in fields: keywords = self.keywords.loc[title] contents['keywords'] = keywords[keywords.notna()] # Get bits of pdf file if 'bits' in fields: with open(fpath, 'rb') as f: bits_list = f.readlines() contents['bits'] = b''.join(bits_list) logger.info(f'PAPERS_SERVER get_by_title success on {title}') return contents
def run(self, ip='localhost', port=8612): """ Run socket listening on [ip]:[port] """ # Setup socket listener sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.bind((ip, port)) sock.listen(1) logger.info(f'WEBSERVER listen on {ip}:{port}') # Serving idx = 0 while self.running: # Accept new connection connection, client_address = sock.accept() logger.info( f'WEBSERVER is connected {connection} from {client_address}.') # Start new thread to serve the connection t = threading.Thread(target=self.serve_connection, args=(connection, client_address, idx)) t.start() # idx increase idx = (idx + 1) % 65536 logger.info("WEBSERVER stopped.")
def serve_connection(self, connection, address, idx=None): """ Method to serve [connection] of [idx] from [address]. """ try: # Fetch request request = connection.recv(65536).decode() length = len(request) logger.info(f'WEBSERVER-{idx} receives {length} bits') # Respond content = self.respond(request) if not isinstance(content, bytes): content = content.encode() length = len(content) logger.info(f'WEBSERVER-{idx} responses {length} bits') connection.sendall(content) idx += 1 except Exception as e: logger.error( f'WEBSERVER runtime error. connection={connection}, client_address={address}, error={e}' ) finally: connection.close() logger.info(f'WEBSERVER-{idx} connection closed')
def __init__(self, buffer_server=BUFFER_SERVER(), papers_server=PAPERS_SERVER()): self.buffer_server = buffer_server self.papers_server = papers_server logger.info('WORKER initialized.')
def __init__(self): self.buffer_dir = profiles.buffer_dir self.ignores_path = os.path.join(self.buffer_dir, 'ignores.json') self.update() logger.info('BUFFER_SERVER started.')
def __init__(self, ip='localhost', port=8612): self.running = True logger.info("WEBSERVER initialized.")