def get_hocr(lang, title): # FIXME, delete all no ocr and redo them with nb code lang. if lang == 'nb': lang = 'no' if type(title) == type(u''): title = title.encode('utf-8') title = title.replace(' ', '_') try: if lang == 'bn': title = unicode(title, 'utf-8') page_nr = re.sub(u'^.*/([০-৯]+)$', '\\1', title) book_name = re.sub(u'^(.*?)(/[০-৯]+)?$', '\\1', title) book_name = book_name.encode('utf-8') result = ord(page_nr[0]) - ord(u'০') for ch in page_nr[1:]: result *= 10 result += ord(ch) - ord(u'০') page_nr = result else: page_nr = re.sub('^.*/([0-9]+)$', '\\1', title) book_name = re.sub('^(.*?)(/[0-9]+)?$', '\\1', title) page_nr = int(page_nr) except: return ret_val(1, "unable to extract page number from page: " + title) path = cache_path(book_name, lang) filename = path + 'page_%04d.hocr' % page_nr # We support data built with different compress scheme than the one # actually generated by the server text = utils.uncompress_file(filename, ['bzip2', 'gzip', '']) if text == None: # not available, add a request to do this hocr so we build data # lazilly but we filter here unsupported file type if book_name.endswith('.djvu') or book_name.endswith('.pdf'): import hocr_request hocr_request.add_hocr_request(lang, book_name, True) return ret_val( 1, "unable to locate file %s for page %s lang %s" % (filename, book_name, lang)) # work-around https://code.google.com/p/tesseract-ocr/issues/detail?id=690&can=1&q=utf-8 a simple patch exists: https://code.google.com/p/tesseract-ocr/source/detail?r=736# but it's easier to do a double conversion to remove invalid utf8 rather than to maintain a patched version of tesseract. text = unicode(text, 'utf-8', 'ignore') text = text.encode('utf-8', 'ignore') return ret_val(0, text)
def get_hocr(lang, title): # FIXME, delete all no ocr and redo them with nb code lang. if lang == 'nb': lang = 'no' if type(title) == type(u''): title = title.encode('utf-8') title = title.replace(' ', '_') try: if lang == 'bn': title = unicode(title, 'utf-8') page_nr = re.sub(u'^.*/([০-৯]+)$', '\\1', title) book_name = re.sub(u'^(.*?)(/[০-৯]+)?$', '\\1', title) book_name = book_name.encode('utf-8') result = ord(page_nr[0]) - ord(u'০') for ch in page_nr[1:]: result *= 10 result += ord(ch) - ord(u'০') page_nr = result else: page_nr = re.sub('^.*/([0-9]+)$', '\\1', title) book_name = re.sub('^(.*?)(/[0-9]+)?$', '\\1', title) page_nr = int(page_nr) except: return ret_val(1, "unable to extract page number from page: " + title) path = cache_path(book_name, lang) filename = path + 'page_%04d.hocr' % page_nr # We support data built with different compress scheme than the one # actually generated by the server text = utils.uncompress_file(filename, [ 'bzip2', 'gzip', '' ]) if text == None: # not available, add a request to do this hocr so we build data # lazilly but we filter here unsupported file type if book_name.endswith('.djvu') or book_name.endswith('.pdf'): import hocr_request hocr_request.add_hocr_request(lang, book_name, True) return ret_val(1, "unable to locate file %s for page %s lang %s" % (filename, book_name, lang)) # work-around https://code.google.com/p/tesseract-ocr/issues/detail?id=690&can=1&q=utf-8 a simple patch exists: https://code.google.com/p/tesseract-ocr/source/detail?r=736# but it's easier to do a double conversion to remove invalid utf8 rather than to maintain a patched version of tesseract. text = unicode(text, 'utf-8', 'ignore') text = text.encode('utf-8', 'ignore') return ret_val(0, text)
def __set_archives_to_analyze(self, mailing_list, archives): archives_to_analyze = [] for archive in archives: # Always set Gmane archives to analyze if archive.url.find(GMANE_DOMAIN) == -1: # Check if already analyzed status = self.db.check_compressed_file(archive.url) this_month = find_current_month(archive.url) # If the file is for the current month, re-import to update. # If already visited, ignore it. if status == self.db.VISITED and not this_month: self.__print_output('Already analyzed %s' % archive.url) continue # If not, set visited # (before uncompressing, otherwise the db will point towards # the uncompressed temporary file) today = datetime.datetime.today().strftime(datetimefmt) self.db.set_visited_url(archive.url, mailing_list.location, today, self.db.NEW) if archive.is_compressed(): try: # Uncompress and get the raw filepaths filepaths = uncompress_file(archive.filepath, archive.compressed_type, mailing_list.mbox_dir) uncompressed_mboxes = [ MBoxArchive(fp, archive.url) for fp in filepaths ] archives_to_analyze.extend(uncompressed_mboxes) except IOError, e: # It could be a plain file, so let's give it a chance self.__print_output( " ***WARNING: Uncompressing file %s - %s" % (archive.filepath, str(e))) archives_to_analyze.append(archive) else: archives_to_analyze.append(archive)
def _update_log(outputDir, subjectDir, err): log_file = get_log_file(outputDir) with open(log_file, "wr") as output: for i in os.listdir(subjectDir): if i == DS_STORE: continue # CPC2018 if not utils.is_fasta(i): continue fullpath = subjectDir + "/" + i if not os.path.isfile(fullpath): continue fafile = os.path.abspath(fullpath) name = os.path.splitext(i)[0] extension = os.path.splitext(i)[1] #check for gz files from ncbi # TODO: rename "file" vars if extension == ".gz": fafile = utils.uncompress_file(fafile, subjectDir + "/" + name) fafileheader = "" with open(fafile, "r") as fafileopen: fafileheader = fafileopen.readline().strip() #read the files and check for fasta files if ">" not in fafileheader[0]: raise Exception("ERROR: " + name + " is not a fasta file") else: target_name = _get_target_name(fafileheader) fasta_record = "\t".join([target_name, fafile, name]) #if VERBOSE: err.write(fasta_record+"\n") output.write(fasta_record + "\n") return
def __analyze_non_remote(self, dirname): """Walk recursively the directory looking for files, and uncompress them. Then __analyze_local_directory is called.""" # Check if directory to stored uncompressed files already exists mbox_dir = os.path.join(self.MBOX_DIR, dirname.lstrip('/')) if not os.path.exists(mbox_dir): os.makedirs(mbox_dir) # Compressed files are left in their original location, # because they can be uncompressed from that location filepaths = [] for root, dirs, files in os.walk(dirname): filepaths += [os.path.join(root, filename) for filename in files] # If the file is for the current month (MailMan filename # YYYY-MMM.txt.gz) don't mark as visited, and download again # Assuming this is run daily, it's better to take yesterday's date, # to ensure we get all of last month's email when the month rolls over. yesterday= datetime.datetime.today() + datetime.timedelta(days=-1) this_month= yesterday.strftime(mailmanfmt) files_to_analyze = {} url_list = [] for filepath in filepaths: # Check if already analyzed status = self.db.check_compressed_file(filepath) # If the file is for the current month, reimport current_month = -1 != filepath.find(this_month) if current_month: self.__print_output('Found substring %s in URL %s...' % \ (this_month, filepath)) # If already visited, ignore, unless it's for the current month if status == self.db.VISITED and not current_month: self.__print_output('Already analyzed %s' % filepath) continue # If not, set visited # (before uncompressing, otherwise the db will point towards # the uncompressed temporary file) today = datetime.datetime.today().strftime(datetimefmt) self.db.set_visited_url(filepath, dirname, today, self.db.NEW) # Check if compressed extension = check_compressed_file(filepath) if extension: # If compressed, uncompress and get the raw filepath filepaths = uncompress_file(filepath, extension, mbox_dir) # __uncompress_file returns a list containing # the path to all the uncompressed files # (for instance, a tar file may contain more than one file) files_to_analyze.setdefault(filepath, []).extend(filepaths) else: # File was not uncompressed, so there is only # one file to append files_to_analyze.setdefault(filepath, []).append(filepath) url_list.append(filepath) # The archives are usually retrieved in descending # chronological order (because the newest archives are always # shown on the top of the archives) # So we will analyze the list of files in the order inversed # to the order in they were retrieved url_list.reverse() return self.__analyze_list_of_files(dirname, url_list, files_to_analyze)
def __analyze_remote(self, url): """Download the archives from the remote url, stores and parses them.""" # Check directories to stored the archives target = re.sub('^(http|ftp)[s]{0,1}://', '', url) compressed_dir = os.path.join(self.COMPRESSED_DIR, target) mbox_dir = os.path.join(self.MBOX_DIR, target) if not os.path.exists(compressed_dir): os.makedirs(compressed_dir) if not os.path.exists(mbox_dir): os.makedirs(mbox_dir) # If the file is for the current month (MailMan filename # YYYY-MMM.txt.gz) don't mark as visited, and download again # Assuming this is run daily, it's better to take yesterday's date, # to ensure we get all of last month's email when the month rolls over. yesterday = datetime.datetime.today() + datetime.timedelta(days=-1) this_month = yesterday.strftime(mailmanfmt) # Get all the links listed in the URL htmlparser = MyHTMLParser(url, self.web_user, self.web_password) links = htmlparser.get_mboxes_links() filepaths = [] for link in links: basename = os.path.basename(link) destfilename = os.path.join(compressed_dir, basename) # If the URL is for the current month, always retrieve. # Otherwise, check visited status & local files first if link.find(this_month) >= 0: self.__print_output('Found substring %s in URL %s...' % (this_month, link)) self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) elif os.path.exists(destfilename): # Check if already downloaded self.__print_output('Already downloaded %s' % link) else: self.__print_output('Retrieving %s...' % link) retrieve_remote_file(link, destfilename, self.web_user, self.web_password) filepaths.append((link, destfilename)) files_to_analyze = {} url_list = [] for link, filepath in filepaths: # Check if already analyzed status = self.db.check_compressed_file(filepath) # If the file is for the current month, reimport current_month = -1 != filepath.find(this_month) if current_month: self.__print_output('Found substring %s in URL %s...' % \ (this_month, filepath)) # If already visited, ignore, unless it's for the current month if status == self.db.VISITED and not current_month: self.__print_output('Already analyzed %s' % filepath) continue # If not, set visited # (before uncompressing, otherwise the db will point towards # the uncompressed temporary file) today = datetime.datetime.today().strftime(datetimefmt) self.db.set_visited_url(link, url, today, self.db.NEW) # Check if compressed extension = check_compressed_file(filepath) if extension: # If compressed, uncompress and get the raw filepath filepaths = uncompress_file(filepath, extension, mbox_dir) # __uncompress_file returns a list containing # the path to all the uncompressed files # (for instance, a tar file may contain more than one file) files_to_analyze.setdefault(link, []).extend(filepaths) else: # File was not uncompressed, so there is only # one file to append files_to_analyze.setdefault(link, []).append(filepath) url_list.append(link) # The archives are usually retrieved in descending # chronological order (because the newest archives are always # shown on the top of the archives) # So we will analyze the list of files in the order inversed # to the order in they were retrieved url_list.reverse() return self.__analyze_list_of_files(url, url_list, files_to_analyze)