def get_file_with_current_path(list_path_to_file): path = Path(__file__).parent.parent res = [] for path_to_file in list_path_to_file: path_to_file = os.path.join(path, path_to_file) res.append(path_to_file) return File(list_name=res)
def to_file(date, mars_class): filename = date.strftime('%Y%m') file_path = "downloads/cams.{}.{}.grib".format(filename, mars_class) return File(filename=filename, path=file_path, priority=1, modify_date=date)
def get_database(config, setting): if config == "mongo": return Mongo(setting) elif config == "file": return File(setting) else: return Mongo(setting)
def test_delete_album(app, json_albums): # if data_albums.name.isspace() or not data_albums.name: if json_albums.name.isspace() or not json_albums.name: return albums = [json_albums] # albums = [data_albums] list_missing_items = app.album.missing_albums(albums) if len(list_missing_items) > 0: for item in list_missing_items: app.album.create( Album(name=item.name, privacy="private_but_link", description="album for delete")) list_albums_for_delete = app.album.info_about_albums_by_name(albums) list_files_in_random_albums = app.file.get_images_in_albums( list_albums_for_delete) if list_files_in_random_albums: files_for_move = File( list_name=list([x.name for x in list_files_in_random_albums])) app.album.move_to_album([Album(name="")], files_for_move) app.album.delete(list_albums_for_delete) new_list_albums = app.album.get_album_list() diff_items = app.album.difference_in_lists_album(albums, new_list_albums) assert diff_items == albums assert len(new_list_albums) == app.album.count_ui_albums()
def to_file(file_tuple, prefix_path): path, meta_information = file_tuple full_path = '{0}{1}'.format(prefix_path, path) priority = 1 if path.endswith('Beschreibung_Stationen.txt') else 0 return File(filename=path.name, path=full_path, meta_information=meta_information, priority=priority)
def get_all_files_with_current_path(folder): path = os.path.join(Path(__file__).parent.parent, folder) only_files = [f for f in listdir(path) if isfile(join(path, f))] res = [] for i in only_files: abs_path = os.path.join(path, i) res.append(abs_path) return File(list_name=res)
def extract_file_name(files): files_name = [] for name in files.list_name: filename_w_ext = os.path.basename(name) filename, file_extension = os.path.splitext(filename_w_ext) files_name.append(filename) return File(list_name=files_name)
def __init__(self, url, mime, image): ''' Constructor. ''' parsed = urllib.parse.urlparse(url) name = os.path.basename(parsed.path) self.upstream_url = url self.file = File(name, mime, image) now = datetime.now() self.start_date = now self.end_date = now thumb_file = io.BytesIO(image) thumb = Image.open(thumb_file) # Handle files that are in palette rather than RGB mode if thumb.mode != 'RGB': thumb = thumb.convert('RGB') thumb.thumbnail(THUMB_SIZE) thumb_file.seek(0) thumb.save(thumb_file, format='JPEG') thumb_file.seek(0) self.thumb_file = File('thumb-{}'.format(name), mime, thumb_file.read())
def test_move_random(app): files = app.random_existing_items(item="file", random_number=3) files_name = File(list_name=[x.name for x in files]) album = app.random_existing_items(item="album", random_number=1) if not album: new_album = Album(name=random_string(max_len_str=10), privacy=random_existing_item( ["public", "private_but_link"])) app.album.create(new_album) album = app.album.info_about_albums_by_name([new_album]) app.album.move_to_album(album=album, file=files_name) new_info_about_files = app.file.get_info_about_file(files_name) for item in new_info_about_files: assert item.id_album == album[0].id_album
def get_file_list(self): wd = self.app.wd self.app.navigation.open_images() files = [] list_files = wd.find_elements_by_css_selector(".list-item") for item in list_files: id_file = item.get_attribute("data-id") name = item.get_attribute("data-title") description = item.get_attribute("data-description") id_album = item.get_attribute("data-album-id") privacy = item.get_attribute("data-privacy") files.append( File(id_file=id_file, name=name, description=description, id_album=id_album, privacy=privacy)) return files
def fetch_all_meta_information(self, path=None): if path is None: return [] ftp = FTP(self.server) ftp.login(self.username, self.password) ftp.cwd(path) ls = ftp.mlsd() files = list() for entry in ls: folder, meta_information = entry if is_directory(entry): if is_main_directory(entry): files.extend( self.fetch_all_meta_information('{0}{1}/'.format( path, folder))) else: file = File('{0}{1}'.format(path, folder), meta_information) files.append(file) ftp.quit() return files
def test_delete_random_album(app): random_albums = app.random_existing_items(item="album", random_number=2) if not random_albums: app.album.create( Album(name=random_string(10), description=random_string(50), privacy=random_existing_item(["public", "private_but_link"]))) list_files_in_random_albums = app.file.get_images_in_albums(random_albums) if list_files_in_random_albums: files_for_move = File( list_name=list([x.name for x in list_files_in_random_albums])) app.album.move_to_album([Album(name="")], files_for_move) app.album.delete(random_albums) new_list_albums = app.album.get_album_list() diff_items = app.album.difference_in_lists_album(random_albums, new_list_albums) # assert sorted(diff_items, key=lambda albums: albums.name) == sorted(random_albums, key=lambda albums: albums.name) assert diff_items == random_albums assert len(new_list_albums) == app.album.count_ui_albums()
def replace_name(file): res = [] for name in file.list_name: res.append(name.replace("_", "-")) return File(list_name=res)
def to_file(row): date, path = row return File(path=path, modify_date=date)
from model.file import File from model.album import Album testdata = [[ File(list_name=["atom", "ae", "179px"]), Album(name="random albums name"), None ], [File(list_name=["ledy", "639px", "atom"]), Album(name=""), None], [None, Album(name="2 album for move files"), "all"], [None, Album(name=""), "all"]]
try: file = open(path, 'r') lines = file.read().split('\n') file.close() location = os.path.join(os.path.abspath(path)) re_match = re.search(filename_regex, path) filename = re_match.group() except (OSError, IOError): print(f'File: {os.path.join(os.path.abspath(path))} not found.') lines = [] return (filename, location, lines) filenames = get_filenames('sample-es6/src') files = [] for filename in filenames: file_read = read_file(filename) new_file = File(file_read[0], file_read[1], file_read[2]) files.append(new_file) for file in files: print(file) # file = read_file('sample-es6/src/js/html.js') # file = read_file('test2.js') # new_file = File(file[0], file[1], file[2]) # print(new_file)
from model.file import File testdata = [ File(dir='tnc_69881045.jpg'), File(dir='archive.rar'), File(dir='archive.zip'), File(dir='video.avi'), File(dir='video.mkv'), File(dir='video.mp4'), File(dir='video.mpg'), File(dir='image.bmp'), File(dir='image.jpg'), File(dir='image.tif'), File(dir='book.fb2'), File(dir='audio.mp3'), File(dir='table.csv'), File(dir='table.xlsx'), File(dir='text.docx'), File(dir='text.html'), File(dir='text.pdf'), File(dir='text.rtf'), File(dir='text.txt') ]
def newFile(self): self._curFile = File(bpm=DEFAULT_BPM)
def get_paper(self, paper_url=None, paper_id=None): """ Load paper details for the paper given by detail page URL or numeric ID """ paper_url = ('%svo020.asp?VOLFDNR=%s' % (self.config['scraper']['base_url'], paper_id)) logging.info("Getting paper %d from %s", paper_id, paper_url) # Stupid re-try concept because AllRis sometimes misses # start < at tags at first request. try_counter = 0 while True: try: response = self.get_url(paper_url) if not response: return if "noauth" in response.url: logging.warn("Paper %s in %s seems to private", paper_id, paper_url) return text = response.text doc = html.fromstring(text) data = {} # Beratungsfolge-Table checken # lets hope we always have this table table = self.table_css(doc)[0] self.consultation_list_start = False last_headline = '' for line in table: if line.tag == 'tr': headline = line[0].text elif line.tag == 'td': headline = line.text else: logging.error("ERROR: Serious error in data table. " "Unable to parse.") if headline: headline = headline.split(":")[0].lower() if headline[-1] == ":": headline = headline[:-1] if headline == "betreff": value = line[1].text_content().strip() # There is some html comment with a script # tag in front of the text which we remove. value = value.split("-->")[1] # remove all multiple spaces from the string data[headline] = " ".join(value.split()) elif headline in [ 'verfasser', u'federführend', 'drucksache-art' ]: data[headline] = line[1].text.strip() elif headline in ['status']: data[headline] = line[1].text.strip() # related papers if len(line) > 2: if len(line[3]): # Gets originalId. is there something # else at this position? (will break) paper_id = line[3][0][0][1][0].get( 'href').split('=')[1].split('&')[0] data['relatedPaper'] = [ Paper(originalId=paper_id) ] # Lot's of scraping just because of the date (?) elif headline == "beratungsfolge": # The actual list will be in the next row # inside a table, so we only set a marker. self.consultation_list_start = True elif self.consultation_list_start: elem = line[0][0] # The first line is pixel images, so skip # it, then we need to jump in steps of two. amount = (len(elem) - 1) / 2 consultations = [] date_list = [] i = 0 item = None for elem_line in elem: if i == 0: i += 1 continue """ Here we need to parse the actual list which can have different forms. A complex example can be found at http://ratsinfo.aachen.de/bi/vo020.asp?VOLFDNR=10822 The first line is some sort of headline with the committee in question and the type of consultation. After that 0-n lines of detailed information of meetings with a date, transscript and decision. The first line has 3 columns (thanks to colspan) and the others have 7. Here we make every meeting a separate entry, we can group them together later again if we want to. """ # now we need to parse the actual list # those lists new_consultation = Consultation() new_consultation.status = \ elem_line[0].attrib['title'].lower() if len(elem_line) == 3: # The order is "color/status", name of # committee / link to TOP, more info we # define a head dict here which can be # shared for the other lines once we find # another head line we will create a new # one here. new_consultation.role = \ elem_line[2].text.strip() # Name of committee, e.g. # "Finanzausschuss", unfort. without id #'committee' : elem_line[1].text.strip(), # For some obscure reasons sometimes action # is missing. elif len(elem_line) == 2: # The order is "color/status", name of # committee / link to TOP, more info. status = \ elem_line[0].attrib['title'].lower() # We define a head dict here which can be # shared for the other lines once we find # another head line we will create a new # one here. # name of committee, e.g. # "Finanzausschuss", unfort. without id #'committee' : elem_line[1].text.strip(), elif len(elem_line) == 7: try: # This is about line 2 with lots of # more stuff to process. # Date can be text or a link with that # text. # We have a link (and ignore it). if len(elem_line[1]) == 1: date_text = elem_line[1][0].text else: date_text = elem_line[1].text date_list.append( datetime.datetime.strptime( date_text.strip(), "%d.%m.%Y")) if len(elem_line[2]): # Form with silfdnr and toplfdnr # but only in link (action= # "to010.asp?topSelected=57023") form = elem_line[2][0] meeting_id = form[0].attrib[ 'value'] new_consultation.meeting = [ Meeting(originalId=meeting_id) ] # Full name of meeting, e.g. # "A/31/WP.16 öffentliche/ # nichtöffentliche Sitzung des # Finanzausschusses" #item['meeting'] = \ # elem_line[3][0].text.strip() else: # No link to TOP. Should not be # possible but happens. # (TODO: Bugreport?) # Here we have no link but the text # is in the TD directly - will be # scaped as meeting. #item['meeting'] = \ # elem_line[3].text.strip() logging.warn( "AgendaItem in consultation " "list on the web page does not " "contain a link to the actual " "meeting at paper %s", paper_url) toplfdnr = None if len(elem_line[6]) > 0: form = elem_line[6][0] toplfdnr = form[0].attrib['value'] if toplfdnr: new_consultation.originalId = \ "%s-%s" % (toplfdnr, paper_id) # actually the id of the transcript new_consultation.agendaItem = \ AgendaItem( originalId=toplfdnr) # e.g. "ungeändert beschlossen" new_consultation.agendaItem.result \ = elem_line[4].text.strip() consultations.append( new_consultation) else: logging.error( "missing agendaItem ID in " "consultation list at %s", paper_url) except (IndexError, KeyError): logging.error( "ERROR: Serious error in " "consultation list. Unable to " "parse.") logging.error( "Serious error in consultation " "list. Unable to parse.") return [] i += 1 # Theory: we don't need this at all, because it's # scraped at meeting. #data['consultations'] = consultations # set the marker to False again as we have read it self.consultation_list_start = False last_headline = headline # We simply ignore the rest (there might not be much more # actually). # The actual text comes after the table in a div but it's not # valid XML or HTML this using regex. data['docs'] = self.body_re.findall(response.text) first_date = False for single_date in date_list: if first_date: if single_date < first_date: first_date = single_date else: first_date = single_date paper = Paper(originalId=paper_id) paper.originalUrl = paper_url paper.name = data['betreff'] paper.description = data['docs'] if 'drucksache-art' in data: paper.paperType = data['drucksache-art'] if first_date: paper.publishedDate = first_date.strftime("%d.%m.%Y") # see theory above #if 'consultations' in data: # paper.consultation = data['consultations'] paper.auxiliaryFile = [] # get the attachments step 1 (Drucksache) file_1 = self.attachment_1_css(doc) if len(file_1): if file_1[0].value: href = ('%sdo027.asp' % self.config['scraper']['base_url']) original_id = file_1[0].value name = 'Drucksache' main_file = File(originalId=original_id, name=name) main_file = self.get_file(main_file, href, True) paper.mainFile = main_file # get the attachments step 2 (additional attachments) files = self.attachments_css(doc) if len(files) > 0: if len(files[0]) > 1: if files[0][1][0].text.strip() == "Anlagen:": for tr in files[0][2:]: link = tr[0][0] href = ("%s%s" % (self.config['scraper']['base_url'], link.attrib["href"])) name = link.text path_tokens = link.attrib["href"].split('/') original_id = "%d-%d" % (int( path_tokens[4]), int(path_tokens[6])) aux_file = File(originalId=original_id, name=name) aux_file = self.get_file(aux_file, href) paper.auxiliaryFile.append(aux_file) print paper.auxiliaryFile if not len(paper.auxiliaryFile): del paper.auxiliaryFile oid = self.db.save_paper(paper) return except (KeyError, IndexError): if try_counter < 3: logging.info("Try again: Getting paper %d from %s", paper_id, paper_url) try_counter += 1 else: logging.error("Failed getting paper %d from %s", paper_id, paper_url) return
def setUp(self): config = "" self.client = File(config) self.client.insert_previous_html( "<html><body><h1>TEST</h1></body></html>") self.client.insert_previous_diff("html > body > h1")