Exemplo n.º 1
0
def get_file_with_current_path(list_path_to_file):
    path = Path(__file__).parent.parent
    res = []
    for path_to_file in list_path_to_file:
        path_to_file = os.path.join(path, path_to_file)
        res.append(path_to_file)
    return File(list_name=res)
Exemplo n.º 2
0
def to_file(date, mars_class):
    filename = date.strftime('%Y%m')
    file_path = "downloads/cams.{}.{}.grib".format(filename, mars_class)
    return File(filename=filename,
                path=file_path,
                priority=1,
                modify_date=date)
Exemplo n.º 3
0
 def get_database(config, setting):
     if config == "mongo":
         return Mongo(setting)
     elif config == "file":
         return File(setting)
     else:
         return Mongo(setting)
Exemplo n.º 4
0
def test_delete_album(app, json_albums):
    # if data_albums.name.isspace() or not data_albums.name:
    if json_albums.name.isspace() or not json_albums.name:
        return
    albums = [json_albums]
    # albums = [data_albums]

    list_missing_items = app.album.missing_albums(albums)

    if len(list_missing_items) > 0:
        for item in list_missing_items:
            app.album.create(
                Album(name=item.name,
                      privacy="private_but_link",
                      description="album for delete"))
    list_albums_for_delete = app.album.info_about_albums_by_name(albums)
    list_files_in_random_albums = app.file.get_images_in_albums(
        list_albums_for_delete)

    if list_files_in_random_albums:
        files_for_move = File(
            list_name=list([x.name for x in list_files_in_random_albums]))
        app.album.move_to_album([Album(name="")], files_for_move)

    app.album.delete(list_albums_for_delete)
    new_list_albums = app.album.get_album_list()
    diff_items = app.album.difference_in_lists_album(albums, new_list_albums)
    assert diff_items == albums
    assert len(new_list_albums) == app.album.count_ui_albums()
Exemplo n.º 5
0
def to_file(file_tuple, prefix_path):
    path, meta_information = file_tuple
    full_path = '{0}{1}'.format(prefix_path, path)
    priority = 1 if path.endswith('Beschreibung_Stationen.txt') else 0
    return File(filename=path.name,
                path=full_path,
                meta_information=meta_information,
                priority=priority)
Exemplo n.º 6
0
def get_all_files_with_current_path(folder):
    path = os.path.join(Path(__file__).parent.parent, folder)
    only_files = [f for f in listdir(path) if isfile(join(path, f))]
    res = []
    for i in only_files:
        abs_path = os.path.join(path, i)
        res.append(abs_path)
    return File(list_name=res)
Exemplo n.º 7
0
def extract_file_name(files):
    files_name = []
    for name in files.list_name:
        filename_w_ext = os.path.basename(name)
        filename, file_extension = os.path.splitext(filename_w_ext)
        files_name.append(filename)

    return File(list_name=files_name)
Exemplo n.º 8
0
    def __init__(self, url, mime, image):
        ''' Constructor. '''

        parsed = urllib.parse.urlparse(url)
        name = os.path.basename(parsed.path)

        self.upstream_url = url
        self.file = File(name, mime, image)
        now = datetime.now()
        self.start_date = now
        self.end_date = now

        thumb_file = io.BytesIO(image)
        thumb = Image.open(thumb_file)
        # Handle files that are in palette rather than RGB mode
        if thumb.mode != 'RGB':
            thumb = thumb.convert('RGB')
        thumb.thumbnail(THUMB_SIZE)
        thumb_file.seek(0)
        thumb.save(thumb_file, format='JPEG')
        thumb_file.seek(0)
        self.thumb_file = File('thumb-{}'.format(name), mime, thumb_file.read())
Exemplo n.º 9
0
def test_move_random(app):
    files = app.random_existing_items(item="file", random_number=3)
    files_name = File(list_name=[x.name for x in files])
    album = app.random_existing_items(item="album", random_number=1)

    if not album:
        new_album = Album(name=random_string(max_len_str=10),
                          privacy=random_existing_item(
                              ["public", "private_but_link"]))
        app.album.create(new_album)
        album = app.album.info_about_albums_by_name([new_album])
    app.album.move_to_album(album=album, file=files_name)
    new_info_about_files = app.file.get_info_about_file(files_name)

    for item in new_info_about_files:
        assert item.id_album == album[0].id_album
Exemplo n.º 10
0
 def get_file_list(self):
     wd = self.app.wd
     self.app.navigation.open_images()
     files = []
     list_files = wd.find_elements_by_css_selector(".list-item")
     for item in list_files:
         id_file = item.get_attribute("data-id")
         name = item.get_attribute("data-title")
         description = item.get_attribute("data-description")
         id_album = item.get_attribute("data-album-id")
         privacy = item.get_attribute("data-privacy")
         files.append(
             File(id_file=id_file,
                  name=name,
                  description=description,
                  id_album=id_album,
                  privacy=privacy))
     return files
Exemplo n.º 11
0
    def fetch_all_meta_information(self, path=None):
        if path is None:
            return []

        ftp = FTP(self.server)
        ftp.login(self.username, self.password)
        ftp.cwd(path)
        ls = ftp.mlsd()

        files = list()
        for entry in ls:
            folder, meta_information = entry
            if is_directory(entry):
                if is_main_directory(entry):
                    files.extend(
                        self.fetch_all_meta_information('{0}{1}/'.format(
                            path, folder)))
            else:
                file = File('{0}{1}'.format(path, folder), meta_information)
                files.append(file)
        ftp.quit()
        return files
Exemplo n.º 12
0
def test_delete_random_album(app):
    random_albums = app.random_existing_items(item="album", random_number=2)

    if not random_albums:
        app.album.create(
            Album(name=random_string(10),
                  description=random_string(50),
                  privacy=random_existing_item(["public",
                                                "private_but_link"])))

    list_files_in_random_albums = app.file.get_images_in_albums(random_albums)

    if list_files_in_random_albums:
        files_for_move = File(
            list_name=list([x.name for x in list_files_in_random_albums]))
        app.album.move_to_album([Album(name="")], files_for_move)

    app.album.delete(random_albums)
    new_list_albums = app.album.get_album_list()
    diff_items = app.album.difference_in_lists_album(random_albums,
                                                     new_list_albums)
    # assert sorted(diff_items, key=lambda albums: albums.name) == sorted(random_albums, key=lambda albums: albums.name)
    assert diff_items == random_albums
    assert len(new_list_albums) == app.album.count_ui_albums()
Exemplo n.º 13
0
def replace_name(file):
    res = []
    for name in file.list_name:
        res.append(name.replace("_", "-"))
    return File(list_name=res)
Exemplo n.º 14
0
 def to_file(row):
     date, path = row
     return File(path=path, modify_date=date)
Exemplo n.º 15
0
from model.file import File
from model.album import Album

testdata = [[
    File(list_name=["atom", "ae", "179px"]),
    Album(name="random albums name"), None
], [File(list_name=["ledy", "639px", "atom"]),
    Album(name=""), None], [None,
                            Album(name="2 album for move files"), "all"],
            [None, Album(name=""), "all"]]
    try:
        file = open(path, 'r')
        lines = file.read().split('\n')
        file.close()
        location = os.path.join(os.path.abspath(path))
        re_match = re.search(filename_regex, path)
        filename = re_match.group()

    except (OSError, IOError):
        print(f'File: {os.path.join(os.path.abspath(path))} not found.')
        lines = []

    return (filename, location, lines)


filenames = get_filenames('sample-es6/src')
files = []

for filename in filenames:
    file_read = read_file(filename)
    new_file = File(file_read[0], file_read[1], file_read[2])
    files.append(new_file)

for file in files:
    print(file)

# file = read_file('sample-es6/src/js/html.js')
# file = read_file('test2.js')
# new_file = File(file[0], file[1], file[2])
# print(new_file)
Exemplo n.º 17
0
from model.file import File

testdata = [
    File(dir='tnc_69881045.jpg'),
    File(dir='archive.rar'),
    File(dir='archive.zip'),
    File(dir='video.avi'),
    File(dir='video.mkv'),
    File(dir='video.mp4'),
    File(dir='video.mpg'),
    File(dir='image.bmp'),
    File(dir='image.jpg'),
    File(dir='image.tif'),
    File(dir='book.fb2'),
    File(dir='audio.mp3'),
    File(dir='table.csv'),
    File(dir='table.xlsx'),
    File(dir='text.docx'),
    File(dir='text.html'),
    File(dir='text.pdf'),
    File(dir='text.rtf'),
    File(dir='text.txt')
]
Exemplo n.º 18
0
 def newFile(self):
     self._curFile = File(bpm=DEFAULT_BPM)
Exemplo n.º 19
0
    def get_paper(self, paper_url=None, paper_id=None):
        """
        Load paper details for the paper given by detail page URL
        or numeric ID
        """
        paper_url = ('%svo020.asp?VOLFDNR=%s' %
                     (self.config['scraper']['base_url'], paper_id))
        logging.info("Getting paper %d from %s", paper_id, paper_url)

        # Stupid re-try concept because AllRis sometimes misses
        # start < at tags at first request.
        try_counter = 0
        while True:
            try:
                response = self.get_url(paper_url)
                if not response:
                    return
                if "noauth" in response.url:
                    logging.warn("Paper %s in %s seems to private", paper_id,
                                 paper_url)
                    return
                text = response.text
                doc = html.fromstring(text)
                data = {}

                # Beratungsfolge-Table checken
                # lets hope we always have this table
                table = self.table_css(doc)[0]
                self.consultation_list_start = False
                last_headline = ''
                for line in table:
                    if line.tag == 'tr':
                        headline = line[0].text
                    elif line.tag == 'td':
                        headline = line.text
                    else:
                        logging.error("ERROR: Serious error in data table. "
                                      "Unable to parse.")
                    if headline:
                        headline = headline.split(":")[0].lower()
                        if headline[-1] == ":":
                            headline = headline[:-1]
                        if headline == "betreff":
                            value = line[1].text_content().strip()
                            # There is some html comment with a script
                            # tag in front of the text which we remove.
                            value = value.split("-->")[1]
                            # remove all multiple spaces from the string
                            data[headline] = " ".join(value.split())
                        elif headline in [
                                'verfasser', u'federführend', 'drucksache-art'
                        ]:
                            data[headline] = line[1].text.strip()
                        elif headline in ['status']:
                            data[headline] = line[1].text.strip()
                            # related papers
                            if len(line) > 2:
                                if len(line[3]):
                                    # Gets originalId. is there something
                                    # else at this position? (will break)
                                    paper_id = line[3][0][0][1][0].get(
                                        'href').split('=')[1].split('&')[0]
                                    data['relatedPaper'] = [
                                        Paper(originalId=paper_id)
                                    ]

                        # Lot's of scraping just because of the date (?)
                        elif headline == "beratungsfolge":
                            # The actual list will be in the next row
                            # inside a table, so we only set a marker.
                            self.consultation_list_start = True
                        elif self.consultation_list_start:
                            elem = line[0][0]
                            # The first line is pixel images, so skip
                            # it, then we need to jump in steps of two.
                            amount = (len(elem) - 1) / 2
                            consultations = []
                            date_list = []
                            i = 0
                            item = None
                            for elem_line in elem:
                                if i == 0:
                                    i += 1
                                    continue
                                """
                                Here we need to parse the actual list which can have different forms. A complex example
                                can be found at http://ratsinfo.aachen.de/bi/vo020.asp?VOLFDNR=10822
                                The first line is some sort of headline with the committee in question and the type of consultation.
                                After that 0-n lines of detailed information of meetings with a date, transscript and decision.
                                The first line has 3 columns (thanks to colspan) and the others have 7.

                                Here we make every meeting a separate entry, we can group them together later again if we want to.
                                """

                                # now we need to parse the actual list
                                # those lists
                                new_consultation = Consultation()
                                new_consultation.status = \
                                        elem_line[0].attrib['title'].lower()
                                if len(elem_line) == 3:
                                    # The order is "color/status", name of
                                    # committee / link to TOP, more info we
                                    # define a head dict here which can be
                                    # shared for the other lines once we find
                                    # another head line we will create a new
                                    # one here.
                                    new_consultation.role = \
                                            elem_line[2].text.strip()

                                    # Name of committee, e.g.
                                    # "Finanzausschuss", unfort. without id
                                    #'committee' : elem_line[1].text.strip(),
                                # For some obscure reasons sometimes action
                                # is missing.
                                elif len(elem_line) == 2:
                                    # The order is "color/status", name of
                                    # committee / link to TOP, more info.
                                    status = \
                                            elem_line[0].attrib['title'].lower()
                                    # We define a head dict here which can be
                                    # shared for the other lines once we find
                                    # another head line we will create a new
                                    # one here.
                                    # name of committee, e.g.
                                    # "Finanzausschuss", unfort. without id
                                    #'committee' : elem_line[1].text.strip(),
                                elif len(elem_line) == 7:
                                    try:
                                        # This is about line 2 with lots of
                                        # more stuff to process.
                                        # Date can be text or a link with that
                                        # text.
                                        # We have a link (and ignore it).
                                        if len(elem_line[1]) == 1:
                                            date_text = elem_line[1][0].text
                                        else:
                                            date_text = elem_line[1].text
                                        date_list.append(
                                            datetime.datetime.strptime(
                                                date_text.strip(), "%d.%m.%Y"))
                                        if len(elem_line[2]):
                                            # Form with silfdnr and toplfdnr
                                            # but only in link (action=
                                            #   "to010.asp?topSelected=57023")
                                            form = elem_line[2][0]
                                            meeting_id = form[0].attrib[
                                                'value']
                                            new_consultation.meeting = [
                                                Meeting(originalId=meeting_id)
                                            ]
                                            # Full name of meeting, e.g.
                                            # "A/31/WP.16 öffentliche/
                                            #   nichtöffentliche Sitzung des
                                            # Finanzausschusses"
                                            #item['meeting'] = \
                                            #    elem_line[3][0].text.strip()
                                        else:
                                            # No link to TOP. Should not be
                                            # possible but happens.
                                            #   (TODO: Bugreport?)
                                            # Here we have no link but the text
                                            # is in the TD directly - will be
                                            # scaped as meeting.
                                            #item['meeting'] = \
                                            #    elem_line[3].text.strip()
                                            logging.warn(
                                                "AgendaItem in consultation "
                                                "list on the web page does not "
                                                "contain a link to the actual "
                                                "meeting at paper %s",
                                                paper_url)
                                        toplfdnr = None
                                        if len(elem_line[6]) > 0:
                                            form = elem_line[6][0]
                                            toplfdnr = form[0].attrib['value']
                                        if toplfdnr:
                                            new_consultation.originalId = \
                                                    "%s-%s" % (toplfdnr,
                                                               paper_id)
                                            # actually the id of the transcript
                                            new_consultation.agendaItem = \
                                                    AgendaItem(
                                                        originalId=toplfdnr)
                                            # e.g. "ungeändert beschlossen"
                                            new_consultation.agendaItem.result \
                                                    = elem_line[4].text.strip()
                                            consultations.append(
                                                new_consultation)
                                        else:
                                            logging.error(
                                                "missing agendaItem ID in "
                                                "consultation list at %s",
                                                paper_url)
                                    except (IndexError, KeyError):
                                        logging.error(
                                            "ERROR: Serious error in "
                                            "consultation list. Unable to "
                                            "parse.")
                                        logging.error(
                                            "Serious error in consultation "
                                            "list. Unable to parse.")
                                        return []
                                i += 1
                            # Theory: we don't need this at all, because it's
                            # scraped at meeting.
                            #data['consultations'] = consultations
                            # set the marker to False again as we have read it
                            self.consultation_list_start = False
                    last_headline = headline
                    # We simply ignore the rest (there might not be much more
                    # actually).
                # The actual text comes after the table in a div but it's not
                # valid XML or HTML this using regex.
                data['docs'] = self.body_re.findall(response.text)
                first_date = False
                for single_date in date_list:
                    if first_date:
                        if single_date < first_date:
                            first_date = single_date
                    else:
                        first_date = single_date
                paper = Paper(originalId=paper_id)
                paper.originalUrl = paper_url
                paper.name = data['betreff']
                paper.description = data['docs']
                if 'drucksache-art' in data:
                    paper.paperType = data['drucksache-art']
                if first_date:
                    paper.publishedDate = first_date.strftime("%d.%m.%Y")
                # see theory above
                #if 'consultations' in data:
                #    paper.consultation = data['consultations']
                paper.auxiliaryFile = []
                # get the attachments step 1 (Drucksache)
                file_1 = self.attachment_1_css(doc)
                if len(file_1):
                    if file_1[0].value:
                        href = ('%sdo027.asp' %
                                self.config['scraper']['base_url'])
                        original_id = file_1[0].value
                        name = 'Drucksache'
                        main_file = File(originalId=original_id, name=name)
                        main_file = self.get_file(main_file, href, True)
                        paper.mainFile = main_file
                # get the attachments step 2 (additional attachments)
                files = self.attachments_css(doc)
                if len(files) > 0:
                    if len(files[0]) > 1:
                        if files[0][1][0].text.strip() == "Anlagen:":
                            for tr in files[0][2:]:
                                link = tr[0][0]
                                href = ("%s%s" %
                                        (self.config['scraper']['base_url'],
                                         link.attrib["href"]))
                                name = link.text
                                path_tokens = link.attrib["href"].split('/')
                                original_id = "%d-%d" % (int(
                                    path_tokens[4]), int(path_tokens[6]))
                                aux_file = File(originalId=original_id,
                                                name=name)
                                aux_file = self.get_file(aux_file, href)
                                paper.auxiliaryFile.append(aux_file)
                print paper.auxiliaryFile
                if not len(paper.auxiliaryFile):
                    del paper.auxiliaryFile
                oid = self.db.save_paper(paper)
                return
            except (KeyError, IndexError):
                if try_counter < 3:
                    logging.info("Try again: Getting paper %d from %s",
                                 paper_id, paper_url)
                    try_counter += 1
                else:
                    logging.error("Failed getting paper %d from %s", paper_id,
                                  paper_url)
                    return
Exemplo n.º 20
0
 def setUp(self):
     config = ""
     self.client = File(config)
     self.client.insert_previous_html(
         "<html><body><h1>TEST</h1></body></html>")
     self.client.insert_previous_diff("html > body > h1")