Exemplo n.º 1
0
def remove_duplicate_photo_uris_per_file():
    """
    If a file has the same photo URI multiple times, make a new photo entry
    with a union of the tags for each one, and the earlier commitdate.
    TODO: support media duplicates
    """
    max = int(get_max_entity_count())
    for file_path in [PANDA_PATH, ZOO_PATH]:
        section = None
        for section_name in ["zoos", "pandas"]:
            if section_name in file_path.split("/"):
                section = section_name.split("s")[0]  # HACK
        # Enter the pandas subdirectories
        for root, dirs, files in os.walk(file_path):
            for filename in files:
                path = root + os.sep + filename
                # print(path)
                photo_list = PhotoFile(section, path)
                photo_count = photo_list.photo_count()
                photo_index = 1
                seen = {}
                duplicates = {}
                while (photo_index <= photo_count):
                    current_option = "photo." + str(photo_index)
                    current_uri = photo_list.get_field(current_option)
                    current_author_option = current_option + ".author"
                    current_author = photo_list.get_field(
                        current_author_option)
                    current_date_option = current_option + ".commitdate"
                    current_date = photo_list.get_field(current_date_option)
                    current_date_value = datetime_to_unixtime(current_date)
                    current_link_option = current_option + ".link"
                    current_link = photo_list.get_field(current_link_option)
                    current_tags_option = current_option + ".tags"
                    current_tags = photo_list.get_field(current_tags_option)
                    if current_uri in seen:
                        # We have a duplicate
                        seen_date_value = datetime_to_unixtime(
                            seen[current_uri]["commitdate"])
                        seen_tags = seen[current_uri]["tags"]
                        # Resolve dates and tags
                        if (current_date_value < seen_date_value):
                            seen[current_uri][
                                "commitdate"] = current_date_value
                        # Handle when either of the duplicates have no tags
                        if seen_tags == None and current_tags != None:
                            seen[current_uri]["tags"] = current_tags
                        if seen_tags != None and current_tags != None:
                            tag_list = current_tags.split(
                                ", ") + seen_tags.split(", ")
                            tag_list = sorted(list(
                                dict.fromkeys(tag_list)))  # deduplicate tags
                            seen[current_uri]["tags"] = ", ".join(tag_list)
                        # Add to duplicates list in its current form
                        duplicates[current_uri] = seen[current_uri]
                        # Remove from the photo list
                        photo_list.delete_photo(photo_index)
                        photo_list.delete_photo(seen[current_uri]["old_index"])
                    elif current_uri in duplicates:
                        # We have something duplicated more than once
                        seen_date_value = datetime_to_unixtime(
                            duplicates[current_uri]["commitdate"])
                        seen_tags = duplicates[current_uri]["tags"]
                        # Resolve dates and tags
                        if (current_date_value < seen_date_value):
                            duplicates[current_uri][
                                "commitdate"] = current_date_value
                        # Handle when either of the duplicates have no tags
                        if seen_tags == None and current_tags != None:
                            seen[current_uri]["tags"] = current_tags
                        if seen_tags != None and current_tags != None:
                            tag_list = current_tags.split(
                                ", ") + seen_tags.split(", ")
                            tag_list = sorted(list(
                                dict.fromkeys(tag_list)))  # deduplicate tags
                            duplicates[current_uri]["tags"] = ", ".join(
                                tag_list)
                        # Remove from the photo list
                        photo_list.delete_photo(photo_index)
                    else:
                        seen[current_uri] = {}
                        seen[current_uri]["old_index"] = photo_index
                        seen[current_uri]["author"] = current_author
                        seen[current_uri]["commitdate"] = current_date
                        seen[current_uri]["link"] = current_link
                        seen[current_uri]["tags"] = current_tags
                    photo_index = photo_index + 1
                for photo_uri in duplicates.keys():
                    # Add duplicates back to photo file, starting at the newest index
                    photo_option = "photo." + str(photo_index)
                    author_option = photo_option + ".author"
                    author = duplicates[photo_uri]["author"]
                    date_option = photo_option + ".commitdate"
                    date = duplicates[photo_uri]["commitdate"]
                    link_option = photo_option + ".link"
                    link = duplicates[photo_uri]["link"]
                    tags_option = photo_option + ".tags"
                    tags = duplicates[photo_uri]["tags"]
                    photo_list.set_field(photo_option, photo_uri)
                    photo_list.set_field(author_option, author)
                    photo_list.set_field(date_option, date)
                    photo_list.set_field(link_option, link)
                    if (tags != None):
                        photo_list.set_field(tags_option, tags)
                    photo_index = photo_index + 1
                # Update the file if there were any changes, and re-sort the hashes
                duplicate_count = len(duplicates.keys())
                if duplicate_count > 0:
                    print("deduplicated: %s (%s duplicated)" %
                          (path, duplicate_count))
                    photo_list.renumber_photos(max)
                    photo_list.update_file()
                    sort_ig_hashes(path)
Exemplo n.º 2
0
def update_entity_commit_dates(starting_commit, force=False):
    """
    When moving pandas, the old redpandafinder updater logic considered "new" 
    animals as anything that was a new file in a location. So when an animal
    moved zoos, it became _new_ again. Rectify this by tracking when the
    commitdate for each new animal is. Track commit dates for other files too,
    just for the hell of it.
    """
    filename_to_commit_date = {}
    type_id_to_commit_date = {}
    repo = git.Repo(".")
    # List of sha1-name commits from the repo, oldest to newest
    commit_list = list(
        reversed(list(map(lambda x: x.hexsha, repo.iter_commits()))))
    if starting_commit != None:
        try:
            index = commit_list.index(starting_commit)
        except IndexError as e:
            raise CommitError("%s not a valid commit in this repo." %
                              starting_commit)
        commit_list = commit_list[
            index:]  # All after, and including the given commit
    for index, commitish in enumerate(commit_list):
        # End of the commit list? Call it a day
        if commitish == commit_list[len(commit_list) - 1]:
            break
        # Get the diff
        start = commitish
        end = commit_list[index + 1]
        diff_raw = repo.git.diff(start,
                                 end,
                                 ignore_blank_lines=True,
                                 ignore_space_at_eol=True)
        patch = PatchSet(diff_raw)
        for change in patch:
            filename = change.path
            if filename.find(".txt") == -1:
                # Don't care about non-data files
                continue
            elif change.is_added_file == True:
                compare = "./" + filename
                dt = repo.commit(end).committed_datetime
                date = str(dt.year) + "/" + str(dt.month) + "/" + str(dt.day)
                just_file = filename.split("/").pop()
                just_type = None
                just_id = None
                if compare.find(PANDA_PATH) == 0:
                    just_type = "panda"
                    just_id = just_file.split("_")[0]
                elif compare.find(ZOO_PATH) == 0:
                    just_type = "zoo"
                    just_id = just_file.split("_")[0]
                elif compare.find(MEDIA_PATH) == 0:
                    just_type = "media"
                    just_id = filename  # Need full path for media files
                else:
                    continue  # Not a file we're tracking commitdates for
                filename_to_commit_date[just_file] = date
                type_id_to_commit_date[just_type + "_" + just_id] = date
            else:
                continue
    # print(str(filename_to_commit_date))
    # print(str(type_id_to_commit_date))
    # Now walk the repo, find all panda files without commit dates,
    # and add commitdate to each photo that needs one
    for file_path in [MEDIA_PATH, PANDA_PATH, ZOO_PATH]:
        section = None
        for section_name in ["media", "zoos", "pandas"]:
            if section_name in file_path.split("/"):
                section = section_name.split("s")[0]  # HACK
        # Enter the pandas subdirectories
        for root, dirs, files in os.walk(file_path):
            for filename in files:
                path = root + os.sep + filename
                photo_list = PhotoFile(section, path)
                if photo_list.get_field("commitdate") == None:
                    if filename not in filename_to_commit_date:
                        # file's name was changed at some point
                        just_file = filename.split("/").pop()
                        just_type = None
                        just_id = None
                        if path.find(PANDA_PATH) == 0:
                            just_type = "panda"
                            just_id = just_file.split("_")[0]
                        elif path.find(ZOO_PATH) == 0:
                            just_type = "zoo"
                            just_id = just_file.split("_")[0]
                        elif path.find(MEDIA_PATH) == 0:
                            just_type = "media"
                            just_id = path  # Need full path for media files
                        else:
                            continue  # Not a file we're tracking commitdates for
                        just_key = just_type + "_" + just_id
                        if just_key not in type_id_to_commit_date:
                            print("warning: %s commitdate undetermined" %
                                  filename)
                            continue
                        else:
                            date = type_id_to_commit_date[just_key]
                            old_date = photo_list.get_field("commitdate")
                            if ((old_date == None) or (force == True)):
                                photo_list.set_field("commitdate", date)
                    else:
                        date = filename_to_commit_date[filename]
                        old_date = photo_list.get_field("commitdate")
                        if ((old_date == None) or (force == True)):
                            photo_list.set_field("commitdate", date)
                    photo_list.update_file()
Exemplo n.º 3
0
def update_photo_commit_dates(starting_commit, force=False):
    """
    The old redpandafinder update logic only worked on the basis of commits
    in the last week or so. When files are re-sorted, added, or removed for
    periods of time, it becomes meaningful to search the entire git repo,
    find when a photo URI first appeared, and then track it using its first
    commit-date into redpandafinder.
    """
    uri_to_commit_date = {}
    repo = git.Repo(".")
    # List of sha1-name commits from the repo, oldest to newest
    commit_list = list(
        reversed(list(map(lambda x: x.hexsha, repo.iter_commits()))))
    if starting_commit != None:
        try:
            index = commit_list.index(starting_commit)
        except IndexError as e:
            raise CommitError("%s not a valid commit in this repo." %
                              starting_commit)
        commit_list = commit_list[
            index:]  # All after, and including the given commit
    for index, commitish in enumerate(commit_list):
        # End of the commit list? Call it a day
        if commitish == commit_list[len(commit_list) - 1]:
            break
        # Get the diff
        start = commitish
        end = commit_list[index + 1]
        diff_raw = repo.git.diff(start,
                                 end,
                                 ignore_blank_lines=True,
                                 ignore_space_at_eol=True)
        patch = PatchSet(diff_raw)
        for change in patch:
            filename = change.path
            if filename.find(".txt") == -1:
                # Don't care about non-data files
                continue
            elif change.added <= 0:
                # No lines were added, so we don't care
                continue
            else:
                for hunk in change:
                    for line in hunk:
                        if line.is_added:
                            if re.match("photo.\d+:", line.value) == None:
                                # Not a photo line
                                continue
                            if line.value.find(": ") == -1:
                                # No correct delimiter, which we see in old commits
                                continue
                            if len(line.value.strip().split(": ")) != 2:
                                # Probably bad linebreaks
                                continue
                            [key, value] = line.value.strip().split(": ")
                            if (value in uri_to_commit_date):
                                # Photo we've already seen
                                continue
                            if (value.find("http") !=
                                    0) and (value.find("ig://") != 0):
                                # Not a URI, so not a photo reference
                                continue
                            dt = repo.commit(end).committed_datetime
                            date = str(dt.year) + "/" + str(
                                dt.month) + "/" + str(dt.day)
                            if value not in uri_to_commit_date:
                                # Only insert a comit date once
                                uri_to_commit_date[value] = date
    # print(str(uri_to_commit_date))
    # Now walk the repo, find all files with photo lines that have no commit dates,
    # and add commitdate to each photo that needs one
    for file_path in [PANDA_PATH, ZOO_PATH, MEDIA_PATH]:
        section = None
        for section_name in ["media", "zoos", "pandas"]:
            if section_name in file_path.split("/"):
                section = section_name.split("s")[0]  # HACK
        # Enter the pandas subdirectories
        for root, dirs, files in os.walk(file_path):
            for filename in files:
                path = root + os.sep + filename
                # print(path)
                photo_list = PhotoFile(section, path)
                photo_count = photo_list.photo_count()
                photo_index = 1
                while (photo_index <= photo_count):
                    photo_option = "photo." + str(photo_index)
                    photo_uri = photo_list.get_field(photo_option)
                    date_option = photo_option + ".commitdate"
                    if photo_uri not in uri_to_commit_date:
                        photo_index = photo_index + 1
                        continue
                    date_value = uri_to_commit_date[photo_uri]
                    old_date_value = photo_list.get_field(date_option)
                    if ((old_date_value == None) or (force == True)):
                        photo_list.set_field(date_option, date_value)
                    # print(photo_uri + " ==> " + date_value)
                    photo_index = photo_index + 1
                photo_list.update_file()
Exemplo n.º 4
0
def sort_ig_hashes(path):
    """
    Take a zoo/panda file, and sort all photos by their IG hashes. 
    This makes the photos appear in the order they were uploaded to IG,
    oldest to newest. 
    If a photo does not use an IG URI, keep its index unchanged.
    """
    # IG alphabet for hashes, time ordering oldest to newest
    # print(path)
    print("sorting: %s" % path)
    hash_order = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"
    section = None
    for section_name in ["wild", "zoos", "media", "pandas"]:
        if section_name in path.split("/"):
            section = section_name.split("s")[0]  # HACK
    photo_list = PhotoFile(section, path)
    photo_count = photo_list.photo_count()
    max = int(get_max_entity_count()) + 1
    if photo_count >= max:
        max = photo_count + 1
    non_ig_indices = []
    ig_photos = []
    # Build photo indices of IG photos and non-IG photos
    start_index = 1
    stop_point = max
    photo_index = start_index
    while photo_index <= stop_point:
        photo_option = "photo." + str(photo_index)
        photo = photo_list.get_field(photo_option)
        if photo == None:
            # Missing photo at this index, continue
            photo_index = photo_index + 1
            continue
        # Convert IG photo formats to use new event handler
        photo = update_ig_link(photo)
        photo_list.set_field(photo_option, photo)
        # If our updated photo link has an ig:// uri, do the moving
        if "ig://" in photo:
            # Track the photo and index as a tuple
            ig_photos.append([photo, photo_index])
            # Rename all photo fields as "old_photo_field"
            photo_list.move_field("old." + photo_option, photo_option)
            photo_list.move_field("old." + photo_option + ".author",
                                  photo_option + ".author")
            photo_list.move_field("old." + photo_option + ".commitdate",
                                  photo_option + ".commitdate")
            photo_list.move_field("old." + photo_option + ".link",
                                  photo_option + ".link")
            photo_list.move_field("old." + photo_option + ".tags",
                                  photo_option + ".tags")
            if section == "media":
                panda_tags = photo_list.get_field("panda.tags").split(", ")
                for panda_id in panda_tags:
                    photo_item = photo_option + ".tags." + panda_id + ".location"
                    photo_list.move_field("old." + photo_item, photo_item)
        else:
            # Track the non-ig index, so we can avoid it
            # Don't need to rename these photos
            non_ig_indices.append(photo_index)
        photo_index = photo_index + 1
    # Sort the list of ig photo tuples by photo URL
    # (the 0th item in each tuple is the url)
    # (the 4th item in each URL is the ig photo hash)
    ig_photos = sorted(
        ig_photos,
        key=lambda x: [hash_order.index(char) for char in x[0].split("/")[2]])
    ig_photos = sorted(ig_photos, key=lambda x: len(x[0].split("/")[2]))
    # Now, re-distribute the photos, iterating down the ig
    # photos, moving "old_photo_field" to "photo_field" but with
    # updated indices
    list_index = start_index
    photo_index = start_index
    used_indices = []
    while photo_index <= stop_point:
        if list_index - 1 == len(ig_photos):
            # No more photos, for certain
            break
        [photo, old_index] = ig_photos[list_index - 1]
        photo_index = list_index
        while photo_index in non_ig_indices:
            photo_index = photo_index + 1  # Avoid indices for non-IG photos
        while photo_index in used_indices:
            photo_index = photo_index + 1  # Avoid indices we already used
        used_indices.append(photo_index)
        current_option = "photo." + str(photo_index)
        old_option = "old.photo." + str(old_index)
        photo_list.move_field(current_option, old_option)
        photo_list.move_field(current_option + ".author",
                              old_option + ".author")
        photo_list.move_field(current_option + ".commitdate",
                              old_option + ".commitdate")
        photo_list.move_field(current_option + ".link", old_option + ".link")
        photo_list.move_field(current_option + ".tags", old_option + ".tags")
        if section == "media":
            panda_tags = photo_list.get_field("panda.tags").split(", ")
            for panda_id in panda_tags:
                current_loc_tag = current_option + ".tags." + panda_id + ".location"
                old_loc_tag = old_option + ".tags." + panda_id + ".location"
                photo_list.move_field(current_loc_tag, old_loc_tag)
        list_index = list_index + 1
    # We're done. Update the photo file
    photo_list.update_file()