import utils import csv import re with open('initial/writers.csv', 'r', encoding='utf8') as data: reader = csv.reader(data) with open('cleaned/writes_cleaned.csv', 'w', encoding='utf8') as out: writer = csv.writer(out) staff_map = utils.get_staff_map() clips = utils.get_clip_set() next(reader) added = set() for row in reader: name = utils.lettres(row[0]).lstrip() clipIds = re.split("\|", row[1][1:-1]) workTypes = re.split("\|", row[2][1:-1]) roles = re.split("\|", row[3][1:-1]) addInfos = re.split("\|", row[4][1:-1]) size = len(clipIds) if name in staff_map and len(workTypes) == size and len( roles) == size and len(addInfos) == size: for i in range(size): if clipIds[i] in clips: staffid = staff_map[name] clipId = clipIds[i] pair = (staffid, clipId) if pair not in added: workType = workTypes[i] role = roles[i]
import utils with open('initial/release_dates.csv', encoding="utf8") as csvfile: reader = csv.reader(csvfile) with open('cleaned/releasedates_cleaned.csv', 'w', encoding="utf8") as out: wr = csv.writer(out) country_map = utils.get_country_map() clips = utils.get_clip_set() next(reader) added = set() for row in reader: if row[0] in clips: clipid = row[0] no_accents = utils.acc(row[1]) only_letters = utils.lettres(no_accents).lstrip() if only_letters == 'Democratic Republic of Congo': only_letters = 'Democratic Republic of the Congo' # Only keep the numbers and the letters in the "ReleaseDate" column only_numbers_letters = utils.alet(row[2]) if only_letters in country_map: countryId = country_map[only_letters] new_row = (clipid, countryId) if new_row not in added: wr.writerow((clipid, countryId, only_numbers_letters)) added.add(new_row)
genre_map = {} clips = utils.get_clip_set() with open('initial/genres.csv', encoding="utf8") as csvfile: reader = csv.reader(csvfile) next(reader) with open('cleaned/genres_cleaned.csv', 'w', encoding="utf8") as out: wr = csv.writer(out) added = set() for row in reader: if row[0] in clips: clipid = row[0] genre = row[1] l = utils.acc(genre) b = utils.lettres(l) if utils.diff_letters(l, b) < 2 and len(b) != 0 and b.lower( ) != 'null' and b.lower() != 'none': if b not in genres: new_row = (genreId, b) if new_row not in added: genres.add(b) genre_map[b] = genreId genreId += 1 wr.writerow(new_row) added.add(new_row) else: new_row = (genre_map[b], b) if new_row not in added: wr.writerow(new_row) added.add(new_row)
import utils import csv import re with open('initial/actors.csv', 'r', encoding='utf8') as data: reader = csv.reader(data) with open('cleaned/acts_cleaned.csv', 'w', encoding='utf8') as out: writer = csv.writer(out) staff_map = utils.get_staff_map() clips = utils.get_clip_set() next(reader) added = set() for row in reader: name = utils.lettres(row[0]).lstrip() clipIds = re.split("\|", row[1][1:-1]) chars = re.split("\|", row[2][1:-1]) orders = re.split("\|", row[3][1:-1]) addInfos = re.split("\|", row[4][1:-1]) size = len(clipIds) if name in staff_map and len(chars) == size and len( orders) == size and len(addInfos) == size: for i in range(size): if clipIds[i] in clips: staffid = staff_map[name] clipId = clipIds[i] pair = (staffid, clipId) if pair not in added: cha = utils.lettres(chars[i]) order = utils.numbers(orders[i])
import csv import unicodedata import re import utils with open('initial/clips.csv', encoding='utf8') as csvfile: reader = csv.reader(csvfile) next(reader) with open('cleaned/clips_set.csv', 'w', encoding='utf8') as fclips_set: clip_set = csv.writer(fclips_set) with open('cleaned/clips_cleaned.csv', 'w', encoding='utf8') as out: wr = csv.writer(out) added = set() for row in reader: clipid = row[0] # Only keep the numbers in the "Year" column only_numbers = utils.numbers(row[2]) # Only keep the doubles in the "ClipType" column only_letters = utils.lettres(row[3]) if len(clipid) != 0 and clipid.lower() != 'null' and ( clipid) not in added: wr.writerow((clipid, row[1], only_numbers, only_letters)) added.add((clipid)) clip_set.writerow([clipid]) ''' for id in added: clip_set.writerow(id) '''