def preProcess(column): """ Do a little bit of data cleaning with the help of [AsciiDammit](https://github.com/tnajdek/ASCII--Dammit) and Regex. Things like casing, extra spaces, quotes and new lines can be ignored. """ column = AsciiDammit.asciiDammit(column) column = re.sub(' +', ' ', column) column = re.sub('\n', ' ', column) column = column.strip().strip('"').strip("'").lower().strip() return column
def preProcess(column): """ Our goal here is to find meaningful duplicates, so things like casing, extra spaces, quotes and new lines can be ignored. `preProcess` removes these. """ column = AsciiDammit.asciiDammit(column) column = re.sub(' +', ' ', column) column = re.sub('\n', ' ', column) column = column.strip().strip('"').strip("'").lower().strip() return column
def fetch(page=1, page_size=DEFAULT_PAGE_SIZE): """Loops through all purchased music tracks in the ITunes library, performs a YouTube search on all song titles and artist names, and returns a list of Entry objects indexed by video id""" tracks = get_itunes_tracks() num_tracks = len(tracks) results_per_track = len(TRACK_SEARCH_FIELDS) * RESULTS_PER_QUERY abs_track_index = int((page_size * (page-1)) / results_per_track) track_index = abs_track_index % num_tracks query_index = (int((abs_track_index+1) / num_tracks)*RESULTS_PER_QUERY) + 1 query_index %= 1000 - RESULTS_PER_QUERY # cycle through the tracks tracks = itertools.islice(itertools.cycle(tracks), track_index, None) results = {} search_terms_used = [] for track in tracks: for field in TRACK_SEARCH_FIELDS: search_term = track.get(field, None) if search_term is not None: # asciify search term and put it in quotes and lower case search_term = (AsciiDammit.asciiDammit(search_term) .encode("ascii", "ignore") .lower()) search_term = '"%s"' % search_term.replace("\"", "'") # don't search for same term twice if not (field,search_term) in search_terms_used: # skim first n items off search max_results = min(RESULTS_PER_QUERY, page_size - len(results)) if max_results == 0: break entries = youtube_search([search_term], max_results=max_results, start_index=query_index) search_terms_used.append((field,search_term)) # index results by id for entry in entries: id = entry.id.text.split('/')[-1] logging.info("%s | %s" % (id, entry.media.title.text)) if id not in results: results[id] = entry if len(results) >= page_size: break return results
def preProcess(column): """ Do a little bit of data cleaning with the help of [AsciiDammit](https://github.com/tnajdek/ASCII--Dammit) and Regex. Things like casing, extra spaces, quotes and new lines can be ignored. """ if column.find('LatLong') == 0: try: vals = column[8:-1].split(',') column = (float(vals[0]), float(vals[1])) except ValueError: column = (0.0,0.0) else: column = AsciiDammit.asciiDammit(column) column = re.sub(' +', ' ', column) column = re.sub('\n', ' ', column) column = column.strip().strip('"').strip("'").lower().strip() return column
#speaker_events_dict[speakernosp] = [] tracks = [] # list of tracks tracksdict = {} # for use in creating a dictionary of tracks with no space in their name rooms = [] roomsdict = {} speakerdict = {} hourslist = {} speaker_events_dict = {} # go through the schedule row by row for index, x in enumerate(pconsched.schedule): # (re)initialize a session dictionary object session = {} # for each field grab the data and put it in the session dictionary for field in fields: # bring the field info into a variable to help keep the code clean fieldtext = AsciiDammit.asciiDammit(pconsched.schedule[index][pconsched.headerdict[field]]) # separate the time and day into different variables session['input'] = x session['index'] = index ##start day and time assignments if (field == "Start Date"): session['startday'] = fieldtext if (fieldtext == friday_date): session['dayheader'] = friday_header if (fieldtext == saturday_date): session['dayheader'] = saturday_header if (fieldtext == sunday_date): session['dayheader'] = sunday_header if (fieldtext == ''):
trackdict = {} logging.debug("schedule in list form: \n%s", build) for line in build: logging.debug("line: %s", line) logging.debug(len(line)) atrac = line[5].split(',')[0] for currenttrac in trackinfo: if currenttrac.keys()[0][0:3] == atrac[0:3].upper(): logging.debug("track: %s", atrac[0:3].upper()) logging.debug("current track: %s", currenttrac.keys()[0]) logging.debug("line[6] %s", line[6]) trackcontent = [ AsciiDammit.asciiDammit( line[6] ), # .decode('unicode_escape').encode('ascii', 'ignore'), AsciiDammit.asciiDammit(line[8]), AsciiDammit.asciiDammit(line[7]), ] # .decode('unicode_escape').encode('ascii', 'ignore')] if currenttrac.keys()[0] not in trackdict.keys(): trackdict[currenttrac.keys()[0]] = [trackcontent] else: if trackcontent not in trackdict[currenttrac.keys()[0]]: trackdict[currenttrac.keys()[0]].append(trackcontent) logging.debug(line) for track in trackdict: trackdict[track].sort() logging.debug("sorted dict's list\n\n") logging.debug("track dictionary: %s", trackdict)
# speaker_events_dict[speakernosp] = [] tracks = [] # list of tracks tracksdict = {} # for use in creating a dictionary of tracks with no space in their name rooms = [] roomsdict = {} speakerdict = {} hourslist = {} speaker_events_dict = {} # go through the schedule row by row for index, x in enumerate(pconsched.schedule): # (re)initialize a session dictionary object session = {} # for each field grab the data and put it in the session dictionary for field in fields: # bring the field info into a variable to help keep the code clean fieldtext = AsciiDammit.asciiDammit( pconsched.schedule[index][pconsched.headerdict[field]]) # separate the time and day into different variables session['input'] = x session['index'] = index session['All Day Event'] = '' session['private'] = "PUBLIC" session['avneeds'] = "" session['alldayorder'] = "3" # start day and time assignments # separate the time and day into different variables if (field == "event_start"): logging.debug(fieldtext[:len(fieldtext)-fieldtext.find(" ") + 1]) dashdate = fieldtext[:len(fieldtext)-fieldtext.find(" ") + 1] slashdate = "{}/{}/{}".format(dashdate[6:7],
build = list(reader) trackdict = {} logging.debug("schedule in list form: \n%s", build) for line in build: logging.debug("line: %s", line) logging.debug(len(line)) atrac = line[5].split(',')[0] for currenttrac in trackinfo: if currenttrac.keys()[0][0:3] == atrac[0:3].upper(): logging.debug("track: %s", atrac[0:3].upper()) logging.debug("current track: %s", currenttrac.keys()[0]) logging.debug("line[6] %s", line[6]) trackcontent = [AsciiDammit.asciiDammit(line[6]), # .decode('unicode_escape').encode('ascii', 'ignore'), AsciiDammit.asciiDammit(line[8]), AsciiDammit.asciiDammit(line[7]), ] # .decode('unicode_escape').encode('ascii', 'ignore')] if currenttrac.keys()[0] not in trackdict.keys(): trackdict[currenttrac.keys()[0]] = [trackcontent] else: if trackcontent not in trackdict[currenttrac.keys()[0]]: trackdict[currenttrac.keys()[0]].append(trackcontent) logging.debug(line) for track in trackdict: trackdict[track].sort() logging.debug("sorted dict's list\n\n") logging.debug("track dictionary: %s", trackdict) env = jinja2.Environment(loader=jinja2.FileSystemLoader('templates'))