def main(input_filename_list=[]): # Make input filename list if input_filename_series and input_whole_folder: raise Exception("Series or folder. Can't have both. Sorry.") if input_filename_series: for g in os.listdir(input_path): if g.startswith(input_filename_list_range[0]): list_started = True if g.endswith('.wav') and list_started is True: input_filename_list.append(g) # If end of range reached if g.startswith(input_filename_list_range[1]): break print 'Input filenames (series) ({0}):'.format(len(input_filename_list)), input_filename_list elif input_whole_folder: for f in os.listdir(input_path): if f.endswith('.wav'): input_filename_list.append(f) print 'Input filenames (folder) ({0}):'.format(len(input_filename_list)), input_filename_list else: input_filename_list = [input_filename] # Grab spreadsheet from Google Drive utils.export_csv('Items') # RUN THE TRAP pool_array = [os.path.join(input_path, each) for each in input_filename_list] if pool_processing: pool = multiprocessing.Pool(multiprocessing.cpu_count() - 1) pool.map(track_file, pool_array) else: for item in pool_array: track_file(item)
def convert_metadata(f,t): res=[] for line in utils.read_csv(f,' +++$+++ ') : filmid=line[0];filmtitle=line[1];filmyear=line[2][:4];filmrating=line[3];filmratingnum=line[4] for filmtype in utils.parse_json_array(line[5]): res.append([filmid,filmtitle,filmyear,filmrating,filmratingnum,filmtype]) utils.export_csv(res,t,';')
def main(): logging.basicConfig(level=log_level) input_list = [] metadata_errors = [] utils.export_csv('Items') # Grab spreadsheet from Google Drive if just_add_tags: item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if os.path.splitext(each_file.lower())[1][1:] in digital_formats] elif whole_folder: item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if each_file.lower().endswith('.wav')] elif some_orders: item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if (each_file.lower().endswith('.wav') and each_file[:5] in orders_input)] elif some_items: item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if (each_file.lower().endswith('.wav') and \ any(each_item_input in each_file for each_item_input in items_input))] # Halt program on metadata errors metadata_errors = [error for item in item_list for error in item.metadata_errors] if metadata_errors: for error in metadata_errors: print error print '{0} Metadata errors found! See above.'.format(len(metadata_errors)) sys.exit() logging.info('item_list ({0}): {1}'.format(len(item_list), [item.name for item in item_list])) item_list = format_list(item_list) #if cd_pdf_toggle: make_cd_pdf(item_list) if front_pdf_toggle: make_front_pdf(item_list) if back_pdf_toggle: make_back_pdf(item_list) if burn_cds_toggle: burn_cds(item_list) if digital_files_toggle: make_digital_files(item_list)
def sensitivity(years, kwLimit, min_edge_th): print('Sensitivity analysis for years ' + str(years)) yearrange = years[0] + "-" + years[len(years) - 1] graph = pickle.load( open( 'pickled/graph_' + yearrange + '_' + str(kwLimit) + '_eth' + str(min_edge_th) + '.pkl', 'rb')) dthvals = numpy.arange(0.01, 0.125, 0.005) ethvals = numpy.arange(10, 205, 5) #mincomsizevals=[0,4,10] # remove min com size, additional filtering does not makes really sense res = [] for dth in dthvals: for eth in ethvals: print('eth = ' + str(eth) + ' ; dth = ' + str(dth)) [fgraph, coms] = get_communities(graph, dth, eth) for i in range(len(coms)): comnum = len(coms[i].sizes()) vcount = fgraph.vcount() modularity = coms[i].modularity res.append([dth, eth, comnum, vcount, modularity]) print( str(i) + " ; " + str(dth) + " ; " + str(eth) + " : " + str(comnum) + " ; " + str(vcount) + " ; " + str(modularity)) # export res utils.export_csv( res, 'sensitivity/sensitivity_' + yearrange + '_' + str(kwLimit) + '_eth' + str(min_edge_th) + '.csv', ";", "dispth;eth;comnum;vcount;modularity")
def filter_events(data, filename="events.csv"): # process the description text for information events = {} for k, v in data.get('tickets', {}).items(): descr = v.get('description') if descr: sentences = extract_event(descr) event = parse_clause(sentences) if len(event.get('condition')) > 0: for f in [ 'name', 'summary', 'type', 'subtask', 'project', 'project key', 'status' ]: event[f] = v.get(f) events[k] = event # export the events to a csv rows = [] for k, v in events.items(): row = {} row['key'] = k for t, c in v.items(): if isinstance(c, list): row[t] = '\n'.join(c) else: row[t] = c rows.append(row) export_csv(data=rows, filename=filename) logger.info('Exported {} events'.format(len(rows))) return events
def print_order_notes(item_list=None, refresh_csv=True): if not item_list: # Input whole Tracked folder item_list = [Item(each_file) for each_file in os.listdir(config.tracked_folder) if os.path.splitext(each_file)[-1] == '.wav'] if refresh_csv: utils.export_csv('Items') # Grab spreadsheet from Google Drive # items.csv item notes notes_lines = [' - items.csv -'] for item in [item for item in item_list if not item.copy_counter and not item.side]: if item.customer_notes: notes_lines.append('{0} - Customer notes: {1}'.format(item.name, item.customer_notes)) if item.private_notes and len(item.private_notes) > 3: notes_lines.append('{0} - Private notes: {1}'.format(item.name, item.private_notes)) # WooCommerce order notes notes_lines.append(' - WC notes -') wc_client = WooCommerceClient(config.wc_ck, config.wc_cs, config.base_url, oauth_enabled=False) for order_id in set([item.order.lstrip('0') for item in item_list]): order = wc_client.get_order(order_id)['order'] html_parser = HTMLParser.HTMLParser() notes_lines.append('{0} - {1}'.format(order_id, html_parser.unescape(order['note']))) # Print notes; write to text file for notes_line in notes_lines: print notes_line order_notes_file = os.path.join(config.order_notes_folder, 'order_notes.txt') with open(order_notes_file, 'w') as notes: for notes_line in notes_lines: notes.write(notes_line + '\n') # Open notes in default text editor os.startfile(order_notes_file)
def main(input_filename_list=[]): # Make input filename list if config.input_orders and config.input_items: raise Exception("Orders or items. Can't have both. Sorry.") for f in os.listdir(config.clean_folder): if f.endswith('.wav'): input_filename_list.append(f) print 'Input filenames (folder) ({0}):'.format(len(input_filename_list)), input_filename_list # Grab spreadsheet from Google Drive utils.export_csv('Items') # RUN THE TRAP pool_array = [os.path.join(config.clean_folder, each) for each in input_filename_list] if config.pool_processing: pool = multiprocessing.Pool(multiprocessing.cpu_count() - 1) pool.map(track_file, pool_array) else: for item in pool_array: track_file(item) # Class-ify item input list item_list = [Item(each_file) for each_file in os.listdir(config.tracked_folder) if os.path.splitext(each_file)[-1] == '.wav'] # Print order notes order.print_order_notes(item_list)
def main(): logging.basicConfig(level=config.log_level) metadata_errors = [] utils.export_csv("Items") # Grab spreadsheet from Google Drive if config.just_add_tags: print "Just Add Tags selected." input_list = [ each_file for each_file in os.listdir(config.tracked_folder) if os.path.splitext(each_file.lower())[1][1:] in digital_formats ] elif config.input_whole_folder: input_list = [ each_file for each_file in os.listdir(config.tracked_folder) if each_file.lower().endswith(".wav") ] elif config.input_orders: print "Input orders:", config.input_orders input_list = [ each_file for each_file in os.listdir(config.tracked_folder) if (each_file.lower().endswith(".wav") and each_file[:5] in config.input_orders) ] elif config.input_items: print "Input items:", config.input_items input_list = [ each_file for each_file in os.listdir(config.tracked_folder) if ( each_file.lower().endswith(".wav") and any(each_item_input in each_file for each_item_input in config.input_items) ) ] # Class-ify item input list item_list = [Item(each_file, tracks_added=True) for each_file in input_list] # Halt program on metadata errors metadata_errors = [error for item in item_list for error in item.metadata_errors] if metadata_errors: for error in metadata_errors: print error print "{0} Metadata errors found! See above.".format(len(metadata_errors)) sys.exit() logging.info("item_list ({0}): {1}".format(len(item_list), [item.name for item in item_list])) item_list = format_list(item_list) # if config.cd_pdf_toggle: make_cd_pdf(item_list) if config.front_pdf_toggle: make_front_pdf(item_list) if config.back_pdf_toggle: make_back_pdf(item_list) if config.burn_cds_toggle: burn_cds(item_list) order.print_order_notes(item_list, refresh_csv=False) if config.digital_files_toggle: make_digital_files(item_list)
def convert_metadata(f, t): res = [] for line in utils.read_csv(f, ' +++$+++ '): filmid = line[0] filmtitle = line[1] filmyear = line[2][:4] filmrating = line[3] filmratingnum = line[4] for filmtype in utils.parse_json_array(line[5]): res.append([ filmid, filmtitle, filmyear, filmrating, filmratingnum, filmtype ]) utils.export_csv(res, t, ';')
def main(input_filename_list=[], output_path=[]): needs_split_points = [] split_files_counter = 0 if not output_path or output_path.lower == 'none': output_path = input_path # Make input filename list if input_filename_series and input_whole_folder: print 'Series or folder. Can\'t have both. Sorry.' raise SystemError if input_filename_series: for g in os.listdir(input_path): if g.startswith(input_filename_list_range[0]): list_started = True if g.endswith('.wav') and list_started is True: input_filename_list.append(g) # If end of range reached if g.startswith(input_filename_list_range[1]): break print 'Input filenames (series) ({0}):'.format(len(input_filename_list)), input_filename_list elif input_whole_folder: for f in os.listdir(input_path): if f.endswith('.wav'): input_filename_list.append(f) print 'Input filenames (folder) ({0}):'.format(len(input_filename_list)), input_filename_list else: input_filename_list = [input_filename] # Grab spreadsheet from Google Drive utils.export_csv('Items') # RUN THE TRAP pool_array = [os.path.join(input_path, each) for each in input_filename_list] if pool_processing: pool = multiprocessing.Pool(multiprocessing.cpu_count() - 1) pool.map(split_cd, pool_array) else: for each in pool_array: results = split_cd(each) split_files_counter += results[0] if results[1]: needs_split_points.append(results[1]) if needs_split_points: print 'Needs split points:', needs_split_points else: print 'CD splitting: great success! {0} files split.'.format(split_files_counter)
def npatent_years(): mongo = pymongo.MongoClient(utils.get_parameter('mongopath', True, True)) data = [] window = int(utils.get_parameter('window-size')) for year in range(1976 + window - 1, 2013): print(year) years = map(lambda y: str(y), range(int(year - window + 1), int(year + 1))) patents = mongo['patent']['keywords'].find( {"app_year": { "$in": years }}, no_cursor_timeout=True) npatents = patents.count() yearrange = str(years[0]) + "-" + str(years[len(years) - 1]) data.append([yearrange, npatents]) utils.export_csv(data, 'data/patentcount_window' + str(window) + '.csv', ";", "yearrange;count")
def main(): logging.basicConfig(level=log_level) input_list = [] metadata_errors = [] utils.export_csv('Items') # Grab spreadsheet from Google Drive if just_add_tags: item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if os.path.splitext(each_file.lower())[1][1:] in digital_formats] elif whole_folder: item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if each_file.lower().endswith('.wav')] elif some_orders: item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if (each_file.lower().endswith('.wav') and each_file[:5] in orders_input)] elif some_items: item_list = [Item(each_file) for each_file in os.listdir(audio_folder) if (each_file.lower().endswith('.wav') and \ any(each_item_input in each_file for each_item_input in items_input))] if metadata_errors: for error in metadata_errors: print error msg = '{0} Metadata errors found! See above.'.format(len(metadata_errors)) raise Exception(msg) add_compilation_counters(item_list) add_image_counters(item_list) logging.info('Compilation items: {0}'.format([item.name for item in item_list if item.compilation_counter])) logging.info('Unique images: {0}'.format(len(set([item.image for item in item_list])))) logging.info('item_list ({0}): {1}'.format(len(item_list), [item.name for item in item_list])) ''' print print '{0} artist: {1}'.format(item_list[0].name, item_list[0].artist) item_list[0].print_tracks() print ''' if cd_pdf_toggle: make_cd_pdf(item_list) if front_pdf_toggle: make_front_pdf(item_list) if back_pdf_toggle: make_back_pdf(item_list) if digital_files_toggle: make_digital_files(item_list)
def main(input_resplit_list=False): def add_serial_metadata(input_filename): def discogs_auth(): authorize_token = None while not authorize_token: discogs = discogs_client.Client(CLIENT_NAME) discogs.set_consumer_key(CONSUMER_KEY, CONSUMER_SECRET) authorize_url = discogs.get_authorize_url() webbrowser.open(authorize_url[2]) authorize_token = raw_input('Enter authorize token (Or q to quit): ') if authorize_token.lower() == 'q': return 'q' access_token = discogs.get_access_token(authorize_token) logging.debug('access_token: {0}'.format(access_token)) return discogs def average(x): assert len(x) > 0 return float(sum(x)) / len(x) def pearson_def(x, y): """ Correlation between listed Discogs release durations and split FLAC file durations. Would divide by zero if all durations are equal for x or y; set to return 1 instead. """ assert len(x) == len(y) n = len(x) assert n > 0 if n == 2: # Pearson's is useless for n < 3 return (min(x[0], y[0]) / float(max(x[0], y[0]))) * (min(x[1], y[1]) / float(max(x[1], y[1]))) avg_x = average(x) avg_y = average(y) diffprod = 0 xdiff2 = 0 ydiff2 = 0 for idx in range(n): xdiff = x[idx] - avg_x ydiff = y[idx] - avg_y diffprod += xdiff * ydiff xdiff2 += xdiff * xdiff ydiff2 += ydiff * ydiff return 1 if xdiff2 == 0 or xdiff2 == 0 else diffprod / math.sqrt(xdiff2 * ydiff2) def to_seconds(time_str_input): to_seconds_output = [] if type(time_str_input) is list: for i in time_str_input: i_split = i.split(':') to_seconds_output.append(int(i_split[0] or 0)*60 + int(i_split[1])) return to_seconds_output elif type(time_str_input) is str or unicode: if ':' not in time_str_input: return int(time_str_input) i_split = time_str_input.split(':') return int(i_split[0] or 0)*60 + int(i_split[1]) elif type(time_str_input) is int: return time_str_input else: raise Exception('Invalid to_seconds input type: {0}'.format(type(time_str_input))) def boxes_pull(input_filename): # Pull artist, album info from boxes csv double_type = None input_filename = input_filename.lstrip('0') input_filename = input_filename.rsplit('-')[0] with open(config.boxes_path, 'r') as boxes: spamreader = csv.reader(boxes) rowdata = [] for row in spamreader: rowdata.append(row) for row in rowdata: row_serial = row[0].translate(None,' ').lower() if row_serial == input_filename: print 'Match!' print 'RealRow:', row artist = row[1] album = row[2] if album.lower() == 'self titled' or album.lower() == 'self-titled': album = artist if row[5].lower() == 'x': print 'Double trouble!' if row[6].startswith('1/2') or row[6] == '': print '1/2!' double_type = '1/2' elif row[6].startswith('1/4'): print 'Eeeek! 1/4!!' double_type = '1/4' elif row[5].lower() and row[5].lower() != 'x': double_type = 'other' if 'live' in row[4].lower(): print 'Eeeek! Live album!' return artist, album, double_type else: return None, None, None filename_matches = [] log_comment = '' artist = '' album = '' track_lengths_correlation = 0 discogs_match = False resplit_serial = False requested_serial = False print print '----------------------' print 'Query:', input_filename # Check for filename query matches for each_file in os.listdir(config.split_folder): if each_file.startswith(input_filename): # Skip 'a' matches for non-'a' files if len(input_filename) == 5 and each_file[5] != '_': continue filename_matches.append(each_file) if not filename_matches: print 'No split (_xx) audio file matches! Dork.' return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match # Check for artist tags, add from spreadsheet if enabled and necessary if config.use_boxes_csv: artist, album, double_type = boxes_pull(input_filename) if (artist, album) == (None, None): log_comment = 'No spreadsheet match found.' return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match elif double_type == 'other': log_comment = 'Freaky double! Engage manual tagging mode.' if not input_resplit_list: return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match # Add tracknumber, album, artist to FLAC files for each_file in filename_matches: audio = FLAC(os.path.join(config.split_folder, each_file)) audio['artist'] = artist audio['album'] = album audio.save() else: try: artist = FLAC(os.path.join(config.split_folder, filename_matches[0]))['artist'][0] album = FLAC(os.path.join(config.split_folder, filename_matches[0]))['album'][0] print 'Metadata found. Artist:', artist, 'Album:', album except: print 'No artist/album tags; boxes spreadsheet not searched:', input_filename return input_filename, artist, album, track_lengths_correlation, log_comment discogs_search = True if input_resplit_list: # Check re-split_list.csv for specified Discogs release ID with open(resplit_list_path, 'r') as resplit_list: rowdata = [row for row in csv.reader(resplit_list)] for row in rowdata: row_serial = row[0].translate(None,' ').lower() if row_serial.startswith(input_filename.lstrip('0')): if 'csv' in row[1].lower(): print 'Skipped: track titles in resplit-track-lists.csv' return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match elif row[1].isdigit(): print 'Requested serial from resplit list: {0}'.format(row[1]) resplit_serial = True output = [discogs.release(int(row[1]))] discogs_search = False break elif row[1]: print 'Re-split serial field non-numeric!' if discogs_search: query = ''.join(ch for ch in (artist+' '+album) if ch.isalnum() or ch in ' -/,\'') # Search Discogs for release ID/artist + album output = discogs.search(query, type='release') print 'Discogs results for "{0}":'.format(query) # Print track listing, tag FLAC files with titles accept_blank_tracklist_durations = False for search_loop in range(2): if accept_blank_tracklist_durations and requested_serial: output = discogs.search(query, type='release') time.sleep(config.discogs_request_interval) # Wait to comply with Discogs 60 requests/minute limit try: for i, result in enumerate(output): if i >= config.max_search_tries: accept_blank_tracklist_durations = True print 'No results with track times found! Let\'s get a bit fuzzier...' break time.sleep(config.discogs_request_interval) # Wait to comply with Discogs 60 requests/minute limit if hasattr(result, 'tracklist'): #and type(result) != discogs_client.Master: if resplit_serial or accept_blank_tracklist_durations: tracklist = [track for track in result.tracklist] else: tracklist = [track for track in result.tracklist if track.duration and to_seconds(track.duration) > 0] if not tracklist: print 'Result {0}: No tracks found. Continuing...'.format(str(i + 1)) continue elif len(tracklist) != len(result.tracklist): print 'Result {0}: Some tracks missing durations. Be careful!'.format(str(i + 1)) elif len(tracklist) == len(filename_matches): print '----------------------' print 'Result', str(i + 1) print 'Release ID:', result.data['id'] print 'Artist:', result.artists[0].name.encode('utf-8') print 'Album:', result.title.encode('utf-8') # Funky double handling if double_type in ['1/4', '1/3'] and filename_matches[0][-7] not in 'abcd': if not result.tracklist[0].position: print 'Result {0}: Nope! (No position info found in Discogs)'.format(str(i + 1)) continue if not result.tracklist[0].position[0].isalpha(): print 'Result {0}: Nope! (No side info in Discogs positions)'.format(str(i + 1)) continue if double_type == '1/4': sort_key = ['a', 'd', 'b', 'c'] if double_type == '1/3': sort_key = ['a', 'c', 'b', 'd'] # Reorder tracklist by alpha position key tracklist_sorted = [] for key in sort_key: for track in tracklist: if track.position.lower().startswith(key): tracklist_sorted.append(track) tracklist = tracklist_sorted if tracklist[0].duration and not resplit_serial: # Check correlation of Discogs track lengths with those of FLAC files discogs_lengths = [track.duration for track in tracklist if track.duration] flac_lengths = [] for match in filename_matches: audio = FLAC(os.path.join(config.split_folder, match)) flac_length = time.strftime('%M:%S', time.gmtime(audio.info.length)).lstrip('0') flac_lengths.append(flac_length) track_lengths_correlation = round(pearson_def(to_seconds(discogs_lengths), to_seconds(flac_lengths)),4) print 'Track lengths correlation:', track_lengths_correlation if track_lengths_correlation < config.min_correlation: print 'Result {0}: Low correlation. Best check yoself!'.format(str(i + 1)) continue # Write tags to FLAC files discogs_match = True for track, match in zip(tracklist, filename_matches): audio = FLAC(os.path.join(config.split_folder, match)) audio['tracknumber'] = track.position audio['title'] = track.title if 'artists' in track.data: # Add artist info to compilation album tracks audio['artist'] = track.data['artists'][0]['name'].split('(')[0].strip() flac_length = time.strftime('%M:%S', time.gmtime(audio.info.length)).lstrip('0') print track.position, track.title.encode('utf-8'), track.duration, '-->', match, flac_length audio.save() return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match else: print 'Result '+str(i + 1)+': Nope! ('+str(len(tracklist))+' != '+str(len(filename_matches))+')' else: if len(output) == 0: # Don't retry fuzzy-style if there weren't any Discogs query matches log_comment = 'No matches for Discogs query!' return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match elif search_loop == 0: accept_blank_tracklist_durations = True print 'No results with track times found! Let\'s get a bit fuzzier...' except Exception as e: # httplib.BadStatusLine? print 'httplib.BadStatusLine (?) error:', e return None else: log_comment = 'No proper Discogs matches! Dork.' return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match input_filename_list = [] matches_count = 0 start_time = time.time() discogs_logged_in = False tries = 0 runtime_errors = [] # Logging setup if config.log_to_file: logging.basicConfig(filename=os.path.join(config.log_folder, 'log.csv'), format='%(levelname)s,%(message)s', level=logging.INFO) else: logging.basicConfig(format='%(levelname)s,%(message)s', level=logging.DEBUG) # Log into Discogs if not DISCOGS_MANUAL_AUTH: discogs = discogs_client.Client(CLIENT_NAME, CONSUMER_KEY, CONSUMER_SECRET, TOKEN, SECRET) else: while not discogs_logged_in: try: discogs = discogs_auth() if discogs == 'q': sys.exit() discogs_logged_in = True except: print 'Nope! Try again.' # Collect, print input filenames if config.serial_series: for f in config.serial_series: input_filename_list.append(str(f).zfill(5)) print 'Input filenames (series):', input_filename_list elif config.input_whole_folder: for f in os.listdir(config.split_folder): if f.endswith('.flac') or f.endswith('.mp3'): f = f.rsplit('_clean')[0] if f[-2] == '-': f = f[:-1] if f not in input_filename_list: input_filename_list.append(f) print 'Input filenames ('+str(len(input_filename_list))+'):', input_filename_list elif type(input_filenames) is not list: input_filename_list = input_filenames.split(',') else: input_filename_list = input_filenames files_count = len(input_filename_list) # Grab spreadsheet from Google Drive utils.export_csv('Boxes!') # Do the things while tries == 0 or runtime_errors and tries <= config.max_error_tries: if runtime_errors: print 'Runtime errors:', runtime_errors print '{0} runtime errors found! See above.'.format(len(runtime_errors)) input_filename_list = runtime_errors runtime_errors = [] for each in input_filename_list: result = add_serial_metadata(each) if result is None: log_comment = 'We tried. We httplib.BadStatusLined.' runtime_errors.append(each) else: input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match = result if discogs_match: matches_count += 1 if result or tries > 3: # Log track data print log_comment artist = artist.translate(None, ',') if artist else '' album = album.translate(None, ',') if album else '' logging.info('%s,%s,%s,%s,%s', input_filename, artist, album, track_lengths_correlation, log_comment) tries += 1 # Now the items with manually-named track titles from resplit_track_lists.csv if input_resplit_list: with open(resplit_track_lists_path, 'r') as resplit_track_lists: config.split_folder_list = os.listdir(config.split_folder) server_directory_list = os.listdir(server_split_path) rowdata = [row for row in csv.reader(resplit_track_lists)] for row in rowdata: query = [''.join(x) for _, x in itertools.groupby(row[0], key=str.isdigit)] query = ''.join([query[0].zfill(5)] + query[1:]) # Check for filename query matches filename_matches = [os.path.join(config.split_folder, each) for each in config.split_folder_list if each.startswith(query)] if not filename_matches: filename_matches = [os.path.join(server_split_path, each) for each in server_directory_list if each.startswith(query)] if not filename_matches: print 'No split (_xx) audio file matches for resplit serial {0}! Dork.'.format(row[0]) continue # Write tags to FLAC files tracklist = [col for col in row[1:] if col] for position, (track, match) in enumerate(zip(tracklist, filename_matches), 1): audio = FLAC(os.path.join(config.split_folder, match)) audio['tracknumber'] = str(position) audio['title'] = track.decode('unicode-escape') flac_length = time.strftime('%M:%S', time.gmtime(audio.info.length)).lstrip('0') print position, flac_length, track, '-->', match audio.save() matches_count += 1 # Done? Great! accuracy = int(round(matches_count / float(files_count) * 100)) processing_time = int(round(time.time() - start_time)) print 'Great success! {0} files ({1} matches, {2}%) processed in {3}s.'.format(files_count, matches_count, accuracy, processing_time)
pass #print 'No cover found for item ', mp3_fname else: region = wave.open(region_name, 'w') region.setparams(ifile_params) region.writeframes(ifile.readframes(region_length)) region.close() ifile.close() if __name__ == '__main__': # Make input filename list logging.basicConfig(level=config.log_level) if config.input_resplit_list: utils.export_csv(['re-split_list', 'resplit-track-lists']) with open(config.resplit_list_path, 'r') as resplit_list: rows = csv.reader(resplit_list) for row in rows: ''' numeric_length = 50 for i, c in enumerate(row[0]): if not c.isdigit(): numeric_length = i break #print row[0][:numeric_length].zfill(5) + row[0][numeric_length:] + '_clean.wav' input_filename_list.append(row[0][:numeric_length].zfill(5) + row[0][numeric_length:] + '_clean.wav') ''' print row[0] input_filename_list.append(row[0] + '_clean.wav') # Remove bad split files from Julius
if not 'pool' in page_data: page_data['pool'] = pool # add the page to the file... if not page_wbs in relationships[file_wbs]['pages']: relationships[file_wbs]['pages'][page_wbs] = page_data else: print('no files found in saved data! \n\tPath: {}'.format(data['folders'])) #print('{}'.format(json.dumps(relationships, indent=4))) return object_types pickle_file = 'visio_data.pickle' data = get_pickle_data( pickle_file) object_types = summarize_data(data) #print(object_types) obj_model = { "fileGUID":None, "filename":None, "title":None, "creator":None, "pageGUID":None, "pagename":None, "objectype":None, "shapeGUID":None, "shapeID":None, "shapeName":None, "shapeType":None, "shapeText":None, "shapeCallouts":None, "shapeConnects":None, "shapeConnected":None, "shapeContain":None } #logger.info('Found {} object types'.format(len(object_types))) #print_items(obj=object_types) rows = [] for k,v in object_types.items(): rows.extend(v) export_csv(fields=obj_model, data=rows)
def main(input_resplit_list=False): def add_serial_metadata(input_filename): def discogs_auth(): authorize_token = None while not authorize_token: discogs = discogs_client.Client(CLIENT_NAME) discogs.set_consumer_key(CONSUMER_KEY, CONSUMER_SECRET) authorize_url = discogs.get_authorize_url() webbrowser.open(authorize_url[2]) authorize_token = raw_input("Enter authorize token (Or q to quit): ") if authorize_token.lower() == "q": return "q" access_token = discogs.get_access_token(authorize_token) logging.debug("access_token: {0}".format(access_token)) return discogs def average(x): assert len(x) > 0 return float(sum(x)) / len(x) def pearson_def(x, y): """ Correlation between listed Discogs release durations and split FLAC file durations. Would divide by zero if all durations are equal for x or y; set to return 1 instead. """ assert len(x) == len(y) n = len(x) assert n > 0 if n == 2: # Pearson's is useless for n < 3 return (min(x[0], y[0]) / float(max(x[0], y[0]))) * (min(x[1], y[1]) / float(max(x[1], y[1]))) avg_x = average(x) avg_y = average(y) diffprod = 0 xdiff2 = 0 ydiff2 = 0 for idx in range(n): xdiff = x[idx] - avg_x ydiff = y[idx] - avg_y diffprod += xdiff * ydiff xdiff2 += xdiff * xdiff ydiff2 += ydiff * ydiff return 1 if xdiff2 == 0 or xdiff2 == 0 else diffprod / math.sqrt(xdiff2 * ydiff2) def to_seconds(time_str_input): to_seconds_output = [] if type(time_str_input) is list: for i in time_str_input: i_split = i.split(":") to_seconds_output.append(int(i_split[0] or 0) * 60 + int(i_split[1])) return to_seconds_output elif type(time_str_input) is str or unicode: if ":" not in time_str_input: return int(time_str_input) i_split = time_str_input.split(":") return int(i_split[0] or 0) * 60 + int(i_split[1]) elif type(time_str_input) is int: return time_str_input else: raise Exception("Invalid to_seconds input type: {0}".format(type(time_str_input))) def boxes_pull(input_filename): # Pull artist, album info from boxes csv double_type = None input_filename = input_filename.lstrip("0") input_filename = input_filename.rsplit("-")[0] with open(boxes_path, "r") as boxes: spamreader = csv.reader(boxes) rowdata = [] for row in spamreader: rowdata.append(row) for row in rowdata: row_serial = row[0].translate(None, " ").lower() if row_serial == input_filename: print "Match!" print "RealRow:", row artist = row[1] album = row[2] if album.lower() == "self titled" or album.lower() == "self-titled": album = artist if row[5].lower() == "x": print "Double trouble!" if row[6].startswith("1/2") or row[6] == "": print "1/2!" double_type = "1/2" elif row[6].startswith("1/4"): print "Eeeek! 1/4!!" double_type = "1/4" elif row[5].lower() and row[5].lower() != "x": double_type = "other" if "live" in row[4].lower(): print "Eeeek! Live album!" return artist, album, double_type else: return None, None, None filename_matches = [] log_comment = "" artist = "" album = "" track_lengths_correlation = 0 discogs_match = False resplit_serial = False requested_serial = False print print "----------------------" print "Query:", input_filename # Check for filename query matches for each_file in os.listdir(flac_directory): if each_file.startswith(input_filename): # Skip 'a' matches for non-'a' files if len(input_filename) == 5 and each_file[5] != "_": continue filename_matches.append(each_file) if not filename_matches: print "No split (_xx) audio file matches! Dork." return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match # Check for artist tags, add from spreadsheet if enabled and necessary if use_boxes_csv: artist, album, double_type = boxes_pull(input_filename) if (artist, album) == (None, None): log_comment = "No spreadsheet match found." return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match elif double_type == "other": log_comment = "Freaky double! Engage manual tagging mode." if not input_resplit_list: return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match # Add tracknumber, album, artist to FLAC files for each_file in filename_matches: audio = FLAC(os.path.join(flac_directory, each_file)) audio["artist"] = artist audio["album"] = album audio.save() else: try: artist = FLAC(os.path.join(flac_directory, filename_matches[0]))["artist"][0] album = FLAC(os.path.join(flac_directory, filename_matches[0]))["album"][0] print "Metadata found. Artist:", artist, "Album:", album except: print "No artist/album tags; boxes spreadsheet not searched:", input_filename return input_filename, artist, album, track_lengths_correlation, log_comment discogs_search = True if input_resplit_list: # Check re-split_list.csv for specified Discogs release ID with open(resplit_list_path, "r") as resplit_list: rowdata = [row for row in csv.reader(resplit_list)] for row in rowdata: row_serial = row[0].translate(None, " ").lower() if row_serial.startswith(input_filename.lstrip("0")): if "csv" in row[1].lower(): print "Skipped: track titles in resplit-track-lists.csv" return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match elif row[1].isdigit(): print "Requested serial from resplit list: {0}".format(row[1]) resplit_serial = True output = [discogs.release(int(row[1]))] discogs_search = False break elif row[1]: print "Re-split serial field non-numeric!" if discogs_search: query = "".join(ch for ch in (artist + " " + album) if ch.isalnum() or ch in " -/,'") # Search Discogs for release ID/artist + album output = discogs.search(query, type="release") print 'Discogs results for "{0}":'.format(query) # Print track listing, tag FLAC files with titles accept_blank_tracklist_durations = False for search_loop in range(2): if accept_blank_tracklist_durations and requested_serial: output = discogs.search(query, type="release") time.sleep(discogs_request_interval) # Wait to comply with Discogs 60 requests/minute limit try: for i, result in enumerate(output): if i >= max_search_tries: accept_blank_tracklist_durations = True print "No results with track times found! Let's get a bit fuzzier..." break time.sleep(discogs_request_interval) # Wait to comply with Discogs 60 requests/minute limit if hasattr(result, "tracklist"): # and type(result) != discogs_client.Master: if resplit_serial or accept_blank_tracklist_durations: tracklist = [track for track in result.tracklist] else: tracklist = [ track for track in result.tracklist if track.duration and to_seconds(track.duration) > 0 ] if not tracklist: print "Result {0}: No tracks found. Continuing...".format(str(i + 1)) continue elif len(tracklist) != len(result.tracklist): print "Result {0}: Some tracks missing durations. Be careful!".format(str(i + 1)) elif len(tracklist) == len(filename_matches): print "----------------------" print "Result", str(i + 1) print "Release ID:", result.data["id"] print "Artist:", result.artists[0].name.encode("utf-8") print "Album:", result.title.encode("utf-8") # Funky double handling if double_type in ["1/4", "1/3"] and filename_matches[0][-7] not in "abcd": if not result.tracklist[0].position: print "Result {0}: Nope! (No position info found in Discogs)".format(str(i + 1)) continue if not result.tracklist[0].position[0].isalpha(): print "Result {0}: Nope! (No side info in Discogs positions)".format(str(i + 1)) continue if double_type == "1/4": sort_key = ["a", "d", "b", "c"] if double_type == "1/3": sort_key = ["a", "c", "b", "d"] # Reorder tracklist by alpha position key tracklist_sorted = [] for key in sort_key: for track in tracklist: if track.position.lower().startswith(key): tracklist_sorted.append(track) tracklist = tracklist_sorted if tracklist[0].duration and not resplit_serial: # Check correlation of Discogs track lengths with those of FLAC files discogs_lengths = [track.duration for track in tracklist if track.duration] flac_lengths = [] for match in filename_matches: audio = FLAC(os.path.join(flac_directory, match)) flac_length = time.strftime("%M:%S", time.gmtime(audio.info.length)).lstrip("0") flac_lengths.append(flac_length) track_lengths_correlation = round( pearson_def(to_seconds(discogs_lengths), to_seconds(flac_lengths)), 4 ) print "Track lengths correlation:", track_lengths_correlation if track_lengths_correlation < min_correlation: print "Result {0}: Low correlation. Best check yoself!".format(str(i + 1)) continue # Write tags to FLAC files discogs_match = True for track, match in zip(tracklist, filename_matches): audio = FLAC(os.path.join(flac_directory, match)) audio["tracknumber"] = track.position audio["title"] = track.title if "artists" in track.data: # Add artist info to compilation album tracks audio["artist"] = track.data["artists"][0]["name"].split("(")[0].strip() flac_length = time.strftime("%M:%S", time.gmtime(audio.info.length)).lstrip("0") print track.position, track.title.encode( "utf-8" ), track.duration, "-->", match, flac_length audio.save() return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match else: print "Result " + str(i + 1) + ": Nope! (" + str(len(tracklist)) + " != " + str( len(filename_matches) ) + ")" else: if len(output) == 0: # Don't retry fuzzy-style if there weren't any Discogs query matches log_comment = "No matches for Discogs query!" return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match elif search_loop == 0: accept_blank_tracklist_durations = True print "No results with track times found! Let's get a bit fuzzier..." except Exception as e: # httplib.BadStatusLine? print "httplib.BadStatusLine (?) error:", e return None else: log_comment = "No proper Discogs matches! Dork." return input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match # Initial variables config = ConfigParser() config.read(CONFIG_LOCATION) input_filename_series = config.getboolean("discogs", "input_filename_series") input_whole_folder = config.getboolean("discogs", "input_whole_folder") use_boxes_csv = config.getboolean("discogs", "use_boxes_csv") input_filenames = config.get("discogs", "input_filenames") input_filename_list_start = config.getint("discogs", "input_filename_list_start") input_filename_list_end = config.getint("discogs", "input_filename_list_end") flac_directory = config.get("discogs", "flac_directory") boxes_path = config.get("general", "boxes_path") log_to_file = config.getboolean("discogs", "log_to_file") log_path = config.get("general", "log_path") discogs_request_interval = config.getfloat("discogs", "discogs_request_interval") # seconds between API requests resplit_list_path = config.get("general", "resplit_list_path") min_correlation = config.getfloat("discogs", "min_correlation") max_search_tries = config.getint("discogs", "max_search_tries") max_error_tries = config.getint("discogs", "max_error_tries") resplit_track_lists_path = config.get("general", "resplit_track_lists_path") input_resplit_list = config.getboolean("split", "input_resplit_list") server_split_path = config.get("general", "server_split_path") input_filename_list_range = range(input_filename_list_start, input_filename_list_end) input_filename_list = [] matches_count = 0 start_time = time.time() discogs_logged_in = False tries = 0 runtime_errors = [] # Logging setup if log_to_file: logging.basicConfig( filename=os.path.join(log_path, "log.csv"), format="%(levelname)s,%(message)s", level=logging.INFO ) else: logging.basicConfig(format="%(levelname)s,%(message)s", level=logging.DEBUG) # Log into Discogs if not DISCOGS_MANUAL_AUTH: discogs = discogs_client.Client(CLIENT_NAME, CONSUMER_KEY, CONSUMER_SECRET, TOKEN, SECRET) else: while not discogs_logged_in: try: discogs = discogs_auth() if discogs == "q": sys.exit() discogs_logged_in = True except: print "Nope! Try again." # Collect, print input filenames if input_filename_series: for f in input_filename_list_range: input_filename_list.append(str(f).zfill(5)) print "Input filenames (series):", input_filename_list elif input_whole_folder: for f in os.listdir(flac_directory): if f.endswith(".flac") or f.endswith(".mp3"): f = f.rsplit("_clean")[0] if f[-2] == "-": f = f[:-1] if f not in input_filename_list: input_filename_list.append(f) print "Input filenames (" + flac_directory + "):", input_filename_list elif type(input_filenames) is not list: input_filename_list = input_filenames.split(",") else: input_filename_list = input_filenames files_count = len(input_filename_list) # Grab spreadsheet from Google Drive utils.export_csv("Boxes!") # Do the things while tries == 0 or runtime_errors and tries <= max_error_tries: if runtime_errors: print "Runtime errors:", runtime_errors print "{0} runtime errors found! See above.".format(len(runtime_errors)) input_filename_list = runtime_errors runtime_errors = [] for each in input_filename_list: result = add_serial_metadata(each) if result is None: log_comment = "We tried. We httplib.BadStatusLined." runtime_errors.append(each) else: input_filename, artist, album, track_lengths_correlation, log_comment, discogs_match = result if discogs_match: matches_count += 1 if result or tries > 3: # Log track data print log_comment artist = artist.translate(None, ",") if artist else "" album = album.translate(None, ",") if album else "" logging.info("%s,%s,%s,%s,%s", input_filename, artist, album, track_lengths_correlation, log_comment) tries += 1 # Now the items with manually-named track titles from resplit_track_lists.csv if input_resplit_list: with open(resplit_track_lists_path, "r") as resplit_track_lists: flac_directory_list = os.listdir(flac_directory) server_directory_list = os.listdir(server_split_path) rowdata = [row for row in csv.reader(resplit_track_lists)] for row in rowdata: query = ["".join(x) for _, x in itertools.groupby(row[0], key=str.isdigit)] query = "".join([query[0].zfill(5)] + query[1:]) # Check for filename query matches filename_matches = [ os.path.join(flac_directory, each) for each in flac_directory_list if each.startswith(query) ] if not filename_matches: filename_matches = [ os.path.join(server_split_path, each) for each in server_directory_list if each.startswith(query) ] if not filename_matches: print "No split (_xx) audio file matches for resplit serial {0}! Dork.".format(row[0]) continue # Write tags to FLAC files tracklist = [col for col in row[1:] if col] for position, (track, match) in enumerate(zip(tracklist, filename_matches), 1): audio = FLAC(os.path.join(flac_directory, match)) audio["tracknumber"] = str(position) audio["title"] = track.decode("unicode-escape") flac_length = time.strftime("%M:%S", time.gmtime(audio.info.length)).lstrip("0") print position, flac_length, track, "-->", match audio.save() matches_count += 1 # Done? Great! accuracy = int(round(matches_count / float(files_count) * 100)) processing_time = int(round(time.time() - start_time)) print "Great success! {0} files ({1} matches, {2}%) processed in {3}s.".format( files_count, matches_count, accuracy, processing_time )
def export_product(): return send_file(utils.export_csv())
def create_csv(self): self.get_data() utils.export_csv(self.header, self.body, 'vultr.csv')
def export_classification(years, kwLimit, min_edge_th, dispth, ethunit): resdir = 'classification/classification_window' + str( int(years[len(years) - 1]) - int(years[0]) + 1) + '_kwLimit' + str( int(kwLimit)) + '_dispth' + str(dispth) + '_ethunit' + str(ethunit) try: os.makedirs(resdir) except: print("res dir exists") print("Constructing patent probas for years " + str(years)) mongo = pymongo.MongoClient(utils.get_parameter('mongopath', True, True)) # load keywords patents = mongo['patent']['keywords'].find({"app_year": { "$in": years }}, no_cursor_timeout=True) npatents = patents.count() yearrange = years[0] + "-" + years[len(years) - 1] # load graph and construct communities [graph, coms] = pickle.load( open( 'pickled/filteredgraphcoms_' + yearrange + '_' + str(kwLimit) + '_eth' + str(min_edge_th) + '_dispth' + str(dispth) + '_ethunit' + str(ethunit) + '.pkl', 'rb')) # best clustering in com[len(com)-1] clustering = coms[len(coms) - 1] #construct dico kw -> community dico = {} for n in range(graph.vcount()): name = graph.vs['name'][n] dico[name] = clustering.membership[n] ncommunities = len(clustering.sizes()) probas = [] rownames = [] counts = [] i = 0 for currentpatent in patents: if i % 10000 == 0: print('probas : ' + str(100 * i / npatents)) #currentpatent = patents.next() currentprobas = [0.0] * ncommunities for kw in currentpatent['keywords']: if kw in dico: currentprobas[dico[kw]] = currentprobas[dico[kw]] + 1 nk = len(currentpatent['keywords']) if sum(currentprobas) > 0: probas.append(currentprobas) rownames.append(currentpatent['id']) counts.append(nk) i = i + 1 # export the matrix proba as csv utils.export_matrix_sparse_csv( probas, [rownames, counts], resdir + '/probas_' + yearrange + '_kwLimit' + str(kwLimit) + '_dispth' + str(dispth) + '_ethunit' + str(ethunit) + '.csv', ";") # add attributes to keywords degree = graph.degree(range(graph.vcount())) evcentrality = graph.eigenvector_centrality(weights='weight') bcentrality = graph.betweenness(weights='weight') ccentrality = graph.closeness(weights='weight') weighteddegree = graph.strength(range(graph.vcount()), weights='weight') kwattrsdico = {} for n in range(graph.vcount()): kwattrsdico[graph.vs['name'][n]] = [ graph.vs['tidf'][n], graph.vs['disp'][n], graph.vs['docfreq'][n], graph.vs['termhood'][n], degree[n], weighteddegree[n], bcentrality[n], ccentrality[n], evcentrality[n] ] kwdata = [] for currentkw in dico.keys(): if currentkw in kwattrsdico: kwdata.append([currentkw, dico[currentkw]] + kwattrsdico[currentkw]) # export keywords as csv utils.export_csv( kwdata, resdir + '/keywords_' + yearrange + '_kwLimit' + str(kwLimit) + '_dispth' + str(dispth) + '_ethunit' + str(ethunit) + '.csv', ';', 'keyword;community;tidf;technodispersion;docfreq;termhood;degree;weighteddegree;betweennesscentrality;closenesscentrality;eigenvectorcentrality' ) # Patent measures patents = mongo['patent']['keywords'].find({"app_year": { "$in": years }}, no_cursor_timeout=True) measures = [] nmeasures = len(kwattrsdico[graph.vs['name'][0]]) i = 0 for currentpatent in patents: #if i%10000==0 : print('patent measures : '+str(100*i/npatents)) print('patent measures : ' + str(100 * i / npatents)) currentmeasures = [0.0] * nmeasures kwnum = 0 for kw in currentpatent['keywords']: if kw in kwattrsdico: currentmeasures = [ currentmeasures[i] + kwattrsdico[kw][i] for i in range(len(currentmeasures)) ] kwnum = kwnum + 1 nk = len(currentpatent['keywords']) if sum(currentmeasures) != 0: measures.append([currentpatent['id'], nk, kwnum] + currentmeasures) i = i + 1 # export measures utils.export_csv( measures, resdir + '/patent_' + yearrange + '_kwLimit' + str(kwLimit) + '_dispth' + str(dispth) + '_ethunit' + str(ethunit) + '.csv', ';', 'patent;kws;classkws;tidf;technodispersion;docfreq;termhood;degree;weighteddegree;betweennesscentrality;closenesscentrality;eigenvectorcentrality' )
def create_csv(self): self.get_data() utils.export_csv(self.header, self.body, 'digitalocean.csv')
browser_arr = [browser, USER_ID, PASS_WORD] browser_arr_2 = [browser_2, USER_ID_2, PASS_WORD_2] browsers = [browser_arr, browser_arr_2] if __name__ == '__main__': for browser_param in browsers: utils.login(browser_param[1], browser_param[2], browser_param[0]) utils.set_wait_time(TIME_TO_WAIT, browser_param[0]) utils.check_current_url(browser_param[0]) #csvが存在していなかったら、urlを全て配列に格納し、csvとしてエクスポートする if os.path.exists(URL_PATH) == False: utils.move_to_company_list(browsers[0][0]) url_arr = utils.get_url(NUMBER_OF_COMPANY, browsers[0][0]) utils.export_csv(url_arr, URL_PATH) utils.browser_close(browsers[0][0]) else: #csvが存在していたら、csvを読み込んでurl_arrに格納する url_arr = utils.import_csv(URL_PATH) #DB接続 connector = MySQLdb.connect( unix_socket = DB_UNIX_SOCKET, host=DB_HOST, user=DB_USER, passwd=DB_PASS_WORD, db=DB_NAME ) corsor = connector.cursor() #ブラウザの数だけURLの配列を分割する url_arrs = list(np.array_split(url_arr, NUMBER_OF_BROWSERS)) print(len(url_arrs[0]))
import os from getFiles import scan_folders from utils import export_csv folders = ['H:\\', 'IT', 'Projects', '2018', 'IT2018051 HRIS'] folder = os.path.join(*folders) print('Scanning...\n\t{} -> {}'.format(folder, folders[-1])) data = scan_folders(folder, folders[-1]) if len(data.get('files', [])) > 0: export_csv(data=data.get('files', []), filename='filelist.csv')