def convert_to_libsvm(lines): ''' Converts a sequence of lines (e.g., a file or list of strings) in MegaM format to LibSVM format. :param lines: The sequence of lines to convert. :type lines: L{file} or L{list} of L{str} :return: A tuple of the newly formatted data, the mappings from class names to numbers, and the mappings from feature names to numbers. :rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict}) ''' # Initialize variables field_num_dict = UniqueNumberDict() class_num_dict = UniqueNumberDict() result_list = [] # Iterate through MegaM file for line in lines: line_fields = set() # Process encoding line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup.strip() # Ignore comments (and TEST/DEV lines) if not line.startswith( '#') and not line == 'TEST' and not line == 'DEV': result_string = '' split_line = line.split() result_string += '{0}'.format(class_num_dict[split_line[0]]) # Handle features if there are any if len(split_line) > 1: del split_line[0] # Loop through all feature-value pairs printing out pairs # separated by commas (and with feature names replaced with # numbers) for field_num, value in sorted( zip((field_num_dict[field_name] for field_name in islice(split_line, 0, None, 2)), (float(value) if value != 'N/A' else 0.0 for value in islice(split_line, 1, None, 2)))): # Check for duplicates if field_num in line_fields: field_name = ( field_name for field_name, f_num in field_num_dict.items() if f_num == field_num).next() raise AssertionError( "Field {} occurs on same line twice.".format( field_name)) # Otherwise output non-empty features elif value != 'N/A' and float(value): result_string += ' {}:{}'.format(field_num, value) line_fields.add(field_num) result_list.append(result_string) return result_list, class_num_dict, field_num_dict
def _sub_read(self, f): example_num = 0 curr_id = 'EXAMPLE_0' for line in f: # Process encoding if not isinstance(line, text_type): line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup line = line.strip() # Handle instance lines if line.startswith('#'): curr_id = line[1:].strip() elif line and line not in ['TRAIN', 'TEST', 'DEV']: split_line = line.split() num_cols = len(split_line) del line # Line is just a class label if num_cols == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = [] # Line has a class label and feature-value pairs elif num_cols % 2 == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = split_line[1:] # Line just has feature-value pairs elif num_cols % 2 == 0: class_name = None field_pairs = split_line curr_info_dict = {} if len(field_pairs) > 0: # Get current instances feature-value pairs field_names = islice(field_pairs, 0, None, 2) # Convert values to floats, because otherwise # features'll be categorical field_values = (safe_float(val) for val in islice(field_pairs, 1, None, 2)) # Add the feature-value pairs to dictionary curr_info_dict.update(zip(field_names, field_values)) if len(curr_info_dict) != len(field_pairs) / 2: raise ValueError(('There are duplicate feature ' + 'names in {} for example ' + '{}.').format(self.path_or_list, curr_id)) yield curr_id, class_name, curr_info_dict # Set default example ID for next instance, in case we see a # line without an ID. example_num += 1 curr_id = 'EXAMPLE_{}'.format(example_num)
def _sub_read(self, f): example_num = 0 curr_id = 'EXAMPLE_0' for line in f: # Process encoding if not isinstance(line, text_type): line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup line = line.strip() # Handle instance lines if line.startswith('#'): curr_id = line[1:].strip() elif line and line not in ['TRAIN', 'TEST', 'DEV']: split_line = line.split() num_cols = len(split_line) del line # Line is just a class label if num_cols == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = [] # Line has a class label and feature-value pairs elif num_cols % 2 == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = split_line[1:] # Line just has feature-value pairs elif num_cols % 2 == 0: class_name = None field_pairs = split_line curr_info_dict = {} if len(field_pairs) > 0: # Get current instances feature-value pairs field_names = islice(field_pairs, 0, None, 2) # Convert values to floats, because otherwise # features'll be categorical field_values = (safe_float(val) for val in islice(field_pairs, 1, None, 2)) # Add the feature-value pairs to dictionary curr_info_dict.update(zip(field_names, field_values)) if len(curr_info_dict) != len(field_pairs) / 2: raise ValueError( ('There are duplicate feature ' + 'names in {} for example ' + '{}.').format( self.path_or_list, curr_id)) yield curr_id, class_name, curr_info_dict # Set default example ID for next instance, in case we see a # line without an ID. example_num += 1 curr_id = 'EXAMPLE_{}'.format(example_num)
def convert_to_libsvm(lines): ''' Converts a sequence of lines (e.g., a file or list of strings) in MegaM format to LibSVM format. :param lines: The sequence of lines to convert. :type lines: L{file} or L{list} of L{str} :return: A tuple of the newly formatted data, the mappings from class names to numbers, and the mappings from feature names to numbers. :rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict}) ''' # Initialize variables field_num_dict = UniqueNumberDict() class_num_dict = UniqueNumberDict() result_list = [] # Iterate through MegaM file for line in lines: line_fields = set() # Process encoding line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup.strip() # Ignore comments (and TEST/DEV lines) if not line.startswith('#') and not line == 'TEST' and not line == 'DEV': result_string = '' split_line = line.split() result_string += '{0}'.format(class_num_dict[split_line[0]]) # Handle features if there are any if len(split_line) > 1: del split_line[0] # Loop through all feature-value pairs printing out pairs # separated by commas (and with feature names replaced with # numbers) for field_num, value in sorted(zip((field_num_dict[field_name] for field_name in islice(split_line, 0, None, 2)), (float(value) if value != 'N/A' else 0.0 for value in islice(split_line, 1, None, 2)))): # Check for duplicates if field_num in line_fields: field_name = (field_name for field_name, f_num in field_num_dict.items() if f_num == field_num).next() raise AssertionError("Field {} occurs on same line twice.".format(field_name)) # Otherwise output non-empty features elif value != 'N/A' and float(value): result_string += ' {}:{}'.format(field_num, value) line_fields.add(field_num) result_list.append(result_string) return result_list, class_num_dict, field_num_dict
def on_pubmsg(self, c, e): nick = e.source.nick target = e.target if is_channel(e.target) else nick def reply(msg): self.send(target, msg) def dm(msg): self.send(nick, msg) line = UnicodeDammit(e.arguments[0]).unicode_markup log(' \033[37m{}→{}\033[0m'.format(nick, line)) a = line.split(":", 1) if len(a) > 1 and a[0].lower() == self.nick: self.do_command(e, a[1].strip().lower(), nick, target, reply, dm) return # zeltofilter if 'zeltoph' in nick: return foo = settings.VIPS.get(nick, 0) if random() < foo: self.kick(nick) match = re.match('.*┻━┻.*', line) if match: reply('┬─┬ノ(ಠ_ಠノ)') return match = re.match('^({} *:)? *chaos-?([☆★☼☀*]|sternchen) *: ?(.*)$'.format(self.nick), line) if match: newcs = match.group(3) self.chaossternchen.append(newcs) self.sendchan('Chaos-☆ Nr. {} notiert: {}'.format(len(self.chaossternchen), newcs)) return if line.startswith('.wiki '): wikipage = line[len('.wiki '):].strip() if re.match('^[-_+\w]+$', wikipage): wikiurl = 'http://afra-berlin.de/dokuwiki/doku.php?id={}'.format(wikipage) if 'Dieses Thema existiert noch nicht' in requests.get(wikiurl).text: reply("I'm sorry, I can't find a wiki page with that name.") else: reply(wikiurl) else: reply('Try to troll somebot else.') return if line == 'wat?': reply("I don't have a clue.") return if re.match('^hail eris[.!]* ', line.lower()): reply("All Hail Discordia!") return m = re.findall('(^|\s)?(gh?ah?nh?dh?ih?)(\s|$)?', line, re.IGNORECASE) for _1,match,_2 in m: if not re.match('(^|\s)?gandhi(\s|$)?', match, re.IGNORECASE): self.kick(nick, "It's spelled Gandhi") return if re.search('https?://[-a-z0-9.]*facebook.com', line.lower()): reply('A facebook link? srsly? Get some self-respect!') return match = re.search('https?://pr0gramm.com/#(newest/\*/[0-9/]*)', line.lower()) if match: reply('Fixed that pr0gramm link for you: http://pr0gramm.com/static/'+match.group(1)) return if line == 'moin': self.moincount += 1 if self.moincount == 5: reply('moin') return else: self.moincount = 0 if line.lstrip('.!#').startswith('eta '): eta = line[4:].strip() with self.db as db: db.execute("DELETE FROM etas WHERE nick=?", (nick,)) if eta: db.execute("INSERT INTO etas VALUES (DATETIME('now'), ?, ?)", (nick, eta)) dm('ETA registered. Thanks!') return m = re.findall(URL_REGEX, line.lower()) for url,*_ in m: res = requests.get(url) if res.status_code == requests.codes.ok: soup = BeautifulSoup(res.text) reply(soup.title.string) m = re.findall('(^|\s)(afra)(\s|$)', line, re.IGNORECASE) for _1,match,_2 in m: if match != 'AfRA' and match != 'afra' and random() < 0.1: reply("I'm sure you meant AfRA, not "+match) return