def convert_to_libsvm(lines): ''' Converts a sequence of lines (e.g., a file or list of strings) in MegaM format to LibSVM format. :param lines: The sequence of lines to convert. :type lines: L{file} or L{list} of L{str} :return: A tuple of the newly formatted data, the mappings from class names to numbers, and the mappings from feature names to numbers. :rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict}) ''' # Initialize variables field_num_dict = UniqueNumberDict() class_num_dict = UniqueNumberDict() result_list = [] # Iterate through MegaM file for line in lines: line_fields = set() # Process encoding line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup.strip() # Ignore comments (and TEST/DEV lines) if not line.startswith( '#') and not line == 'TEST' and not line == 'DEV': result_string = '' split_line = line.split() result_string += '{0}'.format(class_num_dict[split_line[0]]) # Handle features if there are any if len(split_line) > 1: del split_line[0] # Loop through all feature-value pairs printing out pairs # separated by commas (and with feature names replaced with # numbers) for field_num, value in sorted( zip((field_num_dict[field_name] for field_name in islice(split_line, 0, None, 2)), (float(value) if value != 'N/A' else 0.0 for value in islice(split_line, 1, None, 2)))): # Check for duplicates if field_num in line_fields: field_name = ( field_name for field_name, f_num in field_num_dict.items() if f_num == field_num).next() raise AssertionError( "Field {} occurs on same line twice.".format( field_name)) # Otherwise output non-empty features elif value != 'N/A' and float(value): result_string += ' {}:{}'.format(field_num, value) line_fields.add(field_num) result_list.append(result_string) return result_list, class_num_dict, field_num_dict
def _sub_read(self, f): example_num = 0 curr_id = 'EXAMPLE_0' for line in f: # Process encoding if not isinstance(line, text_type): line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup line = line.strip() # Handle instance lines if line.startswith('#'): curr_id = line[1:].strip() elif line and line not in ['TRAIN', 'TEST', 'DEV']: split_line = line.split() num_cols = len(split_line) del line # Line is just a class label if num_cols == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = [] # Line has a class label and feature-value pairs elif num_cols % 2 == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = split_line[1:] # Line just has feature-value pairs elif num_cols % 2 == 0: class_name = None field_pairs = split_line curr_info_dict = {} if len(field_pairs) > 0: # Get current instances feature-value pairs field_names = islice(field_pairs, 0, None, 2) # Convert values to floats, because otherwise # features'll be categorical field_values = (safe_float(val) for val in islice(field_pairs, 1, None, 2)) # Add the feature-value pairs to dictionary curr_info_dict.update(zip(field_names, field_values)) if len(curr_info_dict) != len(field_pairs) / 2: raise ValueError(('There are duplicate feature ' + 'names in {} for example ' + '{}.').format(self.path_or_list, curr_id)) yield curr_id, class_name, curr_info_dict # Set default example ID for next instance, in case we see a # line without an ID. example_num += 1 curr_id = 'EXAMPLE_{}'.format(example_num)
def _sub_read(self, f): example_num = 0 curr_id = 'EXAMPLE_0' for line in f: # Process encoding if not isinstance(line, text_type): line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup line = line.strip() # Handle instance lines if line.startswith('#'): curr_id = line[1:].strip() elif line and line not in ['TRAIN', 'TEST', 'DEV']: split_line = line.split() num_cols = len(split_line) del line # Line is just a class label if num_cols == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = [] # Line has a class label and feature-value pairs elif num_cols % 2 == 1: class_name = safe_float(split_line[0], replace_dict=self.class_map) field_pairs = split_line[1:] # Line just has feature-value pairs elif num_cols % 2 == 0: class_name = None field_pairs = split_line curr_info_dict = {} if len(field_pairs) > 0: # Get current instances feature-value pairs field_names = islice(field_pairs, 0, None, 2) # Convert values to floats, because otherwise # features'll be categorical field_values = (safe_float(val) for val in islice(field_pairs, 1, None, 2)) # Add the feature-value pairs to dictionary curr_info_dict.update(zip(field_names, field_values)) if len(curr_info_dict) != len(field_pairs) / 2: raise ValueError( ('There are duplicate feature ' + 'names in {} for example ' + '{}.').format( self.path_or_list, curr_id)) yield curr_id, class_name, curr_info_dict # Set default example ID for next instance, in case we see a # line without an ID. example_num += 1 curr_id = 'EXAMPLE_{}'.format(example_num)
def convert_to_libsvm(lines): ''' Converts a sequence of lines (e.g., a file or list of strings) in MegaM format to LibSVM format. :param lines: The sequence of lines to convert. :type lines: L{file} or L{list} of L{str} :return: A tuple of the newly formatted data, the mappings from class names to numbers, and the mappings from feature names to numbers. :rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict}) ''' # Initialize variables field_num_dict = UniqueNumberDict() class_num_dict = UniqueNumberDict() result_list = [] # Iterate through MegaM file for line in lines: line_fields = set() # Process encoding line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup.strip() # Ignore comments (and TEST/DEV lines) if not line.startswith('#') and not line == 'TEST' and not line == 'DEV': result_string = '' split_line = line.split() result_string += '{0}'.format(class_num_dict[split_line[0]]) # Handle features if there are any if len(split_line) > 1: del split_line[0] # Loop through all feature-value pairs printing out pairs # separated by commas (and with feature names replaced with # numbers) for field_num, value in sorted(zip((field_num_dict[field_name] for field_name in islice(split_line, 0, None, 2)), (float(value) if value != 'N/A' else 0.0 for value in islice(split_line, 1, None, 2)))): # Check for duplicates if field_num in line_fields: field_name = (field_name for field_name, f_num in field_num_dict.items() if f_num == field_num).next() raise AssertionError("Field {} occurs on same line twice.".format(field_name)) # Otherwise output non-empty features elif value != 'N/A' and float(value): result_string += ' {}:{}'.format(field_num, value) line_fields.add(field_num) result_list.append(result_string) return result_list, class_num_dict, field_num_dict
def white_space_analysis(script_text, soup): spaces_regex = re.compile("^(\s*).*") space_vector = [] character_presence = [] for block in script_text.descendants: # Si block est une instance de bs4.Tag, il est entouré de balises HTML # Le prochain block contiendra le même texte sans les balises # Donc on continue sans parser ce bloc if (isinstance(block, Tag)): continue # UnicodeDammit converts any string to UTF-8 # does not work so well block = UnicodeDammit(block, soup.original_encoding).unicode_markup # remove leading and ending end of lines block = block.strip('\n').strip('\r\n') # if the block doesn't have any text, skip it if (re.search('\w', block) == None): continue for line in block.split('\n'): stripped_line = line.strip(' \n\t\r') if (re.search('\w', line) == None): continue # Counting the number of spaces at the beginning of the line spmatch = spaces_regex.search(line) space_vector.append(len(spmatch.group(1))) if (stripped_line.isupper()) & (len(stripped_line.split(' ')) <= 3): character_presence.append(len(spmatch.group(1))) else: character_presence.append(None) return space_vector, character_presence #,speech_presence
if(isinstance(block, Tag)): continue # UnicodeDammit converts any string to UTF-8 # does not work so well block = UnicodeDammit(block, soup.original_encoding).unicode_markup # remove leading and ending end of lines block = block.strip('\n') # if the block doesn't have any text, skip it if( re.search('\w', block) == None ): continue # bs4 ne coupe pas toujours bien les différents blocs # Mieux vaut donc redécouper par paragraphe et les traiter un à un for line in block.split('\n'): stripped_line = line.strip(' \n\t\r') if( re.search('\w', line) == None ): continue line_type = get_line_type(line, stripped_line, usual_spaces) if(last_line_type == -1 # -1 = not initialized or last_line_type == line_type): text.append(stripped_line) else: if(last_line_type == CHARACTER): last_character=' '.join(text) if not last_character in characters: characters.append(last_character) elif(last_line_type == SPEECH):
't_man' ]) + '\n' fw = open(outputFile, 'w') fw.write(header) plt_lines = open(pltFile, 'rb').readlines() skipped_lines = [] # skip the first two lines since they contain header information for plt_line in plt_lines[2:]: print(plt_line) plt_line = UnicodeDammit(plt_line, ['utf-8', 'windows-1252']).unicode_markup plt_line = unidecode(plt_line) plt_line = plt_line.rstrip() plt_F1 = plt_line.split(',')[0] # a line beginning with '1' is the first line of the vowel means; this # signals the end of the vowel token measurements, so we can stop # processing the file if plt_F1 == '1': break plt_w_raw = plt_line.split(',')[5].split(' ')[0] plt_w = plt_w_raw.upper() plt_w = plt_w.replace('(', '') plt_w = plt_w.replace(')', '') print(plt_w) if plt_w not in words: skipped_lines.append(plt_line) print("SKIPPING LINE -- WORD NOT FOUND") print(plt_line)
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ # Get command line arguments parser = argparse.ArgumentParser( description="Takes an input feature file and converts it to another \ format. Formats are determined automatically from file \ extensions.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('outfile', help='output feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('-i', '--id_col', help='Name of the column which contains the instance \ IDs in ARFF, CSV, or TSV files.', default='id') parser.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('--arff_regression', help='Create ARFF files for regression, not \ classification.', action='store_true') parser.add_argument('--arff_relation', help='Relation name to use for ARFF file.', default='skll_relation') parser.add_argument('--reuse_libsvm_map', help='If you want to output multiple files that use \ the same mapping from labels and features to \ numbers when writing libsvm files, you can \ specify an existing .libsvm file to reuse the \ mapping from.', type=argparse.FileType('rb')) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) logger = logging.getLogger(__name__) # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() output_extension = os.path.splitext(args.outfile)[1].lower() if input_extension not in EXT_TO_READER: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' '.libsvm, .megam, .ndj, or .tsv format. You specified: ' '{}').format(input_extension)) sys.exit(1) # Build feature and label vectorizers from existing libsvm file if asked if args.reuse_libsvm_map and output_extension == '.libsvm': feat_map = {} label_map = {} for line in args.reuse_libsvm_map: line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup if '#' not in line: logger.error('The LibSVM file you want to reuse the map from ' 'was not created by SKLL and does not actually ' 'contain the necessary mapping info.') sys.exit(1) comments = line.split('#')[1] _, label_map_str, feat_map_str = comments.split('|') feat_map.update( _pair_to_dict_tuple(pair) for pair in feat_map_str.strip().split()) label_map.update( _pair_to_dict_tuple(pair) for pair in label_map_str.strip().split()) feat_vectorizer = DictVectorizer() feat_vectorizer.fit([{name: 1} for name in feat_map]) feat_vectorizer.vocabulary_ = feat_map else: feat_vectorizer = None label_map = None # Iterate through input file and collect the information we need reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet, label_col=args.label_col, id_col=args.id_col) feature_set = reader.read() # write out the file in the requested output format writer_type = EXT_TO_WRITER[output_extension] writer_args = {'quiet': args.quiet} if writer_type is DelimitedFileWriter: writer_args['label_col'] = args.label_col writer_args['id_col'] = args.id_col elif writer_type is ARFFWriter: writer_args['label_col'] = args.label_col writer_args['id_col'] = args.id_col writer_args['regression'] = args.arff_regression writer_args['relation'] = args.arff_relation elif writer_type is LibSVMWriter: writer_args['label_map'] = label_map writer = writer_type(args.outfile, feature_set, **writer_args) writer.write()
def parse(url, path, name): #init variables spaces_regex = re.compile("^(\s*).*") location_regex = re.compile("^\s*(INT\.|EXT\.)") BLOCK_TYPES = [ 'character', 'speech', 'stage direction', 'location', 'unknown' ] CHARACTER = 0 SPEECH = 1 DIRECTIONS = 2 LOCATION = 3 time_start = time.time() if url.endswith('.pdf'): print('The file @ %s is a PDF' % (url)) return script_text, soup = get_script(url) #write raw file: if not os.path.exists(path + 'raw/'): os.makedirs(path + 'raw/') with open(path + 'raw/' + "%s.txt" % name, "w") as text_file: text_file.write(str(script_text)) ##### space_vector, character_presence = white_space_analysis(script_text, soup) usual_spaces, flag = identify_usual_spaces(space_vector, character_presence) # Ici on définit les variables qu'on remplira de texte is_intro = True movie_script = [] intro = [] last_line_type = -1 last_character = 'unknown' text = [] characters = [] for block in script_text.descendants: # Si block est une instance de bs4.Tag, il est entouré de balises HTML # Le prochain block contiendra le même texte sans les balises # Donc on continue sans parser ce bloc if (isinstance(block, Tag)): continue # UnicodeDammit converts any string to UTF-8 # does not work so well block = UnicodeDammit(block, soup.original_encoding).unicode_markup # remove leading and ending end of lines block = block.strip('\n').strip('\n\r') # if the block doesn't have any text, skip it if (re.search('\w', block) == None): continue for line in block.split('\n'): stripped_line = line.strip(' \n\t\r') if (re.search('\w', line) == None): continue # Counting the number of spaces at the beginning of the line spmatch = spaces_regex.search(line) space_vector.append(len(spmatch.group(1))) #print(block) #print(line) #print(len(spmatch.group(1))) line_type = get_line_type(line, stripped_line, usual_spaces) #print(line_type) #print(line) if (last_line_type == -1 # -1 = not initialized or last_line_type == line_type): text.append(stripped_line) else: if (last_line_type == CHARACTER): last_character = '\n'.join( text ) #regex to supress (parenthesis) & replicate speaker if not last_character in characters: characters.append(last_character) elif (last_line_type == SPEECH): movie_script.append({ 'type': BLOCK_TYPES[last_line_type], BLOCK_TYPES[CHARACTER]: last_character, 'text': '\n'.join(text) }) #print('We just parsed this JSON block:') #print(movie_script[-1]) else: movie_script.append({ 'type': BLOCK_TYPES[last_line_type], 'text': '\n'.join(text) }) #print('We just parsed this JSON block:') #print(movie_script[-1]) text = [stripped_line] last_line_type = line_type #print('----------------') result = json_normalize(movie_script) if flag: write_csv(result, name, path) print(' Done parsing script at %s in %s' % (url, time.time() - time_start)) print('-----------------') return (result) else: path = path + 'doubtful/' write_csv(result, name, path) print(' Done parsing script at %s in %s' % (url, time.time() - time_start)) print('-----------------') return (result)
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ # Get command line arguments parser = argparse.ArgumentParser( description="Takes an input feature file and converts it to another \ format. Formats are determined automatically from file \ extensions.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('outfile', help='output feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('-i', '--id_col', help='Name of the column which contains the instance \ IDs in ARFF, CSV, or TSV files.', default='id') label_group = parser.add_mutually_exclusive_group(required=False) label_group.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') label_group.add_argument('--no_labels', action='store_true', default=False, help='Used to indicate that the input data has no labels.') parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('--arff_regression', help='Create ARFF files for regression, not \ classification.', action='store_true') parser.add_argument('--arff_relation', help='Relation name to use for ARFF file.', default='skll_relation') parser.add_argument('--reuse_libsvm_map', help='If you want to output multiple files that use \ the same mapping from labels and features to \ numbers when writing libsvm files, you can \ specify an existing .libsvm file to reuse the \ mapping from.', type=argparse.FileType('rb')) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) logger = logging.getLogger(__name__) # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() output_extension = os.path.splitext(args.outfile)[1].lower() if input_extension not in EXT_TO_READER: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' '.libsvm, .megam, .ndj, or .tsv format. You specified: ' '{}').format(input_extension)) sys.exit(1) # Build feature and label vectorizers from existing libsvm file if asked if args.reuse_libsvm_map and output_extension == '.libsvm': feat_map = {} label_map = {} for line in args.reuse_libsvm_map: line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup if '#' not in line: logger.error('The LibSVM file you want to reuse the map from ' 'was not created by SKLL and does not actually ' 'contain the necessary mapping info.') sys.exit(1) comments = line.split('#')[1] _, label_map_str, feat_map_str = comments.split('|') feat_map.update(_pair_to_dict_tuple(pair) for pair in feat_map_str.strip().split()) label_map.update(_pair_to_dict_tuple(pair) for pair in label_map_str .strip().split()) feat_vectorizer = DictVectorizer() feat_vectorizer.fit([{name: 1} for name in feat_map]) feat_vectorizer.vocabulary_ = feat_map else: feat_vectorizer = None label_map = None label_col = None if args.no_labels else args.label_col # Iterate through input file and collect the information we need reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet, label_col=label_col, id_col=args.id_col) feature_set = reader.read() # write out the file in the requested output format writer_type = EXT_TO_WRITER[output_extension] writer_args = {'quiet': args.quiet} if writer_type is CSVWriter or writer_type is TSVWriter: writer_args['label_col'] = label_col writer_args['id_col'] = args.id_col elif writer_type is ARFFWriter: writer_args['label_col'] = label_col writer_args['id_col'] = args.id_col writer_args['regression'] = args.arff_regression writer_args['relation'] = args.arff_relation elif writer_type is LibSVMWriter: writer_args['label_map'] = label_map writer = writer_type(args.outfile, feature_set, **writer_args) writer.write()
def on_pubmsg(self, c, e): nick = e.source.nick target = e.target if is_channel(e.target) else nick def reply(msg): self.send(target, msg) def dm(msg): self.send(nick, msg) line = UnicodeDammit(e.arguments[0]).unicode_markup log(' \033[37m{}→{}\033[0m'.format(nick, line)) a = line.split(":", 1) if len(a) > 1 and a[0].lower() == self.nick: self.do_command(e, a[1].strip().lower(), nick, target, reply, dm) return # zeltofilter if 'zeltoph' in nick: return foo = settings.VIPS.get(nick, 0) if random() < foo: self.kick(nick) match = re.match('.*┻━┻.*', line) if match: reply('┬─┬ノ(ಠ_ಠノ)') return match = re.match('^({} *:)? *chaos-?([☆★☼☀*]|sternchen) *: ?(.*)$'.format(self.nick), line) if match: newcs = match.group(3) self.chaossternchen.append(newcs) self.sendchan('Chaos-☆ Nr. {} notiert: {}'.format(len(self.chaossternchen), newcs)) return if line.startswith('.wiki '): wikipage = line[len('.wiki '):].strip() if re.match('^[-_+\w]+$', wikipage): wikiurl = 'http://afra-berlin.de/dokuwiki/doku.php?id={}'.format(wikipage) if 'Dieses Thema existiert noch nicht' in requests.get(wikiurl).text: reply("I'm sorry, I can't find a wiki page with that name.") else: reply(wikiurl) else: reply('Try to troll somebot else.') return if line == 'wat?': reply("I don't have a clue.") return if re.match('^hail eris[.!]* ', line.lower()): reply("All Hail Discordia!") return m = re.findall('(^|\s)?(gh?ah?nh?dh?ih?)(\s|$)?', line, re.IGNORECASE) for _1,match,_2 in m: if not re.match('(^|\s)?gandhi(\s|$)?', match, re.IGNORECASE): self.kick(nick, "It's spelled Gandhi") return if re.search('https?://[-a-z0-9.]*facebook.com', line.lower()): reply('A facebook link? srsly? Get some self-respect!') return match = re.search('https?://pr0gramm.com/#(newest/\*/[0-9/]*)', line.lower()) if match: reply('Fixed that pr0gramm link for you: http://pr0gramm.com/static/'+match.group(1)) return if line == 'moin': self.moincount += 1 if self.moincount == 5: reply('moin') return else: self.moincount = 0 if line.lstrip('.!#').startswith('eta '): eta = line[4:].strip() with self.db as db: db.execute("DELETE FROM etas WHERE nick=?", (nick,)) if eta: db.execute("INSERT INTO etas VALUES (DATETIME('now'), ?, ?)", (nick, eta)) dm('ETA registered. Thanks!') return m = re.findall(URL_REGEX, line.lower()) for url,*_ in m: res = requests.get(url) if res.status_code == requests.codes.ok: soup = BeautifulSoup(res.text) reply(soup.title.string) m = re.findall('(^|\s)(afra)(\s|$)', line, re.IGNORECASE) for _1,match,_2 in m: if match != 'AfRA' and match != 'afra' and random() < 0.1: reply("I'm sure you meant AfRA, not "+match) return
def main(argv=None): ''' Handles command line arguments and gets things started. :param argv: List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. :type argv: list of str ''' # Get command line arguments parser = argparse.ArgumentParser(description="Takes an input feature file \ and converts it to another \ format. Formats are \ determined automatically from\ file extensions.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .jsonlines, .tsv, \ .csv, .arff, or .megam)') parser.add_argument('outfile', help='output feature file (ends in .jsonlines, .tsv, \ .csv, .arff, or .megam)') parser.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('--arff_regression', help='Create ARFF files for regression, not classification.', action='store_true') parser.add_argument('--arff_relation', help='Relation name to use for ARFF file.', default='skll_relation') parser.add_argument('--reuse_libsvm_map', help='If you want to output multiple files that use \ the same mapping from classes and features to \ numbers when writing libsvm files, you can \ specify an existing .libsvm file to reuse the \ mapping from.', type=argparse.FileType('rb')) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) logger = logging.getLogger(__name__) # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() output_extension = os.path.splitext(args.outfile)[1].lower() if input_extension == ".tsv": example_iter_type = _TSVDictIter elif input_extension == ".jsonlines" or input_extension == '.ndj': example_iter_type = _JSONDictIter elif input_extension == ".libsvm": example_iter_type = _LibSVMDictIter elif input_extension == ".megam": example_iter_type = _MegaMDictIter elif input_extension == ".csv": example_iter_type = _CSVDictIter elif input_extension == ".arff": example_iter_type = _ARFFDictIter else: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' '.libsvm, .megam, .ndj, or .tsv format. You specified: ' '{}').format(input_extension)) sys.exit(1) # Build feature and label vectorizers from existing libsvm file if asked if args.reuse_libsvm_map and output_extension == '.libsvm': feat_map = {} label_map = {} for line in args.reuse_libsvm_map: line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup if '#' not in line: logger.error('The LibSVM file you want to reuse the map from ' 'was not created by SKLL and does not actually ' 'contain the necessary mapping info.') sys.exit(1) comments = line.split('#')[1] _, label_map_str, feat_map_str = comments.split('|') feat_map.update(_pair_to_dict_tuple(pair) for pair in feat_map_str.strip()) label_map.update(_pair_to_dict_tuple(pair) for pair in label_map_str .strip()) feat_vectorizer = DictVectorizer() feat_vectorizer.fit([{name: 1} for name in feat_map]) feat_vectorizer.vocabulary_ = feat_map else: feat_vectorizer = None label_map = None # Iterate through input file and collect the information we need ids = [] classes = [] feature_dicts = [] example_iter = example_iter_type(args.infile, quiet=args.quiet, label_col=args.label_col) for example_id, class_name, feature_dict in example_iter: feature_dicts.append(feature_dict) classes.append(class_name) ids.append(example_id) # write out the file in the requested output format write_feature_file(args.outfile, ids, classes, feature_dicts, arff_regression=args.arff_regression, arff_relation=args.arff_relation, feat_vectorizer=feat_vectorizer, label_map=label_map)
if(isinstance(block, Tag)): continue # UnicodeDammit converts any string to UTF-8 # does not work so well block = UnicodeDammit(block, soup.original_encoding).unicode_markup # remove leading and ending end of lines block = block.strip('\n') # if the block doesn't have any text, skip it if( re.search('\w', block) == None ): continue # bs4 ne coupe pas toujours bien les différents blocs # Mieux vaut donc redécouper par paragraphe et les traiter un à un for line in block.split('\n'): stripped_line = line.strip(' \n\t\r') if( re.search('\w', line) == None ): continue print('------------------------------ Begin line ------------------------------') print(line) print(' ------- End line -------') if( is_intro ): print() answer = input("Is that still part of the intro? (Y/n) ") if(answer == 'n' or answer == 'N'): is_intro = False movie_script.append({
def analyze_content(script_text, encoding): print("\n\nStarting script parsing!\n\n") print("Start by telling me when the introduction will end.") is_intro = True movie_script = [] intro = [] last_line_type = -1 last_character = '' line_type = None text = [] characters = [] usual_spaces = [[] for _ in range(len(BLOCK_TYPES))] for block in script_text.descendants: if isinstance(block, Tag): continue # UnicodeDammit converts any string to UTF-8 # does not work so well block = UnicodeDammit(block, encoding).unicode_markup # remove leading and ending end of lines block = block.strip('\n') # if the block doesn't have any text, skip it if re.search('\w', block) is None: continue for line in block.split('\n'): stripped_line = line.strip(' \n\t\r') if re.search(r'\w', line) is None: continue print( '------------------------------ Begin line ------------------------------' ) print(line) print( '------------------------------- End line -------------------------------' ) if is_intro: print() answer = input("Is that still part of the intro? (Y/n) ") if answer == 'n' or answer == 'N': is_intro = False movie_script.append({ 'type': 'introduction', 'text': '\n'.join(intro) }) print(movie_script[-1]) else: print("OK") print() intro.append(stripped_line) continue line_type = get_line_type(line, stripped_line, usual_spaces, characters) print("The last line was interpreted as '{}'".format( BLOCK_TYPES[line_type])) print() if last_line_type == -1 or last_line_type == line_type: # -1 = not initialized text.append(stripped_line) else: if last_line_type == CHARACTER: last_character = '\n'.join(text) if not last_character in characters: characters.append(last_character) elif last_line_type == SPEECH: movie_script.append({ 'type': BLOCK_TYPES[last_line_type], BLOCK_TYPES[CHARACTER]: last_character, 'text': '\n'.join(text) }) print('We just parsed this JSON block:') print(movie_script[-1]) else: movie_script.append({ 'type': BLOCK_TYPES[last_line_type], 'text': '\n'.join(text) }) print('We just parsed this JSON block:') print(movie_script[-1]) text = [stripped_line] last_line_type = line_type print() print() print() movie_script.append({ 'type': BLOCK_TYPES[line_type], 'text': '\n'.join(text) }) print('We just parsed this JSON block:') print(movie_script[-1]) print() print() return movie_script