示例#1
0
def convert_to_libsvm(lines):
    '''
    Converts a sequence of lines (e.g., a file or list of strings) in MegaM
    format to LibSVM format.

    :param lines: The sequence of lines to convert.
    :type lines: L{file} or L{list} of L{str}

    :return: A tuple of the newly formatted data, the mappings from class names
             to numbers, and the mappings from feature names to numbers.
    :rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict})
    '''

    # Initialize variables
    field_num_dict = UniqueNumberDict()
    class_num_dict = UniqueNumberDict()

    result_list = []
    # Iterate through MegaM file
    for line in lines:
        line_fields = set()
        # Process encoding
        line = UnicodeDammit(line,
                             ['utf-8', 'windows-1252']).unicode_markup.strip()

        # Ignore comments (and TEST/DEV lines)
        if not line.startswith(
                '#') and not line == 'TEST' and not line == 'DEV':
            result_string = ''
            split_line = line.split()
            result_string += '{0}'.format(class_num_dict[split_line[0]])
            # Handle features if there are any
            if len(split_line) > 1:
                del split_line[0]
                # Loop through all feature-value pairs printing out pairs
                # separated by commas (and with feature names replaced with
                # numbers)
                for field_num, value in sorted(
                        zip((field_num_dict[field_name]
                             for field_name in islice(split_line, 0, None, 2)),
                            (float(value) if value != 'N/A' else 0.0
                             for value in islice(split_line, 1, None, 2)))):
                    # Check for duplicates
                    if field_num in line_fields:
                        field_name = (
                            field_name
                            for field_name, f_num in field_num_dict.items()
                            if f_num == field_num).next()
                        raise AssertionError(
                            "Field {} occurs on same line twice.".format(
                                field_name))
                    # Otherwise output non-empty features
                    elif value != 'N/A' and float(value):
                        result_string += ' {}:{}'.format(field_num, value)
                        line_fields.add(field_num)
            result_list.append(result_string)

    return result_list, class_num_dict, field_num_dict
示例#2
0
文件: readers.py 项目: hefeix/skll
    def _sub_read(self, f):
        example_num = 0
        curr_id = 'EXAMPLE_0'
        for line in f:
            # Process encoding
            if not isinstance(line, text_type):
                line = UnicodeDammit(line, ['utf-8',
                                            'windows-1252']).unicode_markup
            line = line.strip()
            # Handle instance lines
            if line.startswith('#'):
                curr_id = line[1:].strip()
            elif line and line not in ['TRAIN', 'TEST', 'DEV']:
                split_line = line.split()
                num_cols = len(split_line)
                del line
                # Line is just a class label
                if num_cols == 1:
                    class_name = safe_float(split_line[0],
                                            replace_dict=self.class_map)
                    field_pairs = []
                # Line has a class label and feature-value pairs
                elif num_cols % 2 == 1:
                    class_name = safe_float(split_line[0],
                                            replace_dict=self.class_map)
                    field_pairs = split_line[1:]
                # Line just has feature-value pairs
                elif num_cols % 2 == 0:
                    class_name = None
                    field_pairs = split_line

                curr_info_dict = {}
                if len(field_pairs) > 0:
                    # Get current instances feature-value pairs
                    field_names = islice(field_pairs, 0, None, 2)
                    # Convert values to floats, because otherwise
                    # features'll be categorical
                    field_values = (safe_float(val) for val in
                                    islice(field_pairs, 1, None, 2))

                    # Add the feature-value pairs to dictionary
                    curr_info_dict.update(zip(field_names, field_values))

                    if len(curr_info_dict) != len(field_pairs) / 2:
                        raise ValueError(('There are duplicate feature ' +
                                          'names in {} for example ' +
                                          '{}.').format(self.path_or_list,
                                                        curr_id))

                yield curr_id, class_name, curr_info_dict

                # Set default example ID for next instance, in case we see a
                # line without an ID.
                example_num += 1
                curr_id = 'EXAMPLE_{}'.format(example_num)
示例#3
0
文件: readers.py 项目: nimmen/skll
    def _sub_read(self, f):
        example_num = 0
        curr_id = 'EXAMPLE_0'
        for line in f:
            # Process encoding
            if not isinstance(line, text_type):
                line = UnicodeDammit(line,
                                     ['utf-8', 'windows-1252']).unicode_markup
            line = line.strip()
            # Handle instance lines
            if line.startswith('#'):
                curr_id = line[1:].strip()
            elif line and line not in ['TRAIN', 'TEST', 'DEV']:
                split_line = line.split()
                num_cols = len(split_line)
                del line
                # Line is just a class label
                if num_cols == 1:
                    class_name = safe_float(split_line[0],
                                            replace_dict=self.class_map)
                    field_pairs = []
                # Line has a class label and feature-value pairs
                elif num_cols % 2 == 1:
                    class_name = safe_float(split_line[0],
                                            replace_dict=self.class_map)
                    field_pairs = split_line[1:]
                # Line just has feature-value pairs
                elif num_cols % 2 == 0:
                    class_name = None
                    field_pairs = split_line

                curr_info_dict = {}
                if len(field_pairs) > 0:
                    # Get current instances feature-value pairs
                    field_names = islice(field_pairs, 0, None, 2)
                    # Convert values to floats, because otherwise
                    # features'll be categorical
                    field_values = (safe_float(val)
                                    for val in islice(field_pairs, 1, None, 2))

                    # Add the feature-value pairs to dictionary
                    curr_info_dict.update(zip(field_names, field_values))

                    if len(curr_info_dict) != len(field_pairs) / 2:
                        raise ValueError(
                            ('There are duplicate feature ' +
                             'names in {} for example ' + '{}.').format(
                                 self.path_or_list, curr_id))

                yield curr_id, class_name, curr_info_dict

                # Set default example ID for next instance, in case we see a
                # line without an ID.
                example_num += 1
                curr_id = 'EXAMPLE_{}'.format(example_num)
示例#4
0
def convert_to_libsvm(lines):
    '''
    Converts a sequence of lines (e.g., a file or list of strings) in MegaM
    format to LibSVM format.

    :param lines: The sequence of lines to convert.
    :type lines: L{file} or L{list} of L{str}

    :return: A tuple of the newly formatted data, the mappings from class names
             to numbers, and the mappings from feature names to numbers.
    :rtype: 3-L{tuple} of (L{list} of L{unicode}, L{dict}, and L{dict})
    '''

    # Initialize variables
    field_num_dict = UniqueNumberDict()
    class_num_dict = UniqueNumberDict()

    result_list = []
    # Iterate through MegaM file
    for line in lines:
        line_fields = set()
        # Process encoding
        line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup.strip()

        # Ignore comments (and TEST/DEV lines)
        if not line.startswith('#') and not line == 'TEST' and not line == 'DEV':
            result_string = ''
            split_line = line.split()
            result_string += '{0}'.format(class_num_dict[split_line[0]])
            # Handle features if there are any
            if len(split_line) > 1:
                del split_line[0]
                # Loop through all feature-value pairs printing out pairs
                # separated by commas (and with feature names replaced with
                # numbers)
                for field_num, value in sorted(zip((field_num_dict[field_name] for field_name in islice(split_line, 0, None, 2)),
                                                   (float(value) if value != 'N/A' else 0.0 for value in islice(split_line, 1, None, 2)))):
                    # Check for duplicates
                    if field_num in line_fields:
                        field_name = (field_name for field_name, f_num in field_num_dict.items() if f_num == field_num).next()
                        raise AssertionError("Field {} occurs on same line twice.".format(field_name))
                    # Otherwise output non-empty features
                    elif value != 'N/A' and float(value):
                        result_string += ' {}:{}'.format(field_num, value)
                        line_fields.add(field_num)
            result_list.append(result_string)

    return result_list, class_num_dict, field_num_dict
示例#5
0
def white_space_analysis(script_text, soup):
    spaces_regex = re.compile("^(\s*).*")
    space_vector = []
    character_presence = []

    for block in script_text.descendants:
        # Si block est une instance de bs4.Tag, il est entouré de balises HTML
        # Le prochain block contiendra le même texte sans les balises
        # Donc on continue sans parser ce bloc
        if (isinstance(block, Tag)):
            continue

        # UnicodeDammit converts any string to UTF-8
        # does not work so well
        block = UnicodeDammit(block, soup.original_encoding).unicode_markup
        # remove leading and ending end of lines
        block = block.strip('\n').strip('\r\n')

        # if the block doesn't have any text, skip it
        if (re.search('\w', block) == None):
            continue

        for line in block.split('\n'):
            stripped_line = line.strip(' \n\t\r')
            if (re.search('\w', line) == None):
                continue
            # Counting the number of spaces at the beginning of the line
            spmatch = spaces_regex.search(line)
            space_vector.append(len(spmatch.group(1)))
            if (stripped_line.isupper()) & (len(stripped_line.split(' ')) <=
                                            3):
                character_presence.append(len(spmatch.group(1)))

            else:
                character_presence.append(None)

    return space_vector, character_presence  #,speech_presence
    if(isinstance(block, Tag)):
        continue

    # UnicodeDammit converts any string to UTF-8
    # does not work so well
    block = UnicodeDammit(block, soup.original_encoding).unicode_markup
    # remove leading and ending end of lines
    block = block.strip('\n')

    # if the block doesn't have any text, skip it
    if( re.search('\w', block) == None ):
        continue

    # bs4 ne coupe pas toujours bien les différents blocs
    # Mieux vaut donc redécouper par paragraphe et les traiter un à un
    for line in block.split('\n'):
        stripped_line = line.strip(' \n\t\r')
        if( re.search('\w', line) == None ):
            continue

        line_type = get_line_type(line, stripped_line, usual_spaces)

        if(last_line_type == -1 # -1 = not initialized
           or last_line_type == line_type):
            text.append(stripped_line)
        else:
            if(last_line_type == CHARACTER):
                last_character=' '.join(text)
                if not last_character in characters:
                    characters.append(last_character)
            elif(last_line_type == SPEECH):
        't_man'
    ]) + '\n'

    fw = open(outputFile, 'w')
    fw.write(header)

    plt_lines = open(pltFile, 'rb').readlines()
    skipped_lines = []
    # skip the first two lines since they contain header information
    for plt_line in plt_lines[2:]:
        print(plt_line)
        plt_line = UnicodeDammit(plt_line,
                                 ['utf-8', 'windows-1252']).unicode_markup
        plt_line = unidecode(plt_line)
        plt_line = plt_line.rstrip()
        plt_F1 = plt_line.split(',')[0]
        # a line beginning with '1' is the first line of the vowel means; this
        # signals the end of the vowel token measurements, so we can stop
        # processing the file
        if plt_F1 == '1':
            break

        plt_w_raw = plt_line.split(',')[5].split(' ')[0]
        plt_w = plt_w_raw.upper()
        plt_w = plt_w.replace('(', '')
        plt_w = plt_w.replace(')', '')
        print(plt_w)
        if plt_w not in words:
            skipped_lines.append(plt_line)
            print("SKIPPING LINE -- WORD NOT FOUND")
            print(plt_line)
示例#8
0
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Takes an input feature file and converts it to another \
                     format. Formats are determined automatically from file \
                     extensions.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('-i',
                        '--id_col',
                        help='Name of the column which contains the instance \
                              IDs in ARFF, CSV, or TSV files.',
                        default='id')
    parser.add_argument('-l',
                        '--label_col',
                        help='Name of the column which contains the class \
                              labels in ARFF, CSV, or TSV files. For ARFF \
                              files, this must be the final column to count as\
                              the label.',
                        default='y')
    parser.add_argument('-q',
                        '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not \
                              classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from labels and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension not in EXT_TO_READER:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .megam, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line,
                                 ['utf-8', 'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(
                _pair_to_dict_tuple(pair)
                for pair in feat_map_str.strip().split())
            label_map.update(
                _pair_to_dict_tuple(pair)
                for pair in label_map_str.strip().split())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
        label_map = None

    # Iterate through input file and collect the information we need
    reader = EXT_TO_READER[input_extension](args.infile,
                                            quiet=args.quiet,
                                            label_col=args.label_col,
                                            id_col=args.id_col)
    feature_set = reader.read()
    # write out the file in the requested output format
    writer_type = EXT_TO_WRITER[output_extension]
    writer_args = {'quiet': args.quiet}
    if writer_type is DelimitedFileWriter:
        writer_args['label_col'] = args.label_col
        writer_args['id_col'] = args.id_col
    elif writer_type is ARFFWriter:
        writer_args['label_col'] = args.label_col
        writer_args['id_col'] = args.id_col
        writer_args['regression'] = args.arff_regression
        writer_args['relation'] = args.arff_relation
    elif writer_type is LibSVMWriter:
        writer_args['label_map'] = label_map
    writer = writer_type(args.outfile, feature_set, **writer_args)
    writer.write()
示例#9
0
def parse(url, path, name):
    #init variables
    spaces_regex = re.compile("^(\s*).*")
    location_regex = re.compile("^\s*(INT\.|EXT\.)")

    BLOCK_TYPES = [
        'character', 'speech', 'stage direction', 'location', 'unknown'
    ]
    CHARACTER = 0
    SPEECH = 1
    DIRECTIONS = 2
    LOCATION = 3

    time_start = time.time()

    if url.endswith('.pdf'):
        print('The file @ %s is a PDF' % (url))
        return

    script_text, soup = get_script(url)
    #write raw file:
    if not os.path.exists(path + 'raw/'):
        os.makedirs(path + 'raw/')
    with open(path + 'raw/' + "%s.txt" % name, "w") as text_file:
        text_file.write(str(script_text))
    #####

    space_vector, character_presence = white_space_analysis(script_text, soup)
    usual_spaces, flag = identify_usual_spaces(space_vector,
                                               character_presence)

    # Ici on définit les variables qu'on remplira de texte
    is_intro = True
    movie_script = []
    intro = []
    last_line_type = -1
    last_character = 'unknown'
    text = []
    characters = []

    for block in script_text.descendants:
        # Si block est une instance de bs4.Tag, il est entouré de balises HTML
        # Le prochain block contiendra le même texte sans les balises
        # Donc on continue sans parser ce bloc
        if (isinstance(block, Tag)):
            continue

        # UnicodeDammit converts any string to UTF-8
        # does not work so well
        block = UnicodeDammit(block, soup.original_encoding).unicode_markup
        # remove leading and ending end of lines
        block = block.strip('\n').strip('\n\r')

        # if the block doesn't have any text, skip it
        if (re.search('\w', block) == None):
            continue

        for line in block.split('\n'):
            stripped_line = line.strip(' \n\t\r')
            if (re.search('\w', line) == None):
                continue
            # Counting the number of spaces at the beginning of the line
            spmatch = spaces_regex.search(line)
            space_vector.append(len(spmatch.group(1)))
            #print(block)
            #print(line)
            #print(len(spmatch.group(1)))
            line_type = get_line_type(line, stripped_line, usual_spaces)
            #print(line_type)
            #print(line)

            if (last_line_type == -1  # -1 = not initialized
                    or last_line_type == line_type):
                text.append(stripped_line)
            else:
                if (last_line_type == CHARACTER):
                    last_character = '\n'.join(
                        text
                    )  #regex to supress (parenthesis) & replicate speaker
                    if not last_character in characters:
                        characters.append(last_character)
                elif (last_line_type == SPEECH):
                    movie_script.append({
                        'type': BLOCK_TYPES[last_line_type],
                        BLOCK_TYPES[CHARACTER]: last_character,
                        'text': '\n'.join(text)
                    })
                    #print('We just parsed this JSON block:')
                    #print(movie_script[-1])
                else:
                    movie_script.append({
                        'type': BLOCK_TYPES[last_line_type],
                        'text': '\n'.join(text)
                    })
                    #print('We just parsed this JSON block:')
                    #print(movie_script[-1])
                text = [stripped_line]

            last_line_type = line_type
            #print('----------------')

    result = json_normalize(movie_script)
    if flag:
        write_csv(result, name, path)
        print('      Done parsing script at %s in %s' %
              (url, time.time() - time_start))
        print('-----------------')
        return (result)
    else:
        path = path + 'doubtful/'
        write_csv(result, name, path)
        print('      Done parsing script at %s in %s' %
              (url, time.time() - time_start))
        print('-----------------')
        return (result)
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    # Get command line arguments
    parser = argparse.ArgumentParser(
        description="Takes an input feature file and converts it to another \
                     format. Formats are determined automatically from file \
                     extensions.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .arff, .csv, \
                              .jsonlines, .libsvm, .megam, .ndj, or .tsv)')
    parser.add_argument('-i', '--id_col',
                        help='Name of the column which contains the instance \
                              IDs in ARFF, CSV, or TSV files.',
                        default='id')
    label_group = parser.add_mutually_exclusive_group(required=False)
    label_group.add_argument('-l',
                             '--label_col',
                             help='Name of the column which contains the class \
                                   labels in ARFF, CSV, or TSV files. For ARFF \
                                   files, this must be the final column to count as\
                                   the label.',
                             default='y')
    label_group.add_argument('--no_labels',
                             action='store_true',
                             default=False,
                             help='Used to indicate that the input data has no labels.')
    parser.add_argument('-q', '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not \
                              classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from labels and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension not in EXT_TO_READER:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .megam, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line, ['utf-8',
                                        'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(_pair_to_dict_tuple(pair) for pair in
                            feat_map_str.strip().split())
            label_map.update(_pair_to_dict_tuple(pair) for pair in
                             label_map_str
                             .strip().split())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
        label_map = None

    label_col = None if args.no_labels else args.label_col

    # Iterate through input file and collect the information we need
    reader = EXT_TO_READER[input_extension](args.infile,
                                            quiet=args.quiet,
                                            label_col=label_col,
                                            id_col=args.id_col)
    feature_set = reader.read()
    # write out the file in the requested output format
    writer_type = EXT_TO_WRITER[output_extension]
    writer_args = {'quiet': args.quiet}
    if writer_type is CSVWriter or writer_type is TSVWriter:
        writer_args['label_col'] = label_col
        writer_args['id_col'] = args.id_col
    elif writer_type is ARFFWriter:
        writer_args['label_col'] = label_col
        writer_args['id_col'] = args.id_col
        writer_args['regression'] = args.arff_regression
        writer_args['relation'] = args.arff_relation
    elif writer_type is LibSVMWriter:
        writer_args['label_map'] = label_map
    writer = writer_type(args.outfile, feature_set, **writer_args)
    writer.write()
示例#11
0
文件: afrab0t.py 项目: Akendo/afrab0t
	def on_pubmsg(self, c, e):
		nick = e.source.nick
		target = e.target if is_channel(e.target) else nick
		def reply(msg):
			self.send(target, msg)
		def dm(msg):
			self.send(nick, msg)
		line = UnicodeDammit(e.arguments[0]).unicode_markup
		log('   \033[37m{}→{}\033[0m'.format(nick, line))
		a = line.split(":", 1)
		if len(a) > 1 and a[0].lower() == self.nick:
			self.do_command(e, a[1].strip().lower(), nick, target, reply, dm)
			return

		# zeltofilter
		if 'zeltoph' in nick:
			return

		foo = settings.VIPS.get(nick, 0)
		if random() < foo:
			self.kick(nick)
	
		match = re.match('.*┻━┻.*', line)
		if match:
			reply('┬─┬ノ(ಠ_ಠノ)')
			return

		match = re.match('^({} *:)? *chaos-?([☆★☼☀*]|sternchen) *: ?(.*)$'.format(self.nick), line)
		if match:
			newcs = match.group(3)
			self.chaossternchen.append(newcs)
			self.sendchan('Chaos-☆ Nr. {} notiert: {}'.format(len(self.chaossternchen), newcs))
			return

		if line.startswith('.wiki '):
			wikipage = line[len('.wiki '):].strip()
			if re.match('^[-_+\w]+$', wikipage):
				wikiurl = 'http://afra-berlin.de/dokuwiki/doku.php?id={}'.format(wikipage)
				if 'Dieses Thema existiert noch nicht' in requests.get(wikiurl).text:
					reply("I'm sorry, I can't find a wiki page with that name.")
				else:
					reply(wikiurl)
			else:
				reply('Try to troll somebot else.')
			return

		if line == 'wat?':
			reply("I don't have a clue.")
			return
		if re.match('^hail eris[.!]* ', line.lower()):
			reply("All Hail Discordia!")
			return
		m = re.findall('(^|\s)?(gh?ah?nh?dh?ih?)(\s|$)?', line, re.IGNORECASE)
		for _1,match,_2 in m:
			if not re.match('(^|\s)?gandhi(\s|$)?', match, re.IGNORECASE):
				self.kick(nick, "It's spelled Gandhi")
				return
		if re.search('https?://[-a-z0-9.]*facebook.com', line.lower()):
			reply('A facebook link? srsly? Get some self-respect!')
			return
		match = re.search('https?://pr0gramm.com/#(newest/\*/[0-9/]*)', line.lower())
		if match:
			reply('Fixed that pr0gramm link for you: http://pr0gramm.com/static/'+match.group(1))
			return
		if line == 'moin':
			self.moincount += 1
			if self.moincount == 5:
				reply('moin')
			return
		else:
			self.moincount = 0
		if line.lstrip('.!#').startswith('eta '):
			eta = line[4:].strip()
			with self.db as db:
				db.execute("DELETE FROM etas WHERE nick=?", (nick,))
				if eta:
					db.execute("INSERT INTO etas VALUES (DATETIME('now'), ?, ?)", (nick, eta))
			dm('ETA registered. Thanks!')
			return
		m = re.findall(URL_REGEX, line.lower())
		for url,*_ in m:
			res = requests.get(url)
			if res.status_code == requests.codes.ok:
				soup = BeautifulSoup(res.text)
				reply(soup.title.string)
		m = re.findall('(^|\s)(afra)(\s|$)', line, re.IGNORECASE)
		for _1,match,_2 in m:
			if match != 'AfRA' and match != 'afra' and random() < 0.1:
				reply("I'm sure you meant AfRA, not "+match)
				return
示例#12
0
def main(argv=None):
    '''
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    '''
    # Get command line arguments
    parser = argparse.ArgumentParser(description="Takes an input feature file \
                                                  and converts it to another \
                                                  format. Formats are \
                                                  determined automatically from\
                                                  file extensions.",
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('infile',
                        help='input feature file (ends in .jsonlines, .tsv, \
                              .csv, .arff, or .megam)')
    parser.add_argument('outfile',
                        help='output feature file (ends in .jsonlines, .tsv, \
                              .csv, .arff, or .megam)')
    parser.add_argument('-l', '--label_col',
                        help='Name of the column which contains the class \
                              labels in ARFF, CSV, or TSV files. For ARFF \
                              files, this must be the final column to count as\
                              the label.',
                        default='y')
    parser.add_argument('-q', '--quiet',
                        help='Suppress printing of "Loading..." messages.',
                        action='store_true')
    parser.add_argument('--arff_regression',
                        help='Create ARFF files for regression, not classification.',
                        action='store_true')
    parser.add_argument('--arff_relation',
                        help='Relation name to use for ARFF file.',
                        default='skll_relation')
    parser.add_argument('--reuse_libsvm_map',
                        help='If you want to output multiple files that use \
                              the same mapping from classes and features to \
                              numbers when writing libsvm files, you can \
                              specify an existing .libsvm file to reuse the \
                              mapping from.',
                        type=argparse.FileType('rb'))
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))
    logger = logging.getLogger(__name__)

    # make sure the input file extension is one we can process
    input_extension = os.path.splitext(args.infile)[1].lower()
    output_extension = os.path.splitext(args.outfile)[1].lower()

    if input_extension == ".tsv":
        example_iter_type = _TSVDictIter
    elif input_extension == ".jsonlines" or input_extension == '.ndj':
        example_iter_type = _JSONDictIter
    elif input_extension == ".libsvm":
        example_iter_type = _LibSVMDictIter
    elif input_extension == ".megam":
        example_iter_type = _MegaMDictIter
    elif input_extension == ".csv":
        example_iter_type = _CSVDictIter
    elif input_extension == ".arff":
        example_iter_type = _ARFFDictIter
    else:
        logger.error(('Input file must be in either .arff, .csv, .jsonlines, '
                      '.libsvm, .megam, .ndj, or .tsv format. You specified: '
                      '{}').format(input_extension))
        sys.exit(1)

    # Build feature and label vectorizers from existing libsvm file if asked
    if args.reuse_libsvm_map and output_extension == '.libsvm':
        feat_map = {}
        label_map = {}
        for line in args.reuse_libsvm_map:
            line = UnicodeDammit(line, ['utf-8',
                                        'windows-1252']).unicode_markup
            if '#' not in line:
                logger.error('The LibSVM file you want to reuse the map from '
                             'was not created by SKLL and does not actually '
                             'contain the necessary mapping info.')
                sys.exit(1)
            comments = line.split('#')[1]
            _, label_map_str, feat_map_str = comments.split('|')
            feat_map.update(_pair_to_dict_tuple(pair) for pair in
                            feat_map_str.strip())
            label_map.update(_pair_to_dict_tuple(pair) for pair in
                             label_map_str
                             .strip())
        feat_vectorizer = DictVectorizer()
        feat_vectorizer.fit([{name: 1} for name in feat_map])
        feat_vectorizer.vocabulary_ = feat_map
    else:
        feat_vectorizer = None
        label_map = None

    # Iterate through input file and collect the information we need
    ids = []
    classes = []
    feature_dicts = []
    example_iter = example_iter_type(args.infile, quiet=args.quiet,
                                     label_col=args.label_col)
    for example_id, class_name, feature_dict in example_iter:
        feature_dicts.append(feature_dict)
        classes.append(class_name)
        ids.append(example_id)

    # write out the file in the requested output format
    write_feature_file(args.outfile, ids, classes, feature_dicts,
                       arff_regression=args.arff_regression,
                       arff_relation=args.arff_relation,
                       feat_vectorizer=feat_vectorizer,
                       label_map=label_map)
    if(isinstance(block, Tag)):
        continue

    # UnicodeDammit converts any string to UTF-8
    # does not work so well
    block = UnicodeDammit(block, soup.original_encoding).unicode_markup
    # remove leading and ending end of lines
    block = block.strip('\n')

    # if the block doesn't have any text, skip it
    if( re.search('\w', block) == None ):
        continue

    # bs4 ne coupe pas toujours bien les différents blocs
    # Mieux vaut donc redécouper par paragraphe et les traiter un à un
    for line in block.split('\n'):
        stripped_line = line.strip(' \n\t\r')
        if( re.search('\w', line) == None ):
            continue

        print('------------------------------ Begin line ------------------------------')
        print(line)
        print('                        ------- End line -------')

        if( is_intro ):
            print()
            answer = input("Is that still part of the intro? (Y/n) ")

            if(answer == 'n' or answer == 'N'):
                is_intro = False
                movie_script.append({
示例#14
0
def analyze_content(script_text, encoding):
    print("\n\nStarting script parsing!\n\n")
    print("Start by telling me when the introduction will end.")

    is_intro = True
    movie_script = []
    intro = []
    last_line_type = -1
    last_character = ''
    line_type = None
    text = []
    characters = []
    usual_spaces = [[] for _ in range(len(BLOCK_TYPES))]

    for block in script_text.descendants:
        if isinstance(block, Tag):
            continue

        # UnicodeDammit converts any string to UTF-8
        # does not work so well
        block = UnicodeDammit(block, encoding).unicode_markup

        # remove leading and ending end of lines
        block = block.strip('\n')

        # if the block doesn't have any text, skip it
        if re.search('\w', block) is None:
            continue

        for line in block.split('\n'):
            stripped_line = line.strip(' \n\t\r')
            if re.search(r'\w', line) is None:
                continue

            print(
                '------------------------------ Begin line ------------------------------'
            )
            print(line)
            print(
                '------------------------------- End line -------------------------------'
            )

            if is_intro:
                print()
                answer = input("Is that still part of the intro? (Y/n) ")

                if answer == 'n' or answer == 'N':
                    is_intro = False
                    movie_script.append({
                        'type': 'introduction',
                        'text': '\n'.join(intro)
                    })

                    print(movie_script[-1])
                else:
                    print("OK")
                    print()
                    intro.append(stripped_line)
                    continue

            line_type = get_line_type(line, stripped_line, usual_spaces,
                                      characters)
            print("The last line was interpreted as '{}'".format(
                BLOCK_TYPES[line_type]))
            print()

            if last_line_type == -1 or last_line_type == line_type:  # -1 = not initialized
                text.append(stripped_line)
            else:
                if last_line_type == CHARACTER:
                    last_character = '\n'.join(text)
                    if not last_character in characters:
                        characters.append(last_character)
                elif last_line_type == SPEECH:
                    movie_script.append({
                        'type': BLOCK_TYPES[last_line_type],
                        BLOCK_TYPES[CHARACTER]: last_character,
                        'text': '\n'.join(text)
                    })
                    print('We just parsed this JSON block:')
                    print(movie_script[-1])
                else:
                    movie_script.append({
                        'type': BLOCK_TYPES[last_line_type],
                        'text': '\n'.join(text)
                    })
                    print('We just parsed this JSON block:')
                    print(movie_script[-1])
                text = [stripped_line]

            last_line_type = line_type
            print()

        print()
        print()

    movie_script.append({
        'type': BLOCK_TYPES[line_type],
        'text': '\n'.join(text)
    })

    print('We just parsed this JSON block:')
    print(movie_script[-1])
    print()
    print()

    return movie_script