Exemplo n.º 1
0
def build_index(corpus_dir, corpus_type, stop_file, index_file, tag_file,
                word_count_file, synch_freq):
    index_jar = jar.Jar(index_file, word_count_file, synch_freq, stop_file)

    if corpus_type == "phpBB":
        post_db = corpus_dir + ".db"
        read_corpora.get_phpBB_posts(corpus_dir, post_db)
        for title, post in iter_indexed_posts(post_db,
                                              read_corpora.POST_DELIMITER,
                                              index_jar):
            index_jar.add_doc(source=corpus_dir, title=title, text=post)
    elif corpus_type == "xml":
        if not tag_file:
            raise AttributeError(
                "a tag file must be supplied when parsing an xml corpus")

        for file_name, title, heading, text in read_corpora.iter_xml(
                corpus_dir, tag_file):
            title = index_jar.index_and_count_text(title)
            heading = index_jar.index_and_count_text(heading)
            text = index_jar.index_and_count_text(text)
            index_jar.add_doc(file_name, title, heading, text)
    else:
        raise AttributeError("invalid corpus type %s\n\
                must be one of phpBB or xml" % corpus_type)

    #one final synch and then we are done
    index_jar.synchronize()
Exemplo n.º 2
0
def main(argv):
    colorama.init()

    argv = build_utils.ExpandFileArgs(argv)

    parser = optparse.OptionParser()
    build_utils.AddDepfileOption(parser)

    parser.add_option('--src-gendirs',
                      help='Directories containing generated java files.')
    parser.add_option('--java-srcjars',
                      action='append',
                      default=[],
                      help='List of srcjars to include in compilation.')
    parser.add_option(
        '--bootclasspath',
        action='append',
        default=[],
        help='Boot classpath for javac. If this is specified multiple times, '
        'they will all be appended to construct the classpath.')
    parser.add_option(
        '--classpath',
        action='append',
        help='Classpath for javac. If this is specified multiple times, they '
        'will all be appended to construct the classpath.')
    parser.add_option(
        '--javac-includes',
        help='A list of file patterns. If provided, only java files that match'
        'one of the patterns will be compiled.')
    parser.add_option(
        '--jar-excluded-classes',
        default='',
        help='List of .class file patterns to exclude from the jar.')

    parser.add_option(
        '--chromium-code',
        type='int',
        help='Whether code being compiled should be built with stricter '
        'warnings for chromium code.')

    parser.add_option('--use-errorprone-path',
                      help='Use the Errorprone compiler at this path.')

    parser.add_option('--classes-dir',
                      help='Directory for compiled .class files.')
    parser.add_option('--jar-path', help='Jar output path.')
    parser.add_option('--jar-source-path', help='Source jar output path.')
    parser.add_option(
        '--jar-source-base-dir',
        help=
        'Base directory for the source files included in the output source jar.'
    )
    parser.add_option('--main-class',
                      help='The class containing the main method.')
    parser.add_option('--manifest-entry',
                      action='append',
                      help='Key:value pairs to add to the .jar manifest.')

    parser.add_option('--stamp', help='Path to touch on success.')

    options, args = parser.parse_args(argv)

    if options.main_class and not options.jar_path:
        parser.error('--main-class requires --jar-path')

    bootclasspath = []
    for arg in options.bootclasspath:
        bootclasspath += build_utils.ParseGypList(arg)

    classpath = []
    for arg in options.classpath:
        classpath += build_utils.ParseGypList(arg)

    java_srcjars = []
    for arg in options.java_srcjars:
        java_srcjars += build_utils.ParseGypList(arg)

    java_files = args
    if options.src_gendirs:
        src_gendirs = build_utils.ParseGypList(options.src_gendirs)
        java_files += build_utils.FindInDirectories(src_gendirs, '*.java')

    input_files = bootclasspath + classpath + java_srcjars + java_files
    with build_utils.TempDir() as temp_dir:
        classes_dir = os.path.join(temp_dir, 'classes')
        os.makedirs(classes_dir)
        if java_srcjars:
            java_dir = os.path.join(temp_dir, 'java')
            os.makedirs(java_dir)
            for srcjar in java_srcjars:
                build_utils.ExtractAll(srcjar, path=java_dir, pattern='*.java')
            java_files += build_utils.FindInDirectory(java_dir, '*.java')

        if options.javac_includes:
            javac_includes = build_utils.ParseGypList(options.javac_includes)
            filtered_java_files = []
            for f in java_files:
                for include in javac_includes:
                    if fnmatch.fnmatch(f, include):
                        filtered_java_files.append(f)
                        break
            java_files = filtered_java_files

        if len(java_files) != 0:
            DoJavac(bootclasspath, classpath, classes_dir,
                    options.chromium_code, options.use_errorprone_path,
                    java_files)

        if options.jar_path:
            if options.main_class or options.manifest_entry:
                if options.manifest_entry:
                    entries = map(lambda e: e.split(":"),
                                  options.manifest_entry)
                else:
                    entries = []
                manifest_file = os.path.join(temp_dir, 'manifest')
                CreateManifest(manifest_file, classpath, options.main_class,
                               entries)
            else:
                manifest_file = None
            jar.JarDirectory(classes_dir,
                             build_utils.ParseGypList(
                                 options.jar_excluded_classes),
                             options.jar_path,
                             manifest_file=manifest_file)

            if options.jar_source_path:
                jar.Jar(java_files, options.jar_source_base_dir,
                        options.jar_source_path)

        if options.classes_dir:
            # Delete the old classes directory. This ensures that all .class files in
            # the output are actually from the input .java files. For example, if a
            # .java file is deleted or an inner class is removed, the classes
            # directory should not contain the corresponding old .class file after
            # running this action.
            build_utils.DeleteDirectory(options.classes_dir)
            shutil.copytree(classes_dir, options.classes_dir)

    if options.depfile:
        build_utils.WriteDepfile(
            options.depfile, input_files + build_utils.GetPythonDependencies())

    if options.stamp:
        build_utils.Touch(options.stamp)