示例#1
0
def import_path(pathname, verbal=False):
    sys.stdout.write("\n")
    path_results = []
    mogu_files = []
    directories = []
    single_file = pathname.endswith(".mogu")
    
    if not single_file:
        mogu_files = [entry for entry in os.listdir(pathname) if \
                entry.endswith(".mogu")]
        directories = [entry for entry in os.listdir(pathname) if \
                os.path.isdir(os.path.join(pathname,entry))]
    else:
        mogu_files = [pathname]
    head,tail = os.path.split(pathname)
    shortname = tail
    for directory in directories:
        path_results.extend(import_path(os.path.join(pathname,directory),
            verbal))
    for i,mogufile in enumerate(mogu_files):
        # Display nice progress information so the user doesn't think
        # something is wrong
        sys.stdout.write("\r%s "%(" "*80))
        sys.stdout.flush()
        sys.stdout.write("\r%s Progress: %d%s (%s)" % (shortname, 
            ((i+1.0)/float(len(mogu_files)))*100, "%", mogufile))
        sys.stdout.flush()

        # Append lexed results to results
        if single_file:
            sys.stdout.write("Importing single file: %s" % mogufile)
            sys.stdout.flush()
            path_results.extend(FileImporter.import_file(mogufile))
        else:
            path_results.extend(
                    FileImporter.import_file(
                        os.path.join(pathname,mogufile),verbal))

    sys.stdout.write("\n")
    # The results will be a list of tuples, each of which will contain two
    # entries:
    # index 0 will contain the OrderedDict of token names: tokens.
    # index 1 will contain the actual map used to parse the tokens


    return path_results
示例#2
0
def mogu_import(args):
    results = []    # Holds the final result of all consumption operations
    redis_objects = [] #Holds the objects that can be written to Redis
    write = not args.testing # If testing, don't actually write to Redis

    for path in args.command[1:]:
        if path.endswith(".mogu"): # If the path points to a specific file
            results.extend(FileImporter.import_file(path,args.v))
        else:   # The path is a directory
            results.extend(PathImporter.import_path(path,args.v))

    converter = PythonObjectConverter.PythonObjectConverter()
    for result in results:
        redis_objects.extend(converter.convert(result))

    # SANITY CHECKS #

    # First, make sure that all symbols referenced are defined.
    for registry in [
            SymbolRegistry.widgetRegistry,
            SymbolRegistry.templateRegistry,
            SymbolRegistry.dataRegistry,
            SymbolRegistry.validatorRegistry,
            SymbolRegistry.policyRegistry
            ]:
        if not registry: # Returns false if a symbol in the registry is not defined
            sys.stderr.write(display_undefined_symbols(registry))
            sys.stderr.write("\n== REFUSING TO CONTINUE ==\n")
            sys.exit()

        if registry.nonreferenced(): # Warns the user if something was defined but never used
            i = None
            sys.stderr.write("\n== WARNING: %s contains the following symbols that are defined but never referenced == \n" %
                    registry.label)
            for symbol in registry.nonreferenced():
                sys.stderr.write("\t- %s\n" % symbol)
            # Give the user a chance to halt the import and fix the problem
            if not args.yes:
                i = raw_input("Continue anyway? [y to continue, anything else to cancel]: ")
            if i != 'y' and not args.yes:
                sys.stderr.write("Exiting...")
                sys.exit()

    # Sanity checks complete. Now the process of actually writing
    # to Redis 

    # TODO Don't forget to make RedisWriter deal with purging/merging
    if write: 
        writer = RedisWriter.RedisWriter(args) #TODO RedisWriter should read dbconfig.conf instead
        if args.v:
            sys.stderr.write("Writing imported files to database!\n")
        writer.write(redis_objects)
the_dir = '/Users/hah661/Documents/Northwestern/MyPHD/social_policy_course/SocPol_Video/transcript_txts/'
files = [
'F2014_1.txtout.csv',
'F2014_2.txtout.csv',
'F2014_3.txtout.csv',
'F2014_4.txtout.csv',
'F2014_5.txtout.csv',
'W2014_1.txtout.csv',
'W2014_2.txtout.csv',
'W2014_3.txtout.csv',
'W2014_4.txtout.csv'
]

all_spoken = []
for filename in files:
	 all_spoken = all_spoken + FileImporter.import_file(the_dir+filename)

chunk_tags = Minute_chunker.chunk(all_spoken)

all_spoken = TagHandler.apply_tags(all_spoken, chunk_tags, "minute_chunk")

all_spoken = TagHandler.remove_tag(all_spoken, "minute_chunk")




x = CentroidClusterModule()



import chunker as c
import xml_writer as x
# import chunks
import CentroidClusterModule
from CentroidClusterModule import CentroidClusterModule
import nltk
import csv
import Minute_chunker
import FileImporter
import TagHandler

the_dir = '/Users/hah661/Documents/Northwestern/MyPHD/social_policy_course/SocPol_Video/transcript_txts/'
files = [
    'F2014_1.txtout.csv', 'F2014_2.txtout.csv', 'F2014_3.txtout.csv',
    'F2014_4.txtout.csv', 'F2014_5.txtout.csv', 'W2014_1.txtout.csv',
    'W2014_2.txtout.csv', 'W2014_3.txtout.csv', 'W2014_4.txtout.csv'
]

all_spoken = []
for filename in files:
    all_spoken = all_spoken + FileImporter.import_file(the_dir + filename)

chunk_tags = Minute_chunker.chunk(all_spoken)

all_spoken = TagHandler.apply_tags(all_spoken, chunk_tags, "minute_chunk")

all_spoken = TagHandler.remove_tag(all_spoken, "minute_chunk")

x = CentroidClusterModule()