Python cleanup 예제들, utilities.cleanup Python 예제들

예제 #1

0

파일 보기

def transcribe(input_file_url):
    """
    Transcribes an audio file. The audio file format is a URL.

    Params:
        input_file_url (string): The input audio file URL
    """

    # Setting up environment
    if not utilities.check_env_vars():
        return
    utilities.create_ancillary_folders()

    # download the podcast file
    filepath = get_podcast_file(input_file_url)

    # convert file to raw audio chunks
    chunks = convert_to_raw_audio_chunks(filepath)

    # transcribe chunks
    transcriber = Transcriber(os.environ['GOOGLE_API_KEY'])
    transcript = transcriber.transcribe_many(chunks)

    # write to the output file
    output_file_name = os.path.split(filepath)[-1]
    utilities.write_output_file(output_file_name, transcript)

    print "Cleaning up...\n"
    utilities.cleanup()

예제 #2

0

파일 보기

def add_to_db(Session, k):
    session = Session()
    t = dt.date.today()
    adds = 0
    cleantracks = k
    for line in cleantracks:
        clean_name = cleanup(line[0])
        n_ = band(name=line[0],
                  song=line[1],
                  album = line[2],
                  release_year = line[3],
                  source=line[4],
                  dateplayed=line[5],
                  dateadded=t,
                  cleanname=clean_name)
        q = session.query(band).filter(band.name == n_.name, band.song == n_.song, band.source == n_.source)
        if q.first() == None:
            session.add(n_)
            adds += 1
        else:
            try:
                print ('Already had {0} - {1}'.format(n_.name, n_.song))
            except:
                print ('Already had it. Cannot print. ID is {0}'.format(q.first().id))
        session.commit()

    return adds

예제 #3

0

파일 보기

def load_to_db(albumlist):
    socket.setdefaulttimeout(15)
    # creation of the SQL database and the "session" object that is used to manage
    # communications with the database
    engine = create_engine('sqlite:///../../databases/scout.db')
    session_factory = sessionmaker(bind=engine)
    Session = scoped_session(session_factory)
    metadata = MetaData(db)
    db.metadata.create_all(engine)

    session = Session()

    t = dt.date.today()
    adds = 0

    for i in albumlist:
        print (i)
        clean_name = cleanup(i[0])
        n_ = band(name=i[0],
                  album=i[1],
                  source='KEXP Countdown 2018',
                  appeared ='KEXP Countdown 2018',
                  dateadded=t,
                  cleanname=clean_name)
        q = session.query(band).filter(band.name == n_.name, band.song == n_.song)
        if q.first() == None:
            session.add(n_)
            adds += 1
        else:
            try:
                print ('Already had {0} - {1}'.format(n_.name, n_.song))
            except:
                print ('Already had it. Cannot print. ID is {0}'.format(q.first().id))
        session.commit()
    print ('Added {0} songs'.format(adds))

예제 #4

0

파일 보기

파일: load_other_bands.py 프로젝트: ksk382/music_scout

def load_other_bands(Session, choices):
    # this loop pulls down band names from the sources identified.
    # each source, however, needs its own function, since the
    # websites are set up differently

    proceed = False
    for src in choices:
        if src in bandsources:
            # at least one of the choices needs to be in bandsources
            # or else this function doesn't need to proceed
            proceed = True
    if proceed == False:
        return
    session = Session()
    today = dt.date.today()
    other_sources = [
        'KEXP Music That Matters', 'Pitchfork Top Tracks', 'Stereogum',
        'Metacritic', 'KCRW', 'Pitchfork', 'KEXP charts', 'KNKX'
    ]
    other_choices = [x for x in choices if x in other_sources]
    for src in other_choices:
        print('\n\n\n', src)
        try:
            list = grabbands(src)
        except Exception as e:
            print('{0} not provided for in load_other_bands.grabbands'.format(
                src))
            print(str(e))
        add_count = 0
        for i in list:
            h = cleanup(i.name)
            if h != '':
                try:
                    q = session.query(band).filter(band.cleanname == h)
                    if q.first() == None:
                        i.source = src
                        i.cleanname = h
                        i.dateadded = today
                        print("Adding {0} (from {1})".format(i.name, i.source))
                        session.add(i)
                        session.commit()
                        add_count += 1
                except Exception as e:
                    print(str(e))
        print('Added {0} entries to the {1} collection. \n\n'.format(
            add_count, src))
    return

예제 #5

0

파일 보기

파일: format.py 프로젝트: julzerinos/python-slack-toolkit

                msg['content']['url'])

        for category in msg['categories']:
            if category not in pre_payload:
                pre_payload[category] = []
            pre_payload[category].append(msg)

    payload = [dblock()]

    for category in pre_payload:
        payload.append(cblock(category))

        pre_payload[category].sort(key=lambda x: x['type'])
        types = set([t['type'] for t in pre_payload[category]])

        for t in types:
            payload.append(tblock(t))
            payload.append(
                iblock(
                    [msg for msg in pre_payload[category] if msg['type'] == t],
                    category, t))

        payload.append(dblock())

    ec.parse()
    slacky.send_message(blocks=payload, channel_id=channel_id)

    slacky.delete_set_messages(messages, channel_id=channel_id)

    ut.cleanup()

예제 #6

0

파일 보기

파일: send_it.py 프로젝트: ksk382/music_scout

def similar(a, b):
    a = cleanup(a)
    b = cleanup(b)
    return SequenceMatcher(None, a, b).ratio()

예제 #7

0

파일 보기

파일: BinGeR.py 프로젝트: jrherr/BinGeR

def main(argv=sys.argv[1:]):

    parser = OptionParser(usage=USAGE, version="Version: " + __version__)

    # Required arguments
    requiredOptions = OptionGroup(
        parser, "Required options",
        "These options are required to run BinGeR, and may be supplied in any order."
    )

    requiredOptions.add_option(
        "-l",
        "--sample_list",
        type="string",
        metavar="FILE",
        help="Text file containing all sample names, one per line")

    requiredOptions.add_option(
        "-o",
        "--out_dir",
        type="string",
        metavar="OUTDIR",
        help=
        "Working directory where the results and intermediate files will be stored at"
    )

    parser.add_option_group(requiredOptions)

    # Optional arguments that need to be supplied if not the same as default
    optOptions = OptionGroup(
        parser, "Optional parameters",
        "There options are optional, and may be supplied in any order.")

    optOptions.add_option(
        "-b",
        "--bams_dir",
        type="string",
        default="Bams",
        metavar="DIR",
        help=
        "Directory where sorted bam files (reads versus assembly, same sample) are, the naming should follow \"sample.*.bam\" convention. [Default: ./Bams]"
    )

    optOptions.add_option(
        "-c",
        "--coverage_dir",
        type="string",
        default="Coverage",
        metavar="DIR",
        help=
        "Directory where coverage files are, naming follows \"sampleA.vs.sampleB.*.coverage\" convention. [Default: ./Coverage]"
    )

    optOptions.add_option(
        "-a",
        "--assemblies_dir",
        type="string",
        default="Assemblies",
        metavar="DIR",
        help=
        "Directory where assemblies in fasta format are, naming follows \"sample.*.fa\" convention. [Default: ./Assemblies]"
    )

    optOptions.add_option(
        "-z",
        "--zscore_dir",
        type="string",
        default="ZScores",
        metavar="DIR",
        help=
        "Directory where oligo-nt z-score files are, naming follows \"sample.*.ZScore\" convention. [Default: ./ZScore]"
    )

    optOptions.add_option(
        "-s",
        "--hmmscan_dir",
        type="string",
        default="HMMScan",
        metavar="DIR",
        help=
        "Directory where hmmscan files are, naming follows \"sample.*.hmmscan\" convention. [Default: ./HMMScan]"
    )

    optOptions.add_option(
        "-t",
        "--num_proc",
        type="int",
        default=1,
        metavar='INT',
        help="Number of processor for BinGeR to use [default: 1].")

    optOptions.add_option("--blat",
                          type="string",
                          default="blat",
                          help="Path to blat, specify if not in env.")

    parser.add_option_group(optOptions)

    # Binning parameters that could fine tune the process
    clusteringOptions = OptionGroup(
        parser, "Binning parameters",
        "There options are optional, and may be supplied in any order.")

    clusteringOptions.add_option(
        "-m",
        "--min_core",
        type="int",
        default=1e5,
        metavar='INT',
        help="Minimum size to consider as bin core [default: 1e5].")

    clusteringOptions.add_option(
        "-u",
        "--cov_clustering_min_length",
        dest="minCovLength",
        type="int",
        default=1500,
        metavar='INT',
        help=
        "Minimum contig length to be considered in coverage clustering [default: 1500]."
    )

    clusteringOptions.add_option(
        "--min_cov_corrcoef",
        dest="minCovCorrceof",
        type="float",
        default=0.95,
        metavar='FLOAT',
        help=
        "Minimum correlation coefficient cutoff for form a link between contigs using coverage profiles [default: 0.95]."
    )

    clusteringOptions.add_option(
        "--min_zscore_corrcoef",
        dest="minZScoreCorrceof",
        type="float",
        default=0.95,
        metavar='FLOAT',
        help=
        "Minimum correlation coefficient cutoff for form a link between contigs using tri-/tetra-nt frequency Z-Score [default: 0.90]."
    )

    clusteringOptions.add_option(
        "-x",
        "--zscore_clustering_min_length",
        dest="minZLength",
        type="int",
        default=3000,
        metavar='INT',
        help=
        "Minimum contig length to be considered in Z-score clustering [default: 2000]."
    )

    clusteringOptions.add_option(
        "-d",
        "--cpr_alpha",
        type="float",
        default=0.9,
        metavar='FLOAT',
        help=
        "The dampening factor, alpha, in community personalized PageRank [default: 0.9, range: (0.75, 0.95)]."
    )

    clusteringOptions.add_option(
        "-e",
        "--cpr_tol",
        type="float",
        default=1e-4,
        metavar='FLOAT',
        help=
        "The error tolerance factor, tol, in community personalized PageRank [default: 1e-4, range: (1e-8, 1e-2)]."
    )

    clusteringOptions.add_option(
        "-i",
        "--cpr_maxiter",
        type="int",
        default=50,
        metavar='INT',
        help=
        "The max iterations performed in community personalized PageRank [default: 50, range: (20, 100)]."
    )

    parser.add_option_group(clusteringOptions)

    # runtime settings that could affect the file saving and message printing
    runtimeSettings = OptionGroup(
        parser, "Runtime settings",
        "There options are optional, and may be supplied in any order.")

    runtimeSettings.add_option(
        "-q",
        "--quiet",
        default=False,
        action="store_true",
        help=
        "Suppress printing detailed runtime information, only important messages will show [default: False]."
    )

    runtimeSettings.add_option(
        "--no_intermediates",
        action="store_false",
        dest="save_intermediates",
        default=True,
        help=
        "Do no save intermediate files during runtime [default: True (save intermediates)]."
    )

    parser.add_option_group(runtimeSettings)

    (options, args) = parser.parse_args(argv)

    if options.sample_list is None:
        parser.error("A list of samples and a working directory are required!")
        exit(0)

    if options.out_dir is None:
        parser.error("An output directory is required to supply!")
        exit(0)

    if options.num_proc < 1:
        parser.error(
            "Number of processors must be integer >= 1, you supplied %i" %
            options.num_proc)
        exit(0)

    if options.min_core < 1e4 or options.min_core > 1e6:
        parser.error(
            "Size of minimum bin size must be in range [1e4, 1e6]bp, you supplied %i"
            % options.min_core)
        exit(0)

    if options.cpr_alpha < 0.75 or options.cpr_alpha > 0.95:
        parser.error(
            "Community PageRank Alpha must be a float in range [0.75, 0.95], you supplied %.3f"
            % options.cpr_alpha)
        exit(0)

    if options.cpr_tol < 1e-8 or options.cpr_tol > 1e-2:
        parser.error(
            "Community PageRank tol must be a float in range [1e-8, 1e-2], you supplied %.3f"
            % options.cpr_tol)
        exit(0)

    if options.cpr_maxiter < 20 or options.cpr_maxiter > 100:
        parser.error(
            "Community PageRank tol must be a float in range [20, 100], you supplied %i"
            % options.cpr_maxiter)
        exit(0)

    total_start_time = time()
    sys.stdout.write("BinGeR started at %s\n" % (ctime()))
    sys.stdout.flush()

    # test if blat exists
    blatTest = Popen(options.blat, shell=True, stdout=PIPE).stdout.read()
    if blatTest == None or len(blatTest) == 0:
        sys.stderr.write("FATAL: blat not found in path!")
        exit(0)

    # check sanity of the files in required directories
    projInfo = ProjectInfo()
    projInfo.initProject(options)
    if not options.quiet:
        projInfo.printSamples()

    # build networkx graph for Project
    sys.stdout.write('\nInitializing contig space...\n')
    G = cSpace.ContigSpace(projInfo.samples)

    initCoresPath = projInfo.out_dir + '/initCores'

    if os.path.exists(initCoresPath):
        if len(glob.glob(initCoresPath + '/*.cpickle')) > 0:
            G.refineCores(projInfo, options)
            G.recruitContigs(projInfo, options)
    else:
        G.initSubgraphs(projInfo, options)
        G.forgeCores(projInfo, options)
        G.refineCores(projInfo, options)
        G.recruitContigs(projInfo, options)

    # output bins and the evaluation, extract reads of bins for downstream analysis.
    utilities.outputBins(projInfo, options)

    # get all the reads for the bins
    utilities.extractReadsForBins(projInfo, options)

    # clean up if necessary
    if not options.save_intermediates:
        utilities.cleanup(projInfo)

    total_finish_time = time()
    sys.stdout.write("BinGeR finished at %s\n" % (ctime()))
    sys.stdout.flush()

    return

예제 #8

0

파일 보기

파일: BinGeR.py 프로젝트: jrherr/BinGeR

def main(argv = sys.argv[1:]):

	parser = OptionParser(usage = USAGE, version="Version: " + __version__)
	
	# Required arguments
	requiredOptions = OptionGroup(parser, "Required options",
								"These options are required to run BinGeR, and may be supplied in any order.")
	
	requiredOptions.add_option("-l", "--sample_list", type = "string", metavar = "FILE",
							help = "Text file containing all sample names, one per line")

	requiredOptions.add_option("-o", "--out_dir", type = "string", metavar = "OUTDIR",
							help = "Working directory where the results and intermediate files will be stored at")

	parser.add_option_group(requiredOptions)

	# Optional arguments that need to be supplied if not the same as default
	optOptions = OptionGroup(parser, "Optional parameters",
						"There options are optional, and may be supplied in any order.")

	optOptions.add_option("-b", "--bams_dir", type = "string", default = "Bams", metavar = "DIR",
							help = "Directory where sorted bam files (reads versus assembly, same sample) are, the naming should follow \"sample.*.bam\" convention. [Default: ./Bams]")

	optOptions.add_option("-c", "--coverage_dir", type = "string", default = "Coverage", metavar = "DIR",
							help = "Directory where coverage files are, naming follows \"sampleA.vs.sampleB.*.coverage\" convention. [Default: ./Coverage]")

	optOptions.add_option("-a", "--assemblies_dir", type = "string", default = "Assemblies", metavar = "DIR",
							help = "Directory where assemblies in fasta format are, naming follows \"sample.*.fa\" convention. [Default: ./Assemblies]")

	optOptions.add_option("-z", "--zscore_dir", type = "string", default = "ZScores", metavar = "DIR",
							help = "Directory where oligo-nt z-score files are, naming follows \"sample.*.ZScore\" convention. [Default: ./ZScore]")

	optOptions.add_option("-s", "--hmmscan_dir", type = "string", default = "HMMScan", metavar = "DIR",
							help = "Directory where hmmscan files are, naming follows \"sample.*.hmmscan\" convention. [Default: ./HMMScan]")

	optOptions.add_option("-t", "--num_proc", type = "int", default = 1, metavar = 'INT',
							help = "Number of processor for BinGeR to use [default: 1].")
						
	optOptions.add_option("--blat", type = "string", default = "blat",
							help = "Path to blat, specify if not in env.")

	parser.add_option_group(optOptions)
	
	
	# Binning parameters that could fine tune the process
	clusteringOptions = OptionGroup(parser, "Binning parameters",
						"There options are optional, and may be supplied in any order.")
	
	clusteringOptions.add_option("-m", "--min_core", type = "int", default = 1e5, metavar = 'INT',
							help = "Minimum size to consider as bin core [default: 1e5].")
	
	clusteringOptions.add_option("-u", "--cov_clustering_min_length", dest = "minCovLength",
							type = "int", default = 1500, metavar = 'INT',
							help = "Minimum contig length to be considered in coverage clustering [default: 1500].")
	
	clusteringOptions.add_option("--min_cov_corrcoef", dest = "minCovCorrceof",
							type = "float", default = 0.95, metavar = 'FLOAT',
							help = "Minimum correlation coefficient cutoff for form a link between contigs using coverage profiles [default: 0.95].")
	
	clusteringOptions.add_option("--min_zscore_corrcoef", dest = "minZScoreCorrceof",
							type = "float", default = 0.95, metavar = 'FLOAT',
							help = "Minimum correlation coefficient cutoff for form a link between contigs using tri-/tetra-nt frequency Z-Score [default: 0.90].")
						
	clusteringOptions.add_option("-x", "--zscore_clustering_min_length", dest = "minZLength",
							type = "int", default = 3000, metavar = 'INT',
							help = "Minimum contig length to be considered in Z-score clustering [default: 2000].")
		
	clusteringOptions.add_option("-d", "--cpr_alpha", type = "float", default = 0.9, metavar = 'FLOAT',
							help = "The dampening factor, alpha, in community personalized PageRank [default: 0.9, range: (0.75, 0.95)].")

	clusteringOptions.add_option("-e", "--cpr_tol", type = "float", default = 1e-4, metavar = 'FLOAT',
							help = "The error tolerance factor, tol, in community personalized PageRank [default: 1e-4, range: (1e-8, 1e-2)].")

	clusteringOptions.add_option("-i", "--cpr_maxiter", type = "int", default = 50, metavar = 'INT',
							help = "The max iterations performed in community personalized PageRank [default: 50, range: (20, 100)].")

	parser.add_option_group(clusteringOptions)

	# runtime settings that could affect the file saving and message printing
	runtimeSettings = OptionGroup(parser, "Runtime settings",
						"There options are optional, and may be supplied in any order.")
						
	runtimeSettings.add_option("-q", "--quiet", default = False, action = "store_true",
								help = "Suppress printing detailed runtime information, only important messages will show [default: False].")

	runtimeSettings.add_option("--no_intermediates", action="store_false", 
								dest = "save_intermediates", default = True,
								help = "Do no save intermediate files during runtime [default: True (save intermediates)].")

	parser.add_option_group(runtimeSettings)

	(options, args) = parser.parse_args(argv)
	
	if options.sample_list is None:
		parser.error("A list of samples and a working directory are required!")
		exit(0)
		
	if options.out_dir is None:
		parser.error("An output directory is required to supply!")
		exit(0)
		
	if options.num_proc < 1:
		parser.error("Number of processors must be integer >= 1, you supplied %i" % options.num_proc)
		exit(0)
		
	if options.min_core < 1e4 or options.min_core > 1e6:
		parser.error("Size of minimum bin size must be in range [1e4, 1e6]bp, you supplied %i" % options.min_core)
		exit(0)
	
	if options.cpr_alpha < 0.75 or options.cpr_alpha > 0.95:
		parser.error("Community PageRank Alpha must be a float in range [0.75, 0.95], you supplied %.3f" % options.cpr_alpha)
		exit(0)
		
	if options.cpr_tol < 1e-8 or options.cpr_tol > 1e-2:
		parser.error("Community PageRank tol must be a float in range [1e-8, 1e-2], you supplied %.3f" % options.cpr_tol)
		exit(0)
		
	if options.cpr_maxiter < 20 or options.cpr_maxiter > 100:
		parser.error("Community PageRank tol must be a float in range [20, 100], you supplied %i" % options.cpr_maxiter)
		exit(0)
	
	total_start_time = time()
	sys.stdout.write("BinGeR started at %s\n"%(ctime()))
	sys.stdout.flush()

	# test if blat exists
	blatTest = Popen(options.blat, shell=True, stdout=PIPE).stdout.read()
	if blatTest == None or len(blatTest) == 0:
		sys.stderr.write("FATAL: blat not found in path!")
		exit(0)
	
	# check sanity of the files in required directories
	projInfo = ProjectInfo()
	projInfo.initProject(options)
	if not options.quiet:
		projInfo.printSamples()
	
	# build networkx graph for Project
	sys.stdout.write('\nInitializing contig space...\n')
	G = cSpace.ContigSpace(projInfo.samples)
	
	initCoresPath = projInfo.out_dir + '/initCores'
	
	if os.path.exists(initCoresPath):
		if len(glob.glob(initCoresPath + '/*.cpickle')) > 0:
			G.refineCores(projInfo, options)
			G.recruitContigs(projInfo, options)
	else:
		G.initSubgraphs(projInfo, options)
		G.forgeCores(projInfo, options)
		G.refineCores(projInfo, options)
		G.recruitContigs(projInfo, options)
	
	# output bins and the evaluation, extract reads of bins for downstream analysis.
	utilities.outputBins(projInfo, options)
	
	# get all the reads for the bins
	utilities.extractReadsForBins(projInfo, options)
	
	# clean up if necessary
	if not options.save_intermediates:
		utilities.cleanup(projInfo)
	
	total_finish_time = time()
	sys.stdout.write("BinGeR finished at %s\n"%(ctime()))
	sys.stdout.flush()

	return