Exemplo n.º 1
0
def generate_job(func,category,inputs=None,batch_num=1):
    '''
    generate_job
    Parameters
    ==========
    func: str
        name of function to call in plugin functions.py
    category: str
        must be one of "terms" or "corpus" or "relations" corresponding to output folder
    inputs: dict
        key should be arg name, and value should be list of string args as input to func
        If inputs are not specified, it is assumed that the function will be called once
        with no inputs.
    batch_num: int
        the number of jobs to package into one job. For example, batch_num=100 will run
        func with 100 of the input items specified. Each is still written to its own
        output file.
    '''
    # Get name of calling plugin
    home = wordfish_home()
    cf = inspect.currentframe()    
    caller = inspect.getouterframes(cf, 2)
    tag = os.path.dirname(caller[1][1]).split("/")[-1]
    script = "wordfish.plugins.%s.functions" %(tag)
    output_dir = ' output_dir="%s/%s/%s"' %(home,category,tag) 

    # script name to add jobs to
    extraction_script = "%s/scripts/run_extractions_%s.job" %(home,tag)

    lines_to_add = []      
    if category in ["corpus","terms","relations"]:
        if inputs == None:
            lines_to_add.append("python -c 'from %s import %s; %s(%s)'" %(script,func,func,output_dir))
        else:
            formatted_inputs = ""
            # First collect all string args - this means same for all scripts
            for varname,elements in inputs.iteritems():
                if isinstance(elements,str):
                    single_input = format_single_input(varname,elements)
                    formatted_inputs = "%s%s" %(formatted_inputs,single_input)
                          
            # Now collect lists, must be equal length
            input_lists = dict()
            for varname,elements in inputs.iteritems():
                if isinstance(elements,list):
                    if len(input_lists)>0:
                        if len(input_lists.values()[0]) == len(elements):    
                            input_lists[varname] = elements
                    else:
                        input_lists[varname] = elements

            # If we have no input lists, just write the job with single args
            if len(input_lists) == 0:
                formatted_inputs = formatted_inputs.strip(",")
                lines_to_add.append("python -c 'from %s import %s; %s(%s,%s)'" %(script,func,func,output_dir,formatted_inputs))
            else:
                N = len(input_lists.values()[0])
                iters = int(numpy.ceil(N/float(batch_num)))
                start = 0
                for i in range(1,iters+1):
                    formatted_instance = formatted_inputs
                    if i==N:
                        end = N
                    else:
                        end = i*batch_num
                    for varname,elements in input_lists.iteritems():
                        new_input = format_inputs(varname,elements[start:end])
                        formatted_instance = "%s%s" %(formatted_instance,new_input)
                    start = end
                    formatted_instance.strip(",")
                    lines_to_add.append("python -c 'from %s import %s; %s(%s,%s)'" %(script,func,func,output_dir,formatted_instance))

        # Add lines
        add_lines(script=extraction_script,lines_to_add=lines_to_add)
Exemplo n.º 2
0
def generate_job(func, category, inputs=None, batch_num=1):
    '''
    generate_job
    Parameters
    ==========
    func: str
        name of function to call in plugin functions.py
    category: str
        must be one of "terms" or "corpus" or "relations" corresponding to output folder
    inputs: dict
        key should be arg name, and value should be list of string args as input to func
        If inputs are not specified, it is assumed that the function will be called once
        with no inputs.
    batch_num: int
        the number of jobs to package into one job. For example, batch_num=100 will run
        func with 100 of the input items specified. Each is still written to its own
        output file.
    '''
    # Get name of calling plugin
    home = wordfish_home()
    cf = inspect.currentframe()
    caller = inspect.getouterframes(cf, 2)
    tag = os.path.dirname(caller[1][1]).split("/")[-1]
    script = "wordfish.plugins.%s.functions" % (tag)
    output_dir = ' output_dir="%s/%s/%s"' % (home, category, tag)

    # script name to add jobs to
    extraction_script = "%s/scripts/run_extractions_%s.job" % (home, tag)

    lines_to_add = []
    if category in ["corpus", "terms", "relations"]:
        if inputs == None:
            lines_to_add.append("python -c 'from %s import %s; %s(%s)'" %
                                (script, func, func, output_dir))
        else:
            formatted_inputs = ""
            # First collect all string args - this means same for all scripts
            for varname, elements in inputs.iteritems():
                if isinstance(elements, str):
                    single_input = format_single_input(varname, elements)
                    formatted_inputs = "%s%s" % (formatted_inputs,
                                                 single_input)

            # Now collect lists, must be equal length
            input_lists = dict()
            for varname, elements in inputs.iteritems():
                if isinstance(elements, list):
                    if len(input_lists) > 0:
                        if len(input_lists.values()[0]) == len(elements):
                            input_lists[varname] = elements
                    else:
                        input_lists[varname] = elements

            # If we have no input lists, just write the job with single args
            if len(input_lists) == 0:
                formatted_inputs = formatted_inputs.strip(",")
                lines_to_add.append(
                    "python -c 'from %s import %s; %s(%s,%s)'" %
                    (script, func, func, output_dir, formatted_inputs))
            else:
                N = len(input_lists.values()[0])
                iters = int(numpy.ceil(N / float(batch_num)))
                start = 0
                for i in range(1, iters + 1):
                    formatted_instance = formatted_inputs
                    if i == N:
                        end = N
                    else:
                        end = i * batch_num
                    for varname, elements in input_lists.iteritems():
                        new_input = format_inputs(varname, elements[start:end])
                        formatted_instance = "%s%s" % (formatted_instance,
                                                       new_input)
                    start = end
                    formatted_instance.strip(",")
                    lines_to_add.append(
                        "python -c 'from %s import %s; %s(%s,%s)'" %
                        (script, func, func, output_dir, formatted_instance))

        # Add lines
        add_lines(script=extraction_script, lines_to_add=lines_to_add)
Exemplo n.º 3
0
import urllib2
import pandas
import pickle
import numpy
import re
import os
import sys

# IMPORTS FOR ALL PLUGINS
from wordfish.corpus import save_sentences
from wordfish.terms import save_terms
from wordfish.terms import save_relations
from wordfish.plugin import generate_job
from wordfish.utils import wordfish_home

home = wordfish_home()

# REQUIRED WORDFISH FUNCTION
def go_fish():

    f,d = download_data()
    features = pandas.read_csv(f,sep="\t")  
    database = pandas.read_csv(d,sep="\t")  
    pmids = database.id.unique().tolist()
    print "NeuroSynth database has %s unique PMIDs" %(len(pmids))

    # Generate brain maps to extract relationships with
    terms = features.columns.tolist()
    terms.pop(0)  #pmid
    
    maps_dir = "%s/terms/neurosynth/maps" %(home)
Exemplo n.º 4
0
#!/usr/bin/python

# IMPORTS #########################################################################
import os
import sys
from wordfish.vm import init_scripts, make_plugin_folders
from wordfish.utils import make_directory, wordfish_home
from wordfish.terms import download_nltk

# DIRECTORIES #####################################################################
analysis_dir = wordfish_home()
corpus_output = make_directory("%s/corpus" %(analysis_dir))
terms_output = make_directory("%s/terms" %(analysis_dir))
relations_output = make_directory("%s/relations" %(analysis_dir))
scripts_directory = make_directory("%s/scripts" %(analysis_dir))

# INIT FUNCTIONS ##################################################################
# These are fine to re-run, if already done will not cause harm
download_nltk()
make_plugin_folders(analysis_dir)
init_scripts(scripts_directory,analysis_dir)

print "\n\n\n################################ WORDFISH ################################\n\nApplication at %s\n\nScripts are generated in scripts folder. First open run_slurm.py to check that the parameters are ok. Then you will run a script to generate jobs for each plugin:\n\n python run_slurm.py run_first.job\n\nThis will generate a list of commands to be run for each plugin that you have selected, and you should submit the list to your cluster (or run with launch) to complete all the extractions:\n\npython run_slurm.py run_extractions_reddit.py.\n\nLaunch is recommended as another method, see: https://github.com/vsoch/poldracklab-launch\n\nOnce corpus, terms, and relationships are extracted, you can run run_analysis.py [NOT YET DEVELOPED]." %analysis_dir