def generate_job(func,category,inputs=None,batch_num=1): ''' generate_job Parameters ========== func: str name of function to call in plugin functions.py category: str must be one of "terms" or "corpus" or "relations" corresponding to output folder inputs: dict key should be arg name, and value should be list of string args as input to func If inputs are not specified, it is assumed that the function will be called once with no inputs. batch_num: int the number of jobs to package into one job. For example, batch_num=100 will run func with 100 of the input items specified. Each is still written to its own output file. ''' # Get name of calling plugin home = wordfish_home() cf = inspect.currentframe() caller = inspect.getouterframes(cf, 2) tag = os.path.dirname(caller[1][1]).split("/")[-1] script = "wordfish.plugins.%s.functions" %(tag) output_dir = ' output_dir="%s/%s/%s"' %(home,category,tag) # script name to add jobs to extraction_script = "%s/scripts/run_extractions_%s.job" %(home,tag) lines_to_add = [] if category in ["corpus","terms","relations"]: if inputs == None: lines_to_add.append("python -c 'from %s import %s; %s(%s)'" %(script,func,func,output_dir)) else: formatted_inputs = "" # First collect all string args - this means same for all scripts for varname,elements in inputs.iteritems(): if isinstance(elements,str): single_input = format_single_input(varname,elements) formatted_inputs = "%s%s" %(formatted_inputs,single_input) # Now collect lists, must be equal length input_lists = dict() for varname,elements in inputs.iteritems(): if isinstance(elements,list): if len(input_lists)>0: if len(input_lists.values()[0]) == len(elements): input_lists[varname] = elements else: input_lists[varname] = elements # If we have no input lists, just write the job with single args if len(input_lists) == 0: formatted_inputs = formatted_inputs.strip(",") lines_to_add.append("python -c 'from %s import %s; %s(%s,%s)'" %(script,func,func,output_dir,formatted_inputs)) else: N = len(input_lists.values()[0]) iters = int(numpy.ceil(N/float(batch_num))) start = 0 for i in range(1,iters+1): formatted_instance = formatted_inputs if i==N: end = N else: end = i*batch_num for varname,elements in input_lists.iteritems(): new_input = format_inputs(varname,elements[start:end]) formatted_instance = "%s%s" %(formatted_instance,new_input) start = end formatted_instance.strip(",") lines_to_add.append("python -c 'from %s import %s; %s(%s,%s)'" %(script,func,func,output_dir,formatted_instance)) # Add lines add_lines(script=extraction_script,lines_to_add=lines_to_add)
def generate_job(func, category, inputs=None, batch_num=1): ''' generate_job Parameters ========== func: str name of function to call in plugin functions.py category: str must be one of "terms" or "corpus" or "relations" corresponding to output folder inputs: dict key should be arg name, and value should be list of string args as input to func If inputs are not specified, it is assumed that the function will be called once with no inputs. batch_num: int the number of jobs to package into one job. For example, batch_num=100 will run func with 100 of the input items specified. Each is still written to its own output file. ''' # Get name of calling plugin home = wordfish_home() cf = inspect.currentframe() caller = inspect.getouterframes(cf, 2) tag = os.path.dirname(caller[1][1]).split("/")[-1] script = "wordfish.plugins.%s.functions" % (tag) output_dir = ' output_dir="%s/%s/%s"' % (home, category, tag) # script name to add jobs to extraction_script = "%s/scripts/run_extractions_%s.job" % (home, tag) lines_to_add = [] if category in ["corpus", "terms", "relations"]: if inputs == None: lines_to_add.append("python -c 'from %s import %s; %s(%s)'" % (script, func, func, output_dir)) else: formatted_inputs = "" # First collect all string args - this means same for all scripts for varname, elements in inputs.iteritems(): if isinstance(elements, str): single_input = format_single_input(varname, elements) formatted_inputs = "%s%s" % (formatted_inputs, single_input) # Now collect lists, must be equal length input_lists = dict() for varname, elements in inputs.iteritems(): if isinstance(elements, list): if len(input_lists) > 0: if len(input_lists.values()[0]) == len(elements): input_lists[varname] = elements else: input_lists[varname] = elements # If we have no input lists, just write the job with single args if len(input_lists) == 0: formatted_inputs = formatted_inputs.strip(",") lines_to_add.append( "python -c 'from %s import %s; %s(%s,%s)'" % (script, func, func, output_dir, formatted_inputs)) else: N = len(input_lists.values()[0]) iters = int(numpy.ceil(N / float(batch_num))) start = 0 for i in range(1, iters + 1): formatted_instance = formatted_inputs if i == N: end = N else: end = i * batch_num for varname, elements in input_lists.iteritems(): new_input = format_inputs(varname, elements[start:end]) formatted_instance = "%s%s" % (formatted_instance, new_input) start = end formatted_instance.strip(",") lines_to_add.append( "python -c 'from %s import %s; %s(%s,%s)'" % (script, func, func, output_dir, formatted_instance)) # Add lines add_lines(script=extraction_script, lines_to_add=lines_to_add)
def go_fish(tag,extraction_script): line_to_add = "python -c 'from wordfish.plugins.%s.functions import go_fish; go_fish()'" %(tag) add_lines(script=extraction_script,lines_to_add=[line_to_add])
def go_fish(tag, extraction_script): line_to_add = "python -c 'from wordfish.plugins.%s.functions import go_fish; go_fish()'" % ( tag) add_lines(script=extraction_script, lines_to_add=[line_to_add])