Exemplo n.º 1
0
def build_spectrum_table(spectrum_file, schema, index=None, **kwargs):
    """
        This function factorises out common code required to auto-populate
        an ADR Spectrum resource from a PJNZ file. It uses the ADR resource
        validation schema to build a dataframe and insert into it data from
        the PJNZ file.

        IMPORTANT - This function evaluates snippets of code from the JSON
        schemas. This is not ideal, as the snippets would ideally be brought
        into the Python ecosystem. However, for the time being it was seen as
        the cleanest way to store the complex mapping of data from PJNZ to ADR
        resource.
        """
    # We reference the spectrum file from json schemas - give it a shorthand ref
    sf = spectrum_file

    # Remove the first schema field as this is the header/index
    schema = schema.copy()
    first_field = schema['fields'].pop(0)

    # Assemble the populated data file in dictionaries
    new_table = OrderedDict()
    for field in schema['fields']:
        if field.get('spectrum_file_key', False):

            # Fill row in with spectrum data
            try:
                # IMPORTANT - We evaluate a snippet of code from the JSON file
                data_series = list(eval(field['spectrum_file_key']))
            except Exception:
                logging.error("Failed to evaluate " + field['name'] +
                              " spectrum_file_key: " +
                              field['spectrum_file_key'])
                raise
            new_table[field['name']] = data_series

        else:
            # If no spectrum_file_key given, then leave series empty
            new_table[field['name']] = []

    # Fill in empty series with NAN (must match other series length)
    max_length = max([len(x) for x in new_table.values()])
    for key, value in new_table.iteritems():
        if len(value) == 0:
            new_table[key] = [np.NaN] * max_length

    new_table = pd.DataFrame.from_dict(new_table, **kwargs)

    # Fix the indicies if they are mannually specified
    if index:
        new_table.index = index
    # Fix the indicies if they are specified with a spectrum_file_key
    elif first_field.get('spectrum_file_key', False):
        new_table.index = list(eval(first_field['spectrum_file_key']))
    new_table.insert(0, first_field['name'], new_table.index)

    return new_table
Exemplo n.º 2
0
def setSelectSampleTextIndex( thisFont, tab=None, marker="### CUSTOM KERN STRING ###"):
	if Glyphs.versionNumber >= 3:
		# Glyphs 3 code
		sampleTexts = OrderedDict([(d['name'], d['text']) for d in Glyphs.defaults["SampleTextsList"]])

		foundSampleString = False
		for sampleTextIndex, k in enumerate(sampleTexts.keys()):
			if marker in k:
				foundSampleString = True
				if not tab:
					tab = thisFont.currentTab
					if not tab:
						tab = thisFont.newTab()
				tab.selectSampleTextArrayController().setSelectionIndex_(sampleTextIndex+1)
				tab.text = sampleTexts[sampleTexts.keys()[sampleTextIndex+1]]
				break

		if not foundSampleString:
			print("Warning: Could not find '%s' in sample strings." % marker)
	else:
		# Glyphs 2 code
		sampleTexts = tuple(Glyphs.defaults["SampleTexts"])

		sampleTextIndex = sampleTexts.index(marker)
		if sampleTextIndex > -1:
			if not tab:
				tab = thisFont.currentTab
				if not tab:
					tab = thisFont.newTab()
			tab.selectSampleTextArrayController().setSelectionIndex_(sampleTextIndex+1)
			tab.text = sampleTexts[sampleTextIndex+1]
		else:
			print("Warning: Could not find '%s' in sample strings." % marker)
Exemplo n.º 3
0
Arquivo: views.py Projeto: as3adm/tola
def export_silo(request, id):
    
    silo_name = Silo.objects.get(id=id).name
    
    response = HttpResponse(content_type='text/csv')
    response['Content-Disposition'] = 'attachment; filename="%s.csv"' % silo_name
    writer = csv.writer(response)

    silo_data = LabelValueStore.objects(silo_id=id)
    data = []
    num_cols = 0
    cols = OrderedDict()
    if silo_data:
        num_rows = len(silo_data)
        
        for row in silo_data:
            for i, col in enumerate(row):
                if col not in cols.keys():
                    num_cols = num_cols + 1
                    cols[col] = num_cols
        
        # Convert OrderedDict to Python list so that it can be written to CSV writer.
        cols = list(cols)
        writer.writerow(list(cols))
        
        # Populate a 2x2 list structure that corresponds to the number of rows and cols in silo_data
        for i in xrange(num_rows): data += [[0]*num_cols]        
        
        for r, row in enumerate(silo_data):
            for col in row:
                # Map values to column names and place them in the correct position in the data array
                data[r][cols.index(col)] = row[col]
            writer.writerow(data[r])
    return response
Exemplo n.º 4
0
    def ranking(self):
        content_a = [word.strip() for word in open(self.wordset_a)]
        content_b = [word.strip() for word in open(self.wordset_b)]

        result_matrix = self.result_matrix.todense()
        truth_matrix = self.truth_matrix.todense()
        row = 0
        targets = []
        rankings = []
        result_word_list = []
        truth_word_list = []

        for i in content_a:
            targets.append(i)
            column = 0
            result_dict = {}
            truth_dict = {}

            for j in content_b:

                result_dict[str(j)] = result_matrix[row, column]
                truth_dict[str(j)] = truth_matrix[row, column]
                column += 1

            result_sort = OrderedDict(
                reversed(
                    sorted(result_dict.items(),
                           key=lambda t: np.float(t[1])))).keys()

            truth_sort = OrderedDict(
                reversed(
                    sorted(truth_dict.items(),
                           key=lambda t: np.float(t[1])))).keys()

            result_words = []
            truth_words = []
            iteration = 0
            rank = 0
            rank_count = 0
            tr_rank = 0

            for l in range(0, 10):

                result_words.append(result_sort[l])
                truth_words.append(truth_sort[l])
                rank_count += (result_sort.index(truth_sort[l]) + 1)
                iteration += 1
                tr_rank += iteration

            rank = float(rank_count / 10.0)
            reference = float(tr_rank / 10.0)
            result_word_list.append(result_words)
            truth_word_list.append(truth_words)
            rankings.append(rank)

            row += 1

        avg_rank = (float(sum(rankings) / len(rankings)))

        return reference, avg_rank, rankings, result_word_list, truth_word_list, targets
Exemplo n.º 5
0
    def ranking(self):
        content_a = [word.strip() for word in open(self.wordset_a)]
        content_b = [word.strip() for word in open(self.wordset_b)]

        result_matrix = self.result_matrix.todense()
        truth_matrix = self.truth_matrix.todense()
        row = 0
        targets = []
        rankings = []
        result_word_list = []
        truth_word_list = []

        for i in content_a:
            targets.append(i)
            column = 0
            result_dict = {}
            truth_dict = {}

            for j in content_b:

                result_dict[str(j)] = result_matrix[row, column]
                truth_dict[str(j)] = truth_matrix[row, column]
                column += 1

            result_sort = OrderedDict(reversed(sorted(result_dict.items(),
                key=lambda t: np.float(t[1])))).keys()

            truth_sort = OrderedDict(reversed(sorted(truth_dict.items(),
                key=lambda t: np.float(t[1])))).keys()

            result_words = []
            truth_words = []
            iteration = 0
            rank = 0
            rank_count = 0
            tr_rank = 0

            for l in range(0, 10):

                result_words.append(result_sort[l])
                truth_words.append(truth_sort[l])
                rank_count += (result_sort.index(truth_sort[l]) + 1)
                iteration += 1
                tr_rank += iteration

            rank = float(rank_count / 10.0)
            reference = float(tr_rank / 10.0)
            result_word_list.append(result_words)
            truth_word_list.append(truth_words)
            rankings.append(rank)

            row += 1

        avg_rank = (float(sum(rankings) / len(rankings)))

        return reference, avg_rank, rankings, result_word_list, truth_word_list, targets
Exemplo n.º 6
0
def test_parse_keywords():
    info = [
        {
            'recscope': 'variable',
            'units': 'none',
            'name': 'cparms_sg000',
            'defval': 'compress Rice',
            'note': '',
            'type': 'string',
        },
        {
            'recscope': 'variable',
            'units': 'none',
            'name': 'mean_bzero',
            'defval': '0',
            'note': '',
            'type': 'double',
        },
        {
            'recscope': 'variable',
            'units': 'none',
            'name': 'mean_bscale',
            'defval': '0.25',
            'note': '',
            'type': 'double',
        },
        {
            'recscope': 'variable',
            'units': 'TAI',
            'name': 'MidTime',
            'defval': '-4712.01.01_11:59_TAI',
            'note': 'Midpoint of averaging interval',
            'type': 'time',
        },
    ]
    exp = OrderedDict([
        ('name', ['cparms_sg000', 'mean_bzero', 'mean_bscale', 'MidTime']),
        ('type', ['string', 'double', 'double', 'time']),
        ('recscope', ['variable', 'variable', 'variable', 'variable']),
        ('defval', ['compress Rice', '0', '0.25', '-4712.01.01_11:59_TAI']),
        ('units', ['none', 'none', 'none', 'TAI']),
        ('note', ['', '', '', 'Midpoint of averaging interval']),
        ('linkinfo', [None, None, None, None]),
        ('is_time', [False, False, False, True]),
        ('is_integer', [False, False, False, False]),
        ('is_real', [False, True, True, False]),
        ('is_numeric', [False, True, True, False]),
    ])

    exp = pd.DataFrame(data=exp)
    exp.index = exp.pop('name')
    assert drms.SeriesInfo._parse_keywords(info).equals(exp)
Exemplo n.º 7
0
    def merge(self, *args):
        '''
        Merge some continuous and ascending labels of a tensor into a new one with an optional permutation.

        Usage: ``tensor.merge((olds,new,<permutation>),(olds,new,<permutation>),...)``
            * olds: list of Label/int
                The old labels/axes to be merged.
            * new: Label
                The new label.
            * permutation: 1d ndarray of int, optional
                The permutation of the quantum number collection of the new label.

        Returns
        -------
        DTensor
            The new tensor.
        '''
        permutations = {}
        keep = OrderedDict([(i, i) for i in xrange(self.ndim)])
        labels = OrderedDict([(i, label)
                              for i, label in enumerate(self.labels)])
        for arg in args:
            assert len(arg) in (2, 3)
            olds, new, permutation = (arg[0], arg[1],
                                      None) if len(arg) == 2 else arg
            axes = np.array([
                self.axis(old) if isinstance(old, Label) else old
                for old in olds
            ])
            if len(axes) != max(axes) - min(axes) + 1 or not all(
                    axes[1:] > axes[:-1]):
                raise ValueError(
                    'DTensor merge error: the axes to be merged should be continuous and ascending, please call transpose first.'
                )
            permutations[new] = permutation
            keep[axes[0]] = slice(axes[0], axes[-1] + 1)
            labels[axes[0]] = new
            for axis in axes[1:]:
                keep.pop(axis)
                labels.pop(axis)
        data = self.data.reshape(
            tuple(
                np.product(self.data.shape[ax]
                           ) if isinstance(ax, slice) else self.data.shape[ax]
                for ax in keep.itervalues()))
        labels = labels.values()
        for label, permutation in permutations.iteritems():
            data = hm.reorder(data,
                              axes=[labels.index(label)],
                              permutation=permutation)
        return DTensor(data, labels=labels)
Exemplo n.º 8
0
def initial_rank_based_on_quality(graph):
    # sorted_quality: The result of the rank for all nodes at initial stage
    quality_dict = {}
    for i in range(30):
        quality_dict[i] = graph.node[i]['quality']
    sorted_quality = OrderedDict(
        sorted(quality_dict.items(), key=lambda x: x[1]))
    sorted_quality = list(sorted_quality)

    # Convert sorted_quality to initial rank for each node w.r.t their quality
    quality_rank_list = []
    for i in range(30):
        quality_rank_list.append(sorted_quality.index(i))

    return quality_rank_list
Exemplo n.º 9
0
def export_silo(request, id):

    silo_name = Silo.objects.get(id=id).name

    response = HttpResponse(content_type='text/csv')
    response['Content-Disposition'] = 'attachment; filename="%s.csv"' % silo_name
    writer = csv.writer(response)

    # Loads the bson objects from mongo
    bsondata = store.find({"silo_id": int(id)})
    # Now convert bson to json string using OrderedDict to main fields order
    json_string = dumps(bsondata)
    # Now decode the json string into python object
    silo_data = json.loads(json_string, object_pairs_hook=OrderedDict)
    data = []
    num_cols = 0
    cols = OrderedDict()
    if silo_data:
        num_rows = len(silo_data)

        for row in silo_data:
            for i, col in enumerate(row):
                if col not in cols.keys():
                    num_cols += 1
                    col = col.decode("latin-1").encode("utf8")
                    cols[col] = num_cols

        # Convert OrderedDict to Python list so that it can be written to CSV writer.
        cols = list(cols)
        writer.writerow(list(cols))

        # Populate a 2x2 list structure that corresponds to the number of rows and cols in silo_data
        for i in xrange(num_rows): data += [[0]*num_cols]

        for r, row in enumerate(silo_data):
            for col in row:
                # Map values to column names and place them in the correct position in the data array
                val = row[col]
                if isinstance(val, OrderedDict): val  = val.popitem()
                if isinstance(val, tuple):
                    if val[0] == "$date": val = smart_text(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(val[1]/1000)))
                    if val[0] == "$oid": val = smart_text(val[1])
                #val = val.decode("latin-1").encode("utf8")
                val = smart_text(val).decode("latin-1").encode("utf8")
                data[r][cols.index(col)] = val
            writer.writerow(data[r])
    return response
Exemplo n.º 10
0
def convertToMatrix(userls, filesList, census):
    '''Assumes "userls" as a list of followers, "filesList" is a list of file paths,
    "census" is a list of popular users.
    Returns a large matrix which rows are census and columns are userlist.'''
    y = OrderedDict()
    zeros = list()
    for i, j in enumerate(filesList):
        followers = list(pyreadr.read_r(j)['followers']['followers'])
        y[i] = np.isin(userls, followers) * 1
        if sum(y[i]) == 0:
            print(census[i], "is followed by 0 users!")
            zeros.append(census[i])
    y = pd.DataFrame.from_dict(y)
    y.index = userls
    y.columns = census
    y = y.drop(columns=zeros)  ## drop census who is followed by zero users
    return (y)
Exemplo n.º 11
0
    def parse_lines(lines):
        """Given a nested line list create a flat list of lines and associated mapping matrix
        
        """

        #flatten linelist and MAKE IT UNIQUE
        lines_flat = OrderedDict()
        for line_components in lines:
            for l in line_components:
                lines_flat[l] = None
        lines_flat = tuple(lines_flat.keys())

        mapping = np.zeros([len(lines), len(lines_flat)], dtype=float)
        for i_out, line_components in enumerate(lines):
            for l in line_components:
                j_in = lines_flat.index(l)
                mapping[i_out, j_in] = 1.

        return lines_flat, mapping
Exemplo n.º 12
0
    def widgets(self):
        """Display widgets for all parameters (i.e. property sheet)"""
        # order by param precedence, but with name first and persist last
        params = self.parameterized.params().items()
        ordered_params = OrderedDict(sorted(params, key=lambda x: x[1].precedence)).keys()
        ordered_params.insert(0,ordered_params.pop(ordered_params.index('name')))

        widgets = [self.widget(pname) for pname in ordered_params]
        button = None
        if self.p.onchange:
            pass
        elif self.blocked:
            button = 'Run %s' % self.p.execute
        elif self.p.callback:
            button = 'Execute'
        if button:
            display_button = ipywidgets.Button(description=button)
            display_button.on_click(self.execute_widget)
            widgets.append(display_button)
        return widgets
Exemplo n.º 13
0
    def widgets(self):
        """Display widgets for all parameters (i.e. property sheet)"""
        # order by param precedence, but with name first and persist last
        params = self.parameterized.params().items()
        ordered_params = OrderedDict(sorted(params, key=lambda x: x[1].precedence)).keys()
        ordered_params.insert(0,ordered_params.pop(ordered_params.index('name')))

        widgets = [self.widget(pname) for pname in ordered_params]
        button = None
        if self.p.onchange:
            pass
        elif self.blocked:
            button = 'Run %s' % self.p.execute
        elif self.p.callback:
            button = 'Execute'
        if button:
            display_button = ipywidgets.Button(description=button)
            display_button.on_click(self.execute_widget)
            widgets.append(display_button)
        return widgets
Exemplo n.º 14
0
def test_parse_segments():
    segments = [
        {
            'type': 'int',
            'dims': 'VARxVAR',
            'units': 'Gauss',
            'protocol': 'fits',
            'note': 'magnetogram',
            'name': 'magnetogram',
        },
        {
            'type': 'char',
            'dims': 'VARxVAR',
            'units': 'Enumerated',
            'protocol': 'fits',
            'note': 'Mask for the patch',
            'name': 'bitmap',
        },
        {
            'type': 'int',
            'dims': 'VARxVAR',
            'units': 'm/s',
            'protocol': 'fits',
            'note': 'Dopplergram',
            'name': 'Dopplergram',
        },
    ]
    exp = OrderedDict([
        ('name', ['magnetogram', 'bitmap', 'Dopplergram']),
        ('type', ['int', 'char', 'int']),
        ('units', ['Gauss', 'Enumerated', 'm/s']),
        ('protocol', ['fits', 'fits', 'fits']),
        ('dims', ['VARxVAR', 'VARxVAR', 'VARxVAR']),
        ('note', ['magnetogram', 'Mask for the patch', 'Dopplergram']),
    ])

    exp = pd.DataFrame(data=exp)
    exp.index = exp.pop('name')
    assert drms.SeriesInfo._parse_segments(segments).equals(exp)
Exemplo n.º 15
0
class PsParser:
    """ parse ps output """
    def __init__(self, command="ps -Al"):
        output = subprocess.check_output(command, shell=True)
        self.result = {}
        self.headers = OrderedDict()
        processes = output.splitlines()
        nfields = len(processes[0].split()) - 1
        self.lines = []
        for k, row in enumerate(processes):
            row = row.decode('utf-8')
            data = row.split(None, nfields)
            if k==0:
                self.headers = data
                for k,header in enumerate(data):
                    self.result[header] = []
                continue
            self.lines.append(data)
            for k,value in enumerate(data):
                self.result[self.headers[k]].append(value)
    def get_pid(self, pid):
        result = [Proc(k, self) for k in self.lines if int(k[self.headers.index('PID')])==pid]
        return result[0] if result else None
Exemplo n.º 16
0
    def merge(self,*args):
        '''
        Merge some continuous and ascending labels of a tensor into a new one with an optional permutation.

        Usage: ``tensor.merge((olds,new,<permutation>),(olds,new,<permutation>),...)``
            * olds: list of Label/int
                The old labels/axes to be merged.
            * new: Label
                The new label.
            * permutation: 1d ndarray of int, optional
                The permutation of the quantum number collection of the new label.

        Returns
        -------
        DTensor
            The new tensor.
        '''
        permutations={}
        keep=OrderedDict([(i,i) for i in range(self.ndim)])
        labels=OrderedDict([(i,label) for i,label in enumerate(self.labels)])
        for arg in args:
            assert len(arg) in (2,3)
            olds,new,permutation=(arg[0],arg[1],None) if len(arg)==2 else arg
            axes=np.array([self.axis(old) if isinstance(old,Label) else old for old in olds])
            if len(axes)!=max(axes)-min(axes)+1 or not all(axes[1:]>axes[:-1]):
                raise ValueError('DTensor merge error: the axes to be merged should be continuous and ascending, please call transpose first.')
            permutations[new]=permutation
            keep[axes[0]]=slice(axes[0],axes[-1]+1)
            labels[axes[0]]=new
            for axis in axes[1:]:
                keep.pop(axis)
                labels.pop(axis)
        data=self.data.reshape(tuple(np.product(self.data.shape[ax]) if isinstance(ax,slice) else self.data.shape[ax] for ax in keep.values()))
        labels=list(labels.values())
        for label,permutation in permutations.items():
            data=hm.reorder(data,axes=[labels.index(label)],permutation=permutation)
        return DTensor(data,labels=labels)
Exemplo n.º 17
0
def test_parse_links():
    links = [
        {
            'name': 'BHARP',
            'kind': 'DYNAMIC',
            'note': 'Bharp',
            'target': 'hmi.Bharp_720s'
        },
        {
            'name': 'MHARP',
            'kind': 'DYNAMIC',
            'note': 'Mharp',
            'target': 'hmi.Mharp_720s'
        },
    ]
    exp = OrderedDict([
        ('name', ['BHARP', 'MHARP']),
        ('target', ['hmi.Bharp_720s', 'hmi.Mharp_720s']),
        ('kind', ['DYNAMIC', 'DYNAMIC']),
        ('note', ['Bharp', 'Mharp']),
    ])
    exp = pd.DataFrame(data=exp)
    exp.index = exp.pop('name')
    assert drms.SeriesInfo._parse_links(links).equals(exp)
Exemplo n.º 18
0
class LiteratureExtension(Extension):
    """ Literature Extension. """

    def __init__(self, *args, **kwargs):
        """ Setup configs. """

        self.config = {
            'PLACE_MARKER':
                ["///Literature Goes Here///",
                 "The text string that marks where the literature references go"],
            'UNIQUE_IDS':
                [False,
                 "Avoid name collisions across "
                 "multiple calls to reset()."],
            "BACKLINK_TEXT":
                ["&#8617;",
                 "The text string that links from the literature reference "
                 "to the reader's place."]
        }
        super().__init__(*args, **kwargs)

        # In multiple invocations, emit links that don't get tangled.
        self.unique_prefix = 0

        self.found_refs = {}
        self.used_refs = set()
        self.reset()

    def extendMarkdown(self, md):
        """ Add pieces to Markdown. """
        md.registerExtension(self)
        self.parser = md.parser
        self.md = md
        # Insert a preprocessor before ReferencePreprocessor
        md.preprocessors.register(LiteraturePreprocessor(self), "literature", 1)

        # Insert an inline pattern before ImageReferencePattern
        LITERATURE_RE = r'\[\=([^\]]*)\]'  # blah blah [^1] blah
        md.inlinePatterns.register(LiteraturePattern(LITERATURE_RE, self), "literature", 80)
        # Insert a tree-processor that would actually add the literatures div
        # This must be before all other treeprocessors (i.e., inline and
        # codehilite) so they can run on the the contents of the div.
        md.treeprocessors.register(LiteratureTreeprocessor(self), "literature", 100)
        # Insert a postprocessor after amp_substitute oricessor
        md.postprocessors.register(LiteraturePostprocessor(self), "literature", 120)

    def reset(self):
        """ Clear literature references on reset, and prepare for distinct document. """
        self.literatures = OrderedDict()
        self.unique_prefix += 1

    def findLiteraturesPlaceholder(self, root):
        """ Return ElementTree Element that contains Literature placeholder. """
        def finder(element):
            for child in element:
                if child.text:
                    if child.text.find(self.getConfig("PLACE_MARKER")) > -1:
                        return child, element, True
                if child.tail:
                    if child.tail.find(self.getConfig("PLACE_MARKER")) > -1:
                        return child, element, False
                child_res = finder(child)
                if child_res is not None:
                    return child_res
            return None

        res = finder(root)
        return res

    def setLiterature(self, identifier, text):
        """ Store a literature for later retrieval. """
        self.literatures[identifier] = text

    def get_separator(self):
        if self.md.output_format in ['html5', 'xhtml5']:
            return '-'
        return ':'

    def makeLiteratureId(self, identifier):
        """ Return literature link identifier. """
        if self.getConfig("UNIQUE_IDS"):
            return 'lit%s%d-%s' % (self.get_separator(), self.unique_prefix, identifier)
        else:
            return 'lit%s%s' % (self.get_separator(), identifier)

    def makeLiteratureRefId(self, identifier):
        """ Return literature back-link identifier. """
        if self.getConfig("UNIQUE_IDS"):
            return 'litref%s%d-%s' % (self.get_separator(),
                                     self.unique_prefix, identifier)
        else:
            return 'litref%s%s' % (self.get_separator(), identifier)

    def makeLiteraturesDiv(self, root):
        """ Return div of literatures as et Element. """

        if not list(self.literatures.keys()):
            return None

        div = etree.Element("div")
        div.set('class', 'literature')
        etree.SubElement(div, "hr")
        ol = etree.SubElement(div, "ol")

        for identifier in self.literatures.keys():
            li = etree.SubElement(ol, "li")
            li.set("id", self.makeLiteratureId(identifier))
            self.parser.parseChunk(li, self.literatures[identifier])
            backlink = etree.Element("a")
            backlink.set("href", "#" + self.makeLiteratureRefId(identifier))
            if self.md.output_format not in ['html5', 'xhtml5']:
                backlink.set("rev", "literature")  # Invalid in HTML5
            backlink.set("class", "literature-backref")
            backlink.set(
                "title",
                "Jump back to literature %d in the text" %
                (self.literatures.index(identifier)+1)
            )
            backlink.text = LIT_BACKLINK_TEXT

            if list(li):
                node = li[-1]
                if node.tag == "p":
                    node.text = node.text + NBSP_PLACEHOLDER
                    node.append(backlink)
                else:
                    p = etree.SubElement(li, "p")
                    p.append(backlink)
        return div
Exemplo n.º 19
0
    'laplace': 'Laplace',
    'binomial': 'Binomial'
}

cols = {
    'sfmt': 'SFMT',
    'dsfmt': 'dSFMT',
    'xoroshiro128plus': 'xoroshiro128+',
    'xorshift1024': 'xorshift1024',
    'pcg64': 'PCG64',
    'mt19937': 'MT19937',
    'random': 'NumPy MT19937'
}

results.columns = [cols[c] for c in results]
results.index = [index[i] for i in results.index]

print(results)

from io import StringIO

sio = StringIO()
results.to_csv(sio)
sio.seek(0)
lines = sio.readlines()
for i, line in enumerate(lines):
    if i == 0:
        line = '    :header: ' + line
    else:
        line = '    ' + line
    lines[i] = line
Exemplo n.º 20
0
def pmultiquery(corpus, 
    search,
    show = 'words',
    query = 'any', 
    sort_by = 'total', 
    quicksave = False,
    multiprocess = 'default', 
    function_filter = False,
    just_speakers = False,
    root = False,
    note = False,
    print_info = True,
    **kwargs):
    """Parallel process multiple queries or corpora.

    This function is used by interrogator() if:

        a) path is a list of paths
        b) query is a dict of named queries
        c) just speakers == 'each', or a list of speakers with len(list) > 1
    
    This function needs joblib 0.8.4 or above in order to run properly.
    There's no reason to call it yourself."""
    
    import collections
    import os
    import pandas as pd
    import collections
    from collections import namedtuple
    from time import strftime, localtime
    import corpkit
    from interrogator import interrogator
    from editor import editor
    from other import save
    from interrogation import Interrogation
    try:
        from joblib import Parallel, delayed
    except:
        pass
        #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \
        #                 'Install with:\n\n        pip install joblib')
    import multiprocessing

    def best_num_parallel(num_cores, num_queries):
        import corpkit
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([int(num_queries / n) for n in range(2, num_cores) if int(num_queries / n) <= num_cores])   
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores: 
                        return int(square_root)    
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False
    multiple_search = False
    mult_corp_are_subs = False
    denom = 1

    if hasattr(corpus, '__iter__'):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(corpus))
        denom = len(corpus)
        if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus):
            mult_corp_are_subs = True
    elif (type(query) == list or type(query) == dict) and not hasattr(search, '__iter__'):
            multiple_queries = True
            num_cores = best_num_parallel(num_cores, len(query))
            denom = len(query)
    elif hasattr(search, '__iter__') and type(search) != dict:
        multiple_search = True
        num_cores = best_num_parallel(num_cores, len(list(search.keys())))
        denom = len(list(search.keys()))
    elif hasattr(function_filter, '__iter__'):
        multiple_option = True
        num_cores = best_num_parallel(num_cores, len(list(function_filter.keys())))
        denom = len(list(function_filter.keys()))
    elif just_speakers:
        from build import get_speaker_names_from_xml_corpus
        multiple_speakers = True
        if just_speakers == 'each' or just_speakers == ['each']:
            just_speakers = get_speaker_names_from_xml_corpus(corpus.path)
        if len(just_speakers) == 0:
            print('No speaker name data found.')
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)
        
    if type(multiprocess) == int:
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure quicksaves are right type
    if quicksave is True:
        raise ValueError('quicksave must be string when using pmultiquery.')
    
    # the options that don't change
    d = {
         #'paralleling': True,
         'function': 'interrogator',
         'root': root,
         'note': note,
         'denominator': denom}
    
    # add kwargs to query
    for k, v in list(kwargs.items()):
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        for index, p in enumerate(corpus):
            name = p.name
            a_dict = dict(d)
            a_dict['corpus'] = p
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name.replace('-parsed', '')
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = q
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_option:
        for index, (name, q) in enumerate(function_filter.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['function_filter'] = q
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = [name]
            a_dict['function_filter'] = function_filter
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_search:
        for index, val in enumerate(search):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = val
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['function_filter'] = function_filter
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)

    if kwargs.get('do_concordancing') is False:
        message = 'Interrogating'
    elif kwargs.get('do_concordancing') is True:
        message = 'Interrogating and concordancing'
    elif kwargs.get('do_concordancing').lower() == 'only':
        message = 'Concordancing'
    time = strftime("%H:%M:%S", localtime())
    sformat = ''
    for i, (k, v) in enumerate(list(search.items())):
        if type(v) == list:
            vformat = ', '.join(v[:5])
            if len(v) > 5:
                vformat += ' ...'
        else:
            vformat = v
        sformat += '%s: %s' %(k, vformat)
        if i < len(search.keys()) - 1:
            sformat += '\n                  '

    if multiple_corpora and not multiple_option:
        corplist = "\n              ".join([i.name for i in corpus[:20]])
        if len(corpus) > 20:
            corplist += '\n ... and %d more ...\n' % (len(corpus) - 20)
        print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n              %s" \
           "\n          Query: '%s'\n          %s corpus ... \n"  % (time, len(corpus), num_cores, corplist, sformat, message)))

    elif multiple_queries:
        print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
           "\n          Queries: '%s'\n          %s corpus ... \n" % (time, len(search), num_cores, corpus.name, "', '".join(list(search.values())), message) ))

    elif multiple_search:
        print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
           "\n          Queries: '%s'\n          %s corpus ... \n" % (time, len(list(search.keys())), num_cores, corpus.name, str(list(search.values())), message)))

    elif multiple_option:
        print(("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \
           "\n          Query: '%s'\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) ))

    elif multiple_speakers:
        print(("\n%s: Beginning %d parallel corpus interrogations: %s" \
           "\n          Query: '%s'\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) ))

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    if not root:
        from blessings import Terminal
        terminal = Terminal()
        print('\n' * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print('%s: QUEUED: %s' % (thetime, dobj['outname']))

            except:
                pass

    if not root and multiprocess:
        #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) 
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            used_joblib = True
        except:
            failed = True
            print('Multiprocessing failed.')
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted(res)
        except:
            pass

    # multiprocessing way
    #from multiprocessing import Process
    #from interrogator import interrogator
    #jobs = []
    ##for d in ds:
    ##    p = multiprocessing.Process(target=interrogator, kwargs=(**d,))
    ##    jobs.append(p)
    ##    p.start()
    ##    while p.is_alive():
    ##        import time
    ##        time.sleep(2)
    ##        if root:
    ##            root.update()
    #result_queue = multiprocessing.Queue()
    #
    #for d in ds:
    #funs = [interrogator(result_queue, **kwargs) for kwargs in ds]
    #jobs = [multiprocessing.Process(mc) for mc in funs]
    #for job in jobs: job.start()
    #for job in jobs: job.join()
    #results = [result_queue.get() for mc in funs]

    import corpkit
    from interrogation import Concordance
    if kwargs.get('do_concordancing') == 'only':
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index)))
        return Concordance(concs)

    from collections import OrderedDict
    if not all(type(i.results) == pd.core.series.Series for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            out[interrog.query['outname']] = interrog
    
        if quicksave:
            fullpath = os.path.join('saved_interrogations', quicksave)
            while os.path.isdir(fullpath):
                selection = input("\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: " % (quicksave, 'saved_interrogations'))
                if selection == 'o' or selection == 'O':
                    import shutil
                    shutil.rmtree(fullpath)
                else:
                    import os
                    fullpath = os.path.join('saved_interrogations', selection)

            for k, v in list(out.items()):
                save(v, k, savedir = fullpath, print_info = False)
        
            time = strftime("%H:%M:%S", localtime())
            print("\n%s: %d files saved to %s" % ( time, len(list(out.keys())), fullpath))

        time = strftime("%H:%M:%S", localtime())
        print("\n\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % (time, "'\n         '".join(sorted(out.keys()))))
        from interrogation import Interrodict
        return Interrodict(out)
    # make query and total branch, save, return
    else:
        #print sers
        #print ds
        if multiple_corpora and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = pd.DataFrame(sers, index = [i.query['outname'] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols
            out = out.fillna(0) # nan to zero
            out = out.astype(int) # float to int
            out = out.T            
        else:
            out = pd.concat([r.results for r in res], axis = 1)
            # format like normal
            out = out[sorted(list(out.columns))]
            out = out.T
            out = out.fillna(0) # nan to zero
            out = out.astype(int)
            if 'c' in show and mult_corp_are_subs:
                out = out.sum()
                out.index = sorted(list(out.index))

        # sort by total
        if type(out) == pd.core.frame.DataFrame:
            out.ix['Total-tmp'] = out.sum()
            tot = out.ix['Total-tmp']
            out = out[tot.argsort()[::-1]]
            out = out.drop('Total-tmp', axis = 0)
        out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False, \
                      df1_always_df = kwargs.get('df1_always_df'))
        if len(out.results.columns) == 1:
            out.results = out.results.sort_index()   
        if kwargs.get('do_concordancing') is True:
            concs = pd.concat([x.concordance for x in res], ignore_index = True)
            concs = concs.sort_values(by='c')
            concs = concs.reset_index(drop=True)
            out.concordance = Concordance(concs)
        thetime = strftime("%H:%M:%S", localtime())
        if terminal:
            with terminal.location(0, terminal.height):
                print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        else:
            print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        #if used_joblib:
            
        if quicksave:
            from other import save
            save(out, quicksave)
        print('\n')
        return out
Exemplo n.º 21
0
def assign_variables(assignment_expressions, df, locals_dict, df_alias=None, trace_rows=None):
    """
    Evaluate a set of variable expressions from a spec in the context
    of a given data table.

    Expressions are evaluated using Python's eval function.
    Python expressions have access to variables in locals_d (and df being
    accessible as variable df.) They also have access to previously assigned
    targets as the assigned target name.

    lowercase variables starting with underscore are temp variables (e.g. _local_var)
    and not returned except in trace_results

    uppercase variables starting with underscore are temp scalar variables (e.g. _LOCAL_SCALAR)
    and not returned except in trace_assigned_locals
    This is useful for defining general purpose local constants in expression file

    Users should take care that expressions (other than temp scalar variables) should result in
    a Pandas Series (scalars will be automatically promoted to series.)

    Parameters
    ----------
    assignment_expressions : pandas.DataFrame of target assignment expressions
        target: target column names
        expression: pandas or python expression to evaluate
    df : pandas.DataFrame
    locals_d : Dict
        This is a dictionary of local variables that will be the environment
        for an evaluation of "python" expression.
    trace_rows: series or array of bools to use as mask to select target rows to trace

    Returns
    -------
    variables : pandas.DataFrame
        Will have the index of `df` and columns named by target and containing
        the result of evaluating expression
    trace_df : pandas.DataFrame or None
        a dataframe containing the eval result values for each assignment expression
    """

    np_logger = NumpyLogger(logger)

    def is_throwaway(target):
        return target == '_'

    def is_temp_scalar(target):
        return target.startswith('_') and target.isupper()

    def is_temp(target):
        return target.startswith('_')

    def to_series(x):
        if x is None or np.isscalar(x):
            return pd.Series([x] * len(df.index), index=df.index)
        return x

    assert assignment_expressions.shape[0] > 0

    trace_assigned_locals = trace_results = None
    if trace_rows is not None:
        # convert to numpy array so we can slice ndarrays as well as series
        trace_rows = np.asanyarray(trace_rows)
        if trace_rows.any():
            trace_results = OrderedDict()
            trace_assigned_locals = OrderedDict()

    # avoid touching caller's passed-in locals_d parameter (they may be looping)
    _locals_dict = local_utilities()
    if locals_dict is not None:
        _locals_dict.update(locals_dict)
    if df_alias:
        _locals_dict[df_alias] = df
    else:
        _locals_dict['df'] = df
    local_keys = list(_locals_dict.keys())

    # build a dataframe of eval results for non-temp targets
    # since we allow targets to be recycled, we want to only keep the last usage
    variables = OrderedDict()

    # need to be able to identify which variables causes an error, which keeps
    # this from being expressed more parsimoniously
    for e in zip(assignment_expressions.target, assignment_expressions.expression):
        target, expression = e

        assert isinstance(target, str), \
            "expected target '%s' for expression '%s' to be string not %s" % \
            (target, expression, type(target))

        if target in local_keys:
            logger.warning("assign_variables target obscures local_d name '%s'", str(target))

        if is_temp_scalar(target) or is_throwaway(target):
            try:
                x = eval(expression, globals(), _locals_dict)
            except Exception as err:
                logger.error("assign_variables error: %s: %s", type(err).__name__, str(err))
                logger.error("assign_variables expression: %s = %s", str(target), str(expression))
                raise err

            if not is_throwaway(target):
                _locals_dict[target] = x
                if trace_assigned_locals is not None:
                    trace_assigned_locals[uniquify_key(trace_assigned_locals, target)] = x
            continue

        try:

            # FIXME - log any numpy warnings/errors but don't raise
            np_logger.target = str(target)
            np_logger.expression = str(expression)
            saved_handler = np.seterrcall(np_logger)
            save_err = np.seterr(all='log')

            # FIXME should whitelist globals for security?
            globals_dict = {}
            expr_values = to_series(eval(expression, globals_dict, _locals_dict))

            np.seterr(**save_err)
            np.seterrcall(saved_handler)

        except Exception as err:
            logger.error("assign_variables error: %s: %s", type(err).__name__, str(err))
            logger.error("assign_variables expression: %s = %s", str(target), str(expression))
            raise err

        if not is_temp(target):
            variables[target] = expr_values

        if trace_results is not None:
            trace_results[uniquify_key(trace_results, target)] = expr_values[trace_rows]

        # update locals to allows us to ref previously assigned targets
        _locals_dict[target] = expr_values

    if trace_results is not None:

        trace_results = pd.DataFrame.from_dict(trace_results)

        trace_results.index = df[trace_rows].index

        # add df columns to trace_results
        trace_results = pd.concat([df[trace_rows], trace_results], axis=1)

    # we stored result in dict - convert to df
    variables = util.df_from_dict(variables, index=df.index)

    return variables, trace_results, trace_assigned_locals
Exemplo n.º 22
0
def pmultiquery(corpus, 
                search,
                show='words',
                query='any', 
                sort_by='total', 
                save=False,
                multiprocess='default', 
                root=False,
                note=False,
                print_info=True,
                subcorpora=False,
                **kwargs
               ):
    """
    - Parallel process multiple queries or corpora.
    - This function is used by corpkit.interrogator.interrogator()
    - for multiprocessing.
    - There's no reason to call this function yourself.
    """
    import os
    from pandas import DataFrame, Series
    import pandas as pd
    import collections
    from collections import namedtuple, OrderedDict
    from time import strftime, localtime
    import corpkit
    from corpkit.interrogator import interrogator
    from corpkit.interrogation import Interrogation, Interrodict
    from corpkit.process import canpickle
    try:
        from joblib import Parallel, delayed
    except ImportError:
        pass
    import multiprocessing

    locs = locals()
    for k, v in kwargs.items():
        locs[k] = v
    in_notebook = locs.get('in_notebook')

    def best_num_parallel(num_cores, num_queries):
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        import corpkit
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([int(num_queries / n) for n in range(2, num_cores) \
                               if int(num_queries / n) <= num_cores])   
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores: 
                        return int(square_root)    
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple = kwargs.get('multiple', False)
    mult_corp_are_subs = False
    if hasattr(corpus, '__iter__'):
        if all(getattr(x, 'level', False) == 's' for x in corpus):
            mult_corp_are_subs = True

    non_first_sub = None
    if subcorpora:
        non_first_sub = subcorpora[1:] if isinstance(subcorpora, list) else None
        subval = subcorpora if not non_first_sub else subcorpora[0]
        #print(subcorpora, non_first_sub, subval)
        if subcorpora is True:
            import re
            subcorpora = re.compile(r'.*')
        else:
            # strange travis error happened here
            subcorpora = corpus.metadata['fields'][subval]
            if len(subcorpora) == 0:
                print('No %s metadata found.' % str(subval))
                return

    mapcores = {'datalist': [corpus, 'corpus'],
                'multiplecorpora': [corpus, 'corpus'],
                'namedqueriessingle': [query, 'query'],
                'namedqueriesmultiple': [search, 'search'],
                'subcorpora': [subcorpora, 'subcorpora']}

    # a is a dummy, just to produce default one
    toiter, itsname = mapcores.get(multiple, [False, False])
    if isinstance(toiter, dict):
        toiter = toiter.items()
    denom = len(toiter)
    num_cores = best_num_parallel(num_cores, denom)

    # todo: code below makes no sense
    vals = ['eachspeaker', 'multiplespeaker', 'namedqueriesmultiple']
    if multiple == 'multiplecorpora' and any(x is True for x in vals):
        from corpkit.corpus import Corpus, Corpora
        if isinstance(corpus, Corpora):
            multiprocess = False
        else:
            corpus = Corpus(corpus)

    if isinstance(multiprocess, int):
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure saves are right type
    if save is True:
        raise ValueError('save must be string when multiprocessing.')

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    locs['printstatus'] = False
    locs['multiprocess'] = False
    locs['df1_always_df'] = False
    locs['files_as_subcorpora'] = False
    locs['corpus'] = corpus

    if multiple == 'multiplespeaker':
        locs['multispeaker'] = True

    if isinstance(non_first_sub, list) and len(non_first_sub) == 1:
        non_first_sub = non_first_sub[0]

    # make the default query
    locs = {k: v for k, v in locs.items() if canpickle(v)}
    # make a new dict for every iteration
    ds = [dict(**locs) for i in range(denom)]
    for index, (d, bit) in enumerate(zip(ds, toiter)):
        d['paralleling'] = index
        if multiple in ['namedqueriessingle', 'namedqueriesmultiple']:
            d[itsname] = bit[1]
            d['outname'] = bit[0]
        elif multiple in ['multiplecorpora', 'datalist']:
            d['outname'] = bit.name.replace('-parsed', '')
            d[itsname] = bit
        elif multiple in ['subcorpora']:
            d[itsname] = bit
            jmd = {subval: bit}
            # put this earlier
            j2 = kwargs.get('just_metadata', False)
            if not j2:
                j2 = {}
            jmd.update(j2)
    
            d['just_metadata'] = jmd
            d['outname'] = bit
            d['by_metadata'] = False
            d['subcorpora'] = non_first_sub
            if non_first_sub:
                d['print_info'] = False

    # message printer should be a function...
    if kwargs.get('conc') is False:
        message = 'Interrogating'
    elif kwargs.get('conc') is True:
        message = 'Interrogating and concordancing'
    elif kwargs.get('conc').lower() == 'only':
        message = 'Concordancing'

    time = strftime("%H:%M:%S", localtime())
    from corpkit.process import dictformat
    
    if print_info:

        # proper printing for plurals
        # in truth this needs to be revised, it's horrible.
        sformat = dictformat(search, query)

        if num_cores == 1:
            add_es = ''
        else:
            add_es = 'es'
        if multiple in ['multiplecorpora', 'datalist']:
            corplist = "\n              ".join([i.name for i in list(corpus)[:20]])
            if len(corpus) > 20:
                corplist += '\n ... and %d more ...\n' % (len(corpus) - 20)
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n              %s" \
               "\n          Query: %s\n          %s corpus ... \n"  % (time, len(corpus), num_cores, add_es, corplist, sformat, message)))

        elif multiple == 'namedqueriessingle':
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(query), num_cores,  add_es, corpus.name, sformat, message) ))

        elif multiple == 'namedqueriesmultiple':
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message)))

        elif multiple in ['eachspeaker', 'multiplespeaker']:
            print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) ))
        elif multiple in ['subcorpora']:
            print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) ))

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    #todo: the number of blank lines to print can be way wrong
    if not root and print_info:
        from blessings import Terminal
        terminal = Terminal()
        print('\n' * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print('%s: QUEUED: %s' % (thetime, dobj['outname']))
            except:
                pass

    if not root and multiprocess:
        try:
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            used_joblib = True
        except:
            failed = True
            print('Multiprocessing failed.')
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted([i for i in res if i])
        except:
            pass

    # remove unpicklable bits from query
    from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType
    badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType)
    qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)}

    if hasattr(qlocs.get('corpus', False), 'name'):
        qlocs['corpus'] = qlocs['corpus'].path
    else:
        qlocs['corpus'] = list([i.path for i in qlocs.get('corpus', [])])

    # return just a concordance
    from corpkit.interrogation import Concordance
    if kwargs.get('conc') == 'only':
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        concs = concs.reset_index(drop=True)
        if kwargs.get('maxconc'):
            concs = concs[:kwargs.get('maxconc')]
        lines = Concordance(concs)
        
        if save:
            lines.save(save, print_info=print_info)

        if print_info:
            print('\n\n%s: Finished! %d results.\n\n' % (thetime, format(len(concs.index), ',')))

        return lines

    # return interrodict (to become multiindex)
    if isinstance(res[0], Interrodict) or not all(isinstance(i.results, Series) for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            try:
                out[interrog.query['outname']] = interrog
            except KeyError:
                out[d['outname']] = interrog

        idict = Interrodict(out)
        
        if print_info:
            thetime = strftime("%H:%M:%S", localtime())
            print("\n\n%s: Finished! Output is multi-indexed." % thetime)
        idict.query = qlocs

        if save:
            idict.save(save, print_info=print_info)

        return idict

    # make query and total branch, save, return
    # todo: standardise this so we don't have to guess transposes
    # 
    else:
        if multiple == 'multiplecorpora' and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = DataFrame(sers, index=[i.query['outname'] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols
            out = out.fillna(0) # nan to zero
            out = out.astype(int) # float to int
            out = out.T            
        else:
            # make a series from counts
            if all(len(i.results) == 1 for i in res):
                out = pd.concat([r.results for r in res])
                out = out.sort_index()
            else:
                try:
                    out = pd.concat([r.results for r in res], axis=1)
                    out = out.T
                    out.index = [i.query['outname'] for i in res]
                except ValueError:
                    return None
                # format like normal
                # this sorts subcorpora, which are cls
                out = out[sorted(list(out.columns))]
                # puts subcorpora in the right place
                if not mult_corp_are_subs and multiple != 'subcorpora':
                    out = out.T
                if multiple == 'subcorpora':
                    out = out.sort_index()
                out = out.fillna(0) # nan to zero
                out = out.astype(int)
                if 'c' in show and mult_corp_are_subs:
                    out = out.sum()
                    out.index = sorted(list(out.index))

        # sort by total
        if isinstance(out, DataFrame):

            out = out[list(out.sum().sort_values(ascending=False).index)]

            # really need to figure out the deal with tranposing!
            if all(x.endswith('.xml') for x in list(out.columns)) \
            or all(x.endswith('.txt') for x in list(out.columns)) \
            or all(x.endswith('.conll') for x in list(out.columns)):
                out = out.T
                
            if kwargs.get('nosubmode'):
                out = out.sum()
    
        from corpkit.interrogation import Interrogation
        tt = out.sum(axis=1) if isinstance(out, DataFrame) else out.sum()
        out = Interrogation(results=out, totals=tt, query=qlocs)

        if hasattr(out, 'columns') and len(out.columns) == 1:
            out = out.sort_index()   

        if kwargs.get('conc') is True:
            try:
                concs = pd.concat([x.concordance for x in res], ignore_index=True)
                concs = concs.sort_values(by='c')
                concs = concs.reset_index(drop=True)
                if kwargs.get('maxconc'):
                    concs = concs[:kwargs.get('maxconc')]
                out.concordance = Concordance(concs)
            except ValueError:
                out.concordance = None

        thetime = strftime("%H:%M:%S", localtime())
        if terminal:
            print(terminal.move(terminal.height-1, 0))
        if print_info:
            if terminal:
                print(terminal.move(terminal.height-1, 0))
            if hasattr(out.results, 'columns'):
                print('%s: Interrogation finished! %s unique results, %s total.' % (thetime, format(len(out.results.columns), ','), format(out.totals.sum(), ',')))
            else:
                print('%s: Interrogation finished! %s matches.' % (thetime, format(tt, ',')))
        if save:
            out.save(save, print_info = print_info)

        if list(out.results.index) == ['0'] and not kwargs.get('df1_always_df'):
            out.results = out.results.ix[0].sort_index()
        return out
Exemplo n.º 23
0
def pmultiquery(corpus, 
                search,
                show='words',
                query='any', 
                sort_by='total', 
                save=False,
                multiprocess='default', 
                just_speakers=False,
                root=False,
                note=False,
                print_info=True,
                **kwargs
               ):
    """
    - Parallel process multiple queries or corpora.
    - This function is used by corpkit.interrogator.interrogator()
    - for multiprocessing.
    - There's no reason to call this function yourself."""
    import os
    from pandas import DataFrame, Series
    import pandas as pd
    import collections
    from collections import namedtuple, OrderedDict
    from time import strftime, localtime
    import corpkit
    from corpkit.interrogator import interrogator
    from corpkit.interrogation import Interrogation
    try:
        from joblib import Parallel, delayed
    except ImportError:
        pass
    import multiprocessing

    locs = locals()
    for k, v in kwargs.items():
        locs[k] = v
    in_notebook = locs.get('in_notebook')

    def best_num_parallel(num_cores, num_queries):
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        import corpkit
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([int(num_queries / n) for n in range(2, num_cores) \
                               if int(num_queries / n) <= num_cores])   
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores: 
                        return int(square_root)    
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False
    multiple_search = False
    mult_corp_are_subs = False
    denom = 1

    if hasattr(corpus, '__iter__'):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(corpus))
        denom = len(corpus)
        if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus):
            mult_corp_are_subs = True
    elif (isinstance(query, (list, dict)) and not hasattr(search, '__iter__')):
            multiple_queries = True
            num_cores = best_num_parallel(num_cores, len(query))
            denom = len(query)
    elif hasattr(search, '__iter__') and all(isinstance(i, dict) for i in list(search.values())):
        multiple_search = True
        num_cores = best_num_parallel(num_cores, len(list(search.keys())))
        denom = len(list(search.keys()))

    elif just_speakers:
        from build import get_speaker_names_from_xml_corpus
        multiple_speakers = True
        if just_speakers == 'each' or just_speakers == ['each']:
            just_speakers = get_speaker_names_from_xml_corpus(corpus.path)
        if len(just_speakers) == 0:
            print('No speaker name data found.')
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)

    if multiple_corpora and any(x is True for x in [multiple_speakers, multiple_queries, 
                                                    multiple_search, multiple_option]):
        from corpkit.corpus import Corpus, Corpora
        if isinstance(corpus, Corpora):
            multiprocess = False
        else:
            corpus = Corpus(corpus)

    if isinstance(multiprocess, int):
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure saves are right type
    if save is True:
        raise ValueError('save must be string when multiprocessing.')
    
    # the options that don't change
    d = {'function': 'interrogator',
         'root': root,
         'note': note,
         'denominator': denom}
    
    # add kwargs to query
    for k, v in list(kwargs.items()):
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        for index, p in enumerate(corpus):
            name = p.name
            a_dict = dict(d)
            a_dict['corpus'] = p
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name.replace('-parsed', '')
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = q
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = [name]
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_search:
        for index, (name, val) in enumerate(search.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = val
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)

    if kwargs.get('conc') is False:
        message = 'Interrogating'
    elif kwargs.get('conc') is True:
        message = 'Interrogating and concordancing'
    elif kwargs.get('conc').lower() == 'only':
        message = 'Concordancing'
    time = strftime("%H:%M:%S", localtime())
    sformat = ''
    if multiple_queries:
        to_it_over = query
    else:
        to_it_over = search
    for i, (k, v) in enumerate(list(to_it_over.items())):
        if isinstance(v, list):
            vformat = ', '.join(v[:5])
            if len(v) > 5:
                vformat += ' ...'
        elif isinstance(v, dict):
            vformat = ''
            for kk, vv in v.items():
                if isinstance(vv, list):
                    vv = ', '.join(vv[:5])

                vformat += '\n                     %s: %s' % (kk, vv)
                if len(vv) > 5:
                    vformat += ' ...'
        else:
            try:
                vformat = v.pattern
            except AttributeError:
                vformat = v
        sformat += '%s: %s' %(k, vformat)
        if i < len(to_it_over.keys()) - 1:
            sformat += '\n                   '

    if print_info:
        # proper printing for plurals
        # in truth this needs to be revised, it's horrible.
        if num_cores == 1:
            add_es = ''
        else:
            add_es = 'es'
        if multiple_corpora and not multiple_option:
            corplist = "\n              ".join([i.name for i in corpus[:20]])
            if len(corpus) > 20:
                corplist += '\n ... and %d more ...\n' % (len(corpus) - 20)
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n              %s" \
               "\n          Query: %s\n          %s corpus ... \n"  % (time, len(corpus), num_cores, add_es, corplist, sformat, message)))

        elif multiple_queries:
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(query), num_cores,  add_es, corpus.name, sformat, message) ))

        elif multiple_search:
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message)))

        elif multiple_option:
            print(("\n%s: Beginning %d parallel corpus interrogation%s (multiple options): %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat,  message) ))

        elif multiple_speakers:
            print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) ))

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    if not root and print_info:
        from blessings import Terminal
        terminal = Terminal()
        print('\n' * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print('%s: QUEUED: %s' % (thetime, dobj['outname']))
            except:
                pass

    if not root and multiprocess:
        #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True) 
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
            used_joblib = True
        except:
            failed = True
            print('Multiprocessing failed.')
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted([i for i in res if i])
        except:
            pass

    # remove unpicklable bits from query
    from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType
    badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType)
    qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)}

    if hasattr(qlocs['corpus'], 'name'):
        qlocs['corpus'] = qlocs['corpus'].path
    else:
        qlocs['corpus'] = list([i.path for i in qlocs['corpus']])

    from corpkit.interrogation import Concordance
    if kwargs.get('conc') == 'only':
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        concs = concs.reset_index(drop=True)
        lines = Concordance(concs)
        
        if save:
            lines.save(save, print_info=print_info)

        if print_info:
            print('\n\n%s: Finished! %d results.\n\n' % (thetime, len(concs.index)))

        return lines

    if not all(isinstance(i.results, Series) for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            try:
                out[interrog.query['outname']] = interrog
            except KeyError:
                out[d['outname']] = interrog

        from corpkit.interrogation import Interrodict
        idict = Interrodict(out)
        
        if print_info:
            time = strftime("%H:%M:%S", localtime())
            print("\n\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n" % \
                (time, "'\n         '".join(sorted(out.keys()))))

        idict.query = qlocs

        if save:
            idict.save(save, print_info=print_info)

        return idict
    

    # make query and total branch, save, return
    # todo: standardise this so we don't have to guess transposes
    else:
        if multiple_corpora and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = DataFrame(sers, index=[i.query['outname'] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols
            out = out.fillna(0) # nan to zero
            out = out.astype(int) # float to int
            out = out.T            
        else:
            try:
                out = pd.concat([r.results for r in res], axis=1)
                out = out.T
                out.index = [i.query['outname'] for i in res]
            except ValueError:
                return None
            # format like normal
            # this sorts subcorpora, which are cls
            out = out[sorted(list(out.columns))]
            # puts subcorpora in the right place
            if not mult_corp_are_subs:
                out = out.T
            out = out.fillna(0) # nan to zero
            out = out.astype(int)
            if 'c' in show and mult_corp_are_subs:
                out = out.sum()
                out.index = sorted(list(out.index))

        # sort by total
        if isinstance(out, DataFrame):
            out = out[list(out.sum().sort_values(ascending=False).index)]

            # really need to figure out the deal with tranposing!
            if all(x.endswith('.xml') for x in list(out.columns)) \
            or all(x.endswith('.txt') for x in list(out.columns)):
                out = out.T
        out = out.edit(sort_by=sort_by, print_info=False, keep_stats=False, \
                      df1_always_df=kwargs.get('df1_always_df'))
        out.query = qlocs

        if len(out.results.columns) == 1:
            out.results = out.results.sort_index()   
        if kwargs.get('conc') is True:
            concs = pd.concat([x.concordance for x in res], ignore_index=True)
            concs = concs.sort_values(by='c')
            concs = concs.reset_index(drop=True)
            out.concordance = Concordance(concs)
        thetime = strftime("%H:%M:%S", localtime())
        if terminal and print_info:
            with terminal.location(0, terminal.height):
                print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        else:
            if print_info:
                print('\n\n%s: Finished! %d unique results, %d total.%s' % (thetime, len(out.results.columns), out.totals.sum(), '\n'))
        if save:
            out.save(save, print_info = print_info)
        return out
Exemplo n.º 24
0
def compare_survival(y, group_indicator, return_stats=False):
    """K-sample log-rank hypothesis test of identical survival functions.

    Compares the pooled hazard rate with each group-specific
    hazard rate. The alternative hypothesis is that the hazard
    rate of at least one group differs from the others at some time.

    See [1]_ for more details.

    Parameters
    ----------
    y : structured array, shape = (n_samples,)
        A structured array containing the binary event indicator
        as first field, and time of event or time of censoring as
        second field.

    group_indicator : array-like, shape = (n_samples,)
        Group membership of each sample.

    return_stats : bool, optional, default: False
        Whether to return a data frame with statistics for each group
        and the covariance matrix of the test statistic.

    Returns
    -------
    chisq : float
        Test statistic.
    pvalue : float
        Two-sided p-value with respect to the null hypothesis
        that the hazard rates across all groups are equal.
    stats : pandas.DataFrame
        Summary statistics for each group:  number of samples,
        observed number of events, expected number of events,
        and test statistic.
        Only provided if `return_stats` is True.
    covariance : array, shape=(n_groups, n_groups)
        Covariance matrix of the test statistic.
        Only provided if `return_stats` is True.

    References
    ----------
    .. [1] Fleming, T. R. and Harrington, D. P.
           A Class of Hypothesis Tests for One and Two Samples of Censored Survival Data.
           Communications In Statistics 10 (1981): 763-794.
    """

    group_indicator, event, time = check_arrays_survival(group_indicator,
                                                         y,
                                                         dtype="O",
                                                         ensure_2d=False)

    n_samples = time.shape[0]
    groups, group_counts = numpy.unique(group_indicator, return_counts=True)
    n_groups = groups.shape[0]
    if n_groups == 1:
        raise ValueError("At least two groups must be specified, "
                         "but only one was provided.")

    # sort descending
    o = numpy.argsort(-time, kind="mergesort")
    x = group_indicator[o]
    event = event[o]
    time = time[o]

    at_risk = numpy.zeros(n_groups, dtype=numpy.int_)
    observed = numpy.zeros(n_groups, dtype=numpy.int_)
    expected = numpy.zeros(n_groups, dtype=numpy.float_)
    covar = numpy.zeros((n_groups, n_groups), dtype=numpy.float_)
    k = 0
    while k < n_samples:
        ti = time[k]
        total_events = 0
        while k < n_samples and ti == time[k]:
            idx = numpy.searchsorted(groups, x[k])
            if event[k]:
                observed[idx] += 1
                total_events += 1
            at_risk[idx] += 1
            k += 1

        if total_events != 0:
            total_at_risk = k
            expected += at_risk * (total_events / total_at_risk)
            if total_at_risk > 1:
                multiplier = total_events * (total_at_risk - total_events) / (
                    total_at_risk * (total_at_risk - 1))
                for g1 in range(n_groups):
                    temp = at_risk[g1] * multiplier
                    covar[g1, g1] += temp
                    for g2 in range(n_groups):
                        covar[g1, g2] -= temp * at_risk[g2] / total_at_risk

    df = n_groups - 1
    zz = observed[:df] - expected[:df]
    chisq = numpy.linalg.solve(covar[:df, :df], zz).dot(zz)
    pval = stats.chi2.sf(chisq, df)

    if return_stats:
        table = OrderedDict()
        table["counts"] = group_counts
        table["observed"] = observed
        table["expected"] = expected
        table["statistic"] = observed - expected
        table = pandas.DataFrame.from_dict(table)
        table.index = pandas.Index(groups, name="group")
        return chisq, pval, table, covar

    return chisq, pval
Exemplo n.º 25
0
def rank(Input1, Input2, D, R):

    f = h5py.File('all_data', 'w')
    content = [word.strip() for word in open(Input)]
    test_content = [word.strip() for word in open(Input2)]
 
    k = 0
    rank = []
    tr_rank = []
    ResultMatrix = R.todense()
    TruthMatrix = D.todense()
    ranking = []
    for i in content:
        l = 0
        d = {}
        e = {}
        r = 0
        rc = 0
        tr = 0 
        iter = 0
        trr = 0
        iterr = 0
#        print i

#        print "\t Truth \t\t Calculation"
#        print "\t________________________________"
        for j in test_content:
            d[str(j)] = ResultMatrix[k, l]
            e[str(j)] = TruthMatrix[k, l]
            l += 1

        C = OrderedDict(reversed(sorted(d.items(), key=lambda
            t: np.float(t[1])))).keys()

        T = OrderedDict(reversed(sorted(e.items(), key=lambda
            t: np.float(t[1])))).keys()
        tar_words = []
        list_rank = 0

        for m in range(0,5):
            print '\t', T[m], '\t\t' ,C[m]
            tar_words.append(C[m])
            rc += (C.index(T[m])+1)

            iter += 1
            tr += iter
            list_rank = float(rc/5.0)
            ranking.append(list_rank)
        print "\t_________________________________"
        print "\t", tr, "\t\t", rc, '\t', list_rank
        k += 1

#        list_ind = []
#        words = []
#
#        for n in T:
#            x = (C.index(n) - T.index(n))
#            r += x ** 2
#            list_ind.append((C.index(n) + 1))
#            words.append(n)
#            iterr += 1
#            trr += iterr

        label_rank = i + '_rank'
        label_words = i + '_words'


        f.create_dataset(label_rank, data=list_rank)
        f.create_dataset(label_words, data=tar_words)

    avg_rank = (float(sum(ranking)/337.0))
    label_avg_rank = '_avg_rank'
    f.create_dataset(label_avg_rank, data=avg_rank)
    print "Average Rank", avg_rank
    f.close()
Exemplo n.º 26
0
            if cat in dictchannels:
                if not cat.startswith("VOD"):
                    for x in dictchannels[cat]:
                        x['serviceRef'] = "{}:0:1:{:x}:0:0:0:0:0:0".format(
                            x['streamType'], num)
                        num += 1
                else:
                    for x in dictchannels[cat]:
                        x['serviceRef'] = "{}:0:1:{:x}:0:0:0:0:0:0".format(
                            x['streamType'], vod_service_id)
            while (catstartnum < num):
                catstartnum += category_offset

        # move all VOD categories to VOD placeholder position
        if ("VOD" in category_order):
            vodindex = category_order.index("VOD")
            vodcategories = list(
                (cat for cat in category_order if cat.startswith('VOD -')))
            if len(vodcategories):
                #remove the multi vod categories from their current location
                category_order = [
                    x for x in category_order if x not in vodcategories
                ]
                #insert the multi vod categories at the placeholder pos
                category_order[vodindex:vodindex] = vodcategories
                category_order.remove("VOD")

        # Check for and parse override map
        self.parse_map_channels_xml(dictchannels)

        # Have a look at what we have
Exemplo n.º 27
0
def pmultiquery(corpus,
                search,
                show='words',
                query='any',
                sort_by='total',
                quicksave=False,
                multiprocess='default',
                just_speakers=False,
                root=False,
                note=False,
                print_info=True,
                **kwargs):
    """Parallel process multiple queries or corpora.

    This function is used by interrogator() for multiprocessing.
    
    There's no reason to call this function yourself."""

    import collections
    import os
    import pandas as pd
    import collections
    from collections import namedtuple
    from time import strftime, localtime
    import corpkit
    from interrogator import interrogator
    from editor import editor
    from other import save
    from interrogation import Interrogation
    try:
        from joblib import Parallel, delayed
    except:
        pass
        #raise ValueError('joblib, the module used for multiprocessing, cannot be found. ' \
        #                 'Install with:\n\n        pip install joblib')
    import multiprocessing

    locs = locals()
    for k, v in kwargs.items():
        locs[k] = v

    def best_num_parallel(num_cores, num_queries):
        import corpkit
        """decide how many parallel processes to run

        the idea, more or less, is to balance the load when possible"""
        if num_queries <= num_cores:
            return num_queries
        if num_queries > num_cores:
            if (num_queries / num_cores) == num_cores:
                return int(num_cores)
            if num_queries % num_cores == 0:
                try:
                    return max([
                        int(num_queries / n) for n in range(2, num_cores)
                        if int(num_queries / n) <= num_cores
                    ])
                except ValueError:
                    return num_cores
            else:
                import math
                if (float(math.sqrt(num_queries))).is_integer():
                    square_root = math.sqrt(num_queries)
                    if square_root <= num_queries / num_cores:
                        return int(square_root)
        return num_cores

    num_cores = multiprocessing.cpu_count()

    # what is our iterable? ...
    multiple_option = False
    multiple_queries = False
    multiple_speakers = False
    multiple_corpora = False
    multiple_search = False
    mult_corp_are_subs = False
    denom = 1

    if hasattr(corpus, '__iter__'):
        multiple_corpora = True
        num_cores = best_num_parallel(num_cores, len(corpus))
        denom = len(corpus)
        if all(c.__class__ == corpkit.corpus.Subcorpus for c in corpus):
            mult_corp_are_subs = True
    elif (type(query) == list
          or type(query) == dict) and not hasattr(search, '__iter__'):
        multiple_queries = True
        num_cores = best_num_parallel(num_cores, len(query))
        denom = len(query)
    elif hasattr(search, '__iter__') and all(
            type(i) == dict for i in list(search.values())):
        multiple_search = True
        num_cores = best_num_parallel(num_cores, len(list(search.keys())))
        denom = len(list(search.keys()))

    elif just_speakers:
        from build import get_speaker_names_from_xml_corpus
        multiple_speakers = True
        if just_speakers == 'each' or just_speakers == ['each']:
            just_speakers = get_speaker_names_from_xml_corpus(corpus.path)
        if len(just_speakers) == 0:
            print('No speaker name data found.')
            return
        num_cores = best_num_parallel(num_cores, len(just_speakers))
        denom = len(just_speakers)

    # if this thing has already come through multiquery, don't multiprocess this time
    #if kwargs.get('outname'):
    #    multiprocess = False

    if multiple_corpora and any(x is True for x in [
            multiple_speakers, multiple_queries, multiple_search,
            multiple_option
    ]):
        from corpus import Corpus, Corpora
        if corpus.__class__ == Corpora:
            multiprocess = False
        else:
            corpus = Corpus(corpus)

    if type(multiprocess) == int:
        num_cores = multiprocess
    if multiprocess is False:
        num_cores = 1

    # make sure quicksaves are right type
    if quicksave is True:
        raise ValueError('quicksave must be string when using pmultiquery.')

    # the options that don't change
    d = {
        #'paralleling': True,
        'function': 'interrogator',
        'root': root,
        'note': note,
        'denominator': denom
    }

    # add kwargs to query
    for k, v in list(kwargs.items()):
        d[k] = v

    # make a list of dicts to pass to interrogator,
    # with the iterable unique in every one
    ds = []
    if multiple_corpora:
        for index, p in enumerate(corpus):
            name = p.name
            a_dict = dict(d)
            a_dict['corpus'] = p
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name.replace('-parsed', '')
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_queries:
        for index, (name, q) in enumerate(query.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = q
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_speakers:
        for index, name in enumerate(just_speakers):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = search
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = [name]
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)
    elif multiple_search:
        for index, (name, val) in enumerate(search.items()):
            a_dict = dict(d)
            a_dict['corpus'] = corpus
            a_dict['search'] = val
            a_dict['query'] = query
            a_dict['show'] = show
            a_dict['outname'] = name
            a_dict['just_speakers'] = just_speakers
            a_dict['paralleling'] = index
            a_dict['printstatus'] = False
            ds.append(a_dict)

    if kwargs.get('do_concordancing') is False:
        message = 'Interrogating'
    elif kwargs.get('do_concordancing') is True:
        message = 'Interrogating and concordancing'
    elif kwargs.get('do_concordancing').lower() == 'only':
        message = 'Concordancing'
    time = strftime("%H:%M:%S", localtime())
    sformat = ''
    if multiple_queries:
        to_it_over = query
    else:
        to_it_over = search
    for i, (k, v) in enumerate(list(to_it_over.items())):
        if type(v) == list:
            vformat = ', '.join(v[:5])
            if len(v) > 5:
                vformat += ' ...'
        elif type(v) == dict:
            vformat = ''
            for kk, vv in v.items():
                if type(vv) == list:
                    vv = ', '.join(vv[:5])

                vformat += '\n                     %s: %s' % (kk, vv)
                if len(vv) > 5:
                    vformat += ' ...'
        else:
            vformat = v
        sformat += '%s: %s' % (k, vformat)
        if i < len(to_it_over.keys()) - 1:
            sformat += '\n                   '

    if print_info:
        if multiple_corpora and not multiple_option:
            corplist = "\n              ".join([i.name for i in corpus[:20]])
            if len(corpus) > 20:
                corplist += '\n ... and %d more ...\n' % (len(corpus) - 20)
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes):\n              %s" \
               "\n          Query: %s\n          %s corpus ... \n"  % (time, len(corpus), num_cores, corplist, sformat, message)))

        elif multiple_queries:
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(query), num_cores, corpus.name, sformat, message) ))

        elif multiple_search:
            print(("\n%s: Beginning %d corpus interrogations (in %d parallel processes): %s" \
               "\n          Queries: %s\n          %s corpus ... \n" % (time, len(list(search.keys())), num_cores, corpus.name, sformat, message)))

        elif multiple_option:
            print(("\n%s: Beginning %d parallel corpus interrogations (multiple options): %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) ))

        elif multiple_speakers:
            print(("\n%s: Beginning %d parallel corpus interrogations: %s" \
               "\n          Query: %s\n          %s corpus ... \n" % (time, num_cores, corpus.name, sformat, message) ))

    # run in parallel, get either a list of tuples (non-c option)
    # or a dataframe (c option)
    #import sys
    #reload(sys)
    #stdout=sys.stdout
    failed = False
    terminal = False
    used_joblib = False
    #ds = ds[::-1]
    if not root and print_info:
        from blessings import Terminal
        terminal = Terminal()
        print('\n' * (len(ds) - 2))
        for dobj in ds:
            linenum = dobj['paralleling']
            # this try handles nosetest problems in sublime text
            try:
                with terminal.location(0, terminal.height - (linenum + 1)):
                    # this is a really bad idea.
                    thetime = strftime("%H:%M:%S", localtime())
                    num_spaces = 26 - len(dobj['outname'])
                    print('%s: QUEUED: %s' % (thetime, dobj['outname']))

            except:
                pass

    if not root and multiprocess:
        #res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds)
        try:
            #ds = sorted(ds, key=lambda k: k['paralleling'], reverse = True)
            res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x)
                                             for x in ds)
            used_joblib = True
        except:
            failed = True
            print('Multiprocessing failed.')
            raise
        if not res:
            failed = True
    else:
        res = []
        for index, d in enumerate(ds):
            d['startnum'] = (100 / denom) * index
            res.append(interrogator(**d))
        try:
            res = sorted(res)
        except:
            pass

    # multiprocessing way
    #from multiprocessing import Process
    #from interrogator import interrogator
    #jobs = []
    ##for d in ds:
    ##    p = multiprocessing.Process(target=interrogator, kwargs=(**d,))
    ##    jobs.append(p)
    ##    p.start()
    ##    while p.is_alive():
    ##        import time
    ##        time.sleep(2)
    ##        if root:
    ##            root.update()
    #result_queue = multiprocessing.Queue()
    #
    #for d in ds:
    #funs = [interrogator(result_queue, **kwargs) for kwargs in ds]
    #jobs = [multiprocessing.Process(mc) for mc in funs]
    #for job in jobs: job.start()
    #for job in jobs: job.join()
    #results = [result_queue.get() for mc in funs]

    import corpkit
    from interrogation import Concordance
    if kwargs.get('do_concordancing') == 'only':
        concs = pd.concat([x for x in res])
        thetime = strftime("%H:%M:%S", localtime())
        if print_info:
            print('\n\n%s: Finished! %d results.\n\n' %
                  (thetime, len(concs.index)))
        return Concordance(concs)

    from collections import OrderedDict
    if not all(type(i.results) == pd.core.series.Series for i in res):
        out = OrderedDict()
        for interrog, d in zip(res, ds):
            for unpicklable in ['note', 'root']:
                interrog.query.pop(unpicklable, None)
            try:
                out[interrog.query['outname']] = interrog
            except KeyError:
                out[d['outname']] = interrog

        if quicksave:
            fullpath = os.path.join('saved_interrogations', quicksave)
            while os.path.isdir(fullpath):
                selection = input(
                    "\nSave error: %s already exists in %s.\n\nType 'o' to overwrite, or enter a new name: "
                    % (quicksave, 'saved_interrogations'))
                if selection == 'o' or selection == 'O':
                    import shutil
                    shutil.rmtree(fullpath)
                else:
                    import os
                    fullpath = os.path.join('saved_interrogations', selection)

            for k, v in list(out.items()):
                save(v, k, savedir=fullpath, print_info=False)

            time = strftime("%H:%M:%S", localtime())
            print("\n%s: %d files saved to %s" %
                  (time, len(list(out.keys())), fullpath))

        time = strftime("%H:%M:%S", localtime())
        if print_info:
            print(
                "\n\n%s: Finished! Output is a dictionary with keys:\n\n         '%s'\n"
                % (time, "'\n         '".join(sorted(out.keys()))))
        from interrogation import Interrodict
        idict = Interrodict(out)

        # remove unpicklable bits from query
        from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType
        locs = {k: v for k, v in locs.items() if not isinstance(v, ModuleType) \
                                             and not isinstance(v, FunctionType) \
                                             and not isinstance(v, BuiltinFunctionType) \
                                             and not isinstance(v, BuiltinMethodType)}
        idict.query = locs
        return idict
    # make query and total branch, save, return
    else:
        #print sers
        #print ds
        if multiple_corpora and not mult_corp_are_subs:
            sers = [i.results for i in res]
            out = pd.DataFrame(sers, index=[i.query['outname'] for i in res])
            out = out.reindex_axis(sorted(out.columns), axis=1)  # sort cols
            out = out.fillna(0)  # nan to zero
            out = out.astype(int)  # float to int
            out = out.T
        else:
            try:
                out = pd.concat([r.results for r in res], axis=1)
            except ValueError:
                return None
            # format like normal
            out = out[sorted(list(out.columns))]
            out = out.T
            out = out.fillna(0)  # nan to zero
            out = out.astype(int)
            if 'c' in show and mult_corp_are_subs:
                out = out.sum()
                out.index = sorted(list(out.index))

        # sort by total
        if type(out) == pd.core.frame.DataFrame:
            out.ix['Total-tmp'] = out.sum()
            tot = out.ix['Total-tmp']
            out = out[tot.argsort()[::-1]]
            out = out.drop('Total-tmp', axis=0)
        out = out.edit(sort_by = sort_by, print_info = False, keep_stats = False, \
                      df1_always_df = kwargs.get('df1_always_df'))
        if len(out.results.columns) == 1:
            out.results = out.results.sort_index()
        if kwargs.get('do_concordancing') is True:
            concs = pd.concat([x.concordance for x in res], ignore_index=True)
            concs = concs.sort_values(by='c')
            concs = concs.reset_index(drop=True)
            out.concordance = Concordance(concs)
        thetime = strftime("%H:%M:%S", localtime())
        if terminal and print_info:
            with terminal.location(0, terminal.height):
                print('\n\n%s: Finished! %d unique results, %d total.%s' %
                      (thetime, len(
                          out.results.columns), out.totals.sum(), '\n'))
        else:
            if print_info:
                print('\n\n%s: Finished! %d unique results, %d total.%s' %
                      (thetime, len(
                          out.results.columns), out.totals.sum(), '\n'))
        #if used_joblib:

        if quicksave:
            from other import save
            save(out, quicksave)
        return out
Exemplo n.º 28
0
def assign_variables(assignment_expressions,
                     df,
                     locals_dict,
                     df_alias=None,
                     trace_rows=None):
    """
    Evaluate a set of variable expressions from a spec in the context
    of a given data table.

    Expressions are evaluated using Python's eval function.
    Python expressions have access to variables in locals_d (and df being
    accessible as variable df.) They also have access to previously assigned
    targets as the assigned target name.

    lowercase variables starting with underscore are temp variables (e.g. _local_var)
    and not returned except in trace_results

    uppercase variables starting with underscore are temp scalar variables (e.g. _LOCAL_SCALAR)
    and not returned except in trace_assigned_locals
    This is useful for defining general purpose local constants in expression file

    Users should take care that expressions (other than temp scalar variables) should result in
    a Pandas Series (scalars will be automatically promoted to series.)

    Parameters
    ----------
    assignment_expressions : pandas.DataFrame of target assignment expressions
        target: target column names
        expression: pandas or python expression to evaluate
    df : pandas.DataFrame
    locals_d : Dict
        This is a dictionary of local variables that will be the environment
        for an evaluation of "python" expression.
    trace_rows: series or array of bools to use as mask to select target rows to trace

    Returns
    -------
    variables : pandas.DataFrame
        Will have the index of `df` and columns named by target and containing
        the result of evaluating expression
    trace_df : pandas.DataFrame or None
        a dataframe containing the eval result values for each assignment expression
    """

    np_logger = NumpyLogger(logger)

    def is_throwaway(target):
        return target == '_'

    def is_temp_scalar(target):
        return target.startswith('_') and target.isupper()

    def is_temp(target):
        return target.startswith('_')

    def to_series(x):
        if x is None or np.isscalar(x):
            return pd.Series([x] * len(df.index), index=df.index)
        return x

    assert assignment_expressions.shape[0] > 0

    trace_assigned_locals = trace_results = None
    if trace_rows is not None:
        # convert to numpy array so we can slice ndarrays as well as series
        trace_rows = np.asanyarray(trace_rows)
        if trace_rows.any():
            trace_results = OrderedDict()
            trace_assigned_locals = OrderedDict()

    # avoid touching caller's passed-in locals_d parameter (they may be looping)
    _locals_dict = local_utilities()
    if locals_dict is not None:
        _locals_dict.update(locals_dict)
    if df_alias:
        _locals_dict[df_alias] = df
    else:
        _locals_dict['df'] = df
    local_keys = list(_locals_dict.keys())

    # build a dataframe of eval results for non-temp targets
    # since we allow targets to be recycled, we want to only keep the last usage
    variables = OrderedDict()

    # need to be able to identify which variables causes an error, which keeps
    # this from being expressed more parsimoniously
    for e in zip(assignment_expressions.target,
                 assignment_expressions.expression):
        target, expression = e

        assert isinstance(target, str), \
            "expected target '%s' for expression '%s' to be string not %s" % \
            (target, expression, type(target))

        if target in local_keys:
            logger.warning(
                "assign_variables target obscures local_d name '%s'",
                str(target))

        if is_temp_scalar(target) or is_throwaway(target):
            try:
                x = eval(expression, globals(), _locals_dict)
            except Exception as err:
                logger.error("assign_variables error: %s: %s",
                             type(err).__name__, str(err))
                logger.error("assign_variables expression: %s = %s",
                             str(target), str(expression))
                raise err

            if not is_throwaway(target):
                _locals_dict[target] = x
                if trace_assigned_locals is not None:
                    trace_assigned_locals[uniquify_key(trace_assigned_locals,
                                                       target)] = x
            continue

        try:

            # FIXME - log any numpy warnings/errors but don't raise
            np_logger.target = str(target)
            np_logger.expression = str(expression)
            saved_handler = np.seterrcall(np_logger)
            save_err = np.seterr(all='log')

            # FIXME should whitelist globals for security?
            globals_dict = {}
            expr_values = to_series(
                eval(expression, globals_dict, _locals_dict))

            np.seterr(**save_err)
            np.seterrcall(saved_handler)

        except Exception as err:
            logger.error("assign_variables error: %s: %s",
                         type(err).__name__, str(err))
            logger.error("assign_variables expression: %s = %s", str(target),
                         str(expression))
            raise err

        if not is_temp(target):
            variables[target] = expr_values

        if trace_results is not None:
            trace_results[uniquify_key(trace_results,
                                       target)] = expr_values[trace_rows]

        # update locals to allows us to ref previously assigned targets
        _locals_dict[target] = expr_values

    if trace_results is not None:

        trace_results = pd.DataFrame.from_dict(trace_results)

        trace_results.index = df[trace_rows].index

        # add df columns to trace_results
        trace_results = pd.concat([df[trace_rows], trace_results], axis=1)

    # we stored result in dict - convert to df
    variables = util.df_from_dict(variables, index=df.index)

    return variables, trace_results, trace_assigned_locals
class Encoder:
    def __init__(self):
        self.keysets = []
        self.keysetsindex = 0
        self.stringhist = {}
        self.output = []

    def createstringhist(self, obj):
        if type(obj) == str or type(obj) == unicode:
            obj = obj.encode('utf-8')
            self.stringhist[obj] = self.stringhist.get(obj, 0) + 1
            return
        if type(obj) == list:
            for p in obj:
                self.createstringhist(p)
        elif type(obj) == dict:
            for p in obj:
                self.createstringhist(obj[p])

    def encode(self, obj, keysetstoomit=None):
        if keysetstoomit == None: keysetstoomit = []
        self.keysets = [tuple(p) for p in keysetstoomit]
        self.keysetsindex = len(self.keysets)
        self.output = []
        self.stringhist = OrderedDict()
        self.createstringhist(obj)
        self.stringhist = [(p, ((len(p) + 1) * self.stringhist[p]) -
                            (len(p) + 2 + self.stringhist[p]))
                           for p in self.stringhist]
        self.stringhist = [(-q, p) for (p, q) in self.stringhist if q > 0]
        self.stringhist = [p for (q, p) in sorted(self.stringhist)]
        q = self.stringhist
        if len(self.stringhist) > 255:
            self.stringhist = self.stringhist[:255]
        self.write(obj)
        x = self.output
        self.output = []
        self.stringhist = []
        self.output.append(TypeTags.STRLUT)
        self.output.append(len(q))
        for p in q:
            self.writeStr(p)
        self.writeArray(self.keysets[self.keysetsindex:])
        return self.output + x

    def write(self, x):
        if type(x) == type(None):
            self.output.append(TypeTags.NULL)
        elif x == NULL:
            self.output.append(TypeTags.NULL)
        elif x == UNDEFINED:
            self.output.append(TypeTags.UNDEFINED)
        elif x == DEFAULT:
            self.output += [TypeTags.EXTENSION, 0, TypeTags.UNDEFINED]
        elif x == INFINITY:
            self.output += [TypeTags.FLOAT32, 0x7F, 0x80, 0x00, 0x00]
        elif x == NEGINFINITY:
            self.output += [TypeTags.FLOAT32, 0xFF, 0x80, 0x00, 0x00]
        elif x == NAN:
            self.output += [TypeTags.FLOAT32, 0x7F, 0xC0, 0x00, 0x00]
        else:
            ENCODERS[type(x)](self, x)

    def writeBoolean(self, x):
        if x: self.output.append(TypeTags.TRUE)
        else: self.output.append(TypeTags.FALSE)

    def writeInt(self, x):
        if abs(x) > 0xFFFFFFFFFFFFFFFF:
            return self.writeFloat(x)
        if x >= 0:
            if x < 64:
                self.output.append(x)
            elif x <= 0x3FFF:
                self.output += [TypeTags.UINT14_BASE | x >> 8, x & 0xFF]
            elif x <= 0xFFFF:
                self.output += [TypeTags.UINT16, x >> 8, x & 0xFF]
            elif x <= 0xFFFFFF:
                self.output += [
                    TypeTags.UINT24, x >> 16, x >> 8 & 0xFF, x & 0xFF
                ]
            elif x <= 0xFFFFFFFF:
                self.output.append(TypeTags.UINT32)
                self.output += int32tobytes(x)
            else:
                self.output.append(TypeTags.UINT64)
                self.output += int32tobytes(x >> 32 & 0xFFFFFFFF)
                self.output += int32tobytes(x & 0xFFFFFFFF)
        else:
            x = -x
            if x <= 15:
                self.output.push(TypeTags.NINT4_BASE | x)
            elif x <= 0xFF:
                self.output += [TypeTags.NINT8, x]
            elif x <= 0xFFFF:
                self.output += [TypeTags.NINT16, x >> 8, x & 0xFF]
            elif x <= 0xFFFFFFFF:
                self.output.append(TypeTags.NINT32)
                self.output += int32tobytes(x)
            else:
                self.output.append(TypeTags.NINT64)
                self.output += int32tobytes(x >> 32 & 0xFFFFFFFF)
                self.output += int32tobytes(x & 0xFFFFFFFF)

    def writeFloat(self, x):
        m = struct.pack("f", x)
        if x == struct.unpack("f", m)[0]:
            self.output.append(TypeTags.FLOAT32)
        else:
            self.output.append(TypeTags.DOUBLE64)
            m = struct.pack("d", x)
        self.output += [ord(m[i]) for i in range(len(m) - 1, -1, -1)]

    def writeStr(self, x):
        x = x.encode('utf-8')
        if x in self.stringhist:
            self.output += [TypeTags.STRREF, self.stringhist.index(x)]
            return
        z = False
        xe = []
        for p in x:
            p = ord(p)
            if p == 0:
                z = True
            xe.append(p)
        x = xe
        y = len(x)
        if y < 32:
            self.output.append(TypeTags.STR5_BASE | y)
            self.output += x
        else:
            if not z:
                self.output.append(TypeTags.CSTRING)
                self.output += x
                self.output.append(0)
            else:
                if y <= 255:
                    self.output += [TypeTags.STR8, y]
                else:
                    self.output.append(TypeTags.STR_)
                    self.writeInt(y)
                self.output += x

    def writeArray(self, x):
        isboolarray = True
        for p in x:
            if type(x) != bool:
                isboolarray = False
                break
        if isboolarray:
            if len(x) < 16:
                self.output.append(TypeTags.BARRAY4_BASE | len(x))
            elif len(x) < 256:
                self.output.append(TypeTags.BARRAY8)
                self.output.append(len(x))
            else:
                self.output.append(TypeTags.BARRAY_)
                self.writeInt(len(x))
            self.output += bytefrombools(x)
        else:
            if len(x) < 32:
                self.output.append(TypeTags.ARRAY5_BASE | len(x))
            elif len(x) <= 255:
                self.output.append(TypeTags.ARRAY8)
                self.output.append(len(x))
            else:
                self.output.append(TypeTags.ARRAY_)
                self.writeInt(len(x))
            for p in x:
                self.write(p)

    def writeDict(self, x):
        y = tuple(sorted(x.keys()))
        try:
            k = self.keysets.index(y)
        except:
            self.keysets.append(y)
            k = len(self.keysets) - 1
        isbool = True
        for p in y:
            if type(x[p]) != bool:
                isbool = False
                break
        if isbool:
            self.output.append(TypeTags.BMAP_)
            self.writeInt(k)
            self.output += bytefrombools([x[p] for p in y])
        else:
            self.output.append(TypeTags.MAP_)
            self.writeInt(k)
            for p in y:
                self.write(x[p])
def eval_interaction_utilities(spec, df, locals_d, trace_label, trace_rows, estimator=None):
    """
    Compute the utilities for a single-alternative spec evaluated in the context of df

    We could compute the utilities for interaction datasets just as we do for simple_simulate
    specs with multiple alternative columns byt calling eval_variables and then computing the
    utilities by matrix-multiplication of eval results with the utility coefficients in the
    spec alternative columns.

    But interaction simulate computes the utilities of each alternative in the context of a
    separate row in interaction dataset df, and so there is only one alternative in spec.
    This turns out to be quite a bit faster (in this special case) than the pandas dot function.

    For efficiency, we combine eval_variables and multiplication of coefficients into a single step,
    so we don't have to create a separate column for each partial utility. Instead, we simply
    multiply the eval result by a single alternative coefficient and sum the partial utilities.


    spec : dataframe
        one row per spec expression and one col with utility coefficient

    df : dataframe
        cross join (cartesian product) of choosers with alternatives
        combines columns of choosers and alternatives
        len(df) == len(choosers) * len(alternatives)
        index values (non-unique) are index values from alternatives df

    interaction_utilities : dataframe
        the utility of each alternative is sum of the partial utilities determined by the
        various spec expressions and their corresponding coefficients
        yielding a dataframe  with len(interaction_df) rows and one utility column
        having the same index as interaction_df (non-unique values from alternatives df)

    Returns
    -------
    utilities : pandas.DataFrame
        Will have the index of `df` and a single column of utilities

    """
    trace_label = tracing.extend_trace_label(trace_label, "eval_interaction_utils")
    logger.info("Running eval_interaction_utilities on %s rows" % df.shape[0])

    assert(len(spec.columns) == 1)

    # avoid altering caller's passed-in locals_d parameter (they may be looping)
    locals_d = locals_d.copy() if locals_d is not None else {}
    locals_d.update(locals())

    def to_series(x):
        if np.isscalar(x):
            return pd.Series([x] * len(df), index=df.index)
        if isinstance(x, np.ndarray):
            return pd.Series(x, index=df.index)
        return x

    if trace_rows is not None and trace_rows.any():
        # # convert to numpy array so we can slice ndarrays as well as series
        # trace_rows = np.asanyarray(trace_rows)
        assert type(trace_rows) == np.ndarray
        trace_eval_results = OrderedDict()
    else:
        trace_eval_results = None

    check_for_variability = config.setting('check_for_variability')

    # need to be able to identify which variables causes an error, which keeps
    # this from being expressed more parsimoniously

    utilities = pd.DataFrame({'utility': 0.0}, index=df.index)
    no_variability = has_missing_vals = 0

    if estimator:
        # ensure alt_id from interaction_dataset is available in expression_values_df for
        # estimator.write_interaction_expression_values and eventual omnibus table assembly
        alt_id = estimator.get_alt_id()
        assert alt_id in df.columns
        expression_values_df = df[[alt_id]]

        # FIXME estimation_requires_chooser_id_in_df_column
        # estimation requires that chooser_id is either in index or a column of interaction_dataset
        # so it can be reformatted (melted) and indexed by chooser_id and alt_id
        # we assume caller has this under control if index is named
        if df.index.name is None:
            chooser_id = estimator.get_chooser_id()
            assert chooser_id in df.columns, \
                "Expected to find choose_id column '%s' in interaction dataset" % (chooser_id, )
            assert df.index.name is None
            expression_values_df[chooser_id] = df[chooser_id]

    if isinstance(spec.index, pd.MultiIndex):
        exprs = spec.index.get_level_values(simulate.SPEC_EXPRESSION_NAME)
        labels = spec.index.get_level_values(simulate.SPEC_LABEL_NAME)
    else:
        exprs = spec.index
        labels = spec.index

    for expr, label, coefficient in zip(exprs, labels, spec.iloc[:, 0]):
        try:

            # - allow temps of form _od_DIST@od_skim['DIST']
            if expr.startswith('_'):

                target = expr[:expr.index('@')]
                rhs = expr[expr.index('@') + 1:]
                v = to_series(eval(rhs, globals(), locals_d))

                # update locals to allows us to ref previously assigned targets
                locals_d[target] = v

                if trace_eval_results is not None:
                    trace_eval_results[expr] = v[trace_rows]

                # mem.trace_memory_info("eval_interaction_utilities TEMP: %s" % expr)
                continue

            if expr.startswith('@'):
                v = to_series(eval(expr[1:], globals(), locals_d))
            else:
                v = df.eval(expr)

            if check_for_variability and v.std() == 0:
                logger.info("%s: no variability (%s) in: %s" % (trace_label, v.iloc[0], expr))
                no_variability += 1

            # FIXME - how likely is this to happen? Not sure it is really a problem?
            if check_for_variability and np.count_nonzero(v.isnull().values) > 0:
                logger.info("%s: missing values in: %s" % (trace_label, expr))
                has_missing_vals += 1

            if estimator:
                # in case we modified expression_values_df index
                v = v.values if isinstance(v, pd.Series) else v
                expression_values_df.insert(loc=len(expression_values_df.columns), column=label,
                                            value=v.values if isinstance(v, pd.Series) else v)

            utilities.utility += (v * coefficient).astype('float')

            if trace_eval_results is not None:

                # expressions should have been uniquified when spec was read
                # (though we could do it here if need be...)
                # expr = assign.uniquify_key(trace_eval_results, expr, template="{} # ({})")
                assert expr not in trace_eval_results

                trace_eval_results[expr] = v[trace_rows]
                k = 'partial utility (coefficient = %s) for %s' % (coefficient, expr)
                trace_eval_results[k] = v[trace_rows] * coefficient

        except Exception as err:
            logger.exception(f"{trace_label} - {type(err).__name__} ({str(err)}) evaluating: {str(expr)}")
            raise err

        # mem.trace_memory_info("eval_interaction_utilities: %s" % expr)

    if estimator:
        estimator.log("eval_interaction_utilities write_interaction_expression_values %s" % trace_label)
        estimator.write_interaction_expression_values(expression_values_df)
        del expression_values_df

    if no_variability > 0:
        logger.warning("%s: %s columns have no variability" % (trace_label, no_variability))

    if has_missing_vals > 0:
        logger.warning("%s: %s columns have missing values" % (trace_label, has_missing_vals))

    if trace_eval_results is not None:
        trace_eval_results['total utility'] = utilities.utility[trace_rows]

        trace_eval_results = pd.DataFrame.from_dict(trace_eval_results)
        trace_eval_results.index = df[trace_rows].index

        # add df columns to trace_results
        trace_eval_results = pd.concat([df[trace_rows], trace_eval_results], axis=1)

    return utilities, trace_eval_results
Exemplo n.º 31
0
    if 'and' in condition_required.keys():
        cond_2 = condition_required['and']
        if cond_2[0].replace('-','').isdigit():
            cond_2.reverse()
            connection = cond_2[1] 
            if cond_2[1] == '<':
                connection = '>'
            if cond_2[1] == '>':
                connection = '<'
            if cond_2[1] == '<=':
                connection = '>='
            if cond_2[1] == '>=':
                connection = '<='

            cond_2[1] = connection
        table_2 = {cond_2.index(x): x.split('.')[0] for x in cond_2 if '.' in x}
        cond_2 = {cond_2.index(x): x.split('.')[-1] for x in cond_2}
        connection = 'and'
    elif 'or' in condition_required.keys():
        cond_2 = condition_required['or']
        if cond_2[0].replace('-','').isdigit():
            cond_2.reverse()
            connection = cond_2[1] 
            if cond_2[1] == '<':
                connection = '>'
            if cond_2[1] == '>':
                connection = '<'
            if cond_2[1] == '<=':
                connection = '>='
            if cond_2[1] == '>=':
                connection = '<='