示例#1
0
 def convertToMatrix(self, filename):
     negOrder = -1 * (self.order)
     set1 = OrderedSet()
     for k, v in self.d.items():
         for x, y in v.items():             
             # s = x[negOrder:]
             toState =  k 
             fromState = x
             set1.add(toState)
             set1.add(fromState)
     self.array = np.zeros(shape=(len(set1), len(set1)))
     for k, v in self.d.items():
         for x, y in v.items():
             summation = 0
             for m, n in self.d.items():
                 for p, q in n.items():
                     if (p == x):
                         summation += q
             # s = x[negOrder:]
             toState = k
             fromState = x
             self.array[set1.index(fromState)][set1.index(toState)] = y / summation
     print(len(set1))
     # dict1 = {}
     # pd.DataFrame(self.array).to_csv("yash.csv")
     # for row in self.array:
     #      for item in set1:
     #          dict1.update({item:row})
     df = pd.DataFrame(self.array)
     df.to_csv(filename)
    def _get_pmx_crossed_sequence(sequence_a: OrderedSet, sequence_b: OrderedSet,
                                  part_from_a: OrderedSet, part_from_b: OrderedSet,
                                  start_index: int, end_index: int) -> List[int]:
        """
        Returns a sequence, which base is from 'sequence_b' and 'part_from_a' is copied in
        """
        new_sequence = list(sequence_b)

        elements_requiring_correction = {}

        uniques_from_b_part = part_from_b - part_from_a
        for unique_from_b_part in uniques_from_b_part:
            index_in_part = part_from_b.index(unique_from_b_part)
            elements_requiring_correction[unique_from_b_part] = part_from_a[index_in_part]

        for elem_from_b, elem_from_a in elements_requiring_correction.items():
            while elem_from_a in part_from_b:
                index_of_elem_from_b = sequence_b.index(elem_from_a)
                elem_from_a = sequence_a[index_of_elem_from_b]

            new_index = sequence_b.index(elem_from_a)
            new_sequence[new_index] = elem_from_b

        new_sequence[start_index:end_index] = part_from_a

        return new_sequence
示例#3
0
def getPaper(url):
    try:
        article = quickSoup(url)
        t = article.get_text()
        if "The abstract you requested was not found" in t:
            return ("{},".format(url))
        title = article.find('h1').get_text().replace("\n", "")
        test_list = OrderedSet(t.split("\n"))
        authors = test_list[0].replace(title,
                                       "").replace(" :: SSRN", "").replace(
                                           " by ", "").replace(", ", ":")
        date = [
            line.replace("Last revised: ", "") for line in test_list
            if "Last revised: " in line
        ]
        if date == []:
            date = [
                line.replace("Posted: ", "") for line in test_list
                if "Last revised: " in line
            ]
        date = date[0]
        text = t.split("Abstract\n")[1]
        abstract = "\"{}\"".format(
            text.split("Suggested Citation:")[0].replace("\n", ""))

        # get paper statistics
        stats = OrderedSet(
            article.find('div', attrs={
                'class': 'box-paper-statics'
            }).get_text().split("\n"))
        views, dl, rank, refs = "", "", "", ""
        try:
            views = stats[stats.index('Abstract Views') + 1].strip().replace(
                ",", "")
        except:
            pass
        try:
            dl = stats[stats.index('Downloads') + 1].strip().replace(",", "")
        except:
            pass
        try:
            rank = stats[stats.index('rank') + 1].strip().replace(",", "")
        except:
            pass
        try:
            refs = stats[stats.index('References') + 1].strip().replace(
                ",", "")
        except:
            pass
        results = [
            url, "\"{}\"".format(title), abstract, authors, date, views, dl,
            rank, refs
        ]
        return (",".join(results))
    except:
        return ("{},,,,,,,,".format(url))
def build_from_conceptnet_table(filename, orig_index=(), self_loops=True):
    """
    Read a file of tab-separated association data from ConceptNet, such as
    `data/assoc/reduced.csv`. Return a SciPy sparse matrix of the associations,
    and a pandas Index of labels.

    If you specify `orig_index`, then the index of labels will be pre-populated
    with existing labels, and any new labels will get index numbers that are
    higher than the index numbers the existing labels use. This is important
    for producing a sparse matrix that can be used for retrofitting onto an
    existing dense labeled matrix (see retrofit.py).
    """
    mat = SparseMatrixBuilder()

    labels = OrderedSet(orig_index)

    totals = defaultdict(float)
    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip(
            ).split('\t')

            index1 = labels.add(replace_numbers(concept1))
            index2 = labels.add(replace_numbers(concept2))
            value = float(value_str)

            mat[index1, index2] = value
            mat[index2, index1] = value
            totals[index1] += value
            totals[index2] += value

    # Link nodes to their more general versions
    for label in labels:
        prefixes = list(uri_prefixes(label, 3))
        if len(prefixes) >= 2:
            parent_uri = prefixes[-2]
            if parent_uri in labels:
                index1 = labels.index(label)
                index2 = labels.index(parent_uri)
                mat[index1, index2] = 1
                mat[index2, index1] = 1
                totals[index1] += 1
                totals[index2] += 1

    # add self-loops on the diagonal with equal weight to the rest of the row
    if self_loops:
        for key, value in totals.items():
            mat[key, key] = value

    shape = (len(labels), len(labels))
    index = pd.Index(labels)
    return normalize(mat.tocsr(shape), norm='l1', axis=1), index
示例#5
0
def test_indexing():
    set1 = OrderedSet('abracadabra')
    assert set1[:] == set1
    assert set1.copy() == set1
    assert set1 is set1
    assert set1[:] is not set1
    assert set1.copy() is not set1

    assert set1[[1, 2]] == OrderedSet(['b', 'r'])
    assert set1[1:3] == OrderedSet(['b', 'r'])
    assert set1.index('b') == 1
    assert set1.index(['b', 'r']) == [1, 2]
    with pytest.raises(KeyError):
        set1.index('br')
示例#6
0
def build_from_conceptnet_table(filename, orig_index=(), self_loops=True):
    """
    Read a file of tab-separated association data from ConceptNet, such as
    `data/assoc/reduced.csv`. Return a SciPy sparse matrix of the associations,
    and a pandas Index of labels.

    If you specify `orig_index`, then the index of labels will be pre-populated
    with existing labels, and any new labels will get index numbers that are
    higher than the index numbers the existing labels use. This is important
    for producing a sparse matrix that can be used for retrofitting onto an
    existing dense labeled matrix (see retrofit.py).
    """
    mat = SparseMatrixBuilder()

    labels = OrderedSet(orig_index)

    totals = defaultdict(float)
    with open(str(filename), encoding='utf-8') as infile:
        for line in infile:
            concept1, concept2, value_str, dataset, relation = line.strip().split('\t')

            index1 = labels.add(replace_numbers(concept1))
            index2 = labels.add(replace_numbers(concept2))
            value = float(value_str)
            mat[index1, index2] = value
            mat[index2, index1] = value
            totals[index1] += value
            totals[index2] += value

    # Link nodes to their more general versions
    for label in labels:
        prefixes = list(uri_prefixes(label, 3))
        if len(prefixes) >= 2:
            parent_uri = prefixes[-2]
            if parent_uri in labels:
                index1 = labels.index(label)
                index2 = labels.index(parent_uri)
                mat[index1, index2] = 1
                mat[index2, index1] = 1
                totals[index1] += 1
                totals[index2] += 1

    # add self-loops on the diagonal with equal weight to the rest of the row
    if self_loops:
        for key, value in totals.items():
            mat[key, key] = value

    shape = (len(labels), len(labels))
    index = pd.Index(labels)
    return mat.tocsr(shape), index
示例#7
0
def test_indexing():
    set1 = OrderedSet('abracadabra')
    assert set1[:] == set1
    assert set1.copy() == set1
    assert set1 is set1
    assert set1[:] is not set1
    assert set1.copy() is not set1

    assert set1[[1, 2]] == OrderedSet(['b', 'r'])
    assert set1[1:3] == OrderedSet(['b', 'r'])
    assert set1.index('b') == 1
    assert set1.index(['b', 'r']) == [1, 2]
    with pytest.raises(KeyError):
        set1.index('br')
示例#8
0
def test_indexing():
    set1 = OrderedSet('abracadabra')
    eq_(set1[:], set1)
    eq_(set1.copy(), set1)
    assert set1[:] is set1
    assert set1.copy() is not set1

    eq_(set1[[1, 2]], OrderedSet(['b', 'r']))
    eq_(set1[1:3], OrderedSet(['b', 'r']))
    eq_(set1.index('b'), 1)
    eq_(set1.index(('b', 'r')), [1, 2])
    try:
        set1.index('br')
        assert False, "Looking up a nonexistent key should be a KeyError"
    except KeyError:
        pass
示例#9
0
def test_indexing():
    set1 = OrderedSet('abracadabra')
    eq_(set1[:], set1)
    eq_(set1.copy(), set1)
    assert set1[:] is set1
    assert set1.copy() is not set1

    eq_(set1[[1, 2]], OrderedSet(['b', 'r']))
    eq_(set1[1:3], OrderedSet(['b', 'r']))
    eq_(set1.index('b'), 1)
    eq_(set1.index(('b', 'r')), [1, 2])
    try:
        set1.index('br')
        assert False, "Looking up a nonexistent key should be a KeyError"
    except KeyError:
        pass
示例#10
0
class SkeletonReducer:
    def __init__(self, sparse_skel: SkeletonType):
        self.reduced_to_sparse = OrderedSet(
            sorted(idx for line in sparse_skel.lines_flat for idx in line))
        self.sparse_skel = sparse_skel
        self.dense_skel = SkeletonType(
            map_idxs(sparse_skel.lines,
                     lambda x: self.reduced_to_sparse.index(x)))

    def reduce_arr(self, arr):
        return arr[self.reduced_to_sparse]
示例#11
0
def make_sparse_assoc(freq_path, parallel_text_path, output_path, languages, vocab_size=100000):
    print("Building vocab")
    vocab = OrderedSet()
    languages.sort()
    for language in languages:
        print('\t{}'.format(language))
        language_freq_path = freq_path / '{}.txt'.format(language)
        with language_freq_path.open(encoding='utf-8') as freq_file:
            for i, line in enumerate(freq_file):
                if i >= vocab_size:
                    break
                word, _rest = line.split('\t')
                uri = make_short_uri(language, word)
                vocab.add(uri)

    vocab_path = output_path / 'vocab.txt'
    with vocab_path.open('w', encoding='utf-8') as vocab_out:
        for uri in vocab:
            print(uri, file=vocab_out)

    coords_path = output_path / 'coords.dat'
    with (output_path / 'coords.dat').open('wb') as coords_out:
        for lang1, lang2 in itertools.combinations(languages, 2):
            print(lang1, lang2)
            parallel_path = parallel_text_path / '{}-{}.txt'.format(lang1, lang2)
            with parallel_path.open(encoding='utf-8') as parallel_file:
                for i, line in enumerate(parallel_file):
                    if i % 100000 == 0:
                        print('\t{}'.format(i))
                    text1, text2 = line.rstrip('\n').split('\t')
                    words1 = [make_short_uri(lang1, word) for word in text1.split()]
                    words2 = [make_short_uri(lang2, word) for word in text2.split()]
                    words = [uri for uri in (words1 + words2) if uri in vocab]
                    for word1 in words:
                        idx1 = vocab.index(word1)
                        for word2 in words:
                            idx2 = vocab.index(word2)
                            coord_bytes = struct.pack('<ii', idx1, idx2)
                            coords_out.write(coord_bytes)
示例#12
0
    def _read_SMILES(self, input_file) -> OrderedSet:
        """
        Reads a SMILES file. Returns an ordered set of ReactionContainer objects passed the standardization protocol.
        :param input_file: str
        :return: OrderedSet
        """
        data = OrderedSet()
        self.logger.info('Start..')
        with SMILESRead(input_file, ignore=True, store_log=True, remap=self._ignore_mapping, header=True) as ifile, \
                open(input_file) as meta_searcher:
            id_tag_position = meta_searcher.readline().strip().split().index(
                self._id_tag)
            if id_tag_position is None or id_tag_position == 0:
                self.logger.critical(
                    f'No reaction ID tag was found in the header!')
                raise ValueError(
                    f'No reaction ID tag was found in the header!')
            for reaction in ifile._data:
                if isinstance(reaction, tuple):
                    meta_searcher.seek(reaction.position)
                    line = meta_searcher.readline().strip().split()
                    if len(line) <= id_tag_position:
                        self.logger.critical(
                            f'No reaction ID tag was found in line {reaction.number}!'
                        )
                        raise ValueError(
                            f'No reaction ID tag was found in line {reaction.number}!'
                        )
                    r_id = line[id_tag_position]
                    self.logger.critical(
                        f'Reaction {r_id}: Parser has returned an error message\n{reaction.log}'
                    )
                    continue

                standardized_reaction = self.standardize(reaction)
                if standardized_reaction:
                    if standardized_reaction not in data:
                        data.add(standardized_reaction)
                    else:
                        i = data.index(standardized_reaction)
                        if 'Extraction_IDs' not in data[i].meta:
                            data[i].meta['Extraction_IDs'] = ''
                        data[i].meta['Extraction_IDs'] = ','.join(
                            data[i].meta['Extraction_IDs'].split(',') +
                            [reaction.meta[self._id_tag]])
                        self.logger.info(
                            'Reaction {0} is a duplicate of the reaction {1}..'
                            .format(reaction.meta[self._id_tag],
                                    data[i].meta[self._id_tag]))
        return data
示例#13
0
def test_remove():
    set1 = OrderedSet('abracadabra')

    set1.remove('a')
    set1.remove('b')

    assert set1 == OrderedSet('rcd')
    assert set1[0] == 'r'
    assert set1[1] == 'c'
    assert set1[2] == 'd'

    assert set1.index('r') == 0
    assert set1.index('c') == 1
    assert set1.index('d') == 2

    assert 'a' not in set1
    assert 'b' not in set1
    assert 'r' in set1

    # Make sure we can .discard() something that's already gone, plus
    # something that was never there
    set1.discard('a')
    set1.discard('a')
示例#14
0
def test_remove():
    set1 = OrderedSet('abracadabra')

    set1.remove('a')
    set1.remove('b')

    assert set1 == OrderedSet('rcd')
    assert set1[0] == 'r'
    assert set1[1] == 'c'
    assert set1[2] == 'd'

    assert set1.index('r') == 0
    assert set1.index('c') == 1
    assert set1.index('d') == 2

    assert 'a' not in set1
    assert 'b' not in set1
    assert 'r' in set1

    # Make sure we can .discard() something that's already gone, plus
    # something that was never there
    set1.discard('a')
    set1.discard('a')
示例#15
0
 def _read_RDF(self, input_file) -> OrderedSet:
     """
     Reads an RDF file. Returns an ordered set of ReactionContainer objects passed the standardization protocol.
     :param input_file: str
     :return: OrderedSet
     """
     data = OrderedSet()
     self.logger.info('Start..')
     with RDFRead(input_file, ignore=self._ignore_mapping, store_log=True, remap=self._ignore_mapping) as ifile, \
             open(input_file) as meta_searcher:
         for reaction in ifile._data:
             if isinstance(reaction, tuple):
                 meta_searcher.seek(reaction.position)
                 flag = False
                 for line in meta_searcher:
                     if flag and '$RFMT' in line:
                         self.logger.critical(
                             f'Reaction id extraction problem rised for the reaction '
                             f'#{reaction.number + 1}: a reaction id was expected but $RFMT line '
                             f'was found!')
                     if flag:
                         self.logger.critical(
                             f'Reaction {line.strip().split()[1]}: Parser has returned an error '
                             f'message\n{reaction.log}')
                         break
                     elif '$RFMT' in line:
                         self.logger.critical(
                             f'Reaction #{reaction.number + 1} has no reaction id!'
                         )
                     elif f'$DTYPE {self._id_tag}' in line:
                         flag = True
                 continue
             standardized_reaction = self.standardize(reaction)
             if standardized_reaction:
                 if standardized_reaction not in data:
                     data.add(standardized_reaction)
                 else:
                     i = data.index(standardized_reaction)
                     if 'Extraction_IDs' not in data[i].meta:
                         data[i].meta['Extraction_IDs'] = ''
                     data[i].meta['Extraction_IDs'] = ','.join(
                         data[i].meta['Extraction_IDs'].split(',') +
                         [reaction.meta[self._id_tag]])
                     self.logger.info(
                         'Reaction {0} is a duplicate of the reaction {1}..'
                         .format(reaction.meta[self._id_tag],
                                 data[i].meta[self._id_tag]))
     return data
def standardize_vecs(labels, vecs, merge_mode='weighted'):
    standardized_labels = OrderedSet()
    standardized_vecs = []

    for index, (label, vec) in enumerate(zip(labels, vecs)):
        label = standardize(label)

        if merge_mode == 'weighted':
            vec /= (index + 1)

        if label not in standardized_labels:
            standardized_labels.add(label)
            standardized_vecs.append(vec)
        else:
            if merge_mode != 'first':
                index = standardized_labels.index(label)
                standardized_vecs[index] += vec

    return list(standardized_labels), np.array(standardized_vecs)
示例#17
0
文件: block.py 项目: iJasonne/pslplay
    def expr_to_matrix(self, expr, row_dict, constr_query, constr_query_symbols):
        """
        First normalizes a given expression with a visitor pattern then queries the knowledge base for the given query
        and assigns the results to their respective row and column index defined the the row and column dictionaries.

        :param expr: The expression to be grounded
        :type expr: Sympy Expression| RLPSum
        :param row_dict: An OrderedSet containing the row indices for the lp matrix for the given expression
        :type row_dict: OrderedSet
        :param constr_query: The query originating from a given constraint
        :type constr_query: Sympy Expression | RLPSum
        :param constr_query_symbols: A Set containing the query symbols for the given constraint query
        :type constr_query_symbols: FiniteSet
        :return: A dictionary containing a unique name for the variable and the results returned from the knowledge base.
        """
        expr = Normalizer(expr).result

        if not isinstance(expr, Add):
            summands = [expr]
        else:
            summands = expr.args

        result = {}
        log.debug("\nSummands: %s", str(summands))

        for summand in summands:
            log.debug("\n->summand: %s", str(summand))

            if isinstance(summand, RlpSum):
                summand_query = summand.query
                summand_query_symbols = summand.query_symbols
                coef_query, coef_expr, variable = coefficient_to_query(summand.args[2])
            else:
                summand_query = True
                summand_query_symbols = EmptySet()
                coef_query, coef_expr, variable = coefficient_to_query(summand)

            query_symbols = OrderedSet(constr_query_symbols + summand_query_symbols)

            query = constr_query & summand_query & coef_query

            answers = self.logkb.ask(query_symbols, query, coef_expr)
            variable_qs_indices = []
            if variable is not None:
                variable_qs_indices = [query_symbols.index(arg) for arg in variable.args if isinstance(arg, SubSymbol)]
            constr_qs_indices = [query_symbols.index(symbol) for symbol in constr_query_symbols]

            variable_class = variable.__class__
            col_dict = self.col_dicts.get(variable_class, OrderedSet())
            self.col_dicts[variable_class] = col_dict

            # If the query yields no results we don't have to add anything to the matrix
            if len(answers) == 0:
                continue

            expr_index = len(answers[0]) - 1
            sparse_data = []
            for answer in answers:
                column_record = []

                # use only subsymbols when they occur, otherwise constants
                qs_iterator = iter(variable_qs_indices)
                if variable is not None:
                    for arg in variable.args:
                        if isinstance(arg, SubSymbol):
                            column_record.append(answer[qs_iterator.next()])
                        else:
                            column_record.append(arg)

                col_dict_index = col_dict.add(tuple(column_record))
                row_dict_index = row_dict.add(tuple(answer[i] for i in constr_qs_indices))

                sparse_data.append([np.float(answer[expr_index]), row_dict_index, col_dict_index])

            sparse_data = np.array(sparse_data)
            summand_block = sp.sparse.coo_matrix((sparse_data[:, 0], (sparse_data[:, 1], sparse_data[:, 2]))).todok()

            if variable_class in result:
                shape = (len(row_dict), len(col_dict))
                result[variable_class].resize(shape)
                summand_block.resize(shape)
                result[variable_class] += summand_block
            else:
                result[variable_class] = summand_block

        return result
示例#18
0
def test_tuples():
    set1 = OrderedSet()
    tup = ('tuple', 1)
    set1.add(tup)
    assert set1.index(tup) == 0
    assert set1[0] == tup
示例#19
0
def test_tuples():
    set1 = OrderedSet()
    tup = ('tuple', 1)
    set1.add(tup)
    eq_(set1.index(tup), 0)
    eq_(set1[0], tup)
示例#20
0
class CategoricalDescriptor(Descriptor):
    """A |Descriptor| used to extract a categorical property from a collection of |Record|.

    Args:
        name (str): The |Record| property to describe name.
        fetch_fn (Callable): Optional. Default to identity. An optional function applied to the |Record| property
            before it is counted in a category.

    Attributes:
        name (str): The |Record| property to describe name.

    """
    def __init__(self, name, fetch_fn=None):
        super(CategoricalDescriptor, self).__init__(name)
        self._categories = OrderedSet()
        self._fetch_fn = fetch_fn or identity

    def update(self, *record_collections):
        """Update the set of known categories from |Record| property :attr:`name` value.

        Args:
            *record_collections (|RecordCollection|): |RecordCollection| of which |Record| will be used to update
                set of known categories.

        """
        records = (record for record_collection in record_collections
                   for record in record_collection)
        try:
            for record in records:
                self._categories.add(
                    str(self._fetch_fn(getattr(record, self.name))))
        except AttributeError:
            raise ValueError(
                'Invalid record property name: {} was not found in record.'.
                format(self.name))

    def compute(self, *record_collections):
        """Construct new |RecordCollection| where each enclosed |Record| is added a category number as a property.

        Args:
            *record_collections (|RecordCollection|): |RecordCollection| used to construct new |RecordCollection| with
                described |Record|.

        Returns:
            (|RecordCollection|, ): A described |RecordCollection| tuple.

        """
        for record_collection in record_collections:
            for record in record_collection:
                try:
                    record.properties[self.property_name] = \
                        self._categories.index(str(self._fetch_fn(getattr(record, self.name)))) + 0.5
                except AttributeError:
                    raise ValueError(
                        'Invalid record property name: {} was not found in record.'
                        .format(self.name))
                except KeyError:
                    raise ValueError(
                        'Invalid record property value: '
                        '{} is out of known property range of values.'.format(
                            getattr(record, self.name)))
        return record_collections

    def reset(self):
        """Reset |CategoricalDescriptor| set of known categories to factory values."""
        self._categories = OrderedSet()

    def _make_interface(self):
        return {
            'type': 'categorical',
            'schema': {
                category: index + 0.5
                for index, category in enumerate(self._categories)
            }
        }
class BundleAdjuster:
    '''
  Bundle Adjustment class that takes in matches (with initial estimates for 
  rotation and focal length of each camera) and minimises the reprojection
  error for all matches' keypoints.
  '''

    # w.r.t. K
    FOCAL_DERIVATIVE = np.array([[1, 0, 0], [0, 1, 0], [0, 0, 0]])

    # w.r.t. K
    PPX_DERIVATIVE = np.array([[0, 0, 1], [0, 0, 0], [0, 0, 0]])

    # w.r.t. K
    PPY_DERIVATIVE = np.array([[0, 0, 0], [0, 0, 1], [0, 0, 0]])

    def __init__(self):
        print(f'BundleAdjuster intitialised')
        self._matches = []
        self._match_count = []
        self._cameras = OrderedSet()

    def matches(self):
        return self._matches

    def added_cameras(self):
        return self._cameras

    def add(self, match):
        '''
    Add a match to the bundle adjuster
    '''
        num_pointwise_matches = sum(
            len(match.inliers) for match in self._matches)
        self._match_count.append(num_pointwise_matches)

        self._matches.append(match)
        for cam in match.cams():
            self._cameras.add(cam)

        print(f'Added match {match}')

    def run(self):
        '''
    Run the bundle adjuster on the current matches to find optimal camera parameters
    '''
        if (len(self._matches) < 1):
            raise ValueError(
                'At least one match must be added before bundle adjustment is run'
            )

        print(f'Running bundle adjustment...')

        initial_state = State()
        initial_state.set_initial_cameras(self._cameras)

        intial_residuals = self._projection_errors(initial_state)
        intial_error = math.sqrt(np.mean(intial_residuals**2))

        print(f'Initial error: {intial_error}')

        print('Initial params')
        for param in initial_state.params:
            print(param)

        itr_count = 0
        non_decrease_count = 0
        best_state = initial_state
        best_residuals = intial_residuals
        best_error = intial_error

        while (itr_count < MAX_ITR):
            # print(f'[{itr_count}] Curr state: \n')
            # for (i, el) in enumerate(best_state.params):
            #   print(f'\t[{i}]: {el}')

            J, JtJ = self._calculate_jacobian(best_state)
            param_update = self._get_next_update(J, JtJ, best_residuals)
            next_state = best_state.updatedState(param_update)

            next_residuals = self._projection_errors(next_state)
            next_error_val = math.sqrt(np.mean(next_residuals**2))
            print(f'Next error: {next_error_val}')
            # return

            if (next_error_val >= best_error - 1e-3):
                non_decrease_count += 1
            else:
                print('Updating state to new best state')
                non_decrease_count = 0
                best_error = next_error_val

                # for i in range(len(best_state.params)):
                #   print(f'{best_state.params[i]} -> {next_state.params[i]}')

                best_state = next_state
                best_residuals = next_residuals

            if (non_decrease_count > 5):
                break

        print(f'BEST ERROR {best_error}')

        # Update actual camera object params
        new_cameras = best_state.cameras
        for i in range(len(new_cameras)):
            # print(f'{self._cameras[i].R} = {new_cameras[i].R}')
            print(f'Final focal: {new_cameras[i].focal}')
            self._cameras[i].focal = new_cameras[i].focal
            self._cameras[i].ppx = new_cameras[i].ppx
            self._cameras[i].ppy = new_cameras[i].ppy
            self._cameras[i].R = new_cameras[i].R

    def _cross_product_matrix(self, x, y, z):
        return np.array([[0, -z, y], [z, 0, -x], [-y, x, 0]], dtype=np.float64)

    def _dR_dvi(self, rotation_matrix, x, y, z):
        '''
    The derivative of the rotation with respect to each rotation parameter
    Returns 3 matrices (dR/dx, dR/dy, dR/dz)
    Calculated using https://arxiv.org/pdf/1312.0788.pdf
    '''
        ssq_params = x * x + y * y + z * z
        if (ssq_params < 1e-14):
            return np.array([
                self._cross_product_matrix(1, 0, 0),
                self._cross_product_matrix(0, 1, 0),
                self._cross_product_matrix(0, 0, 1)
            ])

        cross_product_matrix = self._cross_product_matrix(x, y, z)
        ret = [
            cross_product_matrix, cross_product_matrix, cross_product_matrix
        ]

        ret[0] = ret[0] * x
        ret[1] = ret[1] * y
        ret[2] = ret[2] * z

        I_minus_R = np.identity(3) - rotation_matrix

        for i in range(3):
            x1, y1, z1 = np.cross(np.array([x, y, z]), I_minus_R[:, i])
            ret[i] += self._cross_product_matrix(x1, y1, z1)
            ret[i] = np.multiply(ret[i], 1 / ssq_params)
            ret[i] = ret[i] @ rotation_matrix

        return ret

    def _drdv(self, dhdv, h**o, hz_inv, hz_sqr_inv):
        return np.array([
            -dhdv[0] * hz_inv + dhdv[2] * h**o[0] * hz_sqr_inv,
            -dhdv[1] * hz_inv + dhdv[2] * h**o[1] * hz_sqr_inv
        ],
                        dtype=np.float64)

    def _homogeneous_coordinate_2d(self, coordinate):
        '''
    Convert Cartesian coordinate to homogeneous coordinate
    '''
        return np.append(coordinate, [1])

    def _trans(self, transform, coordinate):
        if (len(coordinate) == 2):
            return self._trans(transform,
                               self._homogeneous_coordinate_2d(coordinate))
        elif (len(coordinate) == 3):
            return transform @ coordinate

    def _calculate_jacobian(self, state):
        with open('ba_test_data.txt', 'w') as f:

            params = state.params
            cameras = state.cameras

            f.write('Params:\n')
            for (i, param) in enumerate(params):
                f.write(f'[{i}] {param}\n')

            f.write('\nCameras:\n')
            for (i, camera) in enumerate(cameras):
                f.write(
                    f'[{i}] Focal: {cameras[i].focal}, R: {cameras[i].R}\n')

            num_cams = len(cameras)
            num_pointwise_matches = sum(
                len(match.inliers) for match in self._matches)

            J = np.zeros((PARAMS_PER_POINT_MATCH * num_pointwise_matches,
                          PARAMS_PER_CAMERA * num_cams),
                         dtype=np.float64)
            JtJ = np.zeros(
                (PARAMS_PER_CAMERA * num_cams, PARAMS_PER_CAMERA * num_cams),
                dtype=np.float64)

            all_dRdvi = []
            for i in range(len(cameras)):
                param_i = i * PARAMS_PER_CAMERA
                x, y, z = params[param_i + 3:param_i + 6]
                dRdvi = self._dR_dvi(cameras[i].R, x, y, z)
                all_dRdvi.append(dRdvi)

            for (i, match) in enumerate(self._matches):
                # print(f'------------\n')
                # print(f'Loop itr: {i}')
                match_count_idx = self._match_count[i] * 2

                cam_to_idx = self._cameras.index(match.cam_to)
                cam_from_idx = self._cameras.index(match.cam_from)

                cam_to = cameras[cam_to_idx]
                cam_from = cameras[cam_from_idx]

                # print(f'from.R: {cam_from.R}')
                # print(f'to.R: {cam_to.R}')

                params_index_from = cam_from_idx * PARAMS_PER_CAMERA
                params_index_to = cam_to_idx * PARAMS_PER_CAMERA

                # print(f'params_index_from: {params_index_from}')
                # print(f'params_index_to: {params_index_to}')

                from_K = cam_from.K
                to_K_inv = np.linalg.pinv(cam_to.K)
                to_R_inv = cam_to.R.T
                from_R = cam_from.R
                d_R_from_vi = all_dRdvi[cam_from_idx]
                d_R_to_vi = np.copy(all_dRdvi[cam_to_idx])
                d_R_to_vi_T = [m.T for m in d_R_to_vi]

                H_to_to_from = (from_K @ from_R) @ (to_R_inv @ to_K_inv)
                # print(f'H_to_to_from: {H_to_to_from}')

                for (pair_index, pair) in enumerate(match.inliers):
                    to_coordinate = pair[1]
                    h**o = self._trans(H_to_to_from, to_coordinate)
                    hz_sqr_inv = 1 / (h**o[2]**2)
                    hz_inv = 1 / h**o[2]

                    d_from = np.zeros(
                        (PARAMS_PER_CAMERA, PARAMS_PER_POINT_MATCH))
                    d_to = np.zeros(
                        (PARAMS_PER_CAMERA, PARAMS_PER_POINT_MATCH))

                    m = from_R @ to_R_inv @ to_K_inv
                    dot_u2 = self._trans(
                        m, to_coordinate
                    )  #m @ self._homogeneous_coordinate_2d(to_coordinate)

                    d_from[0] = self._drdv(
                        self._trans(self.FOCAL_DERIVATIVE, dot_u2), h**o,
                        hz_inv, hz_sqr_inv)
                    d_from[1] = self._drdv(
                        self._trans(self.PPX_DERIVATIVE, dot_u2), h**o, hz_inv,
                        hz_sqr_inv)
                    d_from[2] = self._drdv(
                        self._trans(self.PPY_DERIVATIVE, dot_u2), h**o, hz_inv,
                        hz_sqr_inv)

                    dot_u2 = self._trans((to_R_inv @ to_K_inv), to_coordinate)

                    f.write(f'dot_u2: {dot_u2}\n')
                    f.write(f'from_K: {from_K}\n')
                    f.write(f'd_R_from_vi[0]: {d_R_from_vi[0]}\n')
                    f.write(f'h**o: {h**o}\n')
                    f.write(f'hz_inv: {hz_inv}\n')
                    f.write(f'hz_sqr_inv: {hz_sqr_inv}\n')

                    d_from[3] = self._drdv(
                        self._trans((from_K @ d_R_from_vi[0]), dot_u2), h**o,
                        hz_inv, hz_sqr_inv)
                    d_from[4] = self._drdv(
                        self._trans((from_K @ d_R_from_vi[1]), dot_u2), h**o,
                        hz_inv, hz_sqr_inv)
                    d_from[5] = self._drdv(
                        self._trans((from_K @ d_R_from_vi[2]), dot_u2), h**o,
                        hz_inv, hz_sqr_inv)

                    m = from_K @ from_R @ to_R_inv @ to_K_inv
                    dot_u2 = self._trans(to_K_inv, to_coordinate) * -1

                    # print(f'dot_u2: {dot_u2}')

                    d_to[0] = self._drdv(
                        self._trans((m @ self.FOCAL_DERIVATIVE), dot_u2), h**o,
                        hz_inv, hz_sqr_inv)
                    d_to[1] = self._drdv(
                        self._trans((m @ self.PPX_DERIVATIVE), dot_u2), h**o,
                        hz_inv, hz_sqr_inv)
                    d_to[2] = self._drdv(
                        self._trans((m @ self.PPY_DERIVATIVE), dot_u2), h**o,
                        hz_inv, hz_sqr_inv)

                    # d_to[1], d_to[2] = d_to[2], d_to[1]

                    m = from_K @ from_R
                    dot_u2 = self._trans(to_K_inv, to_coordinate)

                    d_to[3] = self._drdv(
                        self._trans((m @ d_R_to_vi_T[0]), dot_u2), h**o,
                        hz_inv, hz_sqr_inv)
                    d_to[4] = self._drdv(
                        self._trans((m @ d_R_to_vi_T[1]), dot_u2), h**o,
                        hz_inv, hz_sqr_inv)
                    d_to[5] = self._drdv(
                        self._trans((m @ d_R_to_vi_T[2]), dot_u2), h**o,
                        hz_inv, hz_sqr_inv)

                    # print(f'dfrom: {d_from}')
                    # print(f'dto: {d_to}')

                    f.write(f'dfrom: {d_from}\n')
                    f.write(f'dto: {d_to}\n')

                    # if (pair_index == 0):
                    #     print(f'dfrom: {d_from}')
                    #     print(f'dto: {d_to}')

                    for param_idx in range(PARAMS_PER_CAMERA):
                        # IS pair_index CORRECT HERE?
                        J[match_count_idx,
                          params_index_from + param_idx] = d_from[param_idx][0]
                        # print(f'({match_count_idx}, {params_index_from + param_idx}) dfrom[{param_idx}].x: {d_from[param_idx][0]}')
                        J[match_count_idx,
                          params_index_to + param_idx] = d_to[param_idx][0]
                        # print(f'({match_count_idx}, {params_index_to + param_idx}) dto[{param_idx}].x: {d_to[param_idx][0]}')
                        J[match_count_idx + 1,
                          params_index_from + param_idx] = d_from[param_idx][1]
                        # print(f'({match_count_idx+1}, {params_index_from + param_idx}) dfrom[{param_idx}].y: {d_from[param_idx][1]}')
                        J[match_count_idx + 1,
                          params_index_to + param_idx] = d_to[param_idx][1]
                        # print(f'({match_count_idx+1}, {params_index_to + param_idx}) dto[{param_idx}].y: {d_to[param_idx][1]}')

                        f.write(
                            f'({match_count_idx}, {params_index_from + param_idx}) dfrom[{param_idx}].x: {d_from[param_idx][0]}\n'
                        )
                        f.write(
                            f'({match_count_idx}, {params_index_to + param_idx}) dto[{param_idx}].x: {d_to[param_idx][0]}\n'
                        )
                        f.write(
                            f'({match_count_idx+1}, {params_index_from + param_idx}) dfrom[{param_idx}].y: {d_from[param_idx][1]}\n'
                        )
                        f.write(
                            f'({match_count_idx+1}, {params_index_to + param_idx}) dto[{param_idx}].y: {d_to[param_idx][1]}\n'
                        )

                    for param_idx_i in range(PARAMS_PER_CAMERA):
                        for param_idx_j in range(PARAMS_PER_CAMERA):
                            # f.write(f'[l1] index_from: {params_index_from}, index_to: {params_index_to}, i: {param_idx_i}, j: {param_idx_j}\n')
                            i1 = params_index_from + param_idx_i
                            i2 = params_index_to + param_idx_j
                            val = d_from[param_idx_i] @ d_to[param_idx_j]
                            JtJ[i1][i2] += val
                            JtJ[i2][i1] += val

                            f.write(f'JtJ[{i1}][{i2}] += {val}\n')
                            f.write(f'JtJ[{i2}][{i1}] += {val}\n')

                    for param_idx_i in range(PARAMS_PER_CAMERA):
                        for param_idx_j in range(param_idx_i,
                                                 PARAMS_PER_CAMERA):
                            # f.write(f'[l2] index_from: {params_index_from}, index_to: {params_index_to}, i: {param_idx_i}, j: {param_idx_j}\n')
                            i1 = params_index_from + param_idx_i
                            i2 = params_index_from + param_idx_j
                            val = d_from[param_idx_i] @ d_from[param_idx_j]
                            JtJ[i1][i2] += val
                            f.write(f'JtJ[{i1}][{i2}] += {val}\n')
                            if (param_idx_i != param_idx_j):
                                JtJ[i2][i1] += val
                                f.write(f'JtJ[{i2}][{i1}] += {val}\n')

                            i1 = params_index_to + param_idx_i
                            i2 = params_index_to + param_idx_j
                            val = d_to[param_idx_i] @ d_to[param_idx_j]
                            JtJ[i1][i2] += val
                            f.write(f'JtJ[{i1}][{i2}] += {val}\n')
                            if (param_idx_i != param_idx_j):
                                JtJ[i2][i1] += val
                                f.write(f'JtJ[{i2}][{i1}] += {val}\n')

                    match_count_idx += 2

            return J, JtJ

    def _transform_2d(self, H, coordinate):
        '''
    Converts cartesian coordinate to homogeneous
    Project coordinate with H
    Convert back to cartesian
    '''
        homogeneous_coordinate = self._homogeneous_coordinate_2d(coordinate)
        p = H @ homogeneous_coordinate
        return np.array([p[0] / p[2], p[1] / p[2]])

    def _projection_errors(self, state):
        current_cameras = state.cameras

        num_pointwise_matches = sum(
            len(match.inliers) for match in self._matches)
        error = np.zeros((num_pointwise_matches * PARAMS_PER_POINT_MATCH))

        count = 0
        for match in self._matches:
            cam_from = current_cameras[self._cameras.index(match.cam_from)]
            cam_to = current_cameras[self._cameras.index(match.cam_to)]
            from_K = cam_from.K
            from_R = cam_from.R
            to_K_inv = np.linalg.pinv(cam_to.K)
            to_R_inv = cam_to.R.T
            H_to_to_from = (from_K @ from_R) @ (to_R_inv @ to_K_inv)

            start = count
            for pair in match.inliers:
                from_coordinate = pair[0]
                to_coordinate = pair[1]

                transformed = self._transform_2d(H_to_to_from, to_coordinate)
                error[count] = from_coordinate[0] - transformed[0]
                error[count + 1] = from_coordinate[1] - transformed[1]

                count += 2

            print(
                f'Match from_{match.cam_from.image.filename} to_{match.cam_to.image.filename} error: {math.sqrt(np.mean(error[start:]**2))}'
            )

        # print(f'projection_error ({len(error)}):\n{error}')

        return error

    def _get_next_update(self, J, JtJ, residuals):
        # # Regularisation
        l = random.normalvariate(1, 0.1)
        # print(f'random.normalvariate(10, 20): {random.normalvariate(10, 20)}')
        for i in range(len(self._cameras) * PARAMS_PER_CAMERA):
            if (i % PARAMS_PER_CAMERA >= 3):
                # TODO: Improve regularisation params (currently a bit off)
                JtJ[i][i] += (
                    3.14 / 16) * l  #random.normalvariate(10, 20) * 5000000000
            else:
                JtJ[i][i] += (
                    1500 / 10
                ) * l  # TODO: Use intial focal estimate #random.normalvariate(10, 20) * 5000000000

        # print(f'J.T shape: {J.T.shape}')
        # print(f'residuals: {residuals}')

        # with open('test_error_residuals.txt', 'w') as f:
        #   for r in residuals:
        #     f.write(f'{r}\n')

        # openpano_JtJ = np.zeros((24,24), dtype=np.float64)
        # filename = 'ba_optimize.txt'
        # readingB = False
        # openpano_b = []
        # with open('./match_test_data/' + filename, 'r') as fp:
        #   for line in fp:
        #     if re.match(r'^\(', line):
        #       tings = [x for x in re.findall(r'\-?\d+\.?\d*e?\+?\d*', line)]
        #       # print(float(tings[2]))
        #       openpano_JtJ[int(tings[0])][int(tings[1])] = float(tings[2])
        #     elif re.match(r'b:', line):
        #       readingB = True
        #     elif (readingB and not re.match(r'^\s$', line)):
        #       bVal = [x for x in re.findall(r'\-?\d+\.?\d*e?\+?\d*', line)]
        #       # print(float(bVal[0]))
        #       openpano_b.append(float(bVal[0]))
        #     elif (readingB and re.match(r'^\s$', line)):
        #       readingB = False
        # openpano_b = np.asarray(openpano_b, dtype=np.float64)

        # with open('JtJ_test.txt', 'w') as f:
        #   print(f'JtJ.shape : {JtJ.shape}')
        #   for i in range(JtJ.shape[0]):
        #     for j in range(JtJ.shape[1]):
        #       f.write(f'({i}, {j}) {JtJ[i][j]}\n')

        b = J.T @ residuals

        # with open('JtJ_test_comparison.txt', 'w') as f:
        #   print(f'JtJ.shape : {JtJ.shape}')
        #   for i in range(JtJ.shape[0]):
        #     for j in range(JtJ.shape[1]):
        #       percentDiff = ((openpano_JtJ[i][j] - JtJ[i][j]) / openpano_JtJ[i][j]) * 100
        #       if (abs(percentDiff) > 0.001):
        #         f.write(f'({i}, {j}) JtJ={JtJ[i][j]}, OpenPano_JtJ={openpano_JtJ[i][j]} [Diff={percentDiff}]\n')

        #   f.write('\nb:\n')
        #   for (i, el) in enumerate(b):
        #     percentDiff = ((openpano_b[i] - b[i]) / openpano_b[i]) * 100
        #     if (abs(percentDiff) > 0.001):
        #       f.write(f'({i}) b={b[i]}, openpano_b={openpano_b[i]} [Diff={percentDiff}]\n')

        # JtJ = openpano_JtJ
        # b = openpano_b
        updates = np.linalg.solve(JtJ, b)

        # print('b:')
        # for (i, el) in enumerate(b):
        #   print(f'\t[{i}]: {el}')

        # print('Updates:')
        # for (i, update) in enumerate(updates):
        #   print(f'\t[{i}]: {update}')

        # print('Recomputed b vector:')
        # for (i, newB) in enumerate(JtJ@updates):
        #   print(f'\tnewB: {newB}')

        # updates = []
        # filename = 'ba_optimize.txt'
        # readingB = False
        # with open('./match_test_data/' + filename, 'r') as fp:
        #   for line in fp:
        #     if re.match(r'Update:', line):
        #       readingB = True
        #     elif (readingB and re.match(r'^\t+', line)):
        #       bVal = [x for x in re.findall(r'\-?\d+\.?\d*e?\+?\d*', line)]
        #       # print(float(bVal[0]))
        #       updates.append(float(bVal[0]))
        #     elif (readingB and re.match(r'^\s$', line)):
        #       readingB = False

        # print('Updates')
        # for update in updates:
        #   print(update)
        # updates = np.array(updates, dtype=np.float64)

        return updates
    def _compute_validation_outputs(self, world: List[SpiderWorld], sub_graphs,
                                    sub_graphs_scores, sub_graphs_labels,
                                    candidates, outputs: Dict[str,
                                                              Any]) -> None:
        batch_size = len(world)

        outputs['predicted_sql_query'] = []
        outputs['candidates'] = []

        for i in range(batch_size):
            if world[i].query is not None:
                gold_sql_query = ' '.join(world[i].query)
                difficulty = self._query_difficulty(
                    query_tokens=gold_sql_query.split(),
                    entities=set(world[i].db_context.knowledge_graph.entities))

            num_candidates = self._metric_num_candidates
            example_sub_graphs = sub_graphs[i, :num_candidates]
            example_sub_graphs_scores = sub_graphs_scores[i, :num_candidates]
            example_candidates = candidates[i][:num_candidates]
            if sub_graphs_labels is not None:
                example_sub_graphs_labels = sub_graphs_labels[
                    i, :num_candidates]

            candidate_to_sub_graph_id = {}
            sub_graphs_ids = []
            for sub_graph in example_sub_graphs:
                entities_ids = sub_graph[sub_graph > -1].tolist()
                if len(entities_ids) == 0:
                    continue
                sub_graph = tuple(sorted(entities_ids))
                if sub_graph not in candidate_to_sub_graph_id:
                    candidate_to_sub_graph_id[sub_graph] = len(
                        candidate_to_sub_graph_id)
                sub_graphs_ids.append(candidate_to_sub_graph_id[sub_graph])

            sorted_candidates_ids = example_sub_graphs_scores.sort(
                descending=True)[1].tolist()
            sorted_sub_graphs = OrderedSet([
                sub_graphs_ids[j] for j in sorted_candidates_ids
                if j < len(sub_graphs_ids)
            ])

            candidates_for_final_sort = []
            for original_rank, c in enumerate(example_candidates):
                sub_graph_id = sub_graphs_ids[original_rank]
                if sub_graphs_labels is not None:
                    sg_correct = int(
                        example_sub_graphs_labels[original_rank] == 1)
                else:
                    sg_correct = None
                candidates_for_final_sort.append({
                    'query':
                    c['query'],
                    'original_rank':
                    original_rank,
                    'reranker_sg_rank':
                    sorted_sub_graphs.index(sub_graph_id),
                    'reranker_cand_rank':
                    sorted_candidates_ids.index(original_rank),
                    'sub_graph_correct':
                    sg_correct,
                    'correct':
                    c['correct']
                })

            # sorting sub graphs, then inner-ranking by original beam search order
            candidates_sg_sort = sorted(
                candidates_for_final_sort,
                key=lambda x: (x['reranker_sg_rank'], x['original_rank']))

            if sub_graphs_labels is not None:
                sg_tsk_query_correct = candidates_sg_sort[0]['correct']

                self._update_metric('query_accuracy',
                                    int(sg_tsk_query_correct), difficulty)

            outputs['candidates'].append(
                [c['query'] for c in candidates_sg_sort])
示例#23
0
    for split in splits:
        instances_per_ig[split] = {}
        triples_per_ig[split] = {}
        start_instance_id = 0
        for ig_index, ig in enumerate(data[split]):
            instances = OrderedSet()
            triples = defaultdict(int)
            existing_combinations = set()
            for t in ig:
                object_indices = []
                for pos in [S_n, O_n]:
                    object_ = [t[pos][category]]
                    object_.extend([t[pos][bbox][i] for i in range(3)])
                    object_ = tuple(object_)
                    instances.add(object_)
                    object_indices.append(instances.index(object_))
                existing_combinations.add(tuple(object_indices))
                triples[(t[S_n][category], t[P_n], t[O_n][category],
                         object_indices[0] + start_instance_id,
                         object_indices[1] + start_instance_id)] += 1

            # add unannotated subject-object-pairs as triples with unknown relation
            for sbj_index in range(len(instances)):
                for obj_index in range(len(instances)):
                    if sbj_index != obj_index:
                        if not (sbj_index, obj_index) in existing_combinations:
                            sbj = instances[sbj_index]
                            obj = instances[obj_index]
                            triples[(sbj[0], relations["unknown"], obj[0],
                                     sbj_index + start_instance_id,
                                     obj_index + start_instance_id)] += 1
示例#24
0
def test_tuples():
    set1 = OrderedSet()
    tup = ('tuple', 1)
    set1.add(tup)
    assert set1.index(tup) == 0
    assert set1[0] == tup
示例#25
0
class WordVectors:
    def __init__(self, labels, vectors, replacements=None, standardizer=standardize):
        assert(len(labels) == len(vectors))
        self.labels = OrderedSet(labels)
        if not isinstance(vectors, np.memmap):
            normalize(vectors, copy=False)
        self.vectors = vectors
        self.replacements = replacements
        self._standardizer = standardizer
        self._mean_vec = np.mean(self.vectors, axis=0)

    def truncate(self, size):
        return WordVectors(
            list(self.labels)[:size],
            self.vectors[:size],
            self.replacements,
            self._standardizer
        )

    def similarity(self, word1, word2, lang=None):
        try:
            return self.to_vector(word1, lang).dot(self.to_vector(word2, lang))
        except KeyError:
            return 0

    def to_vector(self, word, lang=None, default_zero=False) -> np.ndarray:
        if isinstance(word, list):
            vec = np.zeros(self.vectors.shape[1])
            for actual_word, weight in word:
                vec += self.to_vector(actual_word, lang=lang)
            return normalize_vec(vec)

        if self._standardizer is not None:
            if self._standardizer is standardize and \
                lang is not None:
                word = self._standardizer(word, lang=lang)
            else:
                word = self._standardizer(word)

        max_sim = 1.
        if self.replacements and word in self.replacements:
            while word not in self.labels:
                word, sim = self.replacements[word]
                #max_sim *= np.sqrt(sim)

        if default_zero and word not in self.labels:
            return np.zeros(self.vectors.shape[1])
        vec = normalize_vec(self.vectors[self.labels.index(word)])
        return vec * max_sim

    def similar_to(self, word_or_vector, num=20, only=None):
        if isinstance(self.vectors, np.memmap):
            self.vectors = normalize(self.vectors)

        if isinstance(word_or_vector, str):
            vec = self.to_vector(word_or_vector)
        else:
            vec = word_or_vector

        sim = self.vectors.dot(vec)
        indices = np.argsort(sim)[::-1]

        out = []
        for index in indices:
            if len(out) == num:
                return out
            if only is None or only(self.labels[index]):
                out.append((self.labels[index], sim[index]))

        return out

    def which_relation(self, rel_array, v1, v2):
        if isinstance(v1, str):
            v1 = self.to_vector(v1)
        if isinstance(v2, str):
            v2 = self.to_vector(v2)
        avg_rel = self._mean_vec.dot(rel_array.dot(self._mean_vec))
        rels = v2.dot(rel_array.dot(v1))
        diff = np.maximum(0, rels - avg_rel) ** 2
        return diff / np.sum(diff)

    def analogy_values(self, rel_array, c1, c2, c3, vector_choices):
        # Convert the input concepts to vectors
        v1, v2, v3 = [self.to_vector(c, default_zero=True) for c in (c1, c2, c3)]
        # relA and relB are vectors whose length is the number of relations.
        # They indicate the relative weight with which each relation holds
        # between appropriate pairs of input concepts.
        relA = self.which_relation(rel_array, v1, v2)
        relB = self.which_relation(rel_array, v1, v3)
        # relAr and relBr are matrices that use these combinations of
        # relations to convert one vector into another.
        relAr = rank3_inner_product(relA, rel_array)
        relBr = rank3_inner_product(relB, rel_array)

        # rv1 is the vector that's related to v1 by these relations, and so on.
        rv1 = (relAr + relBr).dot(v1)
        rv2 = relBr.dot(v2)
        rv3 = relAr.dot(v3)
        ratings = weighted_3cosmul(rv1, rv2, rv3, vector_choices)
        return ratings

    def rank_analogies(self, rel_array, c1, c2, c3, only=None, num=20):
        ratings = self.analogy_values(rel_array, c1, c2, c3, self.vectors)
        indices = np.argsort(ratings)[::-1]

        out = []
        for index in indices:
            if len(out) >= num:
                return out
            if only is None or only(self.labels[index]):
                out.append((self.labels[index], ratings[index]))
        return out

    def rate_analogy(self, rel_array, c1, c2, c3, c4):
        v4 = self.to_vector(c4)
        return self.analogy_values(rel_array, c1, c2, c3, v4)
示例#26
0
class Graph:
    """Object to represent a directed graph."""

    def __init__(
        self,
        nodes: Optional[Sequence[Node]] = None,
        edges: Optional[Sequence[Edge]] = None,
        A: Optional[spmatrix] = None,
        nodeprops: Optional[NodeProperties] = None,
        edgeprops: Optional[EdgeProperties] = None,
    ):
        self.A_ = dok_matrix((MAX_N_NODES, MAX_N_NODES), dtype=bool)
        if nodes is None:
            self.nodes_ = OrderedSet()
            if edges is not None:
                self.edges_ = set(edges)
                for x, y in edges:
                    i = self.nodes_.add(x)
                    j = self.nodes_.add(y)
                    self.A_[i, j] = True
            elif A is not None:
                self.nodes_ = OrderedSet(np.arange(A.shape[0]))
                self.edges_ = set()
                for i, j in zip(*A.nonzero()):
                    self.edges_.add((i, j))
                    self.A_[i, j] = True
            else:
                self.edges_ = set()
        else:
            self.nodes_ = OrderedSet(nodes)
            if edges is not None:
                self.edges_ = set(edges)
                for x, y in edges:
                    i = self.nodes_.index(x)
                    j = self.nodes_.index(y)
                    self.A_[i, j] = True
            elif A is not None:
                self.edges_ = set()
                for i, j in zip(*A.nonzero()):
                    self.edges_.add((self.nodes_[i], self.nodes_[j]))
                    self.A_[i, j] = True
            else:
                self.edges_ = set()

        self.nodeprops = ifnone(nodeprops, {})
        self.edgeprops = ifnone(edgeprops, {})

    @property
    def n_nodes(self):
        return len(self.nodes_)

    @property
    def nodes(self):
        return self.nodes_

    @property
    def edges(self):
        return self.edges_

    @property
    def A(self):
        return self.A_.tocsr()[: self.n_nodes, : self.n_nodes]

    def add_node(self, node: Node):
        self.nodes_.add(node)

    def add_nodes(self, nodes: Sequence[Node]):
        for node in nodes:
            self.add_node(node)

    def add_edge(self, edge: Edge):
        self.add_nodes(edge)
        self.edges_.add(edge)
        n1, n2 = edge
        i = self.nodes_.index(n1)
        j = self.nodes_.index(n2)
        self.A_[i, j] = True

    def add_edges(self, edges: Sequence[Edge]):
        for edge in edges:
            self.add_edge(edge)

    def remove_edge(self, edge: Edge):
        try:
            self.edges_.remove(edge)
        except KeyError:
            print(f"Edge {edge} was not found in graph")
        n1, n2 = edge
        i = self.nodes_.index(n1)
        j = self.nodes_.index(n2)
        self.A_[i, j] = False

    def reset(self):
        self.nodes_ = OrderedSet()
        self.edges_ = set()
        self.A_ = dok_matrix((MAX_N_NODES, MAX_N_NODES), dtype=bool)
        self.nodeprops = {}
        self.edgeprops = {}
示例#27
0
class Featurizer(object):
	#
	# Converts chorales into a matrix of feature indices. Each vector in a matrix represents a specific beat within
	# a chorale. Note that indices are 1-based to comply with Torch. 
	#

	# Initialize with the number of scores to analyze
	def __init__(self, num_scores=20):
		self.num_scores = num_scores
		self.indices = {}
		self.features = []
		self.harmonies = []
		self.max_index = 0
		self.original = [] 			# original, cleaned scores deposited here
		self.training_split = [] 	# training scores
		self.test_split = []		# test scores
		self.percentage_train = 0.8 # percentage of scores to be in the test split
		self.percentage_dev = 0.5 	# percentage of the test set to be used a dev set

		self.data_dir = "raw_data/"
		self.output_dir = "data/"

		# Training examples created by featurize()
		self.X_train = []
		self.y_train = []
		self.X_test = []
		self.y_test = []


	# Collect all scores and preprocess them
	@timing
	def gather_scores(self):
		from os import listdir
		self.original = []
		for f in glob(self.data_dir + "*.xml"):
			self.original.append(converter.parse(f))
		print "Gathered %d 4-part chorales." % len(self.original)
		
		return self.original

	# Analyze the chorales and determine the possible values for each feature
	@timing
	def analyze(self):
		self.analyzed = [] # to save time, we store the related objects to a score for featurizing

		# Reset feature sets
		self.keys = OrderedSet()
		self.key_modes = OrderedSet()
		self.times = OrderedSet()
		self.beats = OrderedSet()
		self.offset_ends = OrderedSet()
		self.cadence_dists = OrderedSet()
		self.intervals = OrderedSet()
		self.cadences = OrderedSet(['cadence', 'no cadence'])
		self.pitch = OrderedSet(range(RANGE['Soprano']['min'], RANGE['Soprano']['max'] + 1))
		self.numerals = OrderedSet() # output feature
		self.inversions = OrderedSet() # output feature
		# THIS ORDER MATTERS
		self.features = [('key', self.keys), ('mode', self.key_modes), ('time', self.times), \
						('beatstr', self.beats), ('offset', self.offset_ends), ('cadence_dists', self.cadence_dists), \
						('cadence?', self.cadences), ('pitch', self.pitch), ('ibefore', self.intervals), \
						('iafter', self.intervals), ('numeral_prev', self.numerals), ('inv_prev', self.inversions)]

		for idx, score in enumerate(self.original):
			sys.stdout.write("Analyzing #%d 	\r" % (idx + 1))
			sys.stdout.flush()
			# score-wide features
			S, A, T, B = getNotes(score.parts[0]), getNotes(score.parts[1]), getNotes(score.parts[2]), getNotes(score.parts[3])
			assert len(S) == len(A)
			assert len(A) == len(T)
			assert len(T) == len(B)
			time_sig, key_sig = getTimeSignature(score.parts[0]), getKeySignature(score.parts[0])
			key_obj = getKeyFromSignature(key_sig)
			fermata_locations = map(hasFermata, S)

			# Score-wide: Key (sharps, mode) and Time (num, denom)
			self.keys.add(feat_key(key_sig))
			self.key_modes.add(key_sig.mode)
			self.times.add((time_sig.numerator, time_sig.denominator))

			# Note-specific data
			for index, n in enumerate(S):
				# Beat strength
				self.beats.add(feat_beat(n))
				# Offset from the end
				self.offset_ends.add(feat_offset_end(index, len(S)))
				# Distance to next cadence
				self.cadence_dists.add(feat_cadence_dist(n, index, fermata_locations))
				# Intervals
				if index > 0:
					self.intervals.add(feat_interval(S[index - 1], S[index]))
				# Harmony
				numeral, inversion = feat_harmony(S[index], A[index], T[index], B[index], key_obj)
				self.numerals.add(numeral)
				self.inversions.add(inversion)

			# Store objects for featurizing
			self.analyzed.append((score, S, A, T, B, time_sig, key_sig, key_obj, fermata_locations))

		# Add 'None' as an option for previous harmonies (i.e. to say there's no previous harmony for the first beat)
		self.numerals.add('None')
		self.inversions.add('None')
		# Add 'None' as an option for previous and future melodic intervals
		# (i.e. the first note has no previous note, so the 'interval before' is represented as 'None')
		self.intervals.add('None')

		# Set feature indices
		i_max = 1
		for name, values in self.features:
			self.indices[name] = (i_max, i_max + len(values) - 1)
			i_max += len(values)
		self.max_index = i_max # record the highest index

	# Wrapper function for featurize_set():
	@timing
	def featurize(self):
		# Create train-test split
		training, remaining = self.split(analyzed, self.percentage_train)
		dev, test = self.split(remaining, self.percentage_dev)
		self.X_train, self.y_train, self.X_dev, self.y_dev, self.X_test, self.y_test = [], [], [], [], [], []

		# Training set
		for idx, score in enumerate(training):
			sys.stdout.write("Featurizing #%d 	\r" % (idx + 1))
			sys.stdout.flush()
			X, y = self.featurize_score(score)
			self.X_train.append(X)
			self.y_train.append(y)
		print "Featurized training set."

		# Development set
		for idx, score in enumerate(training):
			sys.stdout.write("Featurizing #%d 	\r" % (idx + 1))
			sys.stdout.flush()
			X, y = self.featurize_score(score)
			self.X_train.append(X)
			self.y_train.append(y)
		print "Featurized training set."

		# Test set
		for idx, score in enumerate(test):
			sys.stdout.write("Featurizing #%d 	\r" % (idx + 1))
			sys.stdout.flush()
			X, y = self.featurize_score(score)
			self.X_test.append(X)
			self.y_test.append(y)
		print "Featurized test set."
		
		print "Training examples size: %d" % len(self.X_train)
		print "Test examples size: %d" % len(self.X_test)

		# Freeze for future use
		freezeObject(self.X_train, "X_train")
		freezeObject(self.y_train, "y_train")
		freezeObject(self.X_dev, "X_dev")
		freezeObject(self.y_dev, "y_dev")
		freezeObject(self.X_test, "X_test")
		freezeObject(self.y_test, "y_test")
		freezeObject(list(self.numerals), "numerals")
		freezeObject(list(self.inversions), "inversions")
		freezeObject(self.indices, "indices")

		
	# After analysis, this generates the training examples (input vectors, output vectors)
	# As scores are examined, the indices of output chords are generated.
	def featurize_score(self, score_packet):
		# feature vectors
		X, y = [], []
		
		# unpack score objects
		score, S, A, T, B, time_sig, key_sig, key_obj, fermata_locations = score_packet

		# Create X vector and y output
		for index, n in enumerate(S):
			# Key
			f_key = self.keys.index(feat_key(key_sig)) + self.indices['key'][0]
			# Key mode
			f_mode = self.key_modes.index(key_sig.mode) + self.indices['mode'][0]
			# Time
			f_time = self.times.index((time_sig.numerator, time_sig.denominator)) + self.indices['time'][0]
			# Beat
			f_beat = self.beats.index(feat_beat(n)) + self.indices['beatstr'][0]
			# Offset end
			f_off_end = self.offset_ends.index(feat_offset_end(index, len(S))) + self.indices['offset'][0]
			# Cadence distance
			f_cadence_dist = self.cadence_dists.index(feat_cadence_dist(n, index, fermata_locations)) + self.indices['cadence_dists'][0]
			# Has cadence?
			f_cadence = feat_cadence(n) + self.indices['cadence?'][0]
			# Pitch
			f_pitch = self.pitch.index(feat_pitch(n)) + self.indices['pitch'][0]
			# Melodic interval before
			ibefore = feat_interval(S[index - 1], S[index]) if index > 0 else 'None'
			f_ibefore = self.intervals.index(ibefore) + self.indices['ibefore'][0]
			# Melodic interval after
			iafter = feat_interval(S[index], S[index + 1]) if index < len(S) - 1 else 'None'
			f_iafter = f_pbefore = self.intervals.index(iafter) + self.indices['iafter'][0]
			# Previous harmony
			num_prev, inv_prev = feat_harmony(S[index - 1], A[index - 1], T[index - 1], B[index - 1], key_obj) if index > 0 else ('None', 'None')
			f_num_prev = self.numerals.index(num_prev) + self.indices['numeral_prev'][0]
			f_inv_prev = self.inversions.index(inv_prev) + self.indices['inv_prev'][0]
			# Input vector
			input_vec = [f_key, f_mode, f_time, f_beat, f_off_end, f_cadence_dist, f_cadence, f_pitch, \
						f_ibefore, f_iafter, f_num_prev, f_inv_prev]

			# Output class, 1-indexed for Torch
			f_num, f_prev = feat_harmony(S[index], A[index], T[index], B[index], key_obj)
			output_vec = [self.numerals.index(f_num) + 1, self.inversions.index(f_prev) + 1]

			X.append(input_vec)
			y.append(output_vec)

		return X, y

	# Verify that the feature indices are all in the right ranges
	def verify(self):
		print "Verifying indices..."
		# self.X_train, self.y_train = thawObject("X_train"), thawObject("y_train")
		# self.X_test, self.y_test = thawObject("X_test"), thawObject("y_test")
		# self.indices = thawObject('indices')
		# self.numerals = thawObject('numerals')
		# self.inversions = thawObject('inversions')
		inputs = self.X_train + self.X_test
		outputs = self.y_train + self.y_test
		for i, score in enumerate(inputs):
			s_in = score
			s_out = outputs[i]
			for j, example in enumerate(s_in):
				numeral, inversion = s_out[j]

				# Note the order here corresponds with the order in which the example features were added
				features = ['key', 'mode', 'time', 'beatstr', 'offset', 'cadence_dists', 'cadence?', 'pitch', 'pbefore', 'pafter']
				for f_idx, feature in enumerate(features):
					try:
						assert in_range(example[f_idx], self.indices[feature][0], self.indices[feature][1])
					except:
						pass
				try:
					assert in_range(numeral, 1, len(self.numerals))
					assert in_range(inversion, 1, len(self.inversions))
				except:
					pass

	# Write 
	def write(self):
		print "Writing to %s..." % self.output_dir
		for idx, score in enumerate(self.X_train):
			with h5py.File(self.output_dir + "train_%d.hdf5" % idx, "w", libver='latest') as f:
				X_matrix = npy.matrix(score)
				f.create_dataset("X", X_matrix.shape, dtype='i', data=X_matrix)
				y_matrix = npy.matrix(self.y_train[idx])
				f.create_dataset("y", y_matrix.shape, dtype='i', data=y_matrix)
		
		for idx, score in enumerate(self.X_test):
			with h5py.File(self.output_dir + "test_%d.hdf5" % idx, "w", libver='latest') as f:
				X_matrix = npy.matrix(score)
				f.create_dataset("X", X_matrix.shape, dtype='i', data=X_matrix)
				y_matrix = npy.matrix(self.y_test[idx])
				f.create_dataset("y", y_matrix.shape, dtype='i', data=y_matrix)

		# Freeze features for evaluation later on
		freezeObject(self.harmonies, "harmonies")

	# Split a list into two sets, with a ratio of pg : 1 - pg (where 0 <= pg <= 1)
	def split(self, lst, pg):
		shuffle(lst)
		split_point = int(len(lst) * pg)
		set1 = score_list[split_point:]
		set2 = score_list[:split_point]
		
		# Make sure there is no overlap
		for s in self.training_split:
			for t in self.test_split:
				assert s != t

		return set1, set2

	def run(self):
		self.gather_scores()
		self.analyze()
		self.featurize()
		self.verify()
		self.write()

	def __str__(self):
		s = "\n---------- FEATURIZER RESULTS ----------\n"
		for name, values in self.features:
			s += name + ": " + str(values) + "\n"
		s += "\n"
		s += "Indices:\n"
		for name, values in self.features:
			s+= "'%s': %s\n" % (name, str(self.indices[name]))
		s += "\n"
		s += "Roman numerals (%d total)\n%s\n" % (len(self.numerals), self.numerals)
		s += "\n"
		s += "Inversions (%d total)\n%s\n" % (len(self.inversions), self.inversions)
		s += "\n"
		s += "Test-training split: %d training chorales, %d test chorales\n" % (len(self.training_split), len(self.test_split))
		s += "Test-training examples: %d for training, %d for test\n" % (len(self.X_train), len(self.X_test))
		s += "---------------------------------------\n"
		return s

	__repr__ = __str__