Пример #1
0
def main():

    parser = argparse.ArgumentParser(
        description='Concatenate a number of vector files into single file')

    parser.add_argument('outpath',
                        type=str,
                        help='Place to save the new Vector file.')
    parser.add_argument(
        '--mode',
        type=str,
        default='a',
        help=
        'Write mode for output. By default, appends if the file exists, can be switched to \'w\' to overwrite.'
    )
    parser.add_argument('--build-cache',
                        action='store_true',
                        help='Build a prefix cache after concatenation.')
    parser.add_argument(
        '--no-concat',
        action='store_true',
        help="Skip the concatenation, if you're hoping to *just* build cache.")
    parser.add_argument('filepaths',
                        type=str,
                        nargs='*',
                        help='List of vector files being combines.')

    args = parser.parse_args()

    if len(args.filepaths) == 0 and not args.no_concat:
        raise Exception("Nothing to do without input filepaths")
    if args.no_concat and not args.build_cache:
        raise Exception(
            "If you're not concatenating and not building a cache, you're not doing anything."
        )

    if not args.no_concat:
        with SRP.Vector_file(args.filepaths[0], mode="r") as vecf:
            dims = vecf.dims

        with SRP.Vector_file(args.outpath, mode=args.mode, dims=dims) as outf:
            for efpath in args.filepaths:
                print("Concatenating:", efpath)
                outf.concatenate_file(efpath)

    if args.build_cache:
        with SRP.Vector_file(args.outpath, offset_cache=True) as outf:
            print("Building prefix lookup cache")
            outf._build_prefix_lookup(sep='-', dump_every=2000000)
Пример #2
0
    def _initialize_embeddings(self, chunk_file):
        """
        Read in an embedding file at unit length, and adjust the metadata to match.
        """
        input = SRP.Vector_file(chunk_file)
        dataset = input.to_matrix(unit_length=True)
        self.matrix = dataset['matrix']

        ids = dataset['names']
        self.ids = ids

        sections = [a.split("-") for a in dataset['names']]
        try:
            htids, sections, starts, ends = zip(*sections)
        except:
            # The old format: deprecated.
            htids, sections = zip(*sections)
        sections = pd.DataFrame({
            'mtid': ids,
            'htid': htids,
            'section': list(map(int, sections))
        }).set_index('htid')

        self.chunk_metadata = sections.join(self.metadata,
                                            how='left').reset_index()

        self.mtid_lookup = dict(
            zip(dataset['names'], range(len(dataset['names']))))

        self.htid_lookup = defaultdict(list)
        for i, htid in enumerate(htids):
            self.htid_lookup[htid].append(ids[i])
Пример #3
0
 def test_error_on_load(self):
     with tempfile.TemporaryDirectory() as dir:
         testfile = SRP.Vector_file(Path(dir, "test.bin"), dims=3, mode="w")
         with self.assertRaises(TypeError):
             testfile.add_row("this is a space", self.array1)
         testfile.close()