Exemplo n.º 1
0
def main():

    opts = get_parser().parse_args()
    n = opts.n
    checker = SpellChecker(Aligner(opts.sigma, opts.bayes))
    # record top 1, 2, 3 accuracy 
    top1=top2=top3 = 0
    allpairs = 0


    data = pd.read_csv('data/testdata.txt', sep=" ", header=None)
    data.columns = ["correct", "mis"]
    subdata = data.sample(n = n) 

    for index, row in tqdm(subdata.iterrows()):
        allpairs += 1
        fs = checker.give_suggestions(row["mis"], opts.topk)
        if fs is not None:
            resultlist = [row[0] for row in fs]
            if row["correct"].strip() == resultlist[0]:
                top1 += 1 
                top2 += 1
                top3 += 1
            elif len(resultlist) >= 2 and row["correct"].strip() == resultlist[1] :
                top2 += 1
                top3 += 1
            elif len(resultlist) == 3 and row["correct"].strip() == resultlist[2] :
                top3 += 1



    print("Top 1 precision: ", top1 / allpairs)
    print("Top 2 precision: ", top2 / allpairs)
    print("Top 3 precision: ", top3 / allpairs)
    print("-----------------------------------------")
Exemplo n.º 2
0
 def read_file(self, file):
     if file not in self.data:
         pair = []
         data = []
         for item in super().read_file(file):
             assert len(item) >= 2
             src, trg, rest = item[0], item[1], item[2:]
             pair.append(([BOS] + src + [EOS], [BOS] + trg + [EOS]))
             data.append(rest)
         align = Aligner(pair, align_symbol=ALIGN)
         assert len(pair) == len(data) == len(align.alignedpairs)
         for idx in range(len(pair)):
             action = self.gen_act(*align.alignedpairs[idx])
             step_cnt = sum([int(x == STEP) for x in action])
             assert step_cnt + 1 == len(
                 pair[idx][0]), "step cnt {}\n{}\n{}\n{}".format(
                     step_cnt, pair[idx], action, align.alignedpairs[idx])
             data[idx] = tuple([pair[idx][0], action, *data[idx]])
         self.data[file] = data
     yield from self.data[file]
Exemplo n.º 3
0
                    set(vowelswaps(word)) & real_words
                    or  #  vowels   "weke" => "wake"
                    set(variants(word)) & real_words
                    or  #  other    "nonster" => "monster"
                    set(both(word)) & real_words
                    or  #  both     "CUNsperrICY" => "conspiracy"
                    set(double_variants(word)) & real_words
                    or  #  other    "nmnster" => "manster"
                    {})
        else:
            return ({word} & real_words
                    or (set(reductions(word)) | set(vowelswaps(word))
                        | set(variants(word)) | set(both(word))
                        | set(double_variants(word))) & real_words or {})

    def give_suggestions(self, word: str, topk=3):
        sug = self.suggestions(word)
        if sug:
            return self.aligner.final_suggestions(word, sug, topk=topk)
        else:
            return None


if __name__ == "__main__":
    from align import Aligner
    sc = SpellChecker(Aligner())
    while True:
        word = str(input('>'))
        fs = sc.give_suggestions(word, topk=10)
        print(fs)
Exemplo n.º 4
0
                        default=1,
                        type=int,
                        metavar='n',
                        dest='n_jobs',
                        help='Set num threads to use (default: 1)')
    args = parser.parse_args()

    if len(sys.argv) == 1:
        parser.print_help()
        sys.exit(1)

    # Determine ltfs to process.
    if not args.scpf is None:
        with open(args.scpf, 'r') as f:
            args.ltfs = [l.strip() for l in f.readlines()]

    # Initialize chunker, aligner, and encoder.
    chunker = BILOUChunkEncoder()
    aligner = Aligner()
    encf = os.path.join(args.model_dir, 'tagger.enc')
    with open(encf, 'r') as f:
        enc = cPickle.load(f)

    # Perform tagging in parallel, dumping results to args.tagged_dir.
    n_jobs = min(len(args.ltfs), args.n_jobs)
    modelf = os.path.join(args.model_dir, 'tagger.crf')
    f = delayed(tag_file)
    Parallel(n_jobs=n_jobs, verbose=0)(
        f(ltf, aligner, enc, chunker, modelf, args.tagged_dir, args.ext)
        for ltf in args.ltfs)
Exemplo n.º 5
0
jobs = []

for source_document in [
        d for d in os.listdir(directory) if d.endswith('.' + source_suffix)
]:

    source_document = os.path.join(directory, source_document)
    target_document = source_document[:-len(source_suffix)] + target_suffix
    translation_document = source_document[:-len(source_suffix
                                                 )] + translation_suffix

    # Sanity checks
    for f in source_document, target_document, translation_document:
        if not os.path.isfile(f):
            sys.stderr.write(
                'ERROR: File {0} expected, but not found\n'.format(f))
            exit()

    jobs.append((source_document, target_document, translation_document))

for (source_document, target_document, translation_document) in jobs:

    options['srcfile'] = source_document
    options['targetfile'] = target_document
    options['srctotarget'] = [translation_document]
    options['output-src'] = source_document + '.aligned'
    options['output-target'] = target_document + '.aligned'

    a = Aligner(options)
    a.mainloop()
from align import Aligner


if __name__ == "__main__":
    aligner = Aligner() 
    pairs = aligner.utils.get_pairs()
    outputs = []
    for pair in pairs:
        n, ptr = aligner.align(pair[0], pair[1])
        alignments = aligner.utils.reconstruct_ptr(pair, ptr)
        output = (alignments[0], alignments[1], n)
        outputs.append(output)
    aligner.utils.write_output(outputs)

Exemplo n.º 7
0
options['verbosity'] = 1
options['printempty'] = False
options['output'] = None

jobs = []

for source_document in [d for d in os.listdir(directory) if d.endswith('.' + source_suffix)]:

    source_document = os.path.join(directory, source_document)
    target_document = source_document[:-len(source_suffix)] + target_suffix
    translation_document = source_document[:-len(source_suffix)] + translation_suffix

    # Sanity checks
    for f in source_document, target_document, translation_document:
        if not os.path.isfile(f):
            sys.stderr.write('ERROR: File {0} expected, but not found\n'.format(f))
            exit()

    jobs.append((source_document, target_document, translation_document))

for (source_document,target_document,translation_document) in jobs:

    options['srcfile'] = source_document
    options['targetfile'] = target_document
    options['srctotarget'] = [translation_document]
    options['output-src'] = source_document + '.aligned'
    options['output-target'] = target_document + '.aligned'

    a = Aligner(options)
    a.mainloop()
Exemplo n.º 8
0
    def __init__(self):
        parser = argparse.ArgumentParser()
        parser.add_argument('--caffe',
                            type=str,
                            default=os.path.expanduser("~/caffe"),
                            help='--caffe ~/caffe')
        parser.add_argument(
            '--gpu',
            type=int,
            help='--gpu 0 . If --gpu not specified, then cpu will be used')
        parser.add_argument('--model',
                            type=str,
                            default='./',
                            help='--model /root/model/')
        parser.add_argument('--train',
                            type=str,
                            default='train',
                            help='--test /root/train')
        parser.add_argument('--descriptorLayer',
                            type=str,
                            default='embed',
                            help='--descriptorLayer embed')
        parser.add_argument('--verbose', default='true', action='store_true')
        self.args = parser.parse_args()

        alignerArgs = properties()
        alignerArgs.inputDir = self.args.train
        alignerArgs.dlibFacePredictor = 'shape_predictor_68_face_landmarks.dat'
        alignerArgs.align = ''
        alignerArgs.landmarks = 'outerEyesAndNose'
        alignerArgs.size = 128
        alignerArgs.skipMulti = False
        alignerArgs.verbose = False
        alignerArgs.fallbackLfw = False
        print('Train folder:', self.args.train)
        alignerArgs.inputDir = self.args.train
        alignerArgs.outputDir = self.args.train
        self.aligner = Aligner(alignerArgs)

        #todo?
        #caffe_root = os.path.join(self.args.caffe, 'python')
        #sys.path.insert(0, caffe_root)
        #import caffe

        # Load the net, list its data and params, and filter an example image.
        if self.args.gpu:
            caffe.set_mode_gpu()
            caffe.set_device(self.args.gpu)
        else:
            caffe.set_mode_cpu()

        deployProtoPath = sorted(
            list(glob.glob(os.path.join(self.args.model,
                                        '*.deploy.prototxt'))))[-1]
        if self.args.verbose:
            print(deployProtoPath)
        caffeModelPath = human_numeric_sort(
            list(glob.glob(os.path.join(self.args.model, '*.caffemodel'))))[-1]
        if self.args.verbose:
            print(caffeModelPath)

        self.net = caffe.Net(deployProtoPath, caffeModelPath, caffe.TEST)
        if self.args.verbose:
            print("blobs {}\nparams {}".format(self.net.blobs.keys(),
                                               self.net.params.keys()))

        self.transformer = caffe.io.Transformer(
            {'data': self.net.blobs['data'].data.shape})
        self.transformer.set_transpose(
            'data', (2, 0, 1))  # move image channels to outermost dimension
        self.transformer.set_mean('data', np.array(
            [127.5, 127.5,
             127.5]))  # subtract the dataset-mean value in each channel
        self.transformer.set_raw_scale('data',
                                       255)  # rescale from [0, 1] to [0, 255]
        self.transformer.set_channel_swap(
            'data', (2, 1, 0))  # swap channels from RGB to BGR
Exemplo n.º 9
0
class Reidentifier:
    def __init__(self):
        parser = argparse.ArgumentParser()
        parser.add_argument('--caffe',
                            type=str,
                            default=os.path.expanduser("~/caffe"),
                            help='--caffe ~/caffe')
        parser.add_argument(
            '--gpu',
            type=int,
            help='--gpu 0 . If --gpu not specified, then cpu will be used')
        parser.add_argument('--model',
                            type=str,
                            default='./',
                            help='--model /root/model/')
        parser.add_argument('--train',
                            type=str,
                            default='train',
                            help='--test /root/train')
        parser.add_argument('--descriptorLayer',
                            type=str,
                            default='embed',
                            help='--descriptorLayer embed')
        parser.add_argument('--verbose', default='true', action='store_true')
        self.args = parser.parse_args()

        alignerArgs = properties()
        alignerArgs.inputDir = self.args.train
        alignerArgs.dlibFacePredictor = 'shape_predictor_68_face_landmarks.dat'
        alignerArgs.align = ''
        alignerArgs.landmarks = 'outerEyesAndNose'
        alignerArgs.size = 128
        alignerArgs.skipMulti = False
        alignerArgs.verbose = False
        alignerArgs.fallbackLfw = False
        print('Train folder:', self.args.train)
        alignerArgs.inputDir = self.args.train
        alignerArgs.outputDir = self.args.train
        self.aligner = Aligner(alignerArgs)

        #todo?
        #caffe_root = os.path.join(self.args.caffe, 'python')
        #sys.path.insert(0, caffe_root)
        #import caffe

        # Load the net, list its data and params, and filter an example image.
        if self.args.gpu:
            caffe.set_mode_gpu()
            caffe.set_device(self.args.gpu)
        else:
            caffe.set_mode_cpu()

        deployProtoPath = sorted(
            list(glob.glob(os.path.join(self.args.model,
                                        '*.deploy.prototxt'))))[-1]
        if self.args.verbose:
            print(deployProtoPath)
        caffeModelPath = human_numeric_sort(
            list(glob.glob(os.path.join(self.args.model, '*.caffemodel'))))[-1]
        if self.args.verbose:
            print(caffeModelPath)

        self.net = caffe.Net(deployProtoPath, caffeModelPath, caffe.TEST)
        if self.args.verbose:
            print("blobs {}\nparams {}".format(self.net.blobs.keys(),
                                               self.net.params.keys()))

        self.transformer = caffe.io.Transformer(
            {'data': self.net.blobs['data'].data.shape})
        self.transformer.set_transpose(
            'data', (2, 0, 1))  # move image channels to outermost dimension
        self.transformer.set_mean('data', np.array(
            [127.5, 127.5,
             127.5]))  # subtract the dataset-mean value in each channel
        self.transformer.set_raw_scale('data',
                                       255)  # rescale from [0, 1] to [0, 255]
        self.transformer.set_channel_swap(
            'data', (2, 1, 0))  # swap channels from RGB to BGR

    #def set_reference_descriptors(self):
    #    pass

    def reindetify(self, testImagePath, train_image_dir):
        testImage = caffe.io.load_image(testImagePath)
        center = np.array([testImage.shape[0], testImage.shape[1]]) / 2.0
        crop_dims = np.array([
            self.net.blobs['data'].data.shape[2],
            self.net.blobs['data'].data.shape[3]
        ])
        crop = np.tile(center, (1, 2))[0] + np.concatenate(
            [-crop_dims / 2.0, crop_dims / 2.0])
        crop = crop.astype(int)

        #isCropNeeded = testImage.shape[0] != self.net.blobs['data'].data.shape[2] or testImage.shape[1] != \
        #                                                                             self.net.blobs['data'].data.shape[3]
        isCropNeeded = False

        trainImages = getImages(train_image_dir)
        print(trainImages)

        #testImages = {-1: testImagePath}
        if isCropNeeded:
            trainDescriptors = self.getDescriptors(self.args, trainImages,
                                                   caffe, self.net,
                                                   self.transformer, crop)
            #testDescriptors = self.getDescriptors(self.args, testImages, caffe, self.net, self.transformer, crop)
        else:
            trainDescriptors = self.getDescriptors(self.args, trainImages,
                                                   caffe, self.net,
                                                   self.transformer)
            #testDescriptors = self.getDescriptors(self.args, testImages, caffe, self.net, self.transformer)

        print('Results:')
        #testPath = testDescriptors.keys()[0]
        #similar = cosine_similarity(np.array(testDescriptors[testPath]).reshape(1,-1), trainDescriptors.values())[0]
        descriptor = self.get_descriptor(self.args, caffe, crop, testImagePath,
                                         self.net, self.transformer)
        similar = cosine_similarity(
            np.array(descriptor).reshape(1, -1), trainDescriptors.values())[0]
        print(similar)
        idx = similar.argsort()[::-1]
        class_name = trainDescriptors.keys()[idx[0]]
        class_name = os.path.dirname(class_name)
        class_name = os.path.basename(class_name)
        return class_name, similar[idx[0]]

    def getDescriptors(self, args, images, caffe, net, transformer, crop=None):
        descriptors = {}
        for folder in images.keys():
            for index, imagePath in enumerate(images[folder]):
                if args.verbose:
                    print(folder, index, imagePath)

                output = self.get_descriptor(args, caffe, crop, imagePath, net,
                                             transformer)
                descriptors[imagePath] = output
        return descriptors

    def get_descriptor(self, args, caffe, crop, imagePath, net, transformer):
        image = self.aligner.align(imagePath).astype(np.float32)
        image /= 255
        #image = caffe.io.load_image(imagePath)
        #if crop is not None:
        #   # central crop
        #   image = image[crop[0]:crop[2], crop[1]:crop[3], :]
        transformed_image = transformer.preprocess('data', image)
        # copy the image data into the memory allocated for the net
        net.blobs['data'].data[...] = transformed_image
        output = net.forward(
            end=args.descriptorLayer)[args.descriptorLayer][0].tolist()
        return output