def worker():
    tokenizer = rltk.CrfTokenizer()

    # load Datasets
    ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file),
                           record_class=IMDBRecord,
                           adapter=rltk.MemoryKeyValueAdapter())
    ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file),
                          record_class=AFIRecord,
                          adapter=rltk.MemoryKeyValueAdapter())
    valid_match = []
    for r_imdb in ds_imdb:
        # test this record with AFI records
        optimum = (None, MY_TRESH)
        for r_afi in ds_afi:
            result, confidence = rule_based_method(r_imdb, r_afi)
            if result and confidence > optimum[1]:
                optimum = (r_afi, confidence)

        if optimum[0] is not None:
            r_afi, confidence = optimum
            valid_match.append({
                'imdb_movie': r_imdb.raw_object['url'],
                'afi_movie': r_afi.raw_object['url']
            })
        else:
            valid_match.append({
                'imdb_movie': r_imdb.raw_object['url'],
                'afi_movie': None
            })

    fout = open(result_file, 'w')
    fout.write(json.dumps(valid_match, indent=4))
    fout.close()
import rltk
# to maintain ordered-dictionaries
from collections import OrderedDict
# json utilities
import json
# regular expression
import re

bad_words_hair_eyes_cat = ['unknown', 'various', 'unrevealed', \
                           'eye', 'with', 'formerly', 'none', 'not', \
                           'variable', 'n/a', 'with', 'hair', 'varying', 'while']

DC_COMICS_PUB_ID = '_dc-comics_4010-10_'
MARVEL_PUB_ID = '_marvel_4010-31_'

tokenizer = rltk.CrfTokenizer()

'''
MOVIE_CHARS_DICT --> WIKIA_CHARS_DICT
ISSUE_CHARS_DICT --> WIKIA_CHARS_DICT
'''
MOVIE_CHARS_DICT = dict()
ISSUE_CHARS_DICT = dict()
WIKIA_CHARS_DICT = dict()
SIM_CHARS__MOVIE_TO_WIKIA = dict()
SIM_CHARS__ISSUE_TO_WIKIA = dict()

'''
ISSUE_TEAMS_DICT --> WIKIA_TEAMS_DICT
'''
ISSUE_TEAMS_DICT = dict()
 def name_tokens(self):
     return set(rltk.CrfTokenizer().tokenize(self.name_string))