def worker(): tokenizer = rltk.CrfTokenizer() # load Datasets ds_imdb = rltk.Dataset(reader=rltk.JsonLinesReader(imdb_file), record_class=IMDBRecord, adapter=rltk.MemoryKeyValueAdapter()) ds_afi = rltk.Dataset(reader=rltk.JsonLinesReader(afi_file), record_class=AFIRecord, adapter=rltk.MemoryKeyValueAdapter()) valid_match = [] for r_imdb in ds_imdb: # test this record with AFI records optimum = (None, MY_TRESH) for r_afi in ds_afi: result, confidence = rule_based_method(r_imdb, r_afi) if result and confidence > optimum[1]: optimum = (r_afi, confidence) if optimum[0] is not None: r_afi, confidence = optimum valid_match.append({ 'imdb_movie': r_imdb.raw_object['url'], 'afi_movie': r_afi.raw_object['url'] }) else: valid_match.append({ 'imdb_movie': r_imdb.raw_object['url'], 'afi_movie': None }) fout = open(result_file, 'w') fout.write(json.dumps(valid_match, indent=4)) fout.close()
import rltk # to maintain ordered-dictionaries from collections import OrderedDict # json utilities import json # regular expression import re bad_words_hair_eyes_cat = ['unknown', 'various', 'unrevealed', \ 'eye', 'with', 'formerly', 'none', 'not', \ 'variable', 'n/a', 'with', 'hair', 'varying', 'while'] DC_COMICS_PUB_ID = '_dc-comics_4010-10_' MARVEL_PUB_ID = '_marvel_4010-31_' tokenizer = rltk.CrfTokenizer() ''' MOVIE_CHARS_DICT --> WIKIA_CHARS_DICT ISSUE_CHARS_DICT --> WIKIA_CHARS_DICT ''' MOVIE_CHARS_DICT = dict() ISSUE_CHARS_DICT = dict() WIKIA_CHARS_DICT = dict() SIM_CHARS__MOVIE_TO_WIKIA = dict() SIM_CHARS__ISSUE_TO_WIKIA = dict() ''' ISSUE_TEAMS_DICT --> WIKIA_TEAMS_DICT ''' ISSUE_TEAMS_DICT = dict()
def name_tokens(self): return set(rltk.CrfTokenizer().tokenize(self.name_string))