def get_titles(self): possible_titles = self.parser.parse() logger.debug("Found possible titles %s for title %s" % (possible_titles, self.title)) possible_exact_titles = flatten( [self.db.query_title(title, year) for title, year in possible_titles]) possible_fuzzy_titles = flatten( [self.db.query_fuzzy(title, year) for title, year in possible_titles]) titles = set(possible_exact_titles).intersection(set(possible_fuzzy_titles)) logger.debug("Found normalized titles %s" % titles) return titles
def populate_actors_file(input_files=INPUT_FILES, write_file=OUTPUT_FILE, cleanup=True): data = set( map(tuple, flatten( map(clean_scraped_data, input_files) ) ) ) data_dicts = map(lambda x: { 'f_name': x[0], 'l_name': x[1], }, data) write_scraped_data(write_file, data_dicts) if cleanup: map(os.remove, INPUT_FILES)
def extract_credits(path, wildcard=EXT): """ path: Input path where processed movie is contained Performs OCR on each image found at path/*wildcard. The text is then cleaned and normalized to the closest match to known actors and actresses. A set of names is returned. """ logger.debug("Extracting credits at %s" % path) cleaner = StringCleaner() img_paths = glob("%s/*%s" % (path, wildcard)) img_text = [_text_from_img(img_path, path) for img_path in img_paths] logger.debug("img_text %s" % img_text) clean_text = flatten(map(cleaner.clean, img_text)) logger.debug("clean_text %s" % clean_text) return clean_text
def _split_text(clean_text): """ split text into terms for fuzzy matching """ return flatten(map( lambda tokens: map(parse_name, tokens), clean_text))