Пример #1
0
 def get_titles(self):
     possible_titles = self.parser.parse()
     logger.debug("Found possible titles %s for title %s" % (possible_titles, self.title))
     possible_exact_titles = flatten(
         [self.db.query_title(title, year) for title, year in possible_titles])
     possible_fuzzy_titles = flatten(
         [self.db.query_fuzzy(title, year) for title, year in possible_titles])
     
     titles = set(possible_exact_titles).intersection(set(possible_fuzzy_titles))
     logger.debug("Found normalized titles %s" % titles)
     return titles
Пример #2
0
def populate_actors_file(input_files=INPUT_FILES,
                         write_file=OUTPUT_FILE, cleanup=True):

    data = set(
        map(tuple,
            flatten(
                map(clean_scraped_data, input_files)
            )
            )
    )
    data_dicts = map(lambda x: {
        'f_name': x[0],
        'l_name': x[1],
    }, data)
    write_scraped_data(write_file, data_dicts)

    if cleanup:
        map(os.remove, INPUT_FILES)
Пример #3
0
def extract_credits(path, wildcard=EXT):
    """
        path: Input path where processed movie is contained
        Performs OCR on each image found at path/*wildcard.
        The text is then cleaned and normalized to the closest 
        match to known actors and actresses. 
        A set of names is returned.
    """
    logger.debug("Extracting credits at %s" % path)

    cleaner = StringCleaner()

    img_paths = glob("%s/*%s" % (path, wildcard))
    img_text = [_text_from_img(img_path, path) for img_path in img_paths]
    logger.debug("img_text %s" % img_text)

    clean_text = flatten(map(cleaner.clean, img_text))
    logger.debug("clean_text %s" % clean_text)

    return clean_text
Пример #4
0
def _split_text(clean_text):
    """
        split text into terms for fuzzy matching
    """
    return flatten(map(
        lambda tokens: map(parse_name, tokens), clean_text))