示例#1
0
    def test_date(self):
        self.assertEqual(search_year(' in the year 2000... '), (2000, (13, 17)))
        self.assertEqual(search_year(' they arrived in 1492. '), (None, None))

        today = date.today()
        today_year_2 = int(str(today.year)[2:])

        future = today + timedelta(days=1000)
        future_year_2 = int(str(future.year)[2:])

        past = today - timedelta(days=10000)
        past_year_2 = int(str(past.year)[2:])

        self.assertEqual(search_date(' Something before 2002-04-22 '), (date(2002, 4, 22), (18, 28)))
        self.assertEqual(search_date(' 2002-04-22 Something after '), (date(2002, 4, 22), (1, 11)))

        self.assertEqual(search_date(' This happened on 2002-04-22. '), (date(2002, 4, 22), (18, 28)))
        self.assertEqual(search_date(' This happened on 22-04-2002. '), (date(2002, 4, 22), (18, 28)))

        self.assertEqual(search_date(' This happened on 13-04-%s. ' % (today_year_2,)), (date(today.year, 4, 13), (18, 26)))
        self.assertEqual(search_date(' This happened on 22-04-%s. ' % (future_year_2,)), (date(future.year, 4, 22), (18, 26)))
        self.assertEqual(search_date(' This happened on 20-04-%s. ' % (past_year_2)), (date(past.year, 4, 20), (18, 26)))

        self.assertEqual(search_date(' This happened on 13-06-14. ', year_first=True), (date(2013, 6, 14), (18, 26)))
        self.assertEqual(search_date(' This happened on 13-05-14. ', year_first=False), (date(2014, 5, 13), (18, 26)))

        self.assertEqual(search_date(' This happened on 04-13-%s. ' % (today_year_2,)), (date(today.year, 4, 13), (18, 26)))
        self.assertEqual(search_date(' This happened on 04-22-%s. ' % (future_year_2,)), (date(future.year, 4, 22), (18, 26)))
        self.assertEqual(search_date(' This happened on 04-20-%s. ' % (past_year_2)), (date(past.year, 4, 20), (18, 26)))

        self.assertEqual(search_date(' This happened on 35-12-%s. ' % (today_year_2,)), (None, None))
        self.assertEqual(search_date(' This happened on 37-18-%s. ' % (future_year_2,)), (None, None))
        self.assertEqual(search_date(' This happened on 44-42-%s. ' % (past_year_2)), (None, None))

        self.assertEqual(search_date(' This happened on %s. ' % (today, )), (today, (18, 28)))
        self.assertEqual(search_date(' This happened on %s. ' % (future, )), (future, (18, 28)))
        self.assertEqual(search_date(' This happened on %s. ' % (past, )), (past, (18, 28)))

        self.assertEqual(search_date(' released date: 04-03-1901? '), (None, None))

        self.assertEqual(search_date(' There\'s no date in here. '), (None, None))

        self.assertEqual(search_date(' Something 01-02-03 '), (date(2003, 2, 1), (11, 19)))
        self.assertEqual(search_date(' Something 01-02-03 ', year_first=False, day_first=True), (date(2003, 2, 1), (11, 19)))
        self.assertEqual(search_date(' Something 01-02-03 ', year_first=True), (date(2001, 2, 3), (11, 19)))
        self.assertEqual(search_date(' Something 01-02-03 ', day_first=False), (date(2003, 1, 2), (11, 19)))
示例#2
0
def guess_year_skip_first(string):
    year, span = search_year(string)
    if year:
        year2, span2 = guess_year(string[span[1]:])
        if year2:
            return year2, (span2[0]+span[1], span2[1]+span[1])

    return None, None
示例#3
0
 def guess_year(string, node=None, options=None):
     year, span = search_year(string)
     if year:
         return {"year": year}, span
     else:
         return None, None
def guess_year(string):
    year, span = search_year(string)
    if year:
        return { 'year': year }, span
    else:
        return None, None
示例#5
0
 def guess_year(self, string):
     year, span = search_year(string)
     if year:
         return {"year": year}, span
     else:
         return None, None
示例#6
0
def guess_groups(string, result, filetype):
    # add sentinels so we can match a separator char at either end of
    # our groups, even when they are at the beginning or end of the string
    # we will adjust the span accordingly later
    #
    # filetype can either be movie, moviesubtitle, episode, episodesubtitle
    current = " " + string + " "

    regions = []  # list of (start, end) of matched regions

    def guessed(match_dict, confidence):
        guess = format_guess(Guess(match_dict, confidence=confidence))
        result.append(guess)
        log.debug("Found with confidence %.2f: %s" % (confidence, guess))
        return guess

    def update_found(string, guess, span, span_adjust=(0, 0)):
        span = (span[0] + span_adjust[0], span[1] + span_adjust[1])
        regions.append((span, guess))
        return blank_region(string, span)

    # try to find dates first, as they are very specific
    date, span = search_date(current)
    if date:
        guess = guessed({"date": date}, confidence=1.0)
        current = update_found(current, guess, span)

    # for non episodes only, look for year information
    if filetype not in ("episode", "episodesubtitle"):
        year, span = search_year(current)
        if year:
            guess = guessed({"year": year}, confidence=1.0)
            current = update_found(current, guess, span)

    # specific regexps (ie: cd number, season X episode, ...)
    for rexp, confidence, span_adjust in video_rexps:
        match = re.search(rexp, current, re.IGNORECASE)
        if match:
            metadata = match.groupdict()
            # is this the better place to put it? (maybe, as it is at least the soonest that we can catch it)
            if "cdNumberTotal" in metadata and metadata["cdNumberTotal"] is None:
                del metadata["cdNumberTotal"]

            guess = guessed(metadata, confidence=confidence)
            current = update_found(current, guess, match.span(), span_adjust)

    if filetype in ("episode", "episodesubtitle"):
        for rexp, confidence, span_adjust in episode_rexps:
            match = re.search(rexp, current, re.IGNORECASE)
            if match:
                metadata = match.groupdict()
                guess = guessed(metadata, confidence=confidence)
                current = update_found(current, guess, match.span(), span_adjust)

    # Now websites, but as exact string instead of regexps
    clow = current.lower()
    for site in websites:
        pos = clow.find(site.lower())
        if pos != -1:
            guess = guessed({"website": site}, confidence=confidence)
            current = update_found(current, guess, (pos, pos + len(site)))
            clow = current.lower()

    # release groups have certain constraints, cannot be included in the previous general regexps
    group_names = [
        r"\.(Xvid)-(?P<releaseGroup>.*?)[ \.]",
        r"\.(DivX)-(?P<releaseGroup>.*?)[\. ]",
        r"\.(DVDivX)-(?P<releaseGroup>.*?)[\. ]",
    ]
    for rexp in group_names:
        match = re.search(rexp, current, re.IGNORECASE)
        if match:
            metadata = match.groupdict()
            metadata.update({"videoCodec": match.group(1)})
            guess = guessed(metadata, confidence=0.8)
            current = update_found(current, guess, match.span(), span_adjust=(1, -1))

    # common well-defined words and regexps
    confidence = 1.0  # for all of them
    for prop, value, pos, end in find_properties(current):
        guess = guessed({prop: value}, confidence=confidence)
        current = update_found(current, guess, (pos, end))

    # weak guesses for episode number, only run it if we don't have an estimate already
    if filetype in ("episode", "episodesubtitle"):
        if not any("episodeNumber" in match for match in result):
            for rexp, _, span_adjust in weak_episode_rexps:
                match = re.search(rexp, current, re.IGNORECASE)
                if match:
                    metadata = match.groupdict()
                    epnum = int(metadata["episodeNumber"])
                    if epnum > 100:
                        guess = guessed({"season": epnum // 100, "episodeNumber": epnum % 100}, confidence=0.6)
                    else:
                        guess = guessed(metadata, confidence=0.3)
                    current = update_found(current, guess, match.span(), span_adjust)

    # try to find languages now
    language, span, confidence = search_language(current)
    while language:
        # is it a subtitle language?
        if "sub" in clean_string(current[: span[0]]).lower().split(" "):
            guess = guessed({"subtitleLanguage": language}, confidence=confidence)
        else:
            guess = guessed({"language": language}, confidence=confidence)
        current = update_found(current, guess, span)

        language, span, confidence = search_language(current)

    # remove our sentinels now and ajust spans accordingly
    assert current[0] == " " and current[-1] == " "
    current = current[1:-1]
    regions = [((start - 1, end - 1), guess) for (start, end), guess in regions]

    # split into '-' separated subgroups (with required separator chars
    # around the dash)
    didx = current.find("-")
    while didx > 0:
        regions.append(((didx, didx), None))
        didx = current.find("-", didx + 1)

    # cut our final groups, and rematch the guesses to the group that created
    # id, None if it is a leftover group
    region_spans = [span for span, guess in regions]
    string_groups = split_on_groups(string, region_spans)
    remaining_groups = split_on_groups(current, region_spans)
    guesses = []

    pos = 0
    for group in string_groups:
        found = False
        for span, guess in regions:
            if span[0] == pos:
                guesses.append(guess)
                found = True
        if not found:
            guesses.append(None)

        pos += len(group)

    return zip(string_groups, remaining_groups, guesses)