def localTest_parsePosterThumbnailData_None(self): """ Tests a typical poster page loaded from filesystem. """ latinStr = 'Operatsiya Y i drugie priklyucheniya Shurika' self._assertEquals('Операция Ы и другие приключения Шурика', translit.detranslify(latinStr).encode('utf8'), 'Wrong translitirated string') latinStr = 'D\'Artanyan i tri mushketyora[kinokopilka].torrent' self._assertEquals('Д‘Артанян и три мушкетёра[кинокопилка].торрент', translit.detranslify(latinStr).encode('utf8'), 'Wrong translitirated string')
def score_title(entry, fileyear, medianame, idx): score = 90 Log.Debug('### Start scoring %s (%s) <-> %s (%s) idx = %s', medianame, fileyear, entry['nameRU'], entry['year'], idx) # result position penalty if Prefs['search.trust_kp'] is True and idx == 1: score = score + 10 else: score = score - idx * const.SCORE_PENALTY_ITEM_ORDER yearpenalty = const.SCORE_PENALTY_YEAR / 3 # if we have no year mediayear = int(fileyear or 0) year = int(re.sub('[^0-9]', '', entry.get('year') or '0') or '0') if mediayear != 0 and year != 0: yeardiff = abs(mediayear - year) if yeardiff < 1: score = score + 10 yearpenalty = 0 else: if yeardiff == 1: yearpenalty = int(const.SCORE_PENALTY_YEAR / 4) elif yeardiff == 2: yearpenalty = int(const.SCORE_PENALTY_YEAR / 3) else: yearpenalty = yeardiff * int(const.SCORE_PENALTY_YEAR / 2) score = score - yearpenalty titlepenalty = compute_title_penalty(medianame, clear_title(entry['nameRU'])) alttitlepenalty = 100 if 'nameEN' in entry: alttitlepenalty = compute_title_penalty(medianame, entry['nameEN']) Log.Debug('yearpenalty = %s, titlepenalty = %s, alttitlepenalty = %s', yearpenalty, titlepenalty, alttitlepenalty) try: detranslifiedmedianame = translit.detranslify(medianame) detranslifiedtitlepenalty = compute_title_penalty(detranslifiedmedianame, clear_title(entry['nameRU'])) titlepenalty = min(detranslifiedtitlepenalty, titlepenalty) if 'nameEN' in entry: detranslifiedalttitlepenalty = compute_title_penalty(detranslifiedmedianame, entry['nameEN']) alttitledetranslified = translit.detranslify(entry['nameEN']) reverseddetranslifiedalttitlepenalty = compute_title_penalty(detranslifiedmedianame, alttitledetranslified) alttitlepenalty = min(detranslifiedalttitlepenalty, reverseddetranslifiedalttitlepenalty, alttitlepenalty) except: Log('Error computing title penalty for %s', medianame) titlepenalty = min(titlepenalty, alttitlepenalty) score = score - titlepenalty if check_main_poster(entry['id']) is True: score = score - const.SCORE_PENALTY_MAIN_POSTER if idx == 0 and score <= 80: score = score + 5 Log.Debug('### End scoring %s', medianame) return score if score <= 100 else 100
def scoreTitle(entry, media, mediaName, idx): score = 100 score = score - (idx * const.SCORE_PENALTY_ITEM_ORDER) yearpenalty = const.SCORE_PENALTY_YEAR mediayear = int(media.year or 0) year = int(re.sub('[^0-9]', '', entry['year']) or 0) if mediayear != 0 and year != 0: yeardiff = abs(mediayear - year) if not yeardiff: yearpenalty = 0 elif yeardiff == 1: yearpenalty = int(const.SCORE_PENALTY_YEAR / 4) elif yeardiff == 2: yearpenalty = int(const.SCORE_PENALTY_YEAR / 3) else: # If year is unknown, don't penalize the score too much. yearpenalty = int(const.SCORE_PENALTY_YEAR / 3) score = score - yearpenalty titlepenalty = computeTitlePenalty(mediaName, entry['nameRU']) alttitlepenalty = 100 if 'nameEN' in entry: alttitlepenalty = computeTitlePenalty(mediaName, entry['nameEN']) try: detranslifiedmedianame = translit.detranslify(mediaName) detranslifiedtitlepenalty = computeTitlePenalty( detranslifiedmedianame, entry['nameRU']) titlepenalty = min(detranslifiedtitlepenalty, titlepenalty) if 'nameEN' in entry: detranslifiedalttitlepenalty = computeTitlePenalty( detranslifiedmedianame, entry['nameEN']) alttitledetranslified = translit.detranslify(entry['nameEN']) reverseddetranslifiedalttitlepenalty = computeTitlePenalty( detranslifiedmedianame, alttitledetranslified) alttitlepenalty = min(detranslifiedalttitlepenalty, reverseddetranslifiedalttitlepenalty, alttitlepenalty) except: pass titlepenalty = min(titlepenalty, alttitlepenalty) score = score - titlepenalty if idx == 0 and score <= 80: score = score + 5 return score
def scoreTitle(entry, media, mediaName, idx): score = 100 score = score - (idx * const.SCORE_PENALTY_ITEM_ORDER) yearpenalty = const.SCORE_PENALTY_YEAR mediayear = int(media.year or 0) year = int(re.sub('[^0-9]','', entry['year']) or 0) if mediayear != 0 and year != 0: yeardiff = abs(mediayear - year) if not yeardiff: yearpenalty = 0 elif yeardiff == 1: yearpenalty = int(const.SCORE_PENALTY_YEAR / 4) elif yeardiff == 2: yearpenalty = int(const.SCORE_PENALTY_YEAR / 3) else: # If year is unknown, don't penalize the score too much. yearpenalty = int(const.SCORE_PENALTY_YEAR / 3) score = score - yearpenalty titlepenalty = computeTitlePenalty(mediaName, entry['nameRU']) alttitlepenalty = 100 if 'nameEN' in entry: alttitlepenalty = computeTitlePenalty(mediaName, entry['nameEN']) try: detranslifiedmedianame = translit.detranslify(mediaName) detranslifiedtitlepenalty = computeTitlePenalty(detranslifiedmedianame, entry['nameRU']) titlepenalty = min(detranslifiedtitlepenalty, titlepenalty) if 'nameEN' in entry: detranslifiedalttitlepenalty = computeTitlePenalty(detranslifiedmedianame, entry['nameEN']) alttitledetranslified = translit.detranslify(entry['nameEN']) reverseddetranslifiedalttitlepenalty = computeTitlePenalty(detranslifiedmedianame, alttitledetranslified) alttitlepenalty = min(detranslifiedalttitlepenalty, reverseddetranslifiedalttitlepenalty, alttitlepenalty) except: pass titlepenalty = min(titlepenalty, alttitlepenalty) score = score - titlepenalty if idx == 0 and score <= 80: score = score + 5 return score
def fetchAndParseSearchResults(self, mediaName, mediaYear): """ Searches for movie titles on KinoPoisk. @param mediaName Movie title parsed from a filename. @param mediaName Movie year parsed from a filename. @return Array of tuples: [kinoPoiskId, title, year, score] """ self.log.Info('Quering kinopoisk...') results = self.queryKinoPoisk(mediaName, mediaYear) # Check media name is all ASCII characters, and if it is, # issue another query to KinoPoisk using a translified media name; # lastly, merge the scored results. if common.isAsciiString(mediaName): translifiedMediaName = translit.detranslify(mediaName) moreResults = self.queryKinoPoisk(translifiedMediaName, mediaYear) resultsMap = dict() for result in results: resultsMap[result[0]] = result results = [ ] # Recreate and repopulate the results array removing duplicates. for result in moreResults: currId = result[0] if currId in resultsMap.keys(): origResult = resultsMap[currId] del resultsMap[currId] if result[3] >= origResult[3]: results.append(result) else: results.append(origResult) else: results.append(result) results.extend(resultsMap.viewvalues()) # Sort all results based on their score. results.sort(key=operator.itemgetter(3)) results.reverse() if self.isDebug: self.log.Debug('Search produced %d results:' % len(results)) index = -1 for result in results: index = index + 1 self.log.Debug( ' ... %d: score="%d", id="%s", name="%s", year="%s".' % (index, result[3], result[0], result[1], str(result[2]))) return results
def fetchAndParseSearchResults(self, mediaName, mediaYear): """ Searches for movie titles on KinoPoisk. @param mediaName Movie title parsed from a filename. @param mediaName Movie year parsed from a filename. @return Array of tuples: [kinoPoiskId, title, year, score] """ self.log.Info('Quering kinopoisk...') results = self.queryKinoPoisk(mediaName, mediaYear) # Check media name is all ASCII characters, and if it is, # issue another query to KinoPoisk using a translified media name; # lastly, merge the scored results. if common.isAsciiString(mediaName): translifiedMediaName = translit.detranslify(mediaName) moreResults = self.queryKinoPoisk(translifiedMediaName, mediaYear) resultsMap = dict() for result in results: resultsMap[result[0]] = result results = [] # Recreate and repopulate the results array removing duplicates. for result in moreResults: currId = result[0] if currId in resultsMap.keys(): origResult = resultsMap[currId] del resultsMap[currId] if result[3] >= origResult[3]: results.append(result) else: results.append(origResult) else: results.append(result) results.extend(resultsMap.viewvalues()) # Sort all results based on their score. results.sort(key=operator.itemgetter(3)) results.reverse() if self.isDebug: self.log.Debug('Search produced %d results:' % len(results)) index = -1 for result in results: index += 1 self.log.Debug(' ... %d: id="%s", name="%s", year="%s", score="%d".' % (index, result[0], result[1], str(result[2]), result[3])) return results
def scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex): """ Compares page and media titles taking into consideration media item's year and title values. Returns score [0, 100]. Search item scores 100 when: - it's first on the list of results; AND - it equals to the media title (ignoring case) OR all media title words are found in the search item; AND - search item year equals to media year. For now, our title scoring is pretty simple - we check if individual words from media item's title are found in the title from search results. We should also take into consideration order of words, so that "One Two" would not have the same score as "Two One". Also, taking into consideration year difference. """ if DEBUG_SCORING: Log.Debug('>>>>>>> comparing item %d::: "%s (%s)" with "%s (%s)" alt="%s"...' % (itemIndex, str(mediaName), str(mediaYear), str(title), str(year), str(altTitle))) # Max score is when both title and year match exactly. score = 100 # Item order penalty (the lower it is on the list or results, the larger the penalty). score = score - (itemIndex * SCORE_PENALTY_ITEM_ORDER) # Compute year penalty: [equal, diff>=3] --> [0, MAX]. yearPenalty = SCORE_PENALTY_YEAR mediaYear = toInteger(mediaYear) year = toInteger(year) if mediaYear is not None and year is not None: yearDiff = abs(mediaYear - year) if not yearDiff: yearPenalty = 0 elif yearDiff == 1: yearPenalty = int(SCORE_PENALTY_YEAR / 4) elif yearDiff == 2: yearPenalty = int(SCORE_PENALTY_YEAR / 3) else: # If year is unknown, don't penalize the score too much. yearPenalty = int(SCORE_PENALTY_YEAR / 3) score = score - yearPenalty # Compute title penalty. titlePenalty = computeTitlePenalty(mediaName, title) altTitlePenalty = 100 if altTitle is not None: altTitlePenalty = computeTitlePenalty(mediaName, altTitle) # Get detranlitirated media name (in case filename is in latin characters), # compare it's score with the original, and pick the min. try: detranslifiedMediaName = translit.detranslify(mediaName) detranslifiedTitlePenalty = computeTitlePenalty(detranslifiedMediaName, title) titlePenalty = min(detranslifiedTitlePenalty, titlePenalty) if DEBUG_SCORING: Log.Debug('Comparing title penalties: %d (original) and %d (detranslified).' % (titlePenalty, detranslifiedTitlePenalty)) if altTitle is not None: detranslifiedAltTitlePenalty = computeTitlePenalty(detranslifiedMediaName, altTitle) altTitleDetranslified = translit.detranslify(altTitle) reversedDetranslifiedAltTitlePenalty = computeTitlePenalty( detranslifiedMediaName, altTitleDetranslified) altTitlePenalty = min(detranslifiedAltTitlePenalty, reversedDetranslifiedAltTitlePenalty, altTitlePenalty) if DEBUG_SCORING: Log.Debug('Comparing alt title penalties: %d (1), %d (2), and %d (3).' % (detranslifiedAltTitlePenalty, reversedDetranslifiedAltTitlePenalty, altTitlePenalty)) except: pass titlePenalty = min(titlePenalty, altTitlePenalty) if DEBUG_SCORING: Log.Debug('Picked the lowest title penalty: %d.' % titlePenalty) score = score - titlePenalty # If the score is not high enough, add a few points to the first result - # let's give KinoPoisk some credit :-). if itemIndex == 0 and score <= 80: score = score + 5 # IMPORTANT: always return an int. score = int(score) if DEBUG_SCORING: Log.Debug('***** title scored %d' % score) return score
def scoreMediaTitleMatch(mediaName, mediaYear, title, altTitle, year, itemIndex): """ Compares page and media titles taking into consideration media item's year and title values. Returns score [0, 100]. Search item scores 100 when: - it's first on the list of results; AND - it equals to the media title (ignoring case) OR all media title words are found in the search item; AND - search item year equals to media year. For now, our title scoring is pretty simple - we check if individual words from media item's title are found in the title from search results. We should also take into consideration order of words, so that "One Two" would not have the same score as "Two One". Also, taking into consideration year difference. """ if DEBUG_SCORING: Log.Debug( '>>>>>>> comparing item %d::: "%s (%s)" with "%s (%s)" alt="%s"...' % (itemIndex, str(mediaName), str(mediaYear), str(title), str(year), str(altTitle))) # Max score is when both title and year match exactly. score = 100 # Item order penalty (the lower it is on the list or results, the larger the penalty). score = score - (itemIndex * SCORE_PENALTY_ITEM_ORDER) # Compute year penalty: [equal, diff>=3] --> [0, MAX]. yearPenalty = SCORE_PENALTY_YEAR mediaYear = toInteger(mediaYear) year = toInteger(year) if mediaYear is not None and year is not None: yearDiff = abs(mediaYear - year) if not yearDiff: yearPenalty = 0 elif yearDiff == 1: yearPenalty = int(SCORE_PENALTY_YEAR / 4) elif yearDiff == 2: yearPenalty = int(SCORE_PENALTY_YEAR / 3) else: # If year is unknown, don't penalize the score too much. yearPenalty = int(SCORE_PENALTY_YEAR / 3) score = score - yearPenalty # Compute title penalty. titlePenalty = computeTitlePenalty(mediaName, title) altTitlePenalty = 100 if altTitle is not None: altTitlePenalty = computeTitlePenalty(mediaName, altTitle) # Get detranlitirated media name (in case filename is in latin characters), # compare it's score with the original, and pick the min. try: detranslifiedMediaName = translit.detranslify(mediaName) detranslifiedTitlePenalty = computeTitlePenalty( detranslifiedMediaName, title) titlePenalty = min(detranslifiedTitlePenalty, titlePenalty) if DEBUG_SCORING: Log.Debug( 'Comparing title penalties: %d (original) and %d (detranslified).' % (titlePenalty, detranslifiedTitlePenalty)) if altTitle is not None: detranslifiedAltTitlePenalty = computeTitlePenalty( detranslifiedMediaName, altTitle) altTitleDetranslified = translit.detranslify(altTitle) reversedDetranslifiedAltTitlePenalty = computeTitlePenalty( detranslifiedMediaName, altTitleDetranslified) altTitlePenalty = min(detranslifiedAltTitlePenalty, reversedDetranslifiedAltTitlePenalty, altTitlePenalty) if DEBUG_SCORING: Log.Debug( 'Comparing alt title penalties: %d (1), %d (2), and %d (3).' % (detranslifiedAltTitlePenalty, reversedDetranslifiedAltTitlePenalty, altTitlePenalty)) except: pass titlePenalty = min(titlePenalty, altTitlePenalty) if DEBUG_SCORING: Log.Debug('Picked the lowest title penalty: %d.' % titlePenalty) score = score - titlePenalty # If the score is not high enough, add a few points to the first result - # let's give KinoPoisk some credit :-). if itemIndex == 0 and score <= 80: score = score + 5 # IMPORTANT: always return an int. score = int(score) if DEBUG_SCORING: Log.Debug('***** title scored %d' % score) return score
tables.update({table_name: ['id integer primary key not null']}) pprint(tables) for line in [ x.strip() for x in open("digraphg.dot").readlines() if x.find('>') != -1 ]: if line.count('->') == 1: table, field = [to_translit(x.strip('"\' ')) for x in line.split('->')] if field.count('['): field = field.split('[')[0].strip('_"') tables[table] += ['%s text' % field] for table in tables: such_name = detranslify(table) if len(tables[table]) == 2: atr_name = tables[table][-1].split()[0] atr_name = detranslify(atr_name) text += [choice(atr_1) % (such_name, atr_name)] elif len(tables[table]) > 2: atr_names = '", "'.join( [detranslify(x.split()[0]) for x in tables[table][1:]]) text += [choice(atr_2) % (such_name, atr_names)] pprint(tables) one2many = [] for line in [