def __init__(self, cfg):
     self.cfg = cfg
     self.downloader = Downloader(cfg)
     self.logger = Logger(cfg)
     self.parser = AnotherHTMLParser(self.logger)
     self.pairs = set()
     self.db_handler = DBHandler(cfg)
     self._word_dict = None
class DataGatherer(object):
    def __init__(self, cfg):
        self.cfg = cfg
        self.downloader = Downloader(cfg)
        self.logger = Logger(cfg)
        self.parser = AnotherHTMLParser(self.logger)
        self.pairs = set()
        self.db_handler = DBHandler(cfg)
        self._word_dict = None

    def read_raw_pairs(self, delimiter=',', limit=0):
        path = cfg['train_path']
        try:
            f = open(path)
        except IOError:
            self.logger.critical("Can't open file '{}'!".format(path))
            sys.exit()

        lines = f.read().split('\n')

        pairs = set()
        i = 0
        for line in lines:
            if not line:
                continue
            if limit and i > limit:
                break

            i += 1
            elements = line.split(delimiter)
            try:
                if elements[2] == 'left':
                    pair = (elements[0], elements[1])
                else:
                    pair = (elements[1], elements[0])
                if pair in pairs:
                    self.logger.warning('pair {} is duplicate!'.format(pair))
                    i -= 1
                pairs.add(pair)
            except IndexError:
                raise AssertionError('line {} is incorrect!'.format(line))
        return pairs

    def read_pairs(self, delimiter=',', limit=0):
        path = cfg['train_fixed_path']
        try:
            f = open(path)
        except IOError:
            self.logger.critical("Can't open file '{}'!".format(path))
            sys.exit()

        lines = f.read().split('\n')

        pairs = set()
        i = 0
        for line in lines:
            if not line:
                continue
            if limit and i > limit:
                break

            i += 1
            elements = line.split(delimiter)
            try:
                pair = tuple(elements)
                if pair in pairs:
                    self.logger.warning('pair {} is duplicate!'.format(pair))
                    i -= 1
                pairs.add(pair)
            except IndexError:
                raise AssertionError('line {} is incorrect!'.format(line))
        return pairs

    def exclude_untracked_videos(self, pairs):
        ids = set(self.db_handler.get_all_video_ids())
        pairs_set = set(pairs)
        for pair in pairs:
            for youtube_id in pair:
                if youtube_id not in ids:
                    pairs_set.remove(pair)
                    break
        return pairs_set

    def rewrite_pairs(self, pairs):
        pairs_fixed = self.exclude_untracked_videos(pairs)
        f = open(self.cfg['train_fixed_path'], 'wb')
        for pair in pairs_fixed:
            f.write(','.join(pair) + '\n')
        f.close()

    def fill_video_catalog(self, pairs, force=False):
        lefts_and_rights = zip(*pairs)
        ids = set(lefts_and_rights[0] + lefts_and_rights[1])

        if not force:
            ids_cache = set(self.db_handler.get_all_video_ids())
            ids.difference_update(ids_cache)

        for i, youtube_id in enumerate(ids):
            if i % 100 == 0:
                self.logger.info('scanned {} lines.'.format(i))
            self.add_video_by_id(youtube_id)

    def update_video_catalog(self, limit=None):
        ids_cache = set(self.db_handler.get_all_video_ids())
        for i, youtube_id in enumerate(ids_cache):
            if limit and i > limit:
                break
            self.update_video_by_id(youtube_id)

    def add_video_by_id(self, youtube_id):
        html = self.downloader.get_html(youtube_id)
        if not self.parser._check_video_availability(html):
            return

        video_item = Video(youtube_id)
        video_item.update(title=self.parser.get_video_title(html))
        self.db_handler.add_entry(video_item)

    def update_video_by_id(self, youtube_id):
        html = self.downloader.get_html(youtube_id)
        if not self.parser._check_video_availability(html):
            return

        video_item = self.db_handler.get_video_by_youtube_id(youtube_id)
        try:
            video_item.update(
                title=self.parser.get_video_title(html),
                views=self.parser.get_view_count(html),
                likes=self.parser.get_likes_count(html),
                dislikes=self.parser.get_dislikes_count(html),
            )
        except ParseError:
            video_item.mark_invalid()
        self.db_handler.commit()

    def update_rank1s(self, pairs):
        videos = self.db_handler.get_all_videos()
        rank1_map = self.get_rank1_map(pairs)

        for video in videos:
            if video.youtube_id in rank1_map:
                video.rank1 = rank1_map[video.youtube_id]
            else:
                self.logger.warning('video {} has no rank calculated!'.format(video.youtube_id))

        self.db_handler.db_session.commit()

    def update_rank2s(self, catalog, pairs):
        videos = self.db_handler.get_all_videos()
        rank2_map = self.get_rank2_map(catalog, pairs)

        for video in videos:
            if video.youtube_id in rank2_map:
                video.rank2 = rank2_map[video.youtube_id]
            else:
                self.logger.warning('video {} has no rank calculated!'.format(video.youtube_id))

        self.db_handler.db_session.commit()

    def update_views(self, force=False):
        if force:
            videos = self.db_handler.get_all_videos()
        else:
            videos = self.db_handler.db_session.query(Video).filter(Video.views == None).all()

        for video in videos:
            try:
                video.views = self.parser.get_view_count(self.downloader.get_html(video.youtube_id))
            except ParseError:
                pass

        self.db_handler.commit()

    def get_video_catalog(self):
        return self.db_handler.get_all_video_data()

    def get_rank1_map(self, pairs):
        ids_above, ids_below = zip(*pairs)
        rank_map = defaultdict(lambda: 0)

        for youtube_id in ids_above:
            rank_map[youtube_id] += 1

        for youtube_id in ids_below:
            rank_map[youtube_id] -= 1

        return rank_map

    def get_rank2_map(self, catalog, pairs):
        chunks = partial_sort(catalog, pairs)
        aggregated_ranks = calculate_aggregated_ranks(chunks)
        assert len(aggregated_ranks) == len(chunks)
        ranked_chunks = zip(aggregated_ranks, chunks)

        r_map = {}
        for rank, chunk in ranked_chunks:
            for youtube_id in chunk:
                r_map[youtube_id] = rank
        return r_map

    def get_char_stat(self):
        characters = set()
        videos = self.db_handler.get_all_videos()
        for video in videos:
            if video.title:
                characters.update(video.title)
        return sorted(list(characters))

    def update_lang_stat(self):
        videos = self.db_handler.get_all_videos()
        for video in videos:
            if video.title:
                video.lang = get_lang(video.title)

        self.db_handler.commit()

    def get_all_words(self):
        words = defaultdict(lambda: 0)
        print 'delimiters: {}'.format(TITLE_DELIMITER)
        videos = self.db_handler.get_all_videos()
        for video in videos:
            for word in extract_words(video.title):
                words[prepare_word(word)] += 1

        return words

    def fill_word_db(self, words):
        for w, count in words.iteritems():
            word = Word(w, None, count)
            self.db_handler.db_session.add(word)
        self.db_handler.commit()

    def fill_words_for_videos(self):
        words = self.db_handler.db_session.query(Word).all()
        word_dict = {}
        for word in words:
            word_dict[word.word] = word

        videos = self.db_handler.get_all_videos()
        for video in videos:
            wordids = set()
            for word in extract_words(video.title):
                w = prepare_word(word)
                if w in word_dict:
                    wordids.add(word_dict[w].id)
            video.wordids = serialize_ids(wordids)

        self.db_handler.commit()

    def calculate_rank1_for_words(self):
        words = self.db_handler.db_session.query(Word).filter(Word.count >= 10).all()
        word_dict = {}
        for word in words:
            word_dict[word.id] = word

        rank_dict = defaultdict(lambda: [])

        videos = self.db_handler.get_all_videos()
        for video in videos:
            word_ids = deserialize_ids(video.wordids)
            for word_id in word_ids:
                if word_id not in word_dict:
                    continue
                rank_dict[word_id].append(video.rank1)

        for word_id in rank_dict:
            if word_id not in word_dict:
                continue
            word_dict[word_id].rank1 = mean(rank_dict.setdefault(word_id, [0]))

        # kostyl! set rank = 0 for word ''
        null_word = self.db_handler.db_session.query(Word).filter(Word.word == '').one()
        null_word.rank1 = 0
        # --

        self.db_handler.commit()

    def get_word_dict_by_word(self):
        if not self._word_dict:
            words = self.db_handler.db_session.query(Word).all()
            self._word_dict = {}
            for word in words:
                self._word_dict[word.word] = word

        return self._word_dict

    def calculate_title_rank(self, title, f):
        word_dict = self.get_word_dict_by_word()
        title_words = extract_words(title)

        title_rank = sum(f(word_dict[x]) for x in title_words if x in word_dict)
        return title_rank