示例#1
0
import logging
import requests

import configs
from core import common
from time import sleep

if __name__ == '__main__':
    configs.setting_logger()

    argv = sys.argv[1:]
    collection_path = argv[0]
    collection_header_path = argv[1]
    quality_score_path = argv[2]

    corpus = common.load_collection(collection_header_path, collection_path,
                                    configs.ENCODE_DECODE)
    with open(quality_score_path, 'w', encoding=configs.ENCODE_DECODE) as wf:
        for ii, cc in enumerate(corpus):
            v_id = cc['video_id']
            img = 'https://img.youtube.com/vi/' + cc[
                'video_id'] + '/maxresdefault.jpg'
            rr = requests.head(img)
            if not rr.status_code == 200:
                logging.info('{}\t{}\t{}'.format(v_id, rr.status_code, img))
                wf.write('{}\t{}'.format(v_id, 0))
            else:
                wf.write('{}\t{}'.format(v_id, 1))

            wf.write('\n')
            sleep(0.1)
            if ii and ii % 100 == 0:
示例#2
0
import sys

import configs
from core.common import load_collection

if __name__ == '__main__':
    configs.setting_logger()

    argv = sys.argv[1:]
    tokens_path = argv[0]
    tokens_header_path = argv[1]
    indices_path = argv[2]

    corpus = load_collection(tokens_header_path,
                             tokens_path,
                             encoding=configs.ENCODE_DECODE)
    with open(indices_path, 'w', encoding=configs.ENCODE_DECODE) as wf:
        for cc in corpus:
            doc_id = cc['video_id']
            title_tokens = cc['title']

            output = list()
            output.append(doc_id)
            values = [
                '{}A0.0'.format(ii.split('A')[0])
                for ii in title_tokens.split()
            ]
            output.extend(values)
            wf.write('\t'.join(output))
            wf.write('\n')
示例#3
0
        if doc:
            _documents.append(' '.join(doc))

    return _documents


if __name__ == '__main__':
    configs.setting_logger()

    argv = sys.argv[1:]
    tokens_path = argv[0]
    tokens_header_path = argv[1]
    quality_score_path = argv[2]

    corpus = common.load_collection(tokens_header_path, tokens_path,
                                    configs.ENCODE_DECODE)
    g_video = _grouping_corpus(corpus, 'channel_id')
    with open(quality_score_path, 'w', encoding=configs.ENCODE_DECODE) as wf:
        for values in g_video.values():
            documents = _make_documents(values)
            model = TfidfVectorizer(
                tokenizer=lambda x: x.split()).fit(documents)

            for vv in values:
                if not vv['caption']:
                    continue

                title_desc = list()
                for tt in vv['title'].split() + vv['description'].split():
                    term, freq = tt.split('A')
                    freq = int(freq)

if __name__ == '__main__':
    configs.setting_logger()

    argv = sys.argv[1:]
    collection_path = argv[0]
    collection_header_path = argv[1]
    title_indices_path = argv[2]
    desc_indices_path = argv[3]
    appended_collection_path = argv[4]
    appended_collection_header_path = argv[5]

    fields = common.load_fields(collection_header_path)
    collection = common.load_collection(collection_header_path,
                                        collection_path,
                                        encoding=configs.ENCODE_DECODE)
    title_indices = _load_expanded_indices(title_indices_path)
    desc_indices = _load_expanded_indices(desc_indices_path)
    with open(appended_collection_path, 'w',
              encoding=configs.ENCODE_DECODE) as wf:
        for cc in collection:
            v_id = cc['video_id']

            title_keywords = list()
            for ii in title_indices.get(v_id, []):
                keyword = ii.split('A')[0]
                title_keywords.append(keyword)

            desc_keywords = list()
            for ii in desc_indices.get(v_id, []):