Пример #1
0
 def __init__(self, company_id, matches_name):
     self._company_id = company_id
     self._matchloader = MatchLoader(company_id, matches_name)
     self._tokens = TokenLoader(company_id)
     self._releases = ReleaseLoader(company_id).get_releases()
     self._articles = ArticleLoader(company_id).get_articles()
     self._br = ConfigReader().get('MARKER_BR')
Пример #2
0
    def _process(self, company_id):
        matchloader = MatchLoader(company_id, MATCHES_NAME)
        tokens = TokenLoader(company_id)
        releases = ReleaseLoader(company_id).get_releases()
        articles = ArticleLoader(company_id).get_articles()
        scores = ScoreLoader(company_id)

        for release_id in matcheloader.get_release_ids():
            release = releases[release_id]
            rel_tokens = tokens.get_stripped_release_token_block(release_id, 0, sys.maxint)
Пример #3
0
 def _load_art_hashes(self):
     hashes = {}
     articles = ArticleLoader(self._company_id).get_articles()
     for article_id in articles:
         article = articles[article_id]
         text = str(article.date()) + article.pub() + article.headline(
         ) + article.body()
         m = hashlib.md5()
         m.update(text)
         hashes[article_id] = m.hexdigest()
     return hashes
Пример #4
0
def test_tokenloader():
    company_id = int(sys.argv[1])
    t = TokenLoader(company_id)
    r = ReleaseLoader(company_id)
    a = ArticleLoader(company_id)
        
    print 'Testing TokenLoader'
    print 'company-id: {0}'.format(company_id)

    articles = a.get_articles()
    article_id = articles.itervalues().next().id()
    print 'article-id: {0}'.format(article_id)
    print 'TOKENS:'
    print t.get_article_tokens(article_id, False)

    releases = r.get_releases()
    release_id = releases.itervalues().next().id()
    print 'release-id: {0}'.format(release_id)
    print 'TOKENS:'
    print t.get_release_tokens(release_id, False)
Пример #5
0
    def __init__(self, company_id, release_ids, article_ids, output_name):
        self._company_id = company_id
        self._release_ids = release_ids
        self._article_ids = article_ids
        self._output_name = output_name

        self._releases = ReleaseLoader(company_id).get_releases()
        self._articles = ArticleLoader(company_id).get_articles()

        self._tokenizer = Tokenizer()
        self._lexicon = SubjLexiconLoader()

        self._make_dirs()
Пример #6
0
    def build_subset_all(self):
        subset_maker = SubsetMaker()
        for company_id in range(1, 41):
            print 'Processing company {0}'.format(company_id)
            releases = ReleaseLoader(company_id).get_releases()
            for release_id in releases:
                subset_maker.add_release(company_id, release_id)

            articles = ArticleLoader(company_id).get_articles()
            for article_id in articles:
                subset_maker.add_article(company_id, article_id)

        subset_maker.save(SUBSET_ALL)
Пример #7
0
def test_duplicateloader():
    company_id = int(sys.argv[1])
    duplicates = DuplicateLoader(company_id)
    releases = ReleaseLoader(company_id).get_releases()
    articles = ArticleLoader(company_id).get_articles()
    print 'Testing DuplicateLoader'
    print 'company-id: {0}'.format(company_id)

    rel_dups = duplicates.get_release_duplicates()
    print 'RELEASE DUPLICATES: {0}'.format(len(rel_dups))
    for d in rel_dups:
        r = releases[int(d)]
        print '{0} : {1}'.format(r.id(), r.title())

    art_dups = duplicates.get_article_duplicates()
    print 'ARTICLE DUPLICATES: {0}'.format(len(art_dups))
    for d in art_dups:
        a = articles[int(d)]
        print '{0} : {1}'.format(a.id(), a.headline())
Пример #8
0
    def __init__(self, company_id, release_ids, article_ids, required_length,
                 min_length, blocks_name_toignore):
        self._company_id = company_id
        self._release_ids = release_ids
        self._article_ids = article_ids
        self._required_length = required_length
        self._min_length = min_length

        self._tokens = TokenLoader(company_id)

        self._releases = ReleaseLoader(company_id).get_releases()
        self._articles = ArticleLoader(company_id).get_articles()

        self._ignoreblocks = BlockLoader(company_id,
                                         blocks_name_toignore).get_blocks()
        self._count_ignore = 0

        dloader = DuplicateLoader(company_id)
        self._rel_duplicates = dloader.get_release_duplicates()
        self._art_duplicates = dloader.get_article_duplicates()
Пример #9
0
def test_articleloader():
    company_id = int(sys.argv[1])
    a = ArticleLoader(company_id)
    print 'Testing ArticleLoader'
    print 'company-id: {0}'.format(company_id)
    print 'articles: {0}'.format(len(a.get_articles()))