Exemplo n.º 1
0
 def __init__(self, company_id, release_ids, article_ids, output_name):
     self._company_id = company_id
     self._release_ids = release_ids
     self._article_ids = article_ids
     self._output_name = output_name
     self._tokens = TokenLoader(company_id)
     self._make_dirs()
Exemplo n.º 2
0
    def write_tags(self, matches_name, company_id):   
        dic_rel = {}
        dic_art = {}

        matches = MatchLoader(company_id, matches_name)
        tokens = TokenLoader(company_id)       

        rel_ids = matches.get_release_ids()
        for count, release_id in enumerate(rel_ids):
            print 'processing release #{0} of {1}'.format(count+1, len(rel_ids))
            tmp = tokens.get_release_tokens(release_id, False)
            self._process_tokens(tmp, dic_rel, release_id)

        art_ids = matches.get_article_ids()
        for count, article_id in enumerate(art_ids):
            print 'processing article #{0} of {1}'.format(count+1, len(art_ids))
            tmp = tokens.get_article_tokens(article_id, False)
            self._process_tokens(tmp, dic_art, article_id)

        path1 = common.get_postags_path()
        path2 = os.path.join(path1, matches_name)

        path = os.path.join(path2, common.DOCTYPE_PR)
        self._pickle(company_id, dic_rel, path)

        path = os.path.join(path2, common.DOCTYPE_NEWS)
        self._pickle(company_id, dic_art, path)
Exemplo n.º 3
0
 def __init__(self, company_id, matches_name):
     self._company_id = company_id
     self._matchloader = MatchLoader(company_id, matches_name)
     self._tokens = TokenLoader(company_id)
     self._releases = ReleaseLoader(company_id).get_releases()
     self._articles = ArticleLoader(company_id).get_articles()
     self._br = ConfigReader().get('MARKER_BR')
Exemplo n.º 4
0
    def _process(self, company_id):
        matchloader = MatchLoader(company_id, MATCHES_NAME)
        tokens = TokenLoader(company_id)
        releases = ReleaseLoader(company_id).get_releases()
        articles = ArticleLoader(company_id).get_articles()
        scores = ScoreLoader(company_id)

        for release_id in matcheloader.get_release_ids():
            release = releases[release_id]
            rel_tokens = tokens.get_stripped_release_token_block(release_id, 0, sys.maxint)
Exemplo n.º 5
0
    def print_matrix(self):
        sb = []
        sb.append('co-id, rel-id, art-id, rel-len, art-len, rel-used, art-added, rel-subj-score, art-subj-score, rel-sent-score, art-sent-score\n')
#        sb.append('co-id rel-id art-id rel-len art-len rel-used art-added rel-subj-score art-subj-score rel-sent-score art-sent-score\n')

        for company_id in range(1, 41):
            matches = MatchLoader(company_id, self._match_name)
            tokens = TokenLoader(company_id)
            scores = ScoreLoader(company_id)

            for release_id in matches.get_release_ids():
                rel_tokens = tokens.get_stripped_release_token_block(release_id, 0, sys.maxint)

                #release subjectivity score
                rel_subj = scores.count_subj_rel_sentences(release_id) / scores.count_all_rel_sentences(release_id)
                #release sentiment score
                if scores.count_subj_rel_sentences(release_id) == 0:
                    rel_sents = 0
                else:
                    pos_minus_neg = scores.count_pos_rel_sentences(release_id) - scores.count_neg_rel_sentences(release_id)
                    rel_sent = pos_minus_neg / scores.count_subj_rel_sentences(release_id)

                for article_id in matches.get_article_ids(release_id):
                    art_tokens = tokens.get_stripped_article_token_block(article_id, 0, sys.maxint)

                    blocks = matches.get_matches(release_id, article_id)
                    blocklen = 0
                    for b in blocks:
                        start = b[1]
                        length = b[2]
                        end = start + length
                        block_tokens = tokens.get_stripped_release_token_block(release_id, start, end)
                        blocklen += len(block_tokens)

                    rel_used = blocklen/len(rel_tokens)

                    art_added = 1 - blocklen/len(art_tokens)

                    #article subjectivity score
                    art_subj = scores.count_subj_art_sentences(article_id) / scores.count_all_art_sentences(article_id)
                    #article sentiment score
                    if scores.count_subj_art_sentences(article_id) == 0:
                        art_sents = 0
                    else:
                        pos_minus_neg = scores.count_pos_art_sentences(article_id) - scores.count_neg_art_sentences(article_id)
                        art_sent = pos_minus_neg / scores.count_subj_art_sentences(article_id)

                    sb.append('{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}\n'.format( \
#                    sb.append('{0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10}\n'.format( \
                        company_id, release_id, article_id, len(rel_tokens), len(art_tokens), rel_used, art_added, rel_subj, art_subj, rel_sent, art_sent))

        text = ''.join(sb)
        print text
Exemplo n.º 6
0
    def make_subset(self, matches_name_in, required_length, min_length,
                    subset_name_out):
        subset_maker = SubsetMaker()
        pair_counter = 0

        for company_id in range(1, 41):
            print 'Processing company {0}'.format(company_id)

            tokens = TokenLoader(company_id)
            matches = MatchLoader(company_id, matches_name_in)

            for release_id in matches.get_release_ids():
                for article_id in matches.get_article_ids(release_id):
                    blocks = matches.get_matches(release_id, article_id)
                    blocklist = bfilter.get_blocks(blocks, release_id,
                                                   article_id)
                    if len(
                            blocklist
                    ) > 0:  #if there are valid blocks according to this criteria
                        subset_maker
                        pr_set.add(release_id)
                        news_set.add(article_id)
                        pair_counter += 1

        print 'Total pairs: {0}'.format(pair_counter)
        subset_maker.save(subset_name_out)
Exemplo n.º 7
0
class TextWriter(object):
    def __init__(self, company_id, release_ids, article_ids, output_name):
        self._company_id = company_id
        self._release_ids = release_ids
        self._article_ids = article_ids
        self._output_name = output_name
        self._tokens = TokenLoader(company_id)
        self._make_dirs()

    def write(self):
        br = ConfigReader().get('MARKER_BR')
        for release_id in self._release_ids:

            tokens = self._tokens.get_release_tokens(release_id, False)
            text = ' '.join(tokens)
            text = text.replace(br, '\n')
            path = self._get_filepath(common.DOCTYPE_PR, release_id)
            with open(path, 'w') as f:
                f.write(text)

        for article_id in self._article_ids:
            tokens = self._tokens.get_article_tokens(article_id, False)
            text = ' '.join(tokens)
            text = text.replace(br, '\n')
            path = self._get_filepath(common.DOCTYPE_NEWS, article_id)
            with open(path, 'w') as f:
                f.write(text)

    def _get_filepath(self, doctype, text_id):
        path_dir = common.get_text_path(self._output_name, self._company_id)
        path_subdir = os.path.join(path_dir, doctype)
        return os.path.join(path_subdir, str(text_id))

    def _make_dirs(self):
        path = common.get_text_path(self._output_name, self._company_id)
        if not os.path.exists(path):
            os.mkdir(path)

        rel_path = os.path.join(path, common.DOCTYPE_PR)
        if not os.path.exists(rel_path):
            os.mkdir(rel_path)

        art_path = os.path.join(path, common.DOCTYPE_NEWS)
        if not os.path.exists(art_path):
            os.mkdir(art_path)
Exemplo n.º 8
0
    def __init__(self, company_id, release_ids, article_ids, required_length,
                 min_length, blocks_name_toignore):
        self._company_id = company_id
        self._release_ids = release_ids
        self._article_ids = article_ids
        self._required_length = required_length
        self._min_length = min_length

        self._tokens = TokenLoader(company_id)

        self._releases = ReleaseLoader(company_id).get_releases()
        self._articles = ArticleLoader(company_id).get_articles()

        self._ignoreblocks = BlockLoader(company_id,
                                         blocks_name_toignore).get_blocks()
        self._count_ignore = 0

        dloader = DuplicateLoader(company_id)
        self._rel_duplicates = dloader.get_release_duplicates()
        self._art_duplicates = dloader.get_article_duplicates()
Exemplo n.º 9
0
class MatchFilter(object):
    def __init__(self, company_id, match_name_in, match_name_out):
        self._company_id = company_id
        self._match_name_in = match_name_in
        self._match_name_out = match_name_out
        self._tokens = TokenLoader(company_id)

    def filter_exclude_pairs(self, pairs_name):
        matches = MatchLoader(self._company_id, self._match_name_in)
        maker = MatchMaker(self._company_id, self._match_name_out)
        pairs = PairLoader(self._company_id, pairs_name)

        for release_id in matches.get_release_ids():
            for article_id in matches.get_article_ids(release_id):
                if not pairs.has_pair(release_id, article_id):
                    blocks = matches.get_matches(release_id, article_id)
                    maker.add_blocks(release_id, article_id, blocks)
        maker.save()

    def filter_by_min_len(self, min_len):
        matches = MatchLoader(self._company_id, self._match_name_in)
        maker = MatchMaker(self._company_id, self._match_name_out)

        for release_id in matches.get_release_ids():
            for article_id in matches.get_article_ids(release_id):

                blocks = matches.get_matches(release_id, article_id)
                newblocks = []
                for b in blocks:
                    start = b[1]  #release start
                    length = b[2]
                    end = start + length
                    tkns = self._tokens.get_stripped_release_token_block(
                        release_id, start, end)
                    if len(tkns) >= min_len:
                        newblocks.append(b)
                    if len(newblocks) > 0:
                        maker.add_blocks(release_id, article_id, newblocks)
        maker.save()
Exemplo n.º 10
0
def test_tokenloader():
    company_id = int(sys.argv[1])
    t = TokenLoader(company_id)
    r = ReleaseLoader(company_id)
    a = ArticleLoader(company_id)
        
    print 'Testing TokenLoader'
    print 'company-id: {0}'.format(company_id)

    articles = a.get_articles()
    article_id = articles.itervalues().next().id()
    print 'article-id: {0}'.format(article_id)
    print 'TOKENS:'
    print t.get_article_tokens(article_id, False)

    releases = r.get_releases()
    release_id = releases.itervalues().next().id()
    print 'release-id: {0}'.format(release_id)
    print 'TOKENS:'
    print t.get_release_tokens(release_id, False)
Exemplo n.º 11
0
 def __init__(self, company_id, match_name_in, match_name_out):
     self._company_id = company_id
     self._match_name_in = match_name_in
     self._match_name_out = match_name_out
     self._tokens = TokenLoader(company_id)
Exemplo n.º 12
0
 def __init__(self, company_id, matches_name):
     self._matchloader = MatchLoader(company_id, matches_name)
     self._tokens = TokenLoader(company_id)
     self._br = ConfigReader().get('MARKER_BR')
Exemplo n.º 13
0
class BlockFinder(object):
    def __init__(self, company_id, matches_name):
        self._matchloader = MatchLoader(company_id, matches_name)
        self._tokens = TokenLoader(company_id)
        self._br = ConfigReader().get('MARKER_BR')

    def print_all_matching_blocks(self, min_len, max_len):
        for release_id in self._matchloader.get_release_ids():
            for article_id in self._matchloader.get_article_ids(release_id):
                blocks = self._matchloader.get_matches(release_id, article_id)
                for block in blocks:
                    i = block[0]
                    j = block[1]
                    k = block[2]

                    rel_match = self._tokens.get_stripped_release_token_block(
                        release_id, j, j + k)

                    if len(rel_match) >= min_len and len(rel_match) < max_len:
                        mb = ' '.join(rel_match)
                        mb = mb.replace(self._br, ' ')
                        print mb

    #prints blocks of min_length or larger occuring in more than one release -
    #   i.e., bad discriminators between releases
    def print_all_nondiscrim_release_blocks(self, min_len, max_len):
        blockset_dict = {}

        for release_id in self._matchloader.get_release_ids():

            blockset = set()  #set of blocks for current release
            blockset_dict[release_id] = blockset

            for article_id in self._matchloader.get_article_ids(release_id):
                blocks = self._matchloader.get_matches(release_id, article_id)
                for block in blocks:
                    i = block[0]
                    j = block[1]
                    k = block[2]

                    rel_match = self._tokens.get_stripped_release_token_block(
                        release_id, j, j + k)

                    if len(rel_match) >= min_len and len(rel_match) < max_len:
                        mb = ' '.join(rel_match)
                        mb = mb.replace(self._br, ' ')
                        mb = mb.lower().strip()
                        blockset.add(mb)

        #count occurances of each block per release
        bcounts = {}
        for release_id in blockset_dict:
            blockset = blockset_dict[release_id]
            for b in blockset:
                if b in bcounts:
                    bcounts[b] += 1
                else:
                    bcounts[b] = 1

        #print blocks which occur more than once per release
        result = [key for key in bcounts if bcounts[key] > 1]
        for r in result:
            print r
Exemplo n.º 14
0
class MatchFinder(object):
    def __init__(self, company_id, release_ids, article_ids, required_length,
                 min_length, blocks_name_toignore):
        self._company_id = company_id
        self._release_ids = release_ids
        self._article_ids = article_ids
        self._required_length = required_length
        self._min_length = min_length

        self._tokens = TokenLoader(company_id)

        self._releases = ReleaseLoader(company_id).get_releases()
        self._articles = ArticleLoader(company_id).get_articles()

        self._ignoreblocks = BlockLoader(company_id,
                                         blocks_name_toignore).get_blocks()
        self._count_ignore = 0

        dloader = DuplicateLoader(company_id)
        self._rel_duplicates = dloader.get_release_duplicates()
        self._art_duplicates = dloader.get_article_duplicates()

    def find_matches(self, output_name):
        matchmaker = MatchMaker(self._company_id, output_name)
        matcher = SequenceMatcher(autojunk=False)
        message = 'Processing company {0}: release {1} of {2}; article {3} of {4}'
        pairs_counter = 0

        for i, release_id in enumerate(
                self._release_ids):  #loop through releases

            if release_id in self._rel_duplicates:
                continue

            matcher.set_seq2(self._tokens.get_release_tokens(release_id, True))
            release_date = self._releases[release_id].date()

            for j, article_id in enumerate(
                    self._article_ids):  #loop through articles

                if article_id in self._art_duplicates:
                    continue

                if j % 100 == 0:
                    print message.format(self._company_id, i + 1,
                                         len(self._release_ids), j + 1,
                                         len(self._article_ids))

                matcher.set_seq1(
                    self._tokens.get_article_tokens(article_id, True))
                article_date = self._articles[article_id].date()

                if article_date >= release_date:  #search for matches if article appeared after the release

                    blocks = matcher.get_matching_blocks(
                    )  #block form: (i,j,k) where i = article (seq1), j = release (seq2)
                    if len(blocks) > 0:  #if there are blocks

                        valid_blocks = self._get_blocks(
                            blocks, release_id, article_id)
                        if len(valid_blocks) > 0:  #if there are valid blocks

                            matchmaker.add_blocks(release_id, article_id,
                                                  valid_blocks)
                            print '\tfound match for release={0} and article={1}'.format(
                                release_id, article_id)
                            pairs_counter += 1

        print 'total matching pairs: {0}'.format(pairs_counter)
        print 'ignored bad discriminators: {0}'.format(self._count_ignore)
        matchmaker.save()

    def _get_blocks(self, blocks, release_id, article_id):
        blocklist = []
        required_length_check = False

        for b in blocks:
            i = b[0]
            j = b[1]
            k = b[2]

            rel_match = self._tokens.get_stripped_release_token_block(
                release_id, j, j + k)
            art_match = self._tokens.get_stripped_article_token_block(
                article_id, i, i + k)

            rel_temp = ' '.join(rel_match)
            art_temp = ' '.join(art_match)

            if rel_temp.lower() != art_temp.lower():
                print rel_temp.lower()
                print art_temp.lower()
                raise Exception("blocks don't match")

            #check against bad discriminators BEFORE updating required_length_check
            if rel_temp.lower() in self._ignoreblocks:
                self._count_ignore += 1
                continue

            #check for min_length BEFORE updating required_length_check
            if len(rel_match) < self._min_length:
                continue

            if len(rel_match) >= self._required_length:
                required_length_check = True

            blocklist.append(b)

        #sort by length, decending
        if len(blocklist) == 0:
            return []

        if not required_length_check:
            return []
        else:
            blocklist = sorted(blocklist, key=itemgetter(2), reverse=True)
            return blocklist
Exemplo n.º 15
0
class MatchWriter(object):
    def __init__(self, company_id, matches_name):
        self._company_id = company_id
        self._matchloader = MatchLoader(company_id, matches_name)
        self._tokens = TokenLoader(company_id)
        self._releases = ReleaseLoader(company_id).get_releases()
        self._articles = ArticleLoader(company_id).get_articles()
        self._br = ConfigReader().get('MARKER_BR')

    def write_matches(self, output_path):
        html = self._build_html()
        filename = '{0}.html'.format(self._company_id)
        filepath = os.path.join(output_path, filename)
        self._write_html_to_file(filepath, html)

    def _build_html(self):
        sb = []
        counter = 0
        releases = self._get_sorted_releases()

        for release in releases:
            self._write_release_header(sb, release)
            articles = self._get_sorted_articles(release.id())

            for article in articles:
                #condition for id=35/32 only
                if self._company_id == '35':
                    delta = article.date() - release.date()
                    if delta.days >= TIME_DELTA and \
                            not (release.id() == 246 and article.id() == 944) and \
                            not (release.id() == 189 and article.id() == 1213) and \
                            not (release.id() == 71 and article.id() == 2557):
                        continue

                if self._company_id == '32':
                    delta = article.date() - release.date()
                    if delta.days >= TIME_DELTA:
                        continue

                blocks = self._matchloader.get_matches(release.id(),
                                                       article.id())

                self._write_article_summary(sb, blocks, release, article)
                self._write_texts(sb, blocks, release.id(), article.id())
                counter += 1

        print '{0}'.format(counter)
        return ''.join(sb)

    def _get_sorted_releases(self):
        ids = self._matchloader.get_release_ids()
        rels = [self._releases[id] for id in ids]
        rels.sort(key=lambda x: x.date())
        return rels

    def _get_sorted_articles(self, release_id):
        ids = self._matchloader.get_article_ids(release_id)
        arts = [self._articles[id] for id in ids]
        arts.sort(key=lambda x: x.date())
        return arts

    def _write_release_header(self, sb, release):
        sb.append('\n\t<tr>\n\t\t<td colspan="2" class="release-title">')
        sb.append('{0} --- {1} --- {2}\n\t\t</td>\n\t</tr>'.format( \
                release.id(), release.date().strftime('%B %d'), release.title()))

    def _write_article_summary(self, sb, blocks, release, article):
        sb.append('\n\t<tr><td colspan=2>')
        sb.append(
            '\n\t\t<table class="tbl-inner1" cellpadding="5" border="1"i>')

        sb.append(
            '\n\t\t\t<tr class="tbl-inner1-title"><td colspan="3" class="article-title">'
        )
        sb.append('R: {0} --- {1} --- {2}\n\t\t</td>\n\t</tr>'.format( \
                release.id(), release.date().strftime('%B %d'), release.title()))

        sb.append(
            '\n\t\t\t<tr class="tbl-inner1-title"><td colspan="3" class="article-title">'
        )
        sb.append('A: {0} --- {1} --- {2} --- {3}\n\t\t</td>\n\t</tr>'.format( \
                article.id(), article.date().strftime('%B %d'), article.headline(), article.pub()))

        sb.append(
            '\n\t\t\t<tr class="tbl-inner1-title"><td>#</td><td>length</td><td>match</td></tr>'
        )

        for count, block in enumerate(blocks):
            i = block[0]  #start in article
            j = block[1]  #start in release
            k = block[2]  #length

            rel_match = self._tokens.get_stripped_release_token_block(
                release.id(), j, j + k)
            art_match = self._tokens.get_stripped_article_token_block(
                article.id(), i, i + k)

            rel_temp = ' '.join(rel_match)
            art_temp = ' '.join(art_match)

            rel_temp = rel_temp.replace(self._br, ' ')
            art_temp = art_temp.replace(self._br, ' ')

            if rel_temp.lower() != art_temp.lower():
                print rel_temp.lower()
                print art_temp.lower()
                raise Exception("blocks don't match")

            sb.append('\n\t\t\t<tr valign="top">')
            sb.append('\n\t\t\t\t<td>{0}</td>'.format(count + 1))
            sb.append('\n\t\t\t\t<td>{0}</td>'.format(k))
            sb.append(
                '\n\t\t\t\t<td><span class="match match{0}">{1}</span>\n\t\t</td>'
                .format(count, rel_temp))
            sb.append('\n\t\t\t</tr>')

        sb.append('\n\t\t</table>')
        sb.append('\n\t</td></tr>')

    def _write_texts(self, sb, blocks, release_id, article_id):
        rel_tokens = self._tokens.get_release_tokens(release_id, False)
        art_tokens = self._tokens.get_article_tokens(article_id, False)
        rel_html = self._get_text(blocks, rel_tokens, POS_IN_BLOCK_REL)
        art_html = self._get_text(blocks, art_tokens, POS_IN_BLOCK_ART)

        sb.append('\n\t<tr valign="top">')
        sb.append('\n\t\t<td width="50%">{0}\n\t\t</td>'.format(rel_html))
        sb.append('\n\t\t<td width="50%">{0}\n\t\t</td>'.format(art_html))
        sb.append('\n\t</tr>')

    def _get_text(self, blocks, orig_tokens, pos_in_block):
        span_start = '<span class="match match{0}">'
        span_end = '</span>'
        #clone list
        tokens = orig_tokens[:]

        #sort by position in article
        blocks = sorted(blocks, key=itemgetter(pos_in_block))
        a = 0
        for count, block in enumerate(blocks):
            pos = block[pos_in_block]  #position in text
            k = block[2]  #length

            tokens.insert(pos + a, span_start.format(count))
            tokens.insert(pos + k + a + 1, span_end)
            a += 2

        html = ' '.join(tokens)
        html = html.replace(self._br, '<br/>')
        return html

    def _write_html_to_file(self, output_path, html):
        with open(output_path, 'w') as f:
            f.write('<html>\n<head>')
            f.write(
                '\n\t<link rel="stylesheet" type="text/css" href="styles.css">'
            )
            f.write('\n</head>\n<body>\n')
            f.write('\n<table class="tbl-main" cellpadding="5" border="1">')
            f.write(html)
            f.write('\n</table>')
            f.write('\n\n</body>\n</html>')