def write_tags(self, matches_name, company_id): dic_rel = {} dic_art = {} matches = MatchLoader(company_id, matches_name) tokens = TokenLoader(company_id) rel_ids = matches.get_release_ids() for count, release_id in enumerate(rel_ids): print 'processing release #{0} of {1}'.format(count+1, len(rel_ids)) tmp = tokens.get_release_tokens(release_id, False) self._process_tokens(tmp, dic_rel, release_id) art_ids = matches.get_article_ids() for count, article_id in enumerate(art_ids): print 'processing article #{0} of {1}'.format(count+1, len(art_ids)) tmp = tokens.get_article_tokens(article_id, False) self._process_tokens(tmp, dic_art, article_id) path1 = common.get_postags_path() path2 = os.path.join(path1, matches_name) path = os.path.join(path2, common.DOCTYPE_PR) self._pickle(company_id, dic_rel, path) path = os.path.join(path2, common.DOCTYPE_NEWS) self._pickle(company_id, dic_art, path)
def test_matchloader(): company_id = int(sys.argv[1]) matches_name = sys.argv[2] m = MatchLoader(company_id, matches_name) print 'Testing MatchLoader' print 'company-id: {0}'.format(company_id) print 'releases: {0}'.format(len(m.get_release_ids())) print 'articles: {0}'.format(len(m.get_article_ids())) print 'pairs: {0}'.format(m.count_rel_art_pairs()) print 'blocks: {0}'.format(m.count_matching_blocks())
def filter_exclude_pairs(self, pairs_name): matches = MatchLoader(self._company_id, self._match_name_in) maker = MatchMaker(self._company_id, self._match_name_out) pairs = PairLoader(self._company_id, pairs_name) for release_id in matches.get_release_ids(): for article_id in matches.get_article_ids(release_id): if not pairs.has_pair(release_id, article_id): blocks = matches.get_matches(release_id, article_id) maker.add_blocks(release_id, article_id, blocks) maker.save()
def print_matrix(self): sb = [] sb.append('co-id, rel-id, art-id, rel-len, art-len, rel-used, art-added, rel-subj-score, art-subj-score, rel-sent-score, art-sent-score\n') # sb.append('co-id rel-id art-id rel-len art-len rel-used art-added rel-subj-score art-subj-score rel-sent-score art-sent-score\n') for company_id in range(1, 41): matches = MatchLoader(company_id, self._match_name) tokens = TokenLoader(company_id) scores = ScoreLoader(company_id) for release_id in matches.get_release_ids(): rel_tokens = tokens.get_stripped_release_token_block(release_id, 0, sys.maxint) #release subjectivity score rel_subj = scores.count_subj_rel_sentences(release_id) / scores.count_all_rel_sentences(release_id) #release sentiment score if scores.count_subj_rel_sentences(release_id) == 0: rel_sents = 0 else: pos_minus_neg = scores.count_pos_rel_sentences(release_id) - scores.count_neg_rel_sentences(release_id) rel_sent = pos_minus_neg / scores.count_subj_rel_sentences(release_id) for article_id in matches.get_article_ids(release_id): art_tokens = tokens.get_stripped_article_token_block(article_id, 0, sys.maxint) blocks = matches.get_matches(release_id, article_id) blocklen = 0 for b in blocks: start = b[1] length = b[2] end = start + length block_tokens = tokens.get_stripped_release_token_block(release_id, start, end) blocklen += len(block_tokens) rel_used = blocklen/len(rel_tokens) art_added = 1 - blocklen/len(art_tokens) #article subjectivity score art_subj = scores.count_subj_art_sentences(article_id) / scores.count_all_art_sentences(article_id) #article sentiment score if scores.count_subj_art_sentences(article_id) == 0: art_sents = 0 else: pos_minus_neg = scores.count_pos_art_sentences(article_id) - scores.count_neg_art_sentences(article_id) art_sent = pos_minus_neg / scores.count_subj_art_sentences(article_id) sb.append('{0}, {1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}, {9}, {10}\n'.format( \ # sb.append('{0} {1} {2} {3} {4} {5} {6} {7} {8} {9} {10}\n'.format( \ company_id, release_id, article_id, len(rel_tokens), len(art_tokens), rel_used, art_added, rel_subj, art_subj, rel_sent, art_sent)) text = ''.join(sb) print text
def main(): company_id = int(sys.argv[1]) input_name = sys.argv[2] input_type = sys.argv[3] output_name = sys.argv[4] if input_type == 'm': mloader = MatchLoader(company_id, input_name) release_ids = mloader.get_release_ids() article_ids = mloader.get_article_ids() tw = TextWriter(company_id, release_ids, article_ids, output_name) tw.write()
def main(): company_id = int(sys.argv[1]) input_name = sys.argv[2] output_name = sys.argv[3] for company_id in range(21, 41): print 'PROCESSING COMPANY {0}'.format(company_id) mloader = MatchLoader(company_id, input_name) release_ids = mloader.get_release_ids() article_ids = mloader.get_article_ids() sw = SentenceWriter(company_id, release_ids, article_ids, output_name) sw.write_and_calculate()
def filter_by_min_len(self, min_len): matches = MatchLoader(self._company_id, self._match_name_in) maker = MatchMaker(self._company_id, self._match_name_out) for release_id in matches.get_release_ids(): for article_id in matches.get_article_ids(release_id): blocks = matches.get_matches(release_id, article_id) newblocks = [] for b in blocks: start = b[1] #release start length = b[2] end = start + length tkns = self._tokens.get_stripped_release_token_block( release_id, start, end) if len(tkns) >= min_len: newblocks.append(b) if len(newblocks) > 0: maker.add_blocks(release_id, article_id, newblocks) maker.save()
def print_pairs(self): sb = [] for company_id in range(1, 41): matches = MatchLoader(company_id, self._match_name) for release_id in matches.get_release_ids():
class BlockFinder(object): def __init__(self, company_id, matches_name): self._matchloader = MatchLoader(company_id, matches_name) self._tokens = TokenLoader(company_id) self._br = ConfigReader().get('MARKER_BR') def print_all_matching_blocks(self, min_len, max_len): for release_id in self._matchloader.get_release_ids(): for article_id in self._matchloader.get_article_ids(release_id): blocks = self._matchloader.get_matches(release_id, article_id) for block in blocks: i = block[0] j = block[1] k = block[2] rel_match = self._tokens.get_stripped_release_token_block( release_id, j, j + k) if len(rel_match) >= min_len and len(rel_match) < max_len: mb = ' '.join(rel_match) mb = mb.replace(self._br, ' ') print mb #prints blocks of min_length or larger occuring in more than one release - # i.e., bad discriminators between releases def print_all_nondiscrim_release_blocks(self, min_len, max_len): blockset_dict = {} for release_id in self._matchloader.get_release_ids(): blockset = set() #set of blocks for current release blockset_dict[release_id] = blockset for article_id in self._matchloader.get_article_ids(release_id): blocks = self._matchloader.get_matches(release_id, article_id) for block in blocks: i = block[0] j = block[1] k = block[2] rel_match = self._tokens.get_stripped_release_token_block( release_id, j, j + k) if len(rel_match) >= min_len and len(rel_match) < max_len: mb = ' '.join(rel_match) mb = mb.replace(self._br, ' ') mb = mb.lower().strip() blockset.add(mb) #count occurances of each block per release bcounts = {} for release_id in blockset_dict: blockset = blockset_dict[release_id] for b in blockset: if b in bcounts: bcounts[b] += 1 else: bcounts[b] = 1 #print blocks which occur more than once per release result = [key for key in bcounts if bcounts[key] > 1] for r in result: print r
class MatchWriter(object): def __init__(self, company_id, matches_name): self._company_id = company_id self._matchloader = MatchLoader(company_id, matches_name) self._tokens = TokenLoader(company_id) self._releases = ReleaseLoader(company_id).get_releases() self._articles = ArticleLoader(company_id).get_articles() self._br = ConfigReader().get('MARKER_BR') def write_matches(self, output_path): html = self._build_html() filename = '{0}.html'.format(self._company_id) filepath = os.path.join(output_path, filename) self._write_html_to_file(filepath, html) def _build_html(self): sb = [] counter = 0 releases = self._get_sorted_releases() for release in releases: self._write_release_header(sb, release) articles = self._get_sorted_articles(release.id()) for article in articles: #condition for id=35/32 only if self._company_id == '35': delta = article.date() - release.date() if delta.days >= TIME_DELTA and \ not (release.id() == 246 and article.id() == 944) and \ not (release.id() == 189 and article.id() == 1213) and \ not (release.id() == 71 and article.id() == 2557): continue if self._company_id == '32': delta = article.date() - release.date() if delta.days >= TIME_DELTA: continue blocks = self._matchloader.get_matches(release.id(), article.id()) self._write_article_summary(sb, blocks, release, article) self._write_texts(sb, blocks, release.id(), article.id()) counter += 1 print '{0}'.format(counter) return ''.join(sb) def _get_sorted_releases(self): ids = self._matchloader.get_release_ids() rels = [self._releases[id] for id in ids] rels.sort(key=lambda x: x.date()) return rels def _get_sorted_articles(self, release_id): ids = self._matchloader.get_article_ids(release_id) arts = [self._articles[id] for id in ids] arts.sort(key=lambda x: x.date()) return arts def _write_release_header(self, sb, release): sb.append('\n\t<tr>\n\t\t<td colspan="2" class="release-title">') sb.append('{0} --- {1} --- {2}\n\t\t</td>\n\t</tr>'.format( \ release.id(), release.date().strftime('%B %d'), release.title())) def _write_article_summary(self, sb, blocks, release, article): sb.append('\n\t<tr><td colspan=2>') sb.append( '\n\t\t<table class="tbl-inner1" cellpadding="5" border="1"i>') sb.append( '\n\t\t\t<tr class="tbl-inner1-title"><td colspan="3" class="article-title">' ) sb.append('R: {0} --- {1} --- {2}\n\t\t</td>\n\t</tr>'.format( \ release.id(), release.date().strftime('%B %d'), release.title())) sb.append( '\n\t\t\t<tr class="tbl-inner1-title"><td colspan="3" class="article-title">' ) sb.append('A: {0} --- {1} --- {2} --- {3}\n\t\t</td>\n\t</tr>'.format( \ article.id(), article.date().strftime('%B %d'), article.headline(), article.pub())) sb.append( '\n\t\t\t<tr class="tbl-inner1-title"><td>#</td><td>length</td><td>match</td></tr>' ) for count, block in enumerate(blocks): i = block[0] #start in article j = block[1] #start in release k = block[2] #length rel_match = self._tokens.get_stripped_release_token_block( release.id(), j, j + k) art_match = self._tokens.get_stripped_article_token_block( article.id(), i, i + k) rel_temp = ' '.join(rel_match) art_temp = ' '.join(art_match) rel_temp = rel_temp.replace(self._br, ' ') art_temp = art_temp.replace(self._br, ' ') if rel_temp.lower() != art_temp.lower(): print rel_temp.lower() print art_temp.lower() raise Exception("blocks don't match") sb.append('\n\t\t\t<tr valign="top">') sb.append('\n\t\t\t\t<td>{0}</td>'.format(count + 1)) sb.append('\n\t\t\t\t<td>{0}</td>'.format(k)) sb.append( '\n\t\t\t\t<td><span class="match match{0}">{1}</span>\n\t\t</td>' .format(count, rel_temp)) sb.append('\n\t\t\t</tr>') sb.append('\n\t\t</table>') sb.append('\n\t</td></tr>') def _write_texts(self, sb, blocks, release_id, article_id): rel_tokens = self._tokens.get_release_tokens(release_id, False) art_tokens = self._tokens.get_article_tokens(article_id, False) rel_html = self._get_text(blocks, rel_tokens, POS_IN_BLOCK_REL) art_html = self._get_text(blocks, art_tokens, POS_IN_BLOCK_ART) sb.append('\n\t<tr valign="top">') sb.append('\n\t\t<td width="50%">{0}\n\t\t</td>'.format(rel_html)) sb.append('\n\t\t<td width="50%">{0}\n\t\t</td>'.format(art_html)) sb.append('\n\t</tr>') def _get_text(self, blocks, orig_tokens, pos_in_block): span_start = '<span class="match match{0}">' span_end = '</span>' #clone list tokens = orig_tokens[:] #sort by position in article blocks = sorted(blocks, key=itemgetter(pos_in_block)) a = 0 for count, block in enumerate(blocks): pos = block[pos_in_block] #position in text k = block[2] #length tokens.insert(pos + a, span_start.format(count)) tokens.insert(pos + k + a + 1, span_end) a += 2 html = ' '.join(tokens) html = html.replace(self._br, '<br/>') return html def _write_html_to_file(self, output_path, html): with open(output_path, 'w') as f: f.write('<html>\n<head>') f.write( '\n\t<link rel="stylesheet" type="text/css" href="styles.css">' ) f.write('\n</head>\n<body>\n') f.write('\n<table class="tbl-main" cellpadding="5" border="1">') f.write(html) f.write('\n</table>') f.write('\n\n</body>\n</html>')