def do_match(infobox_path, text_path, out_path): import Corpus import time print 'loading......' infobox = load_infobox(infobox_path) reader = Corpus.TRECReader() reader.open(text_path) writer = Corpus.TRECWriter(out_path) matcher = InfoBoxMatcher() t0 = time.time() count = 0 doc = reader.next() while doc: text = doc.text lines = text.split('\n') newlines = lines[:3] title_line = lines[1] title_begin_index = title_line.find('>') title_end_index = title_line.find('<', title_begin_index + 1) title = '' if title_begin_index >= 0 and title_end_index >= 0: title = title_line[title_begin_index + 1:title_end_index].strip() if infobox.has_key(title): tagged_text = matcher.match(infobox[title], lines[3:]) doc.text = '\n'.join(lines[:3]) + '\n' doc.text += tagged_text writer.write(doc) doc = reader.next() count += 1 if count % 100 == 0: print count, time.time() - t0 writer.close()
def do_filter(sample_url_path, corpus_path, sample_corpus_path): import Corpus name_set = set( map(lambda line: line.strip().split()[0].split('/')[-1], open(sample_url_path).readlines())) trec_reader = Corpus.TRECReader() trec_reader.open(corpus_path) trec_writer = Corpus.TRECWriter(sample_corpus_path) doc = trec_reader.next() start_title_tag = '<title>' start_title_tag_len = len(start_title_tag) end_title_tag = '</title>' count = 0 while doc: text = doc.text start = text.find(start_title_tag) end = text.find(end_title_tag) title = '' if start >= 0 and end >= 0: title = text[start + start_title_tag_len:end] if name_set.__contains__(title): trec_writer.write(doc) count += 1 if count % 1000 == 0: print count doc = trec_reader.next() trec_reader.close() trec_writer.close()
def do_batch_apply(trec_path, model_dir, pattern_path, out_path, lib_dir): get_classpath(lib_dir) check_java_compile(lib_dir) pattern_set = set( map(lambda line: line.split()[0], open(pattern_path).readlines())) base_tag_trec_path = '%s.basetag' % trec_path command = [ 'java', '-Xms13G', '-Xmx13G', '-classpath', class_path, stanford_tag_program, '--batch-trec', trec_path, base_tag_trec_path ] print ' '.join(command) subprocess.call(command) t = time.time() reader = Corpus.TRECReader() reader.open(base_tag_trec_path) doc = reader.next() indecies = [0] ids = [] all_tagged_text = None while doc: tagged_text = TaggedText() tagged_text.get_from_string('\n'.join( filter(lambda line: not line.startswith('<'), doc.text.split('\n')))) if all_tagged_text: all_tagged_text += tagged_text else: all_tagged_text = tagged_text indecies.append(len(all_tagged_text)) tagged_text = apply_tag(trec_path, tagged_text, model_dir, pattern_set) ids.append(doc.ID) doc = reader.next() reader.close() os.remove(base_tag_trec_path) #tagged_text = apply_tag(trec_path, all_tagged_text, model_dir, pattern_set) print len(tagged_text) writer = Corpus.TRECWriter(out_path) for i in xrange(len(ids)): doc = Corpus.Document( ids[i], tagged_text[indecies[i]:indecies[i + 1]].__str__()) writer.write(doc) writer.close() global prune_t, label_t print time.time() - t, prune_t, label_t
def do_batch(in_trec, out_trec): import Corpus reader = Corpus.TRECReader() reader.open(in_trec) writer = Corpus.TRECWriter(out_trec) doc = reader.next() count = 1 while doc: plain = Wiki2Plain(doc.text) text = plain.text pos = text.find('\n') if pos > 0: text = '<title>%s</title>%s' % (text[:pos], text[pos:]) doc.text = text writer.write(doc) doc = reader.next() if count % 1000 == 0: print count count += 1 reader.close() writer.close()
def __init__(self, names, out_path): import Corpus self.name_set = set(names) self.writer = Corpus.TRECWriter(out_path) self.id = 1 self.count = 0