def setup_DUC_basic(task, skip_updates=False): """ task.topic_file: sgml file for DUC task.doc_path: path containing source documents task.manual_path: path for manual (human) summaries """ ## get all document data all_docs = {} files = util.get_files(task.doc_path, '\w{2,3}\d+[\.\-]\d+') sys.stderr.write('Loading [%d] files\n' % len(files)) for file in files: id = os.path.basename(file) all_docs[id] = file ## initialize problems problems = [] data = open(task.topic_file).read().replace('\n', ' ') topics = re.findall('<topic>.+?</topic>', data) sys.stderr.write('Setting up [%d] problems\n' % len(topics)) for topic in topics: id = util.remove_tags(re.findall('<num>.+?</num>', topic)[0])[:-1] title = util.remove_tags(re.findall('<title>.+?</title>', topic)[0]) narr = util.remove_tags(re.findall('<narr>.+?</narr>', topic)[0]) docsets = re.findall('<docs.*?>.+?</docs.*?>', topic) docsets = map(util.remove_tags, docsets) docsets = [d.split() for d in docsets] old_docs = [] for docset_index in range(len(docsets)): ## update naming convention different from main if len(docsets) > 1: id_ext = '-' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'[docset_index] else: id_ext = '' new_docs = [all_docs[doc] for doc in docsets[docset_index]] ## create a SummaryProblem problem = SummaryProblem(id + id_ext, title, narr, new_docs, old_docs) old_docs += new_docs ## include training data in problem if task.manual_path: problem._load_training(task.manual_path) problems.append(problem) ## skip updates? if skip_updates: break task.problems = problems
def setup_DUC_basic(task, skip_updates=False): """ task.topic_file: sgml file for DUC task.doc_path: path containing source documents task.manual_path: path for manual (human) summaries """ ## get all document data all_docs = {} files = util.get_files(task.doc_path, '\w{2,3}\d+[\.\-]\d+') sys.stderr.write('Loading [%d] files\n' %len(files)) for file in files: id = os.path.basename(file) all_docs[id] = file ## initialize problems problems = [] data = open(task.topic_file).read().replace('\n', ' ') topics = re.findall('<topic>.+?</topic>', data) sys.stderr.write('Setting up [%d] problems\n' %len(topics)) for topic in topics: id = util.remove_tags(re.findall('<num>.+?</num>', topic)[0])[:-1] title = util.remove_tags(re.findall('<title>.+?</title>', topic)[0]) narr = util.remove_tags(re.findall('<narr>.+?</narr>', topic)[0]) docsets = re.findall('<docs.*?>.+?</docs.*?>', topic) docsets = map(util.remove_tags, docsets) docsets = [d.split() for d in docsets] old_docs = [] for docset_index in range(len(docsets)): ## update naming convention different from main if len(docsets) > 1: id_ext = '-' + 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'[docset_index] else: id_ext = '' new_docs = [all_docs[doc] for doc in docsets[docset_index]] ## create a SummaryProblem problem = SummaryProblem(id+id_ext, title, narr, new_docs, old_docs) old_docs += new_docs ## include training data in problem if task.manual_path: problem._load_training(task.manual_path) problems.append(problem) ## skip updates? if skip_updates: break task.problems = problems
def __init__(self, path, is_clean=False): """ path is the location of the file to process is_clean=True means that file has no XML or other markup: just text """ self.id = 'NONE' self.date = 'NONE' self.source = 'NONE' self.paragraphs = [] self._isempty = True ## get generic info try: rawdata = open(path).read() except: sys.stderr.write('ERROR: could not read: %s\n' % path) return try: self.id = util.remove_tags( re.findall('<DOCNO>[^>]+</DOCNO>', rawdata[:100])[0]) except: #<DOC id="AFP_ENG_20050125.0151" type="story" > match = re.search('<DOC id=\"([^"]+)\"', rawdata[:100]) if match: self.id = str(match.groups(1)) else: sys.stderr.write('ERROR: no <DOCNO>/<DOC id=...> tag: %s\n' % path) print rawdata[:100] ## source and date from id (assumes newswire style) if self.id != 'NONE': self.source = re.findall('^[^_\d]*', self.id)[0] self.date = self.id.replace(self.source, '') ## parse various types of newswire xml if is_clean: text = self._parse_clean(path) else: text = self._parse_newswire(path) if len(text) == 0: sys.stderr.write('WARNING: no text read for: %s\n' % path) return self.paragraphs = [] for paragraph in text: fixed_par = self._fix_newswire(paragraph) if fixed_par == '': continue self.paragraphs.append(fixed_par) self._isempty = False
def __init__(self, path, is_clean=False): """ path is the location of the file to process is_clean=True means that file has no XML or other markup: just text """ self.id = 'NONE' self.date = 'NONE' self.source = 'NONE' self.paragraphs = [] self._isempty = True ## get generic info if os.path.isfile(path): rawdata = open(path).read() elif path.strip().startswith('<DOC>'): rawdata = path else: sys.stderr.write('ERROR: could not read: %s\n' %path) return try: self.id = util.remove_tags(re.findall('<DOCNO>[^>]+</DOCNO>', rawdata[:100])[0]) except: match = re.search('<DOC id=\"([^"]+)\"', rawdata[:100]) if match: self.id = str(match.groups(1)[0]) else: sys.stderr.write('ERROR: no <DOCNO>/<DOC id=...> tag: %s\n' %path) ## source and date from id (assumes newswire style) if self.id != 'NONE': self.source = re.findall('^[^_\d]*', self.id)[0] self.date = self.id.replace(self.source, '') ## parse various types of newswire xml if is_clean: text = self._parse_clean(rawdata) else: text = self._parse_newswire(rawdata) if len(text)==0: #sys.stderr.write('WARNING: no text read for: %s\n' %path) return self.paragraphs = [] for paragraph in text: fixed_par = self._fix_newswire(paragraph) if fixed_par == '': continue self.paragraphs.append(fixed_par) self._isempty = False
url = "http://soe.stanford.edu/research/pers_index_results.php?index=%s" % chr(c) doc = util.dl_and_prep(url) results += re.findall(pat, doc) print len(results), 'total professors' output = [] for prof in results: pd = {} pd['lab_website'] = 'http://soe.stanford.edu/research/%s' % prof[0] pd['source'] = 'http://soe.stanford.edu/research/%s' % prof[0] pd['name'] = prof[1] #extract the primary deptmartment from within the <b> tags if '<b>' in prof[2]: pd['department'] = re.findall('<b>(.*?)</b>', prof[2])[0] else: pd['department'] = util.prep_department(util.remove_tags(prof[2])) research = prof[3].replace(' ', '').strip() if len(research) > 0: pd['keywords'] = util.split_and_clean(research, ',') pd['school'] = 'Stanford University' personal_page = util.dl_and_prep(pd['lab_website']) summary = re.findall('<h3>Research Statement</h3><p>(.*?)</p><h3>Degrees</h3>', personal_page) if summary: pd['research_summary'] = util.html_escape(summary[0].strip()) try: pd['image'] = 'http://soe.stanford.edu/research/%s' % re.findall('\'(images/photos_faculty_staff/.*?)\'', personal_page)[0] except Exception: import pdb; pdb.set_trace() pd['title'] = re.findall("Title:</td><td class=\"data\">(.*?)</td>", personal_page)[0] personal_website = re.findall("URL:</TD><TD class=\"data\"><a href='(.*?)'", personal_page)
line = re.sub(ur'([^0-9]|^)1([^0-9])', ur'\1一\2', line, re.UNICODE) # replace <SB> with period #line = re.sub(ur'<SB>$', ur'.', line, re.UNICODE) #line = remove_tags(line) # TODO: should not remove these lines!! #if re.match(ur'[\.,\!\?]+$', line, re.UNICODE): # line = '' fout.write('%s\n' % line) else: time_format = check_format(input) with codecs.open(input, 'r', encoding='utf-8') as fin, codecs.open(output, 'w', encoding='utf-8') as fout: for line in fin: line = line.strip() if len(line) == 0: fout.write('%s\n' % line) else: line = remove_tags(line) line = re.sub(ur'([^0-9]|^)1([^0-9])', ur'\1一\2', line, re.UNICODE) line = re.sub(ur'([,\.\?!"])+', ur'\1', line, flags=re.UNICODE) if len(line) > 0: if time_format == 'int': try: uid, start, end, text = re.split('\s+', line, 3) # some line has empty text after removing tags except ValueError: continue fout.write('%04d - %s - %s\n%s\n' % (int(uid), int2time(start), int2time(end), text)) else: fout.write('%s\n' % line)