def parse_template_5(self, element): """ A template for a workshop with the conference acronym and year in the name Examples: - http://ceur-ws.org/Vol-958/ """ workshop = {} title = rex.rex(element[1], r'(.*)Edited\s*by.*', re.I | re.S).group(1) workshop['volume_number'] = WorkshopSummaryParser.extract_volume_number(element[0].get('href')) label_part = rex.rex(element[0].text, r'(.*)\sat\s(\w{2,})\s(\d{4})[\s\.]*', re.I | re.S) workshop['label'] = label_part.group(1) workshop['conf_acronym'] = label_part.group(2) workshop['conf_year'] = label_part.group(3) workshop['url'] = element[0].get('href') workshop['time'] = utils.parse_date(title) try: workshop['edition'] = tonumber( rex.rex(title, r'.*Proceedings(\s*of)?(\s*the)?\s*(\d{1,}|first|second|third|forth|fourth|fifth)[thrd]*' r'.*Workshop.*', re.I, default=None).group(3)) except: #'edition' property is optional pass self.add_workshop(workshop)
def parse_template_1(self, element): title = rex.rex(element[1], r'(.*)Edited\s*by.*', re.I | re.S).group(1).replace('\n', '') if re.match(r'^proceedings of the[joint ]*.*workshops.*|^joint proceedings.*', title, re.I | re.S): raise DataNotFound() labels = rex.rex(title, r".*\((([\da-zA-Z*@\-&:]+?)['\s-]*(\d{2}|\d{4})|" r"([\da-zA-Z*@\-&:]+?)['\s-]*(\d{2}|\d{4})\s+at.*)\).*", re.I | re.S) short_label = labels.group(2) self.data['volume_number'] = WorkshopSummaryParser.extract_volume_number(element[0].get('href')) self.data['short_label'] = short_label
def parse_template_2(self, element): """ A template for joint proceedings of two workshops: Examples: - http://ceur-ws.org/Vol-776/ """ workshop_1 = {'id': 1} workshop_2 = {'id': 2} summary = rex.rex(element[1], r'^\s*(proceedings\s+of\s+joint.*on.*\((\w+)\-(\w+)\s+\d+\).*)Edited by.*', re.I | re.S) if len(summary.groups()) != 3: raise DataNotFound() title = summary.group(1) workshop_1['volume_number'] = workshop_2['volume_number'] = \ WorkshopSummaryParser.extract_volume_number(element[0].get('href')) workshop_1['url'] = workshop_2['url'] = element[0].get('href') workshop_1['time'] = workshop_2['time'] = utils.parse_date(title) workshop_1['short_label'] = summary.group(2) workshop_2['short_label'] = summary.group(3) self.add_workshop(workshop_1) self.add_workshop(workshop_2)
def parse_template_main(self): proceedings_list = [] tr = self.grab.tree.xpath(XPATH_SUMMARY) for i in range(0, len(tr), 2): href = tr[i].find(self.XPATH_SUMMARY_TITLE) try: if href.get('href') in config.input_urls or len(config.input_urls) == 1: proceedings = dict() proceedings['volume_number'] = ProceedingsSummaryParser.extract_volume_number(href.get('href')) proceedings['url'] = href.get('href') summary_match = rex.rex( tr[i + 1].find('.//td[last()]').text_content(), r'(.*)(\nEdited\s*by\s*:\s*)(.*)(\nSubmitted\s*by\s*:\s*)(.*)(\nPublished\s*on\s*CEUR-WS:\s*)(.*)(\nONLINE)(.*)', re.I | re.M | re.S) proceedings['label'] = re.sub(r'\n', '', text.normalize_space(summary_match.group(1), ' \n')) proceedings['editors'] = re.split(r",+\s*", text.normalize_space(summary_match.group(3))) proceedings['submission_date'] = datetime.strptime( text.normalize_space(summary_match.group(7), ' \n'), '%d-%b-%Y') proceedings_list.append(proceedings) except: print "[WORKSHOP %s: ProceedingsSummaryParser] Summary information not found!" % href.get('href') #traceback.print_exc() self.data['proceedings_list'] = proceedings_list if len(proceedings_list) == 0: raise DataNotFound("There is no summary information to parse!")
def is_invited(publication): if rex.rex(publication['link'], r'.*(keynote|invite).*', re.I, default=None): return True else: return False
def check_for_workshop_paper(publication): if rex.rex(publication['name'].strip(), r'^(preface|overview|introduction|einleitung|foreword)$', re.I, default=None): return False if not publication['link'].endswith('.pdf'): return False return True
def parse_template_6(self, element): workshop = {} title = rex.rex(element[1], r'(.*)Edited\s*by.*', re.I | re.S).group(1) workshop['volume_number'] = WorkshopSummaryParser.extract_volume_number(element[0].get('href')) workshop['label'] = element[0].text.replace('.', '') workshop['url'] = element[0].get('href') workshop['time'] = utils.parse_date(title) try: workshop['edition'] = tonumber( rex.rex(title, r'.*Proceedings(\s*of)?(\s*the)?\s*(\d{1,}|first|second|third|forth|fourth|fifth)[thrd]*' r'.*Workshop.*', re.I, default=None).group(3)) except: #'edition' property is optional pass self.add_workshop(workshop)
def parse_template_3(self): self.begin_template() publications = [] elements = self.grab.tree.xpath('//li[a[@href] and (i or em or br)]') if elements is None or len(elements) == 0: elements = self.grab.tree.xpath('//p[a[@href] and (i or em)]') for publication in elements: try: name = clean_string(publication.find('a').text_content()) if rex.rex(name, r'.*(preface|first\s+pages|author\s+list|foreword).*', re.I, default=None): #Examples: 180, 186 continue link = publication.find('a').get('href') editors = [] editors_tag = None if publication.find('i') is not None: editors_tag = publication.findall('i')[-1] elif publication.find('em') is not None: editors_tag = publication.find('em') if editors_tag is None: editors_tag_content = publication.find('br').tail else: editors_tag_content = editors_tag.text_content() editors_tag_content = re.sub(r'\s*[,\s]*and\s+', ',', editors_tag_content, flags=re.I | re.S).strip() if not editors_tag_content: #a publication should have non-empty list of authors raise DataNotFound(link) for publication_editor_name in editors_tag_content.split(","): pen = clean_string(publication_editor_name.strip()) if pen: editors.append(pen) file_name = link.rsplit('.pdf')[0].rsplit('/')[-1] publication_object = { 'name': name, 'file_name': file_name, 'link': self.task.url + link, 'editors': editors } publication_object['is_invited'] = self.is_invited(publication_object) if self.check_for_workshop_paper(publication_object): publications.append(publication_object) except Exception as ex: #traceback.print_exc() raise DataNotFound(ex) self.data['publications'] = publications self.end_template()
def rex(body, patterns, flags=0, default=rex.NULL): result = None lastexception = DataNotFound() found = False for pattern in patterns: try: result = rex.rex(body, pattern, flags, default) found = True if not result: break except DataNotFound as dnf: lastexception = dnf if found: return result else: raise lastexception
def parse_template_1(self): """ Examples: - http://ceur-ws.org/Vol-1008/ - http://ceur-ws.org/Vol-1081/ - http://ceur-ws.org/Vol-1085/ """ self.begin_template() try: colocated = rex.rex(self.grab.tree.xpath('//span[@class="CEURCOLOCATED"]/text()')[0], r'([a-zA-Z\s*]+)[\s\']*(\d{4}|\d{2})', re.I) except IndexError as ex: raise DataNotFound(ex) self.data['acronym'] = colocated.group(1).strip() self.data['year'] = extract_year(colocated.group(2)) self.end_template()
def parse_template_3(self): self.begin_template() publications = [] elements = self.grab.tree.xpath('//li[a[@href] and (i or em or br)]') if elements is None or len(elements) == 0: elements = self.grab.tree.xpath('//p[a[@href] and (i or em)]') for publication in elements: try: name = clean_string(publication.find('a').text_content()) if rex.rex( name, r'.*(preface|first\s+pages|author\s+list|foreword).*', re.I, default=None): #Examples: 180, 186 continue link = publication.find('a').get('href') editors = [] editors_tag = None if publication.find('i') is not None: editors_tag = publication.findall('i')[-1] elif publication.find('em') is not None: editors_tag = publication.find('em') if editors_tag is None: editors_tag_content = publication.find('br').tail else: editors_tag_content = editors_tag.text_content() editors_tag_content = re.sub(r'\s*[,\s]*and\s+', ',', editors_tag_content, flags=re.I | re.S).strip() if not editors_tag_content: #a publication should have non-empty list of authors raise DataNotFound(link) for publication_editor_name in editors_tag_content.split(","): pen = clean_string(publication_editor_name.strip()) if pen: editors.append(pen) file_name = link.rsplit('.pdf')[0].rsplit('/')[-1] publication_object = { 'name': name, 'file_name': file_name, 'link': self.task.url + link, 'editors': editors } publication_object['is_invited'] = self.is_invited( publication_object) if self.check_for_workshop_paper(publication_object): publications.append(publication_object) except Exception as ex: #traceback.print_exc() raise DataNotFound(ex) self.data['publications'] = publications self.end_template()
def parse_template_5(self): """ Examples: VOL 1513 """ self.begin_template() publications = [] i = 0 for publication in self.grab.tree.xpath('//div[@class="CEURTOC"]/*[@rel="dcterms:hasPart"]/li'): try: if i == 0: i += 1 name = clean_string(publication.find('a').text_content()) href = publication.find('a').get('href') link = href if href.startswith('http://') else self.task.url + href num_of_pages, start, end = -1, -1, -1 publication_object = { 'name': name, 'file_name': href, 'link': link, 'editors': '', 'num_of_pages': num_of_pages, 'start_page': start, 'end_page': end } publication_object['is_invited'] = self.is_invited(publication_object) publications.append(publication_object) if rex.rex(name, r'.*(preface|first\s+pages|author\s+list|foreword).*', re.I, default=None): continue name = clean_string(publication.find('span[@rel="dcterms:relation"]').text_content()) href = publication.find('span[@rel="dcterms:relation"]//a/span[@property="bibo:uri"]').get('content') link = href if href.startswith('http://') else self.task.url + href num_of_pages, start, end = -1, -1, -1 if publication.find('span[@class="CEURPAGES"]'): pages = publication.find('span[@class="CEURPAGES"]').text_content().strip().split('-') start, end, num_of_pages = pages[0], pages[1], int(pages[1]) - int(pages[0]) + 1 # get start and end page number from pdf file if page number not present at web page if link.endswith('.pdf') and start == -1: num_of_pages, start, end = get_online_page_number(link) editors = [] for publication_editor in publication.findall('span[@rel="dcterms:creator"]'): editors.append(clean_string(publication_editor.text_content()).strip()) publication_object = { 'name': name, 'file_name': href, 'link': link, 'editors': editors, 'num_of_pages': num_of_pages, 'start_page': start, 'end_page': end } publication_object['is_invited'] = self.is_invited(publication_object) if self.check_for_workshop_paper(publication_object): publications.append(publication_object) except Exception as ex: raise DataNotFound(ex) self.data['publications'] = publications self.end_template()
def extract_volume_number(url): return rex.rex(url, r'.*http://ceur-ws.org/Vol-(\d+).*').group(1)