pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) title=get_title(p) presentation = None if "presentation" in p.text: presentation = has_pdf(p) presentation = urljoin(URL, presentation) title_authors=p.get_text().split('–') if not pdf: try: title=title_authors.split[0] except: pass authors=has_italic(p) if not authors: try: authors=title_authors[1].split('[')[0] except: pass if authors: authors=namify(authors) if not authors and not title and not pdf: continue line = [title, authors, pdf,presentation, track, raw_text] print(line) writer.writerow(line)
raw_text = unspace(p.get_text()) pdf = None #Skip presention line if has_pdf(p) and "presentation" not in p.text: pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None if "presentation" in p.text: presentation = has_pdf(p) presentation = urljoin(URL, presentation) try: title = re.split(":", p.get_text())[1] title= re.split("--", title)[0] title = namify(title) except: continue print(has_italic(p)) if not has_italic(p): continue authors=has_italic(p) authors=namify(authors) #authors=re.split(":", p.get_text())[0] line = [title, authors, pdf,presentation, track, raw_text] writer.writerow(line)
pdf = has_pdf(p) print(pdf) title = get_title(p) if has_pdf(p1): title2 = get_title(p1) title = title + " " + title2 p2 = page_lines[i + 2] j = 2 p1 = p2 if not pdf: continue while len(p1.text.strip()) <= 3: p1 = page_lines[i + j] j = j + 1 try: pages = extract_pages(p) except: pass pdf = urljoin(URL, pdf) authors = remove_parenthesised(p1.get_text()) authors = namify(authors) if has_italic(p1) == None: authors = None title = unspace(title) line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
raw_text = unspace(p.get_text()) pdf = None if has_pdf(p) and "presentation" not in p.text: pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None if "presentation" in p.text: presentation = has_pdf(p) presentation = urljoin(URL, presentation) try: # Remove the first text before ":" title = re.split(":", p.get_text())[1:] title = ":".join(title) title = re.split("--", title)[0] title = unspace(title) except: continue if not has_italic(p): continue authors = has_italic(p) authors = namify(authors) line = [title, authors, pdf, presentation, track, raw_text] writer.writerow(line)
print(pdf) title = get_title(p) if has_pdf(p1): title2 = get_title(p1) title = title + " " + title2 p2 = page_lines[i + 2] j = 2 p1 = p2 if not pdf: continue while len(p1.text.strip()) <= 3: p1 = page_lines[i + j] j = j + 1 try: pages = extract_pages(p) except: pass pdf = urljoin(URL, pdf) if has_italic(p1): authors = p1.get_text().split(",")[0] else: continue authors = namify(authors) title = unspace(title) line = [title, authors, pdf, presentation, vol, pages, track] writer.writerow(line)
raw_text = unspace(p.get_text()) pdf = None authors = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) else: continue presentation = None try: # Remove the first text before ":" title = get_title(p) except: continue if has_italic(p): authors = has_italic(p) else: authors = has_italic(p1) if authors: authors = remove_parenthesised(authors) authors = namify(authors) if not authors and title == None: continue try: pages = extract_pages(p) except: try: pages = extract_pages(p1) except: pass
track = has_bold(p) continue raw_text = unspace(p.get_text()) pdf = None authors = None if has_pdf(p): pdf = has_pdf(p) pdf = urljoin(URL, pdf) #print(pdf) presentation = None try: # Remove the first text before ":" title = has_italic(p) title = unspace(title) except: continue try: authors = p.text.split("–")[1] except: pass if authors: authors = remove_parenthesised(authors) authors = namify(authors) if not authors and title == None: continue line = [title, authors, pdf, presentation, vol, track] writer.writerow(line)