pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        if "presentation" in p.text:
            presentation = has_pdf(p)
            presentation = urljoin(URL, presentation)
        try:
            # Remove the first text before ":"
            if pdf:
                title = get_title(p)
        except:
            continue

        authors = p.text.split(":")[0]
        authors = namify(authors)
        if len(authors) <= 2 and title == None:
            continue
        if title == None:
            try:
                title = p.text.split(":")[1]
            except:
                pass
        try:
            title = unspace(title)
            title = remove_parenthesised(title)
        except:
            pass
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)
Пример #2
0
    for p in page_lines:

        #get the track
        raw_text = unspace(p.get_text())
        pdf = None
        authors = None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        try:
            # Remove the first text before ":"
            title = p.text.split("--")[0]
            title = unspace(title)
        except:
            continue
        try:
            authors = p.text.split("--")[1]

        except:
            pass
        if authors:
            authors = remove_parenthesised(authors)
            authors = namify(authors)
        if not authors and title == None:
            continue
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)
        if "abstract" in p.text:
            track = "abstract"
        else:
            pass
        presentation = None
        if "presentation" in p.text:
            presentation=pdf
            pdf=None

        authors = p.text.split(":")[0]
        if title==None or authors==None:
            continue
        authors = namify(authors)
        if len(title) <= 3:
            continue


        try:
            title = unspace(unspace(title))
            
            
        except:
            pass
        #title= remove_page(title)
        print("page removed"+title)
            
            
        title= remove_parenthesised(title,exclude="(")
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
Пример #4
0
        pdf = None
        if has_pdf(p):
            pdf = has_pdf(p)

            title = get_title(p)
            if has_pdf(p1):
                title2 = get_title(p1)
                title = title + " " + title2

                p2 = page_lines[i + 2]
                j = 2
                p1 = p2

        if not pdf:
            continue

        while len(p1.text.strip()) <= 3:
            p1 = page_lines[i + j]
            j = j + 1
        try:
            pages = extract_pages(left)
        except:
            pass
        pdf = urljoin(URL, pdf)
        authors = remove_parenthesised(p1.get_text())
        authors = namify(authors)
        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
Пример #5
0
with open(FILE_NAME+".tsv", 'w') as f:
    writer = csv.writer(f, delimiter='\t', quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["Title", "Authors", "Pdf", "Presentation",
                     "Volume_Name","Notes"])

    for i in range(len(page_lines)-1):
        track=None

        p = page_lines[i]
        p1 = page_lines[i+1]
        if "abstract" in p.text:
            track="abstract, full text not available"

        if "[" in p.text:
            title = remove_parenthesised(p.text)
            pdf=has_pdf(p)
        elif has_pdf(p):
            pdf=has_pdf(p)
            title=get_title(p)

        else:
            continue        
        #print(p1)




        authors = namify(p1.text)
        print(authors)
                except:
                    continue
                p2 = page_lines[i+2]
                j = 2
                p1 = p2

        else:
            continue

        if authors==None:
            continue
        title=p1.text
        track=None
        if "abstract" in title:
            track= "abstract"
        title=remove_parenthesised(title)

        while len(p1.text.strip()) <= 3:
            p1 = page_lines[i+j]
            j = j+1
        try:
            pages = extract_pages(p)
        except:
            pass
        pdf=has_pdf(p1)
        pdf = urljoin(URL, pdf)
        print(authors)
        authors=remove_parenthesised(authors)
        authors = namify(authors)

        title = unspace(title)
Пример #7
0
        pdf = None
        if has_pdf(p) and "abstract" not in p.text:
            pdf = has_pdf(p)
            print(pdf)
            title = get_title(p)
            if has_pdf(p1):
                title2 = get_title(p1)
                title = title + " " + title2

                p2 = page_lines[i + 2]
                j = 2
                p1 = p2

        if not pdf:
            if "abstract" in p.text:
                title = remove_parenthesised(p.text)
                pdf = has_pdf(p)
            else:
                continue

        while len(p1.text.strip()) <= 3:
            p1 = page_lines[i + j]
            j = j + 1
        try:
            pages = extract_pages(p)
        except:
            pass
        pdf = urljoin(URL, pdf)
        if pdf == URL:
            pdf = None
        authors = remove_parenthesised(p1.get_text())
Пример #8
0
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "Notes"
    ])

    for i in range(len(page_lines) - 1):
        j = 1
        p = page_lines[i]
        p1 = page_lines[i + 1]

        raw_text = p.get_text().replace("\n", " ")
        print(raw_text)
        pdf = None
        if has_pdf(p1):
            pdf = has_pdf(p1)
            print(pdf)
            title = get_title(p1)

        if not pdf:
            continue

        try:
            pages = extract_pages(p1)
        except:
            pass
        pdf = urljoin(URL, pdf)
        authors = ",".join(remove_parenthesised(p.get_text()).split(",")[:-1])
        authors = namify(authors)
        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
Пример #9
0
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for i in range(len(page_lines) - 1):
        track = None

        p = page_lines[i]
        p1 = page_lines[i + 1]
        if "abstract" in p.text:
            track = "abstract, full text not available"

        if has_pdf(p):
            pdf = has_pdf(p)
            title = get_title(p)

        else:
            continue
        #print(p1)

        authors = namify(remove_parenthesised(p.b.text))

        pdf = urljoin(URL, pdf)
        if pdf == URL:
            pdf = None
        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)
        if has_bold(p):
            print("New track name:", track)
            track = has_bold(p)

            continue

        raw_text = unspace(p.get_text())
        pdf = None

        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None

        text = unspace(p.get_text())

        try:
            title = text.split(":")[0]
            title = unspace(remove_parenthesised(title))

            authors = text.split(":")[1].split(",")[0]
            authors = remove_parenthesised(authors)
        except:
            pass
        #authors = namify(authors)

        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)
    ])

    for i in range(len(page_lines)):
        j = 1
        p = page_lines[i]
        p1 = page_lines[i + 1]

        raw_text = p.get_text().replace("\n", " ")

        pdf = None
        authors = None

        authors = p.text
        if authors == None:
            continue
        title = remove_parenthesised(p1.text)
        track = None

        pdf = has_pdf(p1)

        pdf = urljoin(URL, pdf)
        if pdf == URL:
            pdf = None
        print(authors)
        authors = remove_parenthesised(authors)
        authors = namify(authors)

        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)