delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for p in page_lines:
        authors = None
        #get the track
        if has_bold(p):
            print("New track name:", track)
            track = has_bold(p)

            continue

        raw_text = unspace(p.get_text())
        pdf = None

        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        try:
            # Remove the first text before ":"
            title = get_title(p)
        except:
            continue
        text = unspace(p.get_text())
        if text[1] == ".":
示例#2
0

nlp = spacy.load("en_core_web_sm")
track = None
vol=None
pages=None
with open(FILE_NAME+".tsv", 'w') as f:
    writer = csv.writer(f, delimiter='\t', quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["Title", "Authors", "Pdf", "Presentation",
                     "Volume_Name", "Pages", "Notes"])

    for p in page_lines:


        raw_text = unspace(p.get_text())
        pdf = None
        title=None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)
        else:
            continue
        presentation = None



        authors = p.text.split(":")[0]
        authors = namify(authors)
        if len(authors) <= 2 and title == None:
示例#3
0
pages = None
pdf = None
presentation = None
title = []
pages_in_pdf = None
with open(FILE_NAME + ".tsv", 'w') as f:
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "pages in pdf", "Notes"
    ])

    for i in range(len(page_lines)):
        p = page_lines[i]
        authors = None
        title = p

        pages = extract_pages(title)
        title = remove_page(title)
        title = unspace(title).strip(".")
        title = unspace(title)
        print(pages_in_pdf)
        pages_in_pdf = int(pages) + 4
        line = [
            title, authors, pdf, presentation, vol, pages, pages_in_pdf, track
        ]
        writer.writerow(line)
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "pages in pdf", "Notes"
    ])

    for i in range(len(page_lines)):
        p = page_lines[i]
        p1 = page_lines[i + 1]

        authors_pages = p1.split(";")
        try:
            pages = unspace(authors_pages[1])
        except:
            print(p1)
            continue
        authors = authors_pages[0]
        title = unspace(p)

        authors = unspace(authors)
        authors = namify(authors)
        vol = """Vol. 1:
MT Researchers’ Track"""

        pages_in_pdf = str(int(pages) + 6)
        if len(title) < 4:
            continue
        line = [
示例#5
0
pages = None
pdf = None
presentation = None
title = []
pages_in_pdf = None
with open(FILE_NAME+".tsv", 'w') as f:
    writer = csv.writer(f, delimiter='\t', quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["Title", "Authors", "Pdf", "Presentation",
                     "Volume_Name", "Pages", "pages in pdf", "Notes"])

    for i in range(len(page_lines)):
        p = page_lines[i]
        p1 = page_lines[i+1]
        p2 = page_lines[i+2]
        title=unspace(p1)
        authors=unspace(p2)
        pages=unspace(p)
        try:
            pages=extract_pages(p)
        except:
            print(p1)
            continue
        
        title=unspace(title)


        authors = unspace(authors)
        authors = namify(authors)
        vol=unspace("""Vol.2:
Commercial MT Users and Translators Track
示例#6
0
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "Notes"
    ])

    for i in range(len(page_lines) - 2):
        j = 1
        p = page_lines[i]
        p1 = page_lines[i + 1]

        splitter = ":"
        line = unspace(p.text)
        sliced_line = line.split(splitter)
        try:
            left = splitter.join(sliced_line[:-1])
            right = sliced_line[-1]
        except:
            print(sliced_line)
            continue

        raw_text = p.get_text().replace("\n", " ")

        pdf = None
        if has_pdf(p):
            pdf = has_pdf(p)

            title = get_title(p)
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "Notes"
    ])

    for string_line in page_lines:

        title_authors_pages = string_line.split("\"")

        try:
            authors = title_authors_pages[1]
        except:
            track = unspace(string_line)
            continue
        authors = remove_parenthesised(authors)
        authors = remove_page(authors)
        authors = unspace(authors)
        authors = namify(authors)

        title = title_authors_pages[0]
        pages = title_authors_pages[1]
        pages = extract_pages(pages)
        if len(title) < 4:
            continue
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
with open(FILE_NAME + ".tsv", 'w') as f:
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for i in range(len(page_lines) - 2):
        j = 1
        p = page_lines[i]
        p1 = page_lines[i + 1]
        p0 = page_lines[i - 1]
        if is_subtitle(p):
            track = is_subtitle(p)
            track = unspace(track)
            #writer.writerow([vol])
            continue
        authors = None
        if finish_by_digit(p):
            authors = p.text
        else:
            continue
        title = p0.get_text()

        pdf = None
        pdf = has_pdf(p0)
        if pdf:
            title = get_title(p0)

        pdf = urljoin(URL, pdf)
            continue

        raw_text = p.get_text().replace("\n", " ")
        pdf = None
        if has_pdf(p):
            pdf = has_pdf(p)
            print(pdf)
            title = get_title(p)
            if has_pdf(p1):
                title2 = get_title(p1)
                title = title + " " + title2

                p2 = page_lines[i + 2]
                j = 2
                p1 = p2

        if not pdf:
            continue

        while len(p1.text.strip()) <= 3:
            p1 = page_lines[i + j]
            j = j + 1

        pdf = urljoin(URL, pdf)
        authors = remove_parenthesised(p0.get_text())
        authors = unspace(authors).strip("\"")
        title = title.strip("\"")
        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)
示例#10
0
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "pages in pdf", "Notes"
    ])

    for i in range(len(page_lines)):
        p = page_lines[i]

        title_authors = p.split("\"")

        try:
            title = unspace(title_authors[0])
            authors = title_authors[1].split(";")[0]
        except:
            continue

        authors = unspace(authors)
        authors = namify(authors)

        pages = extract_pages(p)
        print(pages)
        try:
            pages = extract_pages(pages)
        except:
            pass
        pages_in_pdf = str(int(pages) + 4)
        if len(title) < 4:
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "Notes"
    ])

    for p in page_lines:
        track = None

        pdf = None
        title = None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            title = get_title(p)

        if "abstract" in p.text:
            track = "abstract"
        else:
            pass

        authors = None

        try:
            title = unspace(unspace(title))
            title = remove_parenthesised(title)

        except:
            pass
        presentation = None
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
示例#12
0
with open(FILE_NAME + ".tsv", 'w') as f:
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "Notes"
    ])

    for string_line in page_lines:

        title_authors_pages = string_line.split("\"")

        try:
            authors = title_authors_pages[1]
        except:
            track = unspace(string_line)
            continue
        authors = remove_parenthesised(authors)
        authors = remove_page(authors)
        authors = namify(authors)

        title = title_authors_pages[0]
        pages = title_authors_pages[1]
        pages = extract_pages(pages)
        if len(title) < 4:
            continue
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
示例#13
0
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow([
        "Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Pages",
        "Pages_in_pdf", "Notes"
    ])

    for i in range(len(page_lines)):
        p = page_lines[i]
        p1 = page_lines[i + 1]
        authors_title = p.split("\"")

        try:
            title = unspace(authors_title[1])
            authors = authors_title[0]
        except:
            continue
        authors = unspace(authors)
        authors = namify(authors)

        pages = extract_pages(p1)
        print(pages)
        try:
            pages = extract_pages(pages)
        except:
            pass
        pages_in_pdf = str(int(pages) + 5)
        if len(title) < 4:
            continue
示例#14
0
pdf=None
presentation=None
with open(FILE_NAME+".tsv", 'w') as f:
    writer = csv.writer(f, delimiter='\t', quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["Title", "Authors", "Pdf", "Presentation",
                     "Volume_Name", "Pages", "Abstract", "Notes"])
                     
    for i in range(len(page_lines)-2):
        j = 1
        p = page_lines[i]
        p1 = page_lines[i+1]

        if is_subtitle(p):
            track = is_subtitle(p)
            track = unspace(track)
            continue
        authors=None
        if has_pp(p):
            pages=extract_pages(p)
            print(pages)

        else:
            continue
        title_authors = p.get_text().split(":")[1].split("–")
        title = unspace(title_authors[0])
        authors = title_authors[1]
        authors = remove_parenthesised(authors)
        authors = namify(authors)
        abstract = p1.text
with open(FILE_NAME+".tsv", 'w') as f:
    writer = csv.writer(f, delimiter='\t', quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(["Title", "Authors", "Pdf", "Presentation",
                     "Volume_Name", "Pages", "Notes"])

    for p in page_lines:
        track = None
        try:
            
            pages = extract_pages(p)
            
        except:
            pass
        splitter = ":"
        line = unspace(p.text)
        sliced_line = line.split(splitter)
        try:
            left = sliced_line[0]

            right = ":".join(sliced_line[1:])
        except:
            print(sliced_line)
            continue

        raw_text = unspace(p.get_text())
        pdf = None
        title = None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for i in range(len(page_lines) - 1):

        p = page_lines[i]
        p1 = page_lines[i + 1]

        if has_pdf(p):
            pdf = has_pdf(p)
            title = get_title(p)
            title = remove_digit(title)

        else:
            continue
        #print(p1)
        authors = remove_digit(p1.text)
        authors = unspace(authors)
        authors = authors.split(",")[0]
        authors = authors.strip(".")
        authors = namify(authors)
        print(authors)

        pdf = urljoin(URL, pdf)

        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, track]
        writer.writerow(line)
    ])

    for i in range(len(page_lines) - 3):

        j = 1
        p = page_lines[i]
        p1 = page_lines[i + 1]

        raw_text = p.get_text().replace("\n", " ")

        pdf = None
        title = p.text

        while len(p1.text.strip()) <= 3:
            p1 = page_lines[i + j]
            j = j + 1
        try:
            pages = extract_pages(p)
        except:
            pass
        #pdf = urljoin(URL, pdf)
        authors = p1.text
        authors = ",".join(authors.split(",")[:-1])
        authors = remove_parenthesised(authors)
        authors = ",".join(authors.split(",")[:-1])
        authors = namify(authors)

        title = unspace(title)
        line = [title, authors, pdf, presentation, vol, pages, track]
        writer.writerow(line)
        title=[]
        j = 1
        p = page_lines[i]

        pj = page_lines[i+j]

        if has_pp(p):
            pages=extract_pages(p)
            
        else:
            continue

        while has_bold(pj):
            title+=pj.text
            j=j+1
            pj = page_lines[i+j]
        
        title=unspace("".join(title))
        print(title)
        authors=namify(pj.text)
        while "Abstract" not in pj.text:
            j=j+1
            pj = page_lines[i+j]
        j=j+1
        pj = page_lines[i+j]
        abstract=pj.text


        line = [title, authors, pdf, presentation, vol, pages, abstract, track]
        writer.writerow(line)
示例#19
0
nlp = spacy.load("en_core_web_sm")
track = None
vol = None
with open(FILE_NAME + ".tsv", 'w') as f:
    writer = csv.writer(f,
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for p in page_lines:

        #get the track
        raw_text = unspace(p.get_text())
        pdf = None
        authors = None
        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None
        try:
            # Remove the first text before ":"
            title = p.text.split("--")[0]
            title = unspace(title)
        except:
            continue
        try:
                        delimiter='\t',
                        quotechar='"',
                        quoting=csv.QUOTE_MINIMAL)
    writer.writerow(
        ["Title", "Authors", "Pdf", "Presentation", "Volume_Name", "Notes"])

    for p in page_lines:
        authors = None
        #get the track
        if has_bold(p):
            print("New track name:", track)
            track = has_bold(p)

            continue

        raw_text = unspace(p.get_text())
        pdf = None

        if has_pdf(p):
            pdf = has_pdf(p)
            pdf = urljoin(URL, pdf)
            #print(pdf)

        presentation = None

        text = unspace(p.get_text())

        try:
            title = text.split(":")[0]
            title = unspace(remove_parenthesised(title))