Python load_etext 예제들, gutenberg.acquire.load_etext Python 예제들

예제 #1

0

파일 보기

파일: test_acquire.py 프로젝트: rlugojr/Gutenberg-2

 def test_load_etext(self):
     loaders = (lambda etextno: load_etext(etextno, refresh_cache=True),
                lambda etextno: load_etext(etextno, refresh_cache=False))
     testcases = (
         SampleMetaData.for_etextno(2701),  # newstyle identifier
         SampleMetaData.for_etextno(5),  # oldstyle identifier
         SampleMetaData.for_etextno(14287),  # unicode text
         SampleMetaData.for_etextno(23962)  # UTF-8 text
     )
     for testcase, loader in itertools.product(testcases, loaders):
         text = loader(testcase.etextno)
         self.assertTrue(isinstance(text, str))

예제 #2

0

파일 보기

파일: test_acquire.py 프로젝트: ChillarAnand/Gutenberg

 def test_load_etext(self):
     loaders = (lambda etextno: load_etext(etextno, refresh_cache=True),
                lambda etextno: load_etext(etextno, refresh_cache=False))
     testcases = (
         SampleMetaData.for_etextno(2701),   # newstyle identifier
         SampleMetaData.for_etextno(5),      # oldstyle identifier
         SampleMetaData.for_etextno(14287),  # unicode text
         SampleMetaData.for_etextno(23962)   # UTF-8 text
     )
     for testcase, loader in itertools.product(testcases, loaders):
         text = loader(testcase.etextno)
         self.assertIsInstance(text, unicode)

예제 #3

0

파일 보기

파일: my_submodule.py 프로젝트: tsinghuaZhang/tensor2tensor

    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        del data_dir
        del tmp_dir
        del dataset_split

        # pylint: disable=g-import-not-at-top
        from gutenberg import acquire
        from gutenberg import cleanup
        # pylint: enable=g-import-not-at-top

        books = [
            # bookid, skip N lines
            (19221, 223),
            (15553, 522),
        ]

        for (book_id, toskip) in books:
            text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
            lines = text.split("\n")[toskip:]
            prev_line = None
            ex_count = 0
            for line in lines:
                # Any line that is all upper case is a title or author name
                if not line or line.upper() == line:
                    prev_line = None
                    continue

                line = re.sub("[^a-z]+", " ", line.strip().lower())
                if prev_line and line:
                    yield {
                        "inputs": prev_line,
                        "targets": line,
                    }
                    ex_count += 1
                prev_line = line

예제 #4

0

파일 보기

파일: general.py 프로젝트: lrascius/CodeCipherCenter

def generate_paragraph():
    '''
    Generates a random paragraph from the Gutenberg Project

    :return: Text the Guttenberg Project with spaces and non-alphabetic characters removed\
    and all characters lower case
    :rtype: str
    '''
    #Get the text from Gutenberg Project, in this case its Moby Dick
    text = strip_headers(load_etext(2701)).strip()
    #text = "Jack and Jill ran up the hill to get a pail of water. " +
    #       "Jack fell down and broke his crown and Jill came tumbling after."
    sentences = []
    paragraph = ""

    for sentence in text.split("."):
        sentences.append(sentence)

    #Select 2 random sentences
    paragraph = random.choice(sentences) + random.choice(sentences)

    paragraph = re.sub(r'\s+', '', paragraph)
    regex = re.compile('[^a-zA-Z]')
    paragraph = regex.sub('', paragraph).lower()
    return paragraph

예제 #5

0

파일 보기

파일: Book_Recommendations.py 프로젝트: loeshabermehl/Book_Recommendations

def get_featurelists(book):

    # Preparation for topic features: get 100 most common uni-, bi- and trigrams of the given book
    common_ngrams = get_common_ngrams(book)

    # Extract the features of the given book
    features_book = (book_id, extract_features(book, common_ngrams))

    # Create new file and write the features of the given book to it
    path_feat_book = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_book.txt"
    with open(path_feat_book, 'r+', encoding="utf-8") as output_book:
        output_book.write(str(features_book))
        output_book.close()

    # Create new file to write the features of the dataset books to
    path_feat_books = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_dataset.txt"
    output_dataset = open(path_feat_books, 'r+', encoding="utf-8")

    # Extract the features of the dataset books
    features_dataset = []
    for i in IDs:
        features_dataset.append((i,
                                 extract_features(
                                     strip_headers(load_etext(i)).strip(),
                                     common_ngrams)))

        # Write the features to the output file
        output_dataset.write("\n Book " + str(i) + ": ")
        output_dataset.write(str(features_dataset[len(features_dataset) - 1]))
    output_dataset.close()

    return features_book, features_dataset

예제 #6

0

파일 보기

파일: parse_word.py 프로젝트: cahilton/cs6497_project4_phonoaesthethic_text

def get_gutenberg_text(id):
    try:
        text = strip_headers(load_etext(id)).strip()
        return text
    except Exception as ex:
        print(ex)
    return ''

예제 #7

0

파일 보기

    def load_gutenberg(self, language='en'):
        texts = get_etexts('author', self.author)
        texts = {
            t: list(get_metadata("title", t))[0]
            for t in texts if list(get_metadata("language", t))[0] == language
        }

        new_texts = dict()
        dupes = list()
        for k, d in texts.items():
            d = d.replace("\r\n", " ")
            if d not in dupes:
                dupes.append(d)
                new_texts[k] = d
                try:
                    self.books[d] = strip_headers(
                        load_etext(k)).strip().split("\r\n\r\n")
                except UnknownDownloadUriException:
                    print(
                        f'Book "{d}" does not have a text format and was not loaded.'
                    )
                    del new_texts[k]
                    dupes.remove(d)
                    continue
                self.tokens[d] = [
                    nltk.pos_tag(nltk.word_tokenize(self.books[d][b]))
                    for b in range(len(self.books[d]))
                ]
            else:
                pass

        texts = new_texts

        print(texts)

예제 #8

0

파일 보기

파일: source_plays.py 프로젝트: Peritract/shakespeare-dashboard

def load_macbeth():
    """
    Sources Macbeth from Project Gutenberg, returns a cleaned dataframe
    of the play split by act, scene, speaker, and sentence.
    """
    raw_text = load_etext(1533)  # Collect the text
    raw_text = strip_headers(raw_text)  # Remove most metadata

    # Remove in-line stage directions

    raw_text = remove_in_line_stage_directions(raw_text)

    # Split the text into sentences

    sentences = separate_sentences(raw_text)

    # Remove introductory data, keeping only the text

    sentences = sentences[110:]

    # Create a dataframe from the sentences

    macbeth = create_play_data_frame(sentences)

    # Clean the dataframe

    macbeth = clean_macbeth(macbeth)

    # Add a token column

    macbeth["tokens"] = create_token_column(macbeth["sentence"])

    # Return the finished dataframe

    return macbeth

예제 #9

0

파일 보기

    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        del data_dir
        del tmp_dir
        del dataset_split

        books = [
            # bookid, skip N lines
            (19221, 223),
            (15553, 522),
        ]

        for (book_id, toskip) in books:
            text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
            lines = text.split("\n")[toskip:]
            for line in lines:
                # Any line that is all upper case is a title or author name
                if not line or line.upper() == line:
                    continue

                line = re.sub("[^a-z]+", " ", line.strip().lower())
                if line:
                    l = len(line)
                    if l > 100:
                        l = 100
                    yield {
                        "inputs": line,
                        "label": l,
                    }

예제 #10

0

파일 보기

def main():
    """
    The main method.
    """

    parser = argparse.ArgumentParser(
        description='Word suggestion based on Project Gutenberg books.')
    parser.add_argument('--book-id',
                        dest='book_ids',
                        nargs='+',
                        type=int,
                        required=True,
                        help='the book id of the Project Gutenberg')
    parser.add_argument('--query',
                        nargs='+',
                        type=str,
                        required=True,
                        help='suggest next word for list of string',
                        action=required_length(1, 5))

    try:
        args = parser.parse_args()
        text_indexer = TextIndexer(len(args.query))

        for book_id in list(dict.fromkeys(args.book_ids)):
            text = strip_headers(load_etext(book_id)).strip()
            text_indexer.add_text(book_id, text)

        print(text_indexer.suggest(*args.query))
    except Exception as exc:  # pylint: disable=W0703
        print(exc)

예제 #11

0

파일 보기

def getSomeBooks(howManyBooks, startingAt=1):
    i = howManyBooks
    ii = startingAt
    nothing = 0
    valError = 0
    otherError = 0
    allTheBooks = []
    while i > len(allTheBooks):  # 54096 ceiling
        try:
            theText = strip_headers(
                load_etext(ii)).strip()  #load the full text into theText
            theLength = len(theText)
            if len(theText) > 292:
                allTheBooks.append([ii, theText])
                print("one more book in the list, book number:", i,
                      "book total is:", len(allTheBooks))
            else:
                nothing = nothing + 1
                print("nothing here at number:", i)
        except ValueError:
            valError = valError + 1
            print("valueError at book number:", i)
        except:
            otherError = otherError + 1
            print("otherError at book number:", i)
        ii = ii + 1

    print('all done')
    print(len(allTheBooks))
    return allTheBooks

예제 #12

0

파일 보기

파일: gutenbot.py 프로젝트: slippers/gutenbot

def post_corpora(url, auth_token):
    corpora = acquire_corpora()
    text = strip_headers(load_etext(corpora[0])).strip()

    print(corpora, text[:100])

    authentication_token = {'authentication-token': auth_token}

    # data to post
    files = {'file': io.StringIO(text)}
    data = {
        'label': '{} {}'.format(corpora[1], corpora[3]),
        'source': corpora[2]
    }

    # post
    ru = requests.post(url,
                       headers=authentication_token,
                       files=files,
                       data=data)

    print(ru.url, ru.status_code)
    if ru.ok:
        print(ru.json())
    else:
        print(ru.status_code, ru.reason)

예제 #13

0

파일 보기

def download(cfg):
    print('Downloading Gutenberg data to: ' + cfg.directory)
    # Load language data for all books.
    path = os.path.join('code', 'utils', 'metadata.txt')
    with open(path, encoding='utf-8') as f:
        counter = 0
        for line in f:
            [index, lang, r, author, title] = line.split('\t')

            r = int(r)
            i = int(index)
            if counter < cfg.max_books and r == 1 and lang in cfg.languages:
                # Get the book.
                try:
                    text = strip_headers(load_etext(i)).strip().encode('utf-8')
                except UnknownDownloadUriException:
                    print('Could not download book: ' + str(i))
                    continue

                # Save the file to the correct directory.
                path = os.path.join(cfg.directory, lang)
                if not os.path.exists(path):
                    os.mkdir(path)
                with open(os.path.join(path, str(i) + '.txt'), 'wb') as f:
                    f.write(text)

                    counter += 1
                    if not counter % 1000:
                        print('Downloaded ' + str(counter) + ' books')

예제 #14

0

파일 보기

파일: gutenberg_preprocess.py 프로젝트: aarushi-goel/Gutenberg-Sentiment-Analyzer

def init_books(author_file, json_file):
    """initialize book list with texts and save it to disk"""
    with open(author_file) as f:
        authors = list(f)

    authors = [i.strip() for i in authors]

    books = []
    for author in authors:
        s = get_etexts('author', author)
        for i in s:
            try:
                if list(get_metadata('language', i))[0] == 'en':
                    title, etext = list(get_metadata(
                        'title', i))[0], strip_headers(load_etext(i)).strip()
                    b = Book(i, title, etext)
                    books.append(b)
            except UnknownDownloadUriException:
                # this book does not have a load_etext corresponding to it.
                pass

    with open(json_file, 'wb') as f:
        pickle.dump(books, f)

    print(len(books))

예제 #15

0

파일 보기

파일: dataset_generation.py 프로젝트: frederik-laboyrie/whatsapp_poetry

def poetry_cleaner(poetry_books=BOOKS):
    with open(INPUT_DATA_WRITE_PATH + OUT_PATH, 'w') as ofp:

        lineno = 0

        for (id_nr, toskip, title) in poetry_books:

            startline = lineno
            text = strip_headers(load_etext(id_nr)).strip()
            lines = text.split('\n')[toskip:]

            for line in lines:

                if 0 < len(line) < 50 and line.upper(
                ) != line and not re.match('.*[0-9]+.*', line):
                    cleaned = re.sub('[^a-z\'\-]+', ' ', line.strip().lower())
                    if lineno < 100:
                        ofp.write(cleaned)
                        ofp.write('\n')
                    lineno = lineno + 1

                else:
                    ofp.write('\n')

        print('Wrote lines {} to {} from {}'.format(startline, lineno, title))

예제 #16

0

파일 보기

파일: my_submodule.py 프로젝트: kltony/tensor2tensor

  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    del data_dir
    del tmp_dir
    del dataset_split

    # pylint: disable=g-import-not-at-top
    from gutenberg import acquire
    from gutenberg import cleanup
    # pylint: enable=g-import-not-at-top

    books = [
        # bookid, skip N lines
        (19221, 223),
        (15553, 522),
    ]

    for (book_id, toskip) in books:
      text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
      lines = text.split("\n")[toskip:]
      prev_line = None
      ex_count = 0
      for line in lines:
        # Any line that is all upper case is a title or author name
        if not line or line.upper() == line:
          prev_line = None
          continue

        line = re.sub("[^a-z]+", " ", line.strip().lower())
        if prev_line and line:
          yield {
              "inputs": prev_line,
              "targets": line,
          }
          ex_count += 1
        prev_line = line

예제 #17

0

파일 보기

파일: build_database.py 프로젝트: avouacr/who-wrote-this

def sample_paragraphs(book_id, n_parag, min_length):
    """Get book as text file and randomly sample a fixed number of paragraphs."""
    # Get book as string and emove metadata
    book = load_etext(book_id)
    # Remove metadata
    book = strip_headers(book).strip()
    # Remove the character we'll choose as separator
    book = book.replace("|", " ")
    # Split paragraphs
    parag = book.split("\n\n")
    # Remove single line breaks
    parag = [x.replace("\n", " ") for x in parag]
    # Remove paragraphs below a certain length
    parag = [p for p in parag if len(p) > min_length]
    # Exclude first/last 10 parag from sampling as they may contain remaining metadata
    parag = parag[10:-10]

    # Sample paragraphs
    seed(42)
    sample_ind = randint(0, len(parag), n_parag)

    if n_parag is not None:
        if n_parag > len(parag):
            raise ValueError(
                "The number of paragraphs to sample is higher than the "
                "total number of paragraphs."
            )
        else:
            parag_sampled = [parag[i] for i in sample_ind]

    else:
        # If n_parag is None, all paragraphs are sampled
        parag_sampled = parag

    return parag_sampled

예제 #18

0

파일 보기

def regular_view(request, book_num):
    name = Book.get_book_name(book_num)
    bookText = strip_headers(load_etext(book_num)).strip()
    filteredText = removeStopWords(bookText)

    args = {'content': [bookText], 'content2': [filteredText], 'name': name}

    return render(request, "pages/regularText.html", args)

예제 #19

0

파일 보기

파일: prose.py 프로젝트: oscarbyrne/oulipo

def get_raw_book():
    while True:
        try:
            text = load_etext(random.randrange(46000)) #46000 is approximately size of gutenberg catalogue
        except ValueError: #in case of no download method for that text id
            pass
        else:
            return strip_headers(text)

예제 #20

0

파일 보기

def get_joyce_texts():
    joyce_keys = get_etexts('author', 'Joyce, James')
    joyce_titles = []
    joyce_texts = {}
    for key in joyce_keys:
        joyce_titles.append(get_metadata('title', key))
        joyce_texts[key] = strip_headers(load_etext(key)).strip()
    return (joyce_texts)

예제 #21

0

파일 보기

def search_display_options(my_catalog):
    search_result_catalog = book_catalog()

    search_type = input(
        'Please select a search type: Author, Subject, Title [Aa/Ss/Tt]:  ')

    if search_type == 'A' or search_type == 'a':
        search_term = input('Please enter a search term for an Author: ')
    elif search_type == 'T' or search_type == 't':
        search_term = input('Please enter a search term for a Title: ')
    elif search_type == 'S' or search_type == 's':
        search_term = input('Please enter a search term for a Subject: ')
    else:
        print('Invalid search type...')
        return

    # set match flag to false
    match = False
    # fill up a set of all the titles that match the search
    for my_book in my_catalog.get_books():
        if (search_type == 'a' or search_type == 'A') and set(
                my_book.get_book_author().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

        if (search_type == 't' or search_type == 'T') and set(
                my_book.get_book_title().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

        if (search_type == 's' or search_type == 'S') and set(
                my_book.get_book_subject().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

    search_result_catalog.display_titles_by_author()

    if match:
        title_num = input('Please type a title number from the above list: ')

        print('Displaying Word Cloud in [Subject: ' +
              my_book.get_book_subject() + '] for [Title: ' +
              my_book.get_book_title() + '] by [Author:' +
              my_book.get_book_author() + ']')
        try:
            my_book = search_result_catalog.get_book(title_num)
            return (strip_headers(load_etext(int(title_num))).strip()
                    )  # call that gets bok text from gutenberg
        except:
            print('Failed to find a textual download candidate for ' +
                  my_book.get_book_title())
            return (None)
    else:
        print('No matches found for [' + search_term + ']...')
        return (None)

예제 #22

0

파일 보기

파일: cutthroat-finder.py 프로젝트: hugovk/cutthroats

def text_from_pg(id_number):
    # https://github.com/c-w/Gutenberg
    from gutenberg.acquire import load_etext

    # from gutenberg.cleanup import strip_headers

    # text = strip_headers(load_etext(id_number)).strip()
    text = load_etext(id_number).strip()
    return text

예제 #23

0

파일 보기

파일: app.py 프로젝트: jbrandes/BookRoulette

def tab():
    with open("BookRoulette.html", "w") as f:
        x = (random.randint(1, 60059))
        book = strip_headers(load_etext(x)).strip()
        f.write(book)
        f.close
        filename = 'file:///'+os.getcwd()+'/' + 'BookRoulette.html'
        webbrowser.open_new_tab(filename)
        return render_template('BookRoulette.html', book=book)

예제 #24

0

파일 보기

파일: main.py 프로젝트: Ryan-M3/minhasher

 def acquire_and_process(name: str, txt_num: int):
     """
     Convenience function that minhashes a Project Gutenberg
     text given the text id number (can be found on the gutenberg.org,
     for instance in the url).
     """
     txt = strip_headers(load_etext(txt_num))
     with open("texts/%s.txt" % name, "w") as f:
         f.write(txt)
     process_file("texts/%s.txt" % name)

예제 #25

0

파일 보기

파일: app.py 프로젝트: jbrandes/BookRoulette

def download():
    with open("GutProject.doc", "w") as f:
        x = (random.randint(1, 60059))
        text = strip_headers(load_etext(x)).strip()
        f.write(text)
        f.close()
    return send_file('GutProject.doc',
    mimetype='application/msword',
    attachment_filename='GutProject.doc',
    as_attachment=True)

예제 #26

0

파일 보기

파일: application.py 프로젝트: sasraf/CS50-Markov-Project

def downloadBook():
    """If posting, takes in a book number from getty.html, installs the book into
    the database. Otherwise displays getty.html"""
    if request.method == "POST":
        bookNum = int(request.form.get("bookNum"))
        words = strip_headers(load_etext(bookNum)).strip()
        installText(words)
        return render_template("homepage.html")
    else:
        return render_template("getty.html")

예제 #27

0

파일 보기

 def __init__(self, book_number=2701, first_page=20, last_page=20):
     self.text = strip_headers(load_etext(book_number))
     # print(list_supported_metadatas())  # prints (u'author', u'formaturi', u'language', ...)
     # print(get_metadata('title', 2701))  # prints frozenset([u'Moby Dick; Or, The Whale'])
     # print(get_metadata('author', 2701))  # prints frozenset([u'Melville, Hermann'])
     # print(text)  # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...'
     self.pages = []
     self.first_page = first_page
     self.last_page = last_page
     self.print_book()

예제 #28

0

파일 보기

def gutToText(number,name):
    filename = name+"_raw.txt"
    if os.path.isfile(filename)==False:
        book = open(filename,"w")
        text = strip_headers(load_etext(number)).strip()
        words = text
        print "Loaded and writing %s" % (name)
        book.write(words.encode('utf-8'))
        print "Done writing %s" % (name)
        book.close()

예제 #29

0

파일 보기

파일: yewno.py 프로젝트: skyballin/yewno

    def get_text(self, title, author):
        """
		This function will access the title and author of a book from the
		Gutenberg project and save the data as a csv file
		PROBLEM HERE -- gutenberg goes down a lot, so getting a full text 
		did not work. To bypass that, I downloaded some books of mixed languages.
		"""
        guten_number = get_etexts('title', title)[0]
        text = strip_headers(load_etext(guten_number)).strip()
        return (text)

예제 #30

0

파일 보기

def get_gutenberg_document(url) -> str:
    """Downloads a document (book, etc.) from Project Gutenberg and returns it as a string."""
    # Get Project Gutenberg document ID from url string
    validate_url(url, expected_netloc='gutenberg.org')
    match = re.search("(?:files|ebooks|epub)\/(\d+)", urlsplit(url).path)
    if not match:
        raise Exception('Not a valid url')
    document_id = int(match.group(1))
    return super_cleaner(strip_headers(load_etext(document_id).strip()),
                         mark_deletions=False)

예제 #31

0

파일 보기

파일: main.py 프로젝트: Ryan-M3/minhasher

 def acquire_and_process(name: str, txt_num: int):
     """
     Convenience function that minhashes a Project Gutenberg
     text given the text id number (can be found on the gutenberg.org,
     for instance in the url).
     """
     txt = strip_headers( load_etext(txt_num) )
     with open("texts/%s.txt" % name, "w") as f:
         f.write(txt)
     process_file("texts/%s.txt" % name)

예제 #32

0

파일 보기

def generateBooks(lastBookID):
    firstBookID = 1
    # look through and grab each book
    while firstBookID <= lastBookID:
        # load and grab the eBook
        try:
            text = strip_headers(load_etext(firstBookID)).strip()
            gatherMetaData(firstBookID, text)
            firstBookID = firstBookID + 1
        except:
            print("error with book", firstBookID)
            firstBookID = firstBookID + 1

예제 #33

0

파일 보기

파일: gutenberg_preprocess.py 프로젝트: aarushi-goel/Gutenberg-Sentiment-Analyzer

def trial():
    text = strip_headers(load_etext(2701)).strip()
    print(text)  # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...'
    print(get_metadata(
        'title', 2701))  # prints frozenset([u'Moby Dick; Or, The Whale'])
    print(get_metadata('author',
                       2701))  # prints frozenset([u'Melville, Hermann'])

    print(get_etexts(
        'title', 'Moby Dick; Or, The Whale'))  # prints frozenset([2701, ...])
    print(get_etexts('author',
                     'Melville, Herman'))  # prints frozenset([2701, ...])

예제 #34

0

파일 보기

파일: generate_novel.py 프로젝트: anjabeth/nanogenmo-2016

def create_model():
    """Read in Project Gutenberg data, convert each into a markovify Text model object, then combine them into one model. Returns the model. 
	"""

    eap_1 = strip_headers(load_etext(2147)).strip()  #edgar allan poe vol 1
    eap_2 = strip_headers(load_etext(2148)).strip()  #edgar allan poe vol 2
    dickens = strip_headers(
        load_etext(807)).strip()  #charles dickens crime stories
    moonstone = strip_headers(load_etext(155)).strip()  #collins: the moonstone
    lerouge = strip_headers(
        load_etext(3802)).strip()  #gaboriau: the lerouge case
    orcival = strip_headers(
        load_etext(1651)).strip()  #gaboriau: the mystery of orcival
    calais = strip_headers(
        load_etext(16339)).strip()  #griffiths: the passenger from calais\
    leavenworth = strip_headers(
        load_etext(4047)).strip()  #griffiths: the passenger from calais
    agent = strip_headers(load_etext(974)).strip()  #conrad: the secret agent
    thirtynine = strip_headers(
        load_etext(558)).strip()  #conrad: the secret agent

    eap_1_model = markovify.Text(eap_1, state_size=3)
    eap_2_model = markovify.Text(eap_2, state_size=3)
    dickens_model = markovify.Text(dickens, state_size=3)
    moonstone_model = markovify.Text(moonstone, state_size=3)
    lerouge_model = markovify.Text(lerouge, state_size=3)
    orcival_model = markovify.Text(orcival, state_size=3)
    calais_model = markovify.Text(calais, state_size=3)
    leavenworth_model = markovify.Text(leavenworth, state_size=3)
    agent_model = markovify.Text(agent, state_size=3)
    thirtynine_model = markovify.Text(thirtynine, state_size=3)

    #NOTE: will need to play around with the weighting based on the text lengths so that I don't get all sentences from one book
    all_model = markovify.combine([
        eap_1_model, eap_2_model, dickens_model, moonstone_model,
        lerouge_model, orcival_model, calais_model, leavenworth_model,
        agent_model, thirtynine_model
    ])

    return all_model

예제 #35

0

파일 보기

def main():
    eap_1 = strip_headers(load_etext(2147)).strip()  #edgar allan poe vol 1
    eap_2 = strip_headers(load_etext(2148)).strip()  #edgar allan poe vol 2
    dickens = strip_headers(
        load_etext(807)).strip()  #charles dickens crime stories
    moonstone = strip_headers(load_etext(155)).strip()  #collins: the moonstone
    lerouge = strip_headers(
        load_etext(3802)).strip()  #gaboriau: the lerouge case
    orcival = strip_headers(
        load_etext(1651)).strip()  #gaboriau: the mystery of orcival

    eap_1_model = markovify.Text(eap_1, state_size=3)
    dickens_model = markovify.Text(dickens, state_size=3)
    moonstone_model = markovify.Text(moonstone, state_size=3)
    lerouge_model = markovify.Text(lerouge, state_size=3)
    orcival_model = markovify.Text(orcival, state_size=3)

    #NOTE: will need to play around with the weighting based on the text lengths so that I don't get all sentences from one book
    all_model = markovify.combine([
        eap_1_model, eap_2_model, dickens_model, moonstone_model,
        lerouge_model, orcival_model
    ])

    #to do: loop to create different chapters - probably make them short (~ten sentences?) at first to make sure that they work properly
    print "\n\n\n Creating Chapters"

    chapters = create_chapters(
    )  #this will be a list of all the chapters, they should be complete at this point (all replacement/etc done)
    """

예제 #36

0

파일 보기

파일: datastructure_test.py 프로젝트: last-genius/homeworks_project

def main():
    """
    Main function of the test module
    """

    # setting up the API keys from local keys.py file
    goodreads_key = os.environ['GOODREADS_KEY']
    goodreads_secret = os.environ['GOODREADS_SECRET']

    # creating a client for book search and information retrieval
    gc = client.GoodreadsClient(goodreads_key, goodreads_secret)

    current_path = os.getcwd()

    file = open(os.path.join(current_path, "output", "log.json"), "w")

    gutenberg_titles = []

    # Getting the title of the first 3000 books on Project Gutenberg (EXTREMELY FAST)
    for i in range(1, 10):
        title = list(get_metadata('title', i))
        if title:
            # prepare the string for the file name
            filename = ''.join(
                e for e in title[0] if e.isalnum() or e == ' ') + ".txt"
            gutenberg_titles.append(filename[:-4])
            text = strip_headers(load_etext(i)).strip()
            with open(os.path.join(current_path, "output", filename),
                      "w") as output_file:
                output_file.write(text)

    titles = dict()
    # Searching for the books on Goodreads, reading their metadata
    for book_title in gutenberg_titles:
        try:
            lst = gc.search_books(book_title, search_field='title')

            if not lst:
                continue
            else:
                book = lst[0]

            titles[book.title] = (
                book_title + ".txt", str(book.popular_shelves),
                str(book.similar_books), str(book.authors),
                dict(dict(book.work)['original_publication_year'])['#text'])
        except (request.GoodreadsRequestException, KeyError, TypeError):
            continue

    json.dump(titles, file, indent=4)
    file.close()

예제 #37

0

파일 보기

파일: gutenberg_engine.py 프로젝트: skasim/gutenberg_scifi

def get_book_text(csvfile):
    'gets text for book using project gutenberg catalog'
    book_list = open_csv(csvfile)
 
        
    for i, value in enumerate(book_list):
        #print i, value
        
        a = int(book_list[i][2]) # a = book number
        print i, a
        author = book_list[i][0]
        title = book_list[i][1]
        try:
            text = strip_headers(load_etext(a)).strip()
        except ValueError:
            pass

예제 #38

0

파일 보기

파일: get_text.py 프로젝트: TheSumitGogia/chara-extractor

def process_file(filename, outdir):
    outpath = outdir + '/%s.txt'
    with open(filename) as f:
        for line in f:
          spl = line.split('|')
          book = spl[0]
          uids = map(int, spl[3].strip(string.lowercase + '\n').split(','))
          try:
            with open(outpath % book, 'w') as out:
              for uid in uids:
                raw_text = load_etext(uid)
                try:
                  text = strip_headers(unidecode(raw_text.encode('latin-1').decode('utf-8')))
                except UnicodeDecodeError:
                  text = strip_headers(raw_text)
                out.write(text.encode('utf-8'))
          except ValueError as e:
            print '%s|%s' % (book, uid), e
            os.remove(outpath % book)

예제 #39

0

파일 보기

파일: novel_processing.py 프로젝트: ravenruckus/novel_processing

 def the_text(self): 
      try:
         self.novel = load_etext(self.novel_num)
      except:
         rejects.append(self.novel_num)
         return False
           
      if re.search('Character set encoding: ASCII', self.novel):
         self.novel = strip_headers(self.novel)
         self.novel = self.novel.replace('\n', ' ')
         self.novel = TextBlob(self.novel)
         self.novel_sentences = self.novel.sentences
         self.m = str(self.novel_num)
         with open('novel_'+self.m +'list_1.csv', 'wb') as f:
             writer = csv.writer(f)
             for sentence in self.novel_sentences:
                 writer.writerow([sentence])
      else: 
         rejects_2.append(self.novel_num) 
         return False

예제 #40

0

파일 보기

파일: process.py 프로젝트: RealTimeWeb/datasets

def check_text():
    with open('raw.json') as inputfile:
        data = json.load(inputfile)
    for record in tqdm(data):
        id = record['metadata']['id']
        title = clean(record['book']['title'])
        text = load_etext(id)
        if id in lookup_dates:
            release_date = lookup_dates[id]
        else:
            for line in text.split("\n"):
                if line.startswith('Release Date:'):
                    release_date = line.replace('Release Date:', '').split('[')[0]
                    break
            else:
                print id, title
        record['book']['author'] = record['author']
        author_name = record['book']['author']['name']
        vals.add(record['book']['author']['birth'])
        if record['book']['author']['birth'] == None:
            record['book']['author']['birth'] = 0
        if record['book']['author']['death'] == None:
            record['book']['author']['death'] = 0
        vals2.add(record['book']['author']['birth'])
        record['book']['author']['name'] = clean(author_name) if author_name else "Unknown"
        del record['author']
        month, day, year = extract_date(release_date)
        release_date = release_date.strip()
        record['book']['publication'] = {
            'full': release_date  if month != 'Jan' else release_date.replace('Jan', 'January'),
            'year': year,
            'month name': month if month != 'Jan' else 'January',
            'month': month_lookup[month],
            'day': day
        }
        record['bibliography'] = record['book']
        del record['book']
        record['metrics'] = record['statistics']
        del record['statistics']
    with open('classics-2.json', 'w') as output:
        json.dump(data, output, indent=2)

예제 #41

0

파일 보기

파일: fetch.py 프로젝트: LSaldyt/Plutus

def fetch_gutenberg(filename=None):
    from gutenberg.acquire import load_etext
    from gutenberg.cleanup import strip_headers
    from gutenbergsettings import popularTitles, saveInterval

    start    = time.time()
    lastsave = start

    with redirect(filename):
        try:
            for title in popularTitles:
                text = strip_headers(load_etext(title)).strip()
                serialize([(title, text)], '../serialized/guten%s' % title)
                sinceLast = time.time() - lastsave
                print('%s since last save' % sinceLast)
                if sinceLast > saveInterval:
                    concatenate('guten')
                    lastsave = time.time()
        except KeyboardInterrupt:
            concatenate('guten')
            sys.exit(0)

예제 #42

0

파일 보기

파일: 1_extract_books.py 프로젝트: ataki/deep-learning-gender

def extract_subroutine(data, src_dir, century):
    session = model.get_session()
    backoff = 1

    counter = 0
    for metadata in data:
        contents = extract_book_contents(metadata)

        if contents is None:
            backoff *= 1.5
            continue

        title = metadata['title']
        author = metadata['author']
        e_id = metadata['id']

        if type(title) == list:
            title = dasherize(title)

        text_file_path = os.path.join(src_dir, dasherize(title.split(" ")))
        text = strip_headers(load_etext(e_id)).strip()
        f = open(text_file_path, "w")
        f.write(text)
        f.close()
        book = model.Book(
            title=title,
            author=author,
            e_id=e_id,
            century=century,
            text_file_path=text_file_path
        )
        session.add(book)
        session.commit()
        log.info("successfully added " + title)
        counter += 1
        time.sleep(backoff)

    log.info("---- finished run. added %d books ----" % counter)

예제 #43

0

파일 보기

파일: admin_gender.py 프로젝트: tcql/gutenberg_scifi

def main():
    filename = "gutenbergscifi.csv"
    json_filename = "gendered_words.json"
    
    if os.path.isfile(filename):
        print "file exists"
        
    else:
        write_csv = extract_text_urls(filename)
        print "file created"

    book_list = open_csv(filename)
    
    print book_list
    
    for i, value in enumerate(book_list):
        #print i, value
        
        a = int(book_list[i][2]) # a = book number
        print i, a
        author = book_list[i][0]
        title = book_list[i][1]
        try:
            text = strip_headers(load_etext(a)).strip()
        except ValueError:
            pass
        #print text  
        
        clean_text = remove_punc_html(text)
        ratio = gender_ratio(clean_text)
        
        print author, title, ratio
        uber_key = author
        sub_key = title
        sub_value = ratio
        uber_value = {author: {title:ratio}}

        json_source = read_write_json_object(json_filename="gendered_words.json", uber_key=uber_key, uber_value=uber_value, sub_key=sub_key, sub_value=sub_value, READ=False, WRITE=True)

예제 #44

0

파일 보기

파일: data_gutenberg.py 프로젝트: 447327642/cs224n-project

def download_book(title, gutenberg_id, data_path, sleep=0):
    print('downloading {:}'.format(title))

    full_text = strip_headers(load_etext(gutenberg_id)).strip()
    summary = downloadSummary(title)

    if full_text is None:
        print('Full text is None. Skipping {:}'.format(title))
        return
    if summary is None:
        print('Summary is None. Skipping {:}'.format(title))
        return

    output_data = {'title': title,
                   'full_text': full_text,
                   'summary': summary}
        
    output_file = os.path.join(data_path,
                               '{:}.json'.format(gutenberg_id))
    with open(output_file, 'w') as f:
        json.dump(output_data, f, ensure_ascii=False)

    time.sleep(sleep)

예제 #45

0

파일 보기

파일: getbooks.py 프로젝트: sarah-j-smith/gutenwords

def getBook(bookDetails):
    global timeAtLastFetch
    cachedFilename = "cache/" + fileNameForBook(bookDetails) + ".txt"
    if os.path.isfile(cachedFilename):
        with open(cachedFilename) as bookfile:
            text = bookfile.read()
            return TextBlob(text)

    nowMS = milliseconds()
    timeSinceLastFetch = nowMS - timeAtLastFetch
    if timeSinceLastFetch < gutenbergWaitTimeMS:
        waitTime = gutenbergWaitTimeMS - timeSinceLastFetch
        print "    waiting {}ms for Gutenberg...".format(waitTime)
        time.sleep(waitTime / 1000)

    bookId = bookDetails['id']
    print "Fetching from Gutenberg id {}".format(bookId)
    source = load_etext(bookId)
    print "    cleaning...."
    source = removeUnicodeWords.sub("", source)
    source = removeUnicodeCharacters.sub("", source)
    source = removePossessive.sub("", source)
    source = removeWordsWithApostrophe.sub("", source)
    source = removeHyphens.sub(" ", source)
    source = removeChapterHeaders.sub("", source)
    source = removeRomanNumerals.sub("", source)
    source = removeEllipsis.sub("", source)
    text = strip_headers(source).strip()
    timeAtLastFetch = milliseconds()
    if not os.path.isdir("cache"):
        os.mkdir("cache")
    bookfile = open(cachedFilename, 'w')
    bookfile.write(text)
    bookfile.close()
    print "    fetched and cached " + bookDetails['title']
    return TextBlob(text)

예제 #46

0

파일 보기

파일: reader.py 프로젝트: sfrapoport/litbot

def generate_tweets(gutenberg_id, total=24):
    document = []
    text = strip_headers(load_etext(gutenberg_id)).strip()
    lines = text.split('\n')    
    print get_metadata('title', gutenberg_id)
    for line in lines:
        words = re.findall(regex, line)
        document.extend(words)

    trigrams = zip(document, document[1:], document[2:])
    trigram_transitions = defaultdict(list)
    starts = []

    for prev, current, next in trigrams:
            if prev == ".":
                    starts.append(current)
            trigram_transitions[(prev, current)].append(next)

    def generate_using_trigrams():
            current = random.choice(starts)
            prev = "."
            result = [current]
            while True:
                    next_word_candidates = trigram_transitions[(prev, current)]
                    next_word = random.choice(next_word_candidates)
                    prev, current = current, next_word
                    if current != ".":
                        result.append(current)
                    else:
                        return " ".join(result) + current
    tweets = [];
    while len(tweets) < total:
        tweet = generate_using_trigrams()
        if len(tweet) <= 140:
            tweets.append(tweet)
    return tweets

예제 #47

0

파일 보기

파일: tarea_diptongos.py 프로젝트: alexapulido/problemasenclase

# -*- coding:utf-8 -*-

# Librerias
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

texto=load_etext(2000)
texto=strip_headers(texto)


#Reemplazar fragmento "qu" por "k" para que no se incluya "(q)ue" o "(q)ui" como diptongo
#Reemplazar diptongos con "y" agregando "-" para que no encuentre palabras en las que la "y" es consonante y no vocal.
texto = texto.replace("que", "ke")
texto = texto.replace("qui", "ki")
texto = texto.replace("gue", "ke")
texto = texto.replace ("gui", "ki")
texto = texto.replace ("ay", "ay-")
texto = texto.replace (u"áy", u"áy-")
texto = texto.replace ("ey", "ey-")
texto = texto.replace (u"éy", u"éy-")
texto = texto.replace ("oy", "oy-")
texto = texto.replace ("uy", "uy-")

texto = texto.lower()


# Dividir texto en palabras
# Diptongo:  Combinación de una vocal abierta (/a e o/) con una cerrada (/i u/), o viceversa, la cerrada no debe ser tónica.
# Hay que indicar con un espacio que la "y" debe quedar al final de palbra
palabras=texto.split()
dic_diptongos={

예제 #48

0

파일 보기

파일: problema_3_alfredo.py 프로젝트: alexapulido/problemasenclase

def split_sentences(text):
	for sentence_separator in [u'. ',u'.\n',u'? ',u'! ',u'?\n',u'!\n',u'; ',u';\n',u'- ',u'--',u'...',u'\n',u'\n\n',u'\n\n\n']:
		text=text.replace(sentence_separator,u'|||')
		return text.split(u'|||')


# Saber la cantidad de libros que posee el corpus.
print u'Total de libros en español:',len(codigos_libros.es)


# Ahora se cargan los libros y se suprimen sus encabezados.
dic_oraciones_es={}
total_palabras_es=0
for codigo_libro_es in codigos_libros.es:
	texto=load_etext(codigo_libro_es)
	texto=strip_headers(texto)
	
# En cada libro se separan las oraciones y se delimitan por el símbolo |||.
	oraciones_libro=split_sentences(texto)
	for oracion_libro in oraciones_libro:
		palabras=rufino.split_words(oracion_libro)
		numero_palabras_oracion=len(palabras)
		total_palabras_es+=numero_palabras_oracion
		if numero_palabras_oracion not in dic_oraciones_es:
			dic_oraciones_es[numero_palabras_oracion]=1
		else:
			dic_oraciones_es[numero_palabras_oracion]=dic_oraciones_es[numero_palabras_oracion]+1


print u'Total de oraciones en español:',len(dic_oraciones_es)

예제 #49

0

파일 보기

파일: test.py 프로젝트: aklreaxmer/LATAWCapstone

import nltk
from nltk.text import Text
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

id = input("Input gutenberg id to load: ")
text=strip_headers(load_etext(id)).strip()
raw_input("Enter to print text preview...")
print(text[:1000])

text = text.split()
text=Text(text)
def ask():
	test = raw_input("Which analysis to perform ('list' to see list): ")
	if(test == "list"):
		print("concordance, dispersionplot, wordcount, lexicaldiversity, frequency, collocations")
		ask()
	if(test == "concordance"):
		conc = raw_input("word: ")
		text.concordance(conc)
		ask()
	if(test == "dispersionplot"):
		disp = []
		keepasking = True;
		i=0;
		while(keepasking):
			input = raw_input("word " + str(i) + " (blank to stop): ")
			if(len(input) > 0):
				disp.append(input)
			else:
				keepasking = False;

예제 #50

0

파일 보기

파일: inter.py 프로젝트: andresfpc/problemasenclase

17406, 17430, 17491, 20401, 21651, 23206, 23236, 24536, 24601, 24925, 25317, 25640, 25687, 
25777, 25988, 26284, 26655, 27736, 29497, 29506, 29663, 29799, 29831, 30053, 30122, 30425, 
30535, 30903, 30986, 31013, 31464, 31541, 31613, 31637, 31707, 32235, 32315, 32364, 33690, 
35882, 36253, 36453, 36573, 36940, 37067, 37095, 37139, 37637, 38814, 39444, 39613, 39990, 
41746, 42727,)



guardar_cadena=0
cadena=u''
interjecciones={}
contador = 0


for texto in textos: #Repito el ciclo para cada libro
    texto= load_etext(texto) #Cargo el texto 
    texto=strip_headers(texto).lower() #Quito las cabeceras
    texto=unicode(texto)
    for caracter in texto: #recorro el texto caracter por caracter
	if caracter == u'¡':      # Si encuentro una apertura de exclamación
        	guardar_cadena=1  # Pongo una variable para empezar a guardar la cadena	
		cadena=cadena+unicode(caracter)	
	if caracter == u'!':      # Si encuentro un cierre de exclamación
		cadena = cadena+unicode(caracter)	 # 1. Guardo ese último caractér (esto es opcional)
		if cadena in interjecciones.keys(): # 2. reviso si la cadena esta en el diccionario
			interjecciones[cadena]+=1  # 3. Si esta le sumo uno a su contador
		else:                           # 4. Si no esta 
			interjecciones[cadena]=1    # La pongo y aranca desde 1
		guardar_cadena=0				# 5. Cambio el valor de la variable para que no se guarde más 
		cadena=''
        if guardar_cadena == 1:         # 6. reviso si el valor de guardar cadena esta en 1

예제 #51

0

파일 보기

파일: gutenberg_get_and_print_etext.py 프로젝트: tonyd2/mypython

"""
Created on Wed Aug 12 18:06:45 2015

@author: Tony
Description: Pull etext numbers from Project Gutenberg for an author

1) First pip install gutenberg 0.4.0 library for Python from the command line

"""
 
from gutenberg.query import get_etexts
from gutenberg.query import get_metadata
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


# get the catalogue numbers of all the texts
# by Wilhelm Grimm in Project Gutenberg
bookList=get_etexts('author', 'Grimm, Wilhelm Carl')
# gives bookList = [12704, 12705, 12706, 12707, 12708, 12709, 12710, 37381, 20051, 28044, 30510, 22555, 20050, 11027, 16846, 12250, 20027, 19068, 2591]

#Once We can associate a number with a title we can pull the text
for number in bookList:
    print(number,get_metadata('title',number))
 
print('\n HHHHHHHHHHHHHHH Now for the full text HHHHHHHHHHHHHHHHHHH \n')
# Once we have the text number we can print the text
# example 11027 is the number for Grimm's Fairy Stories 
# can be tempermental truncating text at top (console limit?) may need to trick around  
etext = strip_headers(load_etext(11027)).strip()
print(etext)

예제 #52

0

파일 보기

파일: process_gutenberg_books.py 프로젝트: sodewumi/gutenberg_translate

    def open_file(self, file_id):
        """
            Opens a file from project gutenberg 
        """

        return load_etext(file_id)

예제 #53

0

파일 보기

파일: process_gutenberg.py 프로젝트: ssamot/Novels

print total
i = 0
for (dirpath, dirnames, filenames) in walk(gutenberg_path):
    for filename in filenames:
        f =  "/".join([dirpath, filename])
        if(f.endswith(".rdf")):
            #print f
            i+=1
            bf = BeautifulSoup(open(f))
            subjects =  bf.find_all("dcterms:subject")
            if (subjects is not None and len(subjects) > 0):
                for subject in subjects:
                    val =  subject.find_all("rdf:value")[0].contents[0]
                    for i_subject in i_subjects:
                        if(i_subject in val.lower()):
                            #print f, val

                            id =  int(basename(f)[2:-4])
                            fn = str(id).zfill(10) + "_" +  i_subject + ".txt"
                            print fn
                            try:
                                text = strip_headers(load_etext(id)).strip().encode("utf-8")
                                wf = "./texts/" + fn
                                with open(wf, "w") as text_file:
                                    text_file.write(text)
                                print i, total, float(i)/total
                            except:
                                print "broken", id
            # for network in tree.findtext('dcterms subject'):
            #     print network

예제 #54

0

파일 보기

파일: problema2crocha.py 프로젝트: nana016/problemasenclase

#Hecho en python 3.5
from gutenberg.acquire import load_etext 
from gutenberg.cleanup import strip_headers

librosCodigo = {"Francés":[13735,13808],"Español":[24925,15027],"Portugés":[14904,16384],"Inglés":[10422,1013]}
dic_idiomas={}

for idioma in librosCodigo.keys():
    diccionario_largo_palabras={}

    for indeCo in librosCodigo[idioma]:
        texto= strip_headers(load_etext(indeCo))
        dic_idiomas[idioma]= diccionario_largo_palabras

        for caracter_especial in ['"',"...","¿","?","=","_","[","]","(",")",",",".",":",";","!","¡","«","»","*","~","' "," '","- "," -","--"]:
            texto=texto.replace(caracter_especial," ")
            palabras=texto.split()

        for palabra in palabras:
            largo_palabra = len(palabra)
            if largo_palabra in diccionario_largo_palabras:
                diccionario_largo_palabras[largo_palabra] = diccionario_largo_palabras[largo_palabra]+1
            else:
                diccionario_largo_palabras[largo_palabra]= 1
print (dic_idiomas)

예제 #55

0

파일 보기

파일: gutenbergCorpus.py 프로젝트: liwzhi/machineLearning

Created on Sun Sep 20 19:49:20 2015

@author: weizhi
"""

import nltk
from nltk.corpus import gutenberg
nltk.corpus.gutenberg.fileids()
emma = nltk.corpus.gutenberg.words('austen-emma.txt')

from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


from gutenberg.acquire import metadata
text = load_etext(201)
print text[:100]
#assert text.startswith('MOBY DICK; OR THE WHALE\n\nBy Herman Melville')

import rdflib
g = rdflib.Graph()



from gutenberg.acquire import metadata


output = metadata._create_metadata_graph(store='Sleepycat')
#downLoad = metadata._download_metadata_archive()

from gutenberg.query.api import get_metadata  # noqa

예제 #56

0

파일 보기

파일: downloadtexts.py 프로젝트: andrewyang96/GutenbergsHorse

def downloadText(textID):
    print "Downloading", textID
    text = strip_headers(load_etext(textID)).strip()
    return text

예제 #57

0

파일 보기

파일: stanford_ner_wrapper.py 프로젝트: WillemJan/Narralyzer

    for item in ners:
        # Loop over the Stanford NER (per/ person) results,
        # and apply probablepeople, which raises when fails, (so try).
        if "per" in item["tag"].lower():
            try:
                result = parse(item.get('string'))
            except:
                log.error("Could not run probablepeople")

            if result:
                result = parse(item["string"])
                pp.append(result)
    ner["pp"] = pp
    return ner


if __name__ == '__main__':
    if len(sys.argv) >= 2 and 'test' in " ".join(sys.argv):
        import doctest
        doctest.testmod(verbose=True)

    if len(sys.argv) >= 2 and 'profile' in " ".join(sys.argv):
        from gutenberg.acquire import load_etext
        from gutenberg.cleanup import strip_headers
        from pycallgraph import PyCallGraph
        from pycallgraph.output import GraphvizOutput

        text = smart_text(strip_headers(load_etext(54807)).strip())
        with PyCallGraph(output=GraphvizOutput()):
            stanford_ner_wrapper(text, 9992, True)

예제 #58

0

파일 보기

파일: data_loading.py 프로젝트: liwzhi/machineLearning

            except Exception, e:
                logging.error("%s: %s" % (path, e))
                # raise e

    @classmethod
    def text_from_zip(cls, path, rdf_catalog_path=None):
        """Return a ProjectGutenbergText object from a zip file."""
        archive = zipfile.ZipFile(path)
        inside = archive.filelist
        filenames = [x.filename for x in inside]
        if len(inside) != 1:
            logging.warn("Supposedly plain-text %s has %d files in zip: %s" % (
                    path, len(inside), ", ".join(filenames)))
        possibilities = [x for x in filenames if x.lower().endswith(".txt")]
        data = archive.read(possibilities[0])
        return ProjectGutenbergText(data, path, rdf_catalog_path)

    @property
    def paragraphs(self):
        return self.text.split("\r\n\r\n")
Obj = ProjectGutenbergText(text, name=None, rdf_catalog_path=raw_data)
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

text = strip_headers(load_etext(2701)).strip()
assert text.startswith('MOBY DICK; OR THE WHALE\n\nBy Herman Melville')

예제 #59

0

파일 보기

파일: interjecciones.py 프로젝트: andresfpc/problemasenclase

# -*- coding:utf-8 -*-

from gutenberg.acquire import load_etext 
from gutenberg.cleanup import strip_headers

dic_cont_interjecciones={}

textos=load_etext(1619)
texto=strip_headers(textos).lower()
guardar_cadena=0
cadena=''
interjecciones={}

for texto in textos: #Repito el ciclo para cada libro
    for caracter in texto: #recorro el texto caracter por caracter
	if caracter == u'¡':
        	guardar_cadena=1
	if caracter == u'!':
		cadena = cadena+caracter
		if cadena in interjecciones.keys():
			interjecciones[cadena]+=1
		else:
			interjecciones[cadena]=1
		guardar_cadena=0
        if guardar_cadena == 1:
		cadena = cadena+caracter

for interjeccion in interjecciones.keys().sort():
	print interjeccion, interjecciones[interjeccion]

예제 #60

0

파일 보기

파일: GutenbergTextAnalysis.py 프로젝트: liwzhi/machineLearning

from numpy import random
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


bookNumber = set(random.randint(10,50024,size=2000))
#f.write(foo.encode('utf8'))

metaInfo = []

for item in bookNumber:
   # print item
    try: 
       # print item
        # loading the raw txt 
        data = load_etext(item).split("\n")
        
        # save the txt data path
        filePath = rdfPath + '/' +str(item) + '/' +  str(item) + '.txt'
        f = open(filePath,'w')
        f.write(data.encode('utf8'))
        f.close()
        # get the meta data 
        Dict = obj.metaData(data)
        metaInfo.append((Dict,filePath))
        print len(metaInfo)
    except:
        continue
#%%do the data mining to these txt, author, title, release time, etc, need time to work on this part