예제 #1
0
 def __init__(self, text):
     # pass in the text, and process the text
     self.original_striped_text = strip_headers(text).strip()
     self.text = strip_headers(text).strip()
     self.sentences = []
     self.word_frequencies = {}
     self.sentence_scores = {}
     self.preprocesstext()
예제 #2
0
def generate_paragraph():
    '''
    Generates a random paragraph from the Gutenberg Project

    :return: Text the Guttenberg Project with spaces and non-alphabetic characters removed\
    and all characters lower case
    :rtype: str
    '''
    #Get the text from Gutenberg Project, in this case its Moby Dick
    text = strip_headers(load_etext(2701)).strip()
    #text = "Jack and Jill ran up the hill to get a pail of water. " +
    #       "Jack fell down and broke his crown and Jill came tumbling after."
    sentences = []
    paragraph = ""

    for sentence in text.split("."):
        sentences.append(sentence)

    #Select 2 random sentences
    paragraph = random.choice(sentences) + random.choice(sentences)

    paragraph = re.sub(r'\s+', '', paragraph)
    regex = re.compile('[^a-zA-Z]')
    paragraph = regex.sub('', paragraph).lower()
    return paragraph
예제 #3
0
파일: words.py 프로젝트: kwccoin/words
def analyze(file, mode=None, stripmap=None):
    "This function analyzes a plane text file, optionally skipping Gutenberg/Wikipedia header and footer, and returns a dictionary (mapping of keys to values) of words to their frequencies. A map of characters to strip from each word may also be provided for efficiency purposes if calling this function multiple times (as we do for this experiment), but if none is provided it will be generated before processing."
    #Res is an empty dictionary which we will populate with key/value pairs and ultimately return.
    res = {}
    #If we don't have a stripmap, generate one.
    if stripmap == None:
        stripmap = generate_stripmap()
    #We need to determine the character encoding of the file for processing.
    import chardet
    enc = chardet.detect(open(file, 'br').read())['encoding']
    #Fin is the file object. We will open it using the detected encoding to collect its text (sans headers if Gutenberg), then close it.
    fin = open(file, encoding=enc, errors='ignore')
    #words is an empty list which we will populate with all words from the source text.
    words = []
    if mode == 'Gutenberg':
        from gutenberg.cleanup import strip_headers
        text = strip_headers(''.join(fin.readlines()))
    elif mode == 'Wikipedia':
        text = ''
        for line in fin:
            if "<doc" not in line and "</doc" not in line:
                text += line
    else:
        text = ''.join(fin.readlines())
    fin.close()
    #The text we've extracted is full of punctuation, capitalization, and newlines which are undesirable for our purposes. We just want the words.
    for word in text.split():
        words.extend(word.translate(stripmap).lower().split())
    #Analyze words, and generate our frequency map.
    for word in words:
        res[word] = res.setdefault(word, 0) + 1
    return res
예제 #4
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        del data_dir
        del tmp_dir
        del dataset_split

        # pylint: disable=g-import-not-at-top
        from gutenberg import acquire
        from gutenberg import cleanup
        # pylint: enable=g-import-not-at-top

        books = [
            # bookid, skip N lines
            (19221, 223),
            (15553, 522),
        ]

        for (book_id, toskip) in books:
            text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
            lines = text.split("\n")[toskip:]
            prev_line = None
            ex_count = 0
            for line in lines:
                # Any line that is all upper case is a title or author name
                if not line or line.upper() == line:
                    prev_line = None
                    continue

                line = re.sub("[^a-z]+", " ", line.strip().lower())
                if prev_line and line:
                    yield {
                        "inputs": prev_line,
                        "targets": line,
                    }
                    ex_count += 1
                prev_line = line
예제 #5
0
def text_invalidates_entry(text):
    """
    Determine if there is anything obvious in the text that would invalidate it as a valid novel

    >>> from gender_novels.corpus_gen import text_invalidates_entry
    >>> text_invalidates_entry("Translator: George Fyler Townsend")
    True
    >>> from gender_novels.corpus_gen import get_novel_text_gutenberg
    >>> import os
    >>> current_dir = os.path.abspath(os.path.dirname(__file__))
    >>> filepath = Path(current_dir, r"corpora/sample_novels/texts/hawthorne_scarlet.txt")
    >>> scarlet_letter = get_novel_text_gutenberg(filepath)
    >>> text_invalidates_entry(scarlet_letter)
    False

    :param text: str
    :return: boolean
    """
    if text.find("Translator: ", 0, 650) != -1:
        return True
    text = strip_headers(text)
    text_length = len(text)
    # Animal Farm is roughly 166700 characters including boilerplate
    # Guiness World Records states that the longest novel is 9,609,000 characters long
    if text_length < 140000 or text_length > 9609000:
        return True
    return False
예제 #6
0
    def generate_samples(self, data_dir, tmp_dir, dataset_split):
        del data_dir
        del tmp_dir
        del dataset_split

        books = [
            # bookid, skip N lines
            (19221, 223),
            (15553, 522),
        ]

        for (book_id, toskip) in books:
            text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
            lines = text.split("\n")[toskip:]
            for line in lines:
                # Any line that is all upper case is a title or author name
                if not line or line.upper() == line:
                    continue

                line = re.sub("[^a-z]+", " ", line.strip().lower())
                if line:
                    l = len(line)
                    if l > 100:
                        l = 100
                    yield {
                        "inputs": line,
                        "label": l,
                    }
def sanitize_texts(directory):
    """
    Strip all header and copyright information from downloaded text files in the
    specified directory using gutenberg.strip_headers module and ensure proper
    file encodings.

    :param directory: <String> A string containing the full path to directory containing files to strip
    :return:
    """

    for item in os.listdir(directory):
        file_path = os.path.join(directory, item)
        if os.path.isfile(file_path):

            # Detect file encoding, takes time to run
            with open(file_path, 'rb') as inf:
                text = inf.readlines()
            detector = UniversalDetector()
            for line in text:
                detector.feed(line)
                if detector.done:
                    break
            detector.close()
            encoding = detector.result['encoding']

            # Open file, strip headers, and save result
            with open(file_path, 'r', encoding=encoding) as inf:
                text = inf.read()
            text = strip_headers(text).strip()
            os.remove(file_path)
            with open(file_path, 'w+', encoding=encoding) as outf:
                outf.write(text)
예제 #8
0
def getSomeBooks(howManyBooks, startingAt=1):
    i = howManyBooks
    ii = startingAt
    nothing = 0
    valError = 0
    otherError = 0
    allTheBooks = []
    while i > len(allTheBooks):  # 54096 ceiling
        try:
            theText = strip_headers(
                load_etext(ii)).strip()  #load the full text into theText
            theLength = len(theText)
            if len(theText) > 292:
                allTheBooks.append([ii, theText])
                print("one more book in the list, book number:", i,
                      "book total is:", len(allTheBooks))
            else:
                nothing = nothing + 1
                print("nothing here at number:", i)
        except ValueError:
            valError = valError + 1
            print("valueError at book number:", i)
        except:
            otherError = otherError + 1
            print("otherError at book number:", i)
        ii = ii + 1

    print('all done')
    print(len(allTheBooks))
    return allTheBooks
예제 #9
0
def post_corpora(url, auth_token):
    corpora = acquire_corpora()
    text = strip_headers(load_etext(corpora[0])).strip()

    print(corpora, text[:100])

    authentication_token = {'authentication-token': auth_token}

    # data to post
    files = {'file': io.StringIO(text)}
    data = {
        'label': '{} {}'.format(corpora[1], corpora[3]),
        'source': corpora[2]
    }

    # post
    ru = requests.post(url,
                       headers=authentication_token,
                       files=files,
                       data=data)

    print(ru.url, ru.status_code)
    if ru.ok:
        print(ru.json())
    else:
        print(ru.status_code, ru.reason)
예제 #10
0
  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    del data_dir
    del tmp_dir
    del dataset_split

    # pylint: disable=g-import-not-at-top
    from gutenberg import acquire
    from gutenberg import cleanup
    # pylint: enable=g-import-not-at-top

    books = [
        # bookid, skip N lines
        (19221, 223),
        (15553, 522),
    ]

    for (book_id, toskip) in books:
      text = cleanup.strip_headers(acquire.load_etext(book_id)).strip()
      lines = text.split("\n")[toskip:]
      prev_line = None
      ex_count = 0
      for line in lines:
        # Any line that is all upper case is a title or author name
        if not line or line.upper() == line:
          prev_line = None
          continue

        line = re.sub("[^a-z]+", " ", line.strip().lower())
        if prev_line and line:
          yield {
              "inputs": prev_line,
              "targets": line,
          }
          ex_count += 1
        prev_line = line
예제 #11
0
def sample_paragraphs(book_id, n_parag, min_length):
    """Get book as text file and randomly sample a fixed number of paragraphs."""
    # Get book as string and emove metadata
    book = load_etext(book_id)
    # Remove metadata
    book = strip_headers(book).strip()
    # Remove the character we'll choose as separator
    book = book.replace("|", " ")
    # Split paragraphs
    parag = book.split("\n\n")
    # Remove single line breaks
    parag = [x.replace("\n", " ") for x in parag]
    # Remove paragraphs below a certain length
    parag = [p for p in parag if len(p) > min_length]
    # Exclude first/last 10 parag from sampling as they may contain remaining metadata
    parag = parag[10:-10]

    # Sample paragraphs
    seed(42)
    sample_ind = randint(0, len(parag), n_parag)

    if n_parag is not None:
        if n_parag > len(parag):
            raise ValueError(
                "The number of paragraphs to sample is higher than the "
                "total number of paragraphs."
            )
        else:
            parag_sampled = [parag[i] for i in sample_ind]

    else:
        # If n_parag is None, all paragraphs are sampled
        parag_sampled = parag

    return parag_sampled
def load_macbeth():
    """
    Sources Macbeth from Project Gutenberg, returns a cleaned dataframe
    of the play split by act, scene, speaker, and sentence.
    """
    raw_text = load_etext(1533)  # Collect the text
    raw_text = strip_headers(raw_text)  # Remove most metadata

    # Remove in-line stage directions

    raw_text = remove_in_line_stage_directions(raw_text)

    # Split the text into sentences

    sentences = separate_sentences(raw_text)

    # Remove introductory data, keeping only the text

    sentences = sentences[110:]

    # Create a dataframe from the sentences

    macbeth = create_play_data_frame(sentences)

    # Clean the dataframe

    macbeth = clean_macbeth(macbeth)

    # Add a token column

    macbeth["tokens"] = create_token_column(macbeth["sentence"])

    # Return the finished dataframe

    return macbeth
예제 #13
0
def download(cfg):
    print('Downloading Gutenberg data to: ' + cfg.directory)
    # Load language data for all books.
    path = os.path.join('code', 'utils', 'metadata.txt')
    with open(path, encoding='utf-8') as f:
        counter = 0
        for line in f:
            [index, lang, r, author, title] = line.split('\t')

            r = int(r)
            i = int(index)
            if counter < cfg.max_books and r == 1 and lang in cfg.languages:
                # Get the book.
                try:
                    text = strip_headers(load_etext(i)).strip().encode('utf-8')
                except UnknownDownloadUriException:
                    print('Could not download book: ' + str(i))
                    continue

                # Save the file to the correct directory.
                path = os.path.join(cfg.directory, lang)
                if not os.path.exists(path):
                    os.mkdir(path)
                with open(os.path.join(path, str(i) + '.txt'), 'wb') as f:
                    f.write(text)

                    counter += 1
                    if not counter % 1000:
                        print('Downloaded ' + str(counter) + ' books')
def get_gutenberg_text(id):
    try:
        text = strip_headers(load_etext(id)).strip()
        return text
    except Exception as ex:
        print(ex)
    return ''
예제 #15
0
    def load_gutenberg(self, language='en'):
        texts = get_etexts('author', self.author)
        texts = {
            t: list(get_metadata("title", t))[0]
            for t in texts if list(get_metadata("language", t))[0] == language
        }

        new_texts = dict()
        dupes = list()
        for k, d in texts.items():
            d = d.replace("\r\n", " ")
            if d not in dupes:
                dupes.append(d)
                new_texts[k] = d
                try:
                    self.books[d] = strip_headers(
                        load_etext(k)).strip().split("\r\n\r\n")
                except UnknownDownloadUriException:
                    print(
                        f'Book "{d}" does not have a text format and was not loaded.'
                    )
                    del new_texts[k]
                    dupes.remove(d)
                    continue
                self.tokens[d] = [
                    nltk.pos_tag(nltk.word_tokenize(self.books[d][b]))
                    for b in range(len(self.books[d]))
                ]
            else:
                pass

        texts = new_texts

        print(texts)
def delete_introduction_ending(full_text):
# this function deletes extranious text from Peter_Pan_full_text
	from gutenberg.cleanup import strip_headers
	no_intro = strip_headers(full_text).strip()
	# print no_intro
	print'headers are gone'
	return no_intro
예제 #17
0
def main():
    """
    The main method.
    """

    parser = argparse.ArgumentParser(
        description='Word suggestion based on Project Gutenberg books.')
    parser.add_argument('--book-id',
                        dest='book_ids',
                        nargs='+',
                        type=int,
                        required=True,
                        help='the book id of the Project Gutenberg')
    parser.add_argument('--query',
                        nargs='+',
                        type=str,
                        required=True,
                        help='suggest next word for list of string',
                        action=required_length(1, 5))

    try:
        args = parser.parse_args()
        text_indexer = TextIndexer(len(args.query))

        for book_id in list(dict.fromkeys(args.book_ids)):
            text = strip_headers(load_etext(book_id)).strip()
            text_indexer.add_text(book_id, text)

        print(text_indexer.suggest(*args.query))
    except Exception as exc:  # pylint: disable=W0703
        print(exc)
def get_featurelists(book):

    # Preparation for topic features: get 100 most common uni-, bi- and trigrams of the given book
    common_ngrams = get_common_ngrams(book)

    # Extract the features of the given book
    features_book = (book_id, extract_features(book, common_ngrams))

    # Create new file and write the features of the given book to it
    path_feat_book = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_book.txt"
    with open(path_feat_book, 'r+', encoding="utf-8") as output_book:
        output_book.write(str(features_book))
        output_book.close()

    # Create new file to write the features of the dataset books to
    path_feat_books = "C:/Users/gebruiker/Documents/Master/Text/Project/output_data/features_dataset.txt"
    output_dataset = open(path_feat_books, 'r+', encoding="utf-8")

    # Extract the features of the dataset books
    features_dataset = []
    for i in IDs:
        features_dataset.append((i,
                                 extract_features(
                                     strip_headers(load_etext(i)).strip(),
                                     common_ngrams)))

        # Write the features to the output file
        output_dataset.write("\n Book " + str(i) + ": ")
        output_dataset.write(str(features_dataset[len(features_dataset) - 1]))
    output_dataset.close()

    return features_book, features_dataset
def poetry_cleaner(poetry_books=BOOKS):
    with open(INPUT_DATA_WRITE_PATH + OUT_PATH, 'w') as ofp:

        lineno = 0

        for (id_nr, toskip, title) in poetry_books:

            startline = lineno
            text = strip_headers(load_etext(id_nr)).strip()
            lines = text.split('\n')[toskip:]

            for line in lines:

                if 0 < len(line) < 50 and line.upper(
                ) != line and not re.match('.*[0-9]+.*', line):
                    cleaned = re.sub('[^a-z\'\-]+', ' ', line.strip().lower())
                    if lineno < 100:
                        ofp.write(cleaned)
                        ofp.write('\n')
                    lineno = lineno + 1

                else:
                    ofp.write('\n')

        print('Wrote lines {} to {} from {}'.format(startline, lineno, title))
def init_books(author_file, json_file):
    """initialize book list with texts and save it to disk"""
    with open(author_file) as f:
        authors = list(f)

    authors = [i.strip() for i in authors]

    books = []
    for author in authors:
        s = get_etexts('author', author)
        for i in s:
            try:
                if list(get_metadata('language', i))[0] == 'en':
                    title, etext = list(get_metadata(
                        'title', i))[0], strip_headers(load_etext(i)).strip()
                    b = Book(i, title, etext)
                    books.append(b)
            except UnknownDownloadUriException:
                # this book does not have a load_etext corresponding to it.
                pass

    with open(json_file, 'wb') as f:
        pickle.dump(books, f)

    print(len(books))
예제 #21
0
파일: words.py 프로젝트: codeofdusk/words
def analyze(file,mode=None,stripmap=None):
    "This function analyzes a plane text file, optionally skipping Gutenberg/Wikipedia header and footer, and returns a dictionary (mapping of keys to values) of words to their frequencies. A map of characters to strip from each word may also be provided for efficiency purposes if calling this function multiple times (as we do for this experiment), but if none is provided it will be generated before processing."
    #Res is an empty dictionary which we will populate with key/value pairs and ultimately return.
    res={}
    #If we don't have a stripmap, generate one.
    if stripmap==None:
        stripmap=generate_stripmap()
    #We need to determine the character encoding of the file for processing.
    import chardet
    enc=chardet.detect(open(file,'br').read())['encoding']
    #Fin is the file object. We will open it using the detected encoding to collect its text (sans headers if Gutenberg), then close it.
    fin=open(file,encoding=enc,errors='ignore')
    #words is an empty list which we will populate with all words from the source text.
    words=[]
    if mode=='Gutenberg':
        from gutenberg.cleanup import strip_headers
        text=strip_headers(''.join(fin.readlines()))
    elif mode=='Wikipedia':
        text=''
        for line in fin:
            if "<doc" not in line and "</doc" not in line:
                text+=line
    else:
        text=''.join(fin.readlines())
    fin.close()
    #The text we've extracted is full of punctuation, capitalization, and newlines which are undesirable for our purposes. We just want the words.
    for word in text.split():
        words.extend(word.translate(stripmap).lower().split())
    #Analyze words, and generate our frequency map.
    for word in words:
        res[word]=res.setdefault(word,0)+1
    return res
예제 #22
0
def get_joyce_texts():
    joyce_keys = get_etexts('author', 'Joyce, James')
    joyce_titles = []
    joyce_texts = {}
    for key in joyce_keys:
        joyce_titles.append(get_metadata('title', key))
        joyce_texts[key] = strip_headers(load_etext(key)).strip()
    return (joyce_texts)
예제 #23
0
def regular_view(request, book_num):
    name = Book.get_book_name(book_num)
    bookText = strip_headers(load_etext(book_num)).strip()
    filteredText = removeStopWords(bookText)

    args = {'content': [bookText], 'content2': [filteredText], 'name': name}

    return render(request, "pages/regularText.html", args)
예제 #24
0
파일: prose.py 프로젝트: oscarbyrne/oulipo
def get_raw_book():
    while True:
        try:
            text = load_etext(random.randrange(46000)) #46000 is approximately size of gutenberg catalogue
        except ValueError: #in case of no download method for that text id
            pass
        else:
            return strip_headers(text)
예제 #25
0
def search_display_options(my_catalog):
    search_result_catalog = book_catalog()

    search_type = input(
        'Please select a search type: Author, Subject, Title [Aa/Ss/Tt]:  ')

    if search_type == 'A' or search_type == 'a':
        search_term = input('Please enter a search term for an Author: ')
    elif search_type == 'T' or search_type == 't':
        search_term = input('Please enter a search term for a Title: ')
    elif search_type == 'S' or search_type == 's':
        search_term = input('Please enter a search term for a Subject: ')
    else:
        print('Invalid search type...')
        return

    # set match flag to false
    match = False
    # fill up a set of all the titles that match the search
    for my_book in my_catalog.get_books():
        if (search_type == 'a' or search_type == 'A') and set(
                my_book.get_book_author().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

        if (search_type == 't' or search_type == 'T') and set(
                my_book.get_book_title().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

        if (search_type == 's' or search_type == 'S') and set(
                my_book.get_book_subject().lower().split(' ')).intersection(
                    set(search_term.lower().split(' '))):
            search_result_catalog.add_book(my_book)
            match = True

    search_result_catalog.display_titles_by_author()

    if match:
        title_num = input('Please type a title number from the above list: ')

        print('Displaying Word Cloud in [Subject: ' +
              my_book.get_book_subject() + '] for [Title: ' +
              my_book.get_book_title() + '] by [Author:' +
              my_book.get_book_author() + ']')
        try:
            my_book = search_result_catalog.get_book(title_num)
            return (strip_headers(load_etext(int(title_num))).strip()
                    )  # call that gets bok text from gutenberg
        except:
            print('Failed to find a textual download candidate for ' +
                  my_book.get_book_title())
            return (None)
    else:
        print('No matches found for [' + search_term + ']...')
        return (None)
예제 #26
0
def tab():
    with open("BookRoulette.html", "w") as f:
        x = (random.randint(1, 60059))
        book = strip_headers(load_etext(x)).strip()
        f.write(book)
        f.close
        filename = 'file:///'+os.getcwd()+'/' + 'BookRoulette.html'
        webbrowser.open_new_tab(filename)
        return render_template('BookRoulette.html', book=book)
예제 #27
0
def process_file(filename, outdir):
    outpath = outdir + '/%s.txt'
    with open(filename) as f:
        for line in f:
          spl = line.split('|')
          book = spl[0]
          uids = map(int, spl[3].strip(string.lowercase + '\n').split(','))
          try:
            with open(outpath % book, 'w') as out:
              for uid in uids:
                raw_text = load_etext(uid)
                try:
                  text = strip_headers(unidecode(raw_text.encode('latin-1').decode('utf-8')))
                except UnicodeDecodeError:
                  text = strip_headers(raw_text)
                out.write(text.encode('utf-8'))
          except ValueError as e:
            print '%s|%s' % (book, uid), e
            os.remove(outpath % book)
예제 #28
0
파일: main.py 프로젝트: Ryan-M3/minhasher
 def acquire_and_process(name: str, txt_num: int):
     """
     Convenience function that minhashes a Project Gutenberg
     text given the text id number (can be found on the gutenberg.org,
     for instance in the url).
     """
     txt = strip_headers(load_etext(txt_num))
     with open("texts/%s.txt" % name, "w") as f:
         f.write(txt)
     process_file("texts/%s.txt" % name)
예제 #29
0
def download():
    with open("GutProject.doc", "w") as f:
        x = (random.randint(1, 60059))
        text = strip_headers(load_etext(x)).strip()
        f.write(text)
        f.close()
    return send_file('GutProject.doc',
    mimetype='application/msword',
    attachment_filename='GutProject.doc',
    as_attachment=True)
예제 #30
0
 def loadNew(self):
     test = os.listdir(self.new)[0]
     testB = open(self.new + test)
     raw = testB.read()
     text = strip_headers(raw).strip()
     text = text.replace('\n', ' ')
     text = text.replace(':', '. ')
     text = sent_tokenize(text)
     text = list(filter(lambda x: len(x) > 5, text))
     return text
def get_gutenberg_text(book_id):
    """
    This function gets the text corresponding to the book_id 
    from Gutenberg database.
    """
    try:
        x = strip_headers(load_etext(int(book_id), prefer_ascii=False)).strip()
    except:
        x = None
    return x
예제 #32
0
def downloadBook():
    """If posting, takes in a book number from getty.html, installs the book into
    the database. Otherwise displays getty.html"""
    if request.method == "POST":
        bookNum = int(request.form.get("bookNum"))
        words = strip_headers(load_etext(bookNum)).strip()
        installText(words)
        return render_template("homepage.html")
    else:
        return render_template("getty.html")
예제 #33
0
 def __init__(self, book_number=2701, first_page=20, last_page=20):
     self.text = strip_headers(load_etext(book_number))
     # print(list_supported_metadatas())  # prints (u'author', u'formaturi', u'language', ...)
     # print(get_metadata('title', 2701))  # prints frozenset([u'Moby Dick; Or, The Whale'])
     # print(get_metadata('author', 2701))  # prints frozenset([u'Melville, Hermann'])
     # print(text)  # prints 'MOBY DICK; OR THE WHALE\n\nBy Herman Melville ...'
     self.pages = []
     self.first_page = first_page
     self.last_page = last_page
     self.print_book()
예제 #34
0
def gutToText(number,name):
    filename = name+"_raw.txt"
    if os.path.isfile(filename)==False:
        book = open(filename,"w")
        text = strip_headers(load_etext(number)).strip()
        words = text
        print "Loaded and writing %s" % (name)
        book.write(words.encode('utf-8'))
        print "Done writing %s" % (name)
        book.close()
예제 #35
0
def get_gutenberg_document(url) -> str:
    """Downloads a document (book, etc.) from Project Gutenberg and returns it as a string."""
    # Get Project Gutenberg document ID from url string
    validate_url(url, expected_netloc='gutenberg.org')
    match = re.search("(?:files|ebooks|epub)\/(\d+)", urlsplit(url).path)
    if not match:
        raise Exception('Not a valid url')
    document_id = int(match.group(1))
    return super_cleaner(strip_headers(load_etext(document_id).strip()),
                         mark_deletions=False)
예제 #36
0
파일: main.py 프로젝트: Ryan-M3/minhasher
 def acquire_and_process(name: str, txt_num: int):
     """
     Convenience function that minhashes a Project Gutenberg
     text given the text id number (can be found on the gutenberg.org,
     for instance in the url).
     """
     txt = strip_headers( load_etext(txt_num) )
     with open("texts/%s.txt" % name, "w") as f:
         f.write(txt)
     process_file("texts/%s.txt" % name)
예제 #37
0
파일: yewno.py 프로젝트: skyballin/yewno
    def get_text(self, title, author):
        """
		This function will access the title and author of a book from the
		Gutenberg project and save the data as a csv file
		PROBLEM HERE -- gutenberg goes down a lot, so getting a full text 
		did not work. To bypass that, I downloaded some books of mixed languages.
		"""
        guten_number = get_etexts('title', title)[0]
        text = strip_headers(load_etext(guten_number)).strip()
        return (text)
def fetch_etext(etextno):
    """Returns a unicode representation of the full body of a Project Gutenberg
    text. After making an initial remote call to Project Gutenberg's servers,
    the text is persisted locally.

    """
    download_uri = _format_download_uri(etextno)
    if download_uri:
        response = requests.get(download_uri)
        return strip_headers(response.text.strip()).encode('utf-8')
    return ''
def get_word_list():
    """ Reads the specified project Gutenberg book.  Header comments,
		punctuation, and whitespace are stripped away.  The function
		returns a list of the words used in the book as a list.
		All words are converted to lower case.
	"""
    textfile = open("pg32325.txt")
    full_text = textfile.read()
    no_punctuation = full_text.translate(None, string.punctuation)
    no_intro = strip_headers(no_punctuation).strip()
    convert_ascii = no_intro.encode("ascii")
    convert_lowercase = string.lower(convert_ascii)
    list_split = convert_lowercase.split()
    return list_split
예제 #40
0
def get_book_text(csvfile):
    'gets text for book using project gutenberg catalog'
    book_list = open_csv(csvfile)
 
        
    for i, value in enumerate(book_list):
        #print i, value
        
        a = int(book_list[i][2]) # a = book number
        print i, a
        author = book_list[i][0]
        title = book_list[i][1]
        try:
            text = strip_headers(load_etext(a)).strip()
        except ValueError:
            pass
    def split_chapters(self, full_text):
        """
            Removes header and footer from project gutenberg book. 
            Makes a list of chapters, where each chapter is a sublist of paragraphs
        """
        book = strip_headers(full_text)

        chapter_list = re.split(ur'\n\bchapter\b \w+\.?', book, flags=re.IGNORECASE)

        if len(chapter_list) < 2:
            chapter_list = re.split(ur'\n[IVXCLM]+\n', book)
            
        paragraphs_in_chapter_list = []

        for i in range(len(chapter_list)):
            paragraphs_in_chapter_list.append(chapter_list[i].split('\n\r'))

        # return len(paragraphs_in_chapter_list)
        return paragraphs_in_chapter_list
 def the_text(self): 
      try:
         self.novel = load_etext(self.novel_num)
      except:
         rejects.append(self.novel_num)
         return False
           
      if re.search('Character set encoding: ASCII', self.novel):
         self.novel = strip_headers(self.novel)
         self.novel = self.novel.replace('\n', ' ')
         self.novel = TextBlob(self.novel)
         self.novel_sentences = self.novel.sentences
         self.m = str(self.novel_num)
         with open('novel_'+self.m +'list_1.csv', 'wb') as f:
             writer = csv.writer(f)
             for sentence in self.novel_sentences:
                 writer.writerow([sentence])
      else: 
         rejects_2.append(self.novel_num) 
         return False    
def process_file(direc, file):
    """
    Given a file and directory, extracts the title and author from
    the file if it's an English language text. Then strips the header
    information using Gutenberg module and stores the new file in a
    nameddir directory with filename "title%%%author.txt" for future
    processing.

    :param direc: <String> Path to directory containing the file, no trailing '/'
    :param file: <String> Name of the file
    """
    # Grab author and title from the top if language is English
    title = ""
    author = ""
    lang = False
    text = ""
    enc = 'ISO-8859-1'
    path = direc + '/' + file
    with open(path, 'r', encoding=enc) as inf:
        text = inf.read()
        inf.seek(0)  # reset buffer to read author and title
        for line in inf:
            if "Title:" in line:
                title = line.replace("Title: ", "").strip()
            if "Author:" in line:
                author = line.replace("Author: ", "").strip()
            if "Language:" in line and "English" in line:
                lang = True
                break

    # Generate new file name like 'Title%%%Author.txt' for easy lookup
    filename = title[:min(100, len(title))].replace("/", "") + delim\
               + author[:min(100, len(author))].replace("/", "") + ".txt"

    # Remove copyright and metadata from the file
    text = strip_headers(text).strip()

    # Save the file in 'nameddir' (global var) directory
    if lang:
        with open(nameddir + '/' + filename, 'w+') as outf:
            outf.write(text)
예제 #44
0
파일: fetch.py 프로젝트: LSaldyt/Plutus
def fetch_gutenberg(filename=None):
    from gutenberg.acquire import load_etext
    from gutenberg.cleanup import strip_headers
    from gutenbergsettings import popularTitles, saveInterval

    start    = time.time()
    lastsave = start

    with redirect(filename):
        try:
            for title in popularTitles:
                text = strip_headers(load_etext(title)).strip()
                serialize([(title, text)], '../serialized/guten%s' % title)
                sinceLast = time.time() - lastsave
                print('%s since last save' % sinceLast)
                if sinceLast > saveInterval:
                    concatenate('guten')
                    lastsave = time.time()
        except KeyboardInterrupt:
            concatenate('guten')
            sys.exit(0)
예제 #45
0
 def test_strip_headers(self):
     for testcase in SampleText.all():
         expected = testcase.clean_text.splitlines()
         actual = strip_headers(testcase.raw_text).splitlines()
         lines = zip(actual, expected)
         for i, (actual_line, expected_line) in enumerate(lines, start=1):
             self.assertEqual(
                 actual_line,
                 expected_line,
                 u'non-matching lines:\n'
                 u'{previous_lines}\n'
                 u'{lineno_separator}\n'
                 u'got "{actual}"\n'
                 u'expected "{expected}"\n'
                 u'{separator}\n'
                 u'{next_lines}'.format(
                     previous_lines=_previous_lines(i, lines, amount=3),
                     next_lines=_next_lines(i, lines, amount=3),
                     actual=actual_line,
                     expected=expected_line,
                     lineno_separator='line {}:'.format(i).center(80, '-'),
                     separator=''.center(80, '-')))
예제 #46
0
def main():
    filename = "gutenbergscifi.csv"
    json_filename = "gendered_words.json"
    
    if os.path.isfile(filename):
        print "file exists"
        
    else:
        write_csv = extract_text_urls(filename)
        print "file created"

    book_list = open_csv(filename)
    
    print book_list
    
    for i, value in enumerate(book_list):
        #print i, value
        
        a = int(book_list[i][2]) # a = book number
        print i, a
        author = book_list[i][0]
        title = book_list[i][1]
        try:
            text = strip_headers(load_etext(a)).strip()
        except ValueError:
            pass
        #print text  
        
        clean_text = remove_punc_html(text)
        ratio = gender_ratio(clean_text)
        
        print author, title, ratio
        uber_key = author
        sub_key = title
        sub_value = ratio
        uber_value = {author: {title:ratio}}

        json_source = read_write_json_object(json_filename="gendered_words.json", uber_key=uber_key, uber_value=uber_value, sub_key=sub_key, sub_value=sub_value, READ=False, WRITE=True)
예제 #47
0
def extract_subroutine(data, src_dir, century):
    session = model.get_session()
    backoff = 1

    counter = 0
    for metadata in data:
        contents = extract_book_contents(metadata)

        if contents is None:
            backoff *= 1.5
            continue

        title = metadata['title']
        author = metadata['author']
        e_id = metadata['id']

        if type(title) == list:
            title = dasherize(title)

        text_file_path = os.path.join(src_dir, dasherize(title.split(" ")))
        text = strip_headers(load_etext(e_id)).strip()
        f = open(text_file_path, "w")
        f.write(text)
        f.close()
        book = model.Book(
            title=title,
            author=author,
            e_id=e_id,
            century=century,
            text_file_path=text_file_path
        )
        session.add(book)
        session.commit()
        log.info("successfully added " + title)
        counter += 1
        time.sleep(backoff)

    log.info("---- finished run. added %d books ----" % counter)
 def test_strip_headers(self):
     for testcase in SampleText.all():
         expected = testcase.clean_text.splitlines()
         actual = strip_headers(testcase.raw_text).splitlines()
         lines = list(zip(actual, expected))
         for i, (actual_line, expected_line) in enumerate(lines, start=1):
             self.assertTrue(
                 actual_line == expected_line,
                 u('non-matching lines for etext {etextno}:\n'
                   '{previous_lines}\n'
                   '{lineno_separator}\n'
                   'got "{actual}"\n'
                   'expected "{expected}"\n'
                   '{separator}\n'
                   '{next_lines}')
                 .format(
                     etextno=testcase.etextno,
                     previous_lines=_previous_lines(i, lines, amount=3),
                     next_lines=_next_lines(i, lines, amount=3),
                     actual=actual_line,
                     expected=expected_line,
                     lineno_separator='line {0}:'.format(i).center(80, '-'),
                     separator=''.center(80, '-')))
예제 #49
0
def download_book(title, gutenberg_id, data_path, sleep=0):
    print('downloading {:}'.format(title))

    full_text = strip_headers(load_etext(gutenberg_id)).strip()
    summary = downloadSummary(title)

    if full_text is None:
        print('Full text is None. Skipping {:}'.format(title))
        return
    if summary is None:
        print('Summary is None. Skipping {:}'.format(title))
        return

    output_data = {'title': title,
                   'full_text': full_text,
                   'summary': summary}
        
    output_file = os.path.join(data_path,
                               '{:}.json'.format(gutenberg_id))
    with open(output_file, 'w') as f:
        json.dump(output_data, f, ensure_ascii=False)

    time.sleep(sleep)
예제 #50
0
def getBook(bookDetails):
    global timeAtLastFetch
    cachedFilename = "cache/" + fileNameForBook(bookDetails) + ".txt"
    if os.path.isfile(cachedFilename):
        with open(cachedFilename) as bookfile:
            text = bookfile.read()
            return TextBlob(text)

    nowMS = milliseconds()
    timeSinceLastFetch = nowMS - timeAtLastFetch
    if timeSinceLastFetch < gutenbergWaitTimeMS:
        waitTime = gutenbergWaitTimeMS - timeSinceLastFetch
        print "    waiting {}ms for Gutenberg...".format(waitTime)
        time.sleep(waitTime / 1000)

    bookId = bookDetails['id']
    print "Fetching from Gutenberg id {}".format(bookId)
    source = load_etext(bookId)
    print "    cleaning...."
    source = removeUnicodeWords.sub("", source)
    source = removeUnicodeCharacters.sub("", source)
    source = removePossessive.sub("", source)
    source = removeWordsWithApostrophe.sub("", source)
    source = removeHyphens.sub(" ", source)
    source = removeChapterHeaders.sub("", source)
    source = removeRomanNumerals.sub("", source)
    source = removeEllipsis.sub("", source)
    text = strip_headers(source).strip()
    timeAtLastFetch = milliseconds()
    if not os.path.isdir("cache"):
        os.mkdir("cache")
    bookfile = open(cachedFilename, 'w')
    bookfile.write(text)
    bookfile.close()
    print "    fetched and cached " + bookDetails['title']
    return TextBlob(text)
예제 #51
0
파일: reader.py 프로젝트: sfrapoport/litbot
def generate_tweets(gutenberg_id, total=24):
    document = []
    text = strip_headers(load_etext(gutenberg_id)).strip()
    lines = text.split('\n')    
    print get_metadata('title', gutenberg_id)
    for line in lines:
        words = re.findall(regex, line)
        document.extend(words)

    trigrams = zip(document, document[1:], document[2:])
    trigram_transitions = defaultdict(list)
    starts = []

    for prev, current, next in trigrams:
            if prev == ".":
                    starts.append(current)
            trigram_transitions[(prev, current)].append(next)

    def generate_using_trigrams():
            current = random.choice(starts)
            prev = "."
            result = [current]
            while True:
                    next_word_candidates = trigram_transitions[(prev, current)]
                    next_word = random.choice(next_word_candidates)
                    prev, current = current, next_word
                    if current != ".":
                        result.append(current)
                    else:
                        return " ".join(result) + current
    tweets = [];
    while len(tweets) < total:
        tweet = generate_using_trigrams()
        if len(tweet) <= 140:
            tweets.append(tweet)
    return tweets
예제 #52
0
    for item in ners:
        # Loop over the Stanford NER (per/ person) results,
        # and apply probablepeople, which raises when fails, (so try).
        if "per" in item["tag"].lower():
            try:
                result = parse(item.get('string'))
            except:
                log.error("Could not run probablepeople")

            if result:
                result = parse(item["string"])
                pp.append(result)
    ner["pp"] = pp
    return ner


if __name__ == '__main__':
    if len(sys.argv) >= 2 and 'test' in " ".join(sys.argv):
        import doctest
        doctest.testmod(verbose=True)

    if len(sys.argv) >= 2 and 'profile' in " ".join(sys.argv):
        from gutenberg.acquire import load_etext
        from gutenberg.cleanup import strip_headers
        from pycallgraph import PyCallGraph
        from pycallgraph.output import GraphvizOutput

        text = smart_text(strip_headers(load_etext(54807)).strip())
        with PyCallGraph(output=GraphvizOutput()):
            stanford_ner_wrapper(text, 9992, True)
for index, record in df.iterrows():
    # Get the key
    url = record['url']
    key = bucket.get_key(url)
    if key is None:
        #Remove the utf8 extension from url
        utf8_extension = url.rfind('.utf8')
        if (utf8_extension != -1):
            url = url[0:utf8_extension]
    key = bucket.get_key(url)
    if key is None:
        continue
    contents = key.get_contents_as_string()
    contents = unicode(contents, 'utf-8')
    book_text = strip_headers(contents).strip()
    book_length = len(book_text)
    noise_size = int(book_length * 0.05) 
    #Compute offsets for content
    start_offset = noise_size
    end_offset = book_length - noise_size
    #Remove the noise from book text
    document = book_text[start_offset:end_offset]
    #Truncate the document at full stops
    start = document.find('.')
    end = document.rfind('.')
    if ((start != -1) and (end != -1)):
        document = document[start+1:end+1]
    #Remove special characters and digits
    pattern = '[^\w+.\s+,:;?\'-]'
    prog = re.compile(pattern,re.UNICODE)
def split_sentences(text):
	for sentence_separator in [u'. ',u'.\n',u'? ',u'! ',u'?\n',u'!\n',u'; ',u';\n',u'- ',u'--',u'...',u'\n',u'\n\n',u'\n\n\n']:
		text=text.replace(sentence_separator,u'|||')
		return text.split(u'|||')


# Saber la cantidad de libros que posee el corpus.
print u'Total de libros en español:',len(codigos_libros.es)


# Ahora se cargan los libros y se suprimen sus encabezados.
dic_oraciones_es={}
total_palabras_es=0
for codigo_libro_es in codigos_libros.es:
	texto=load_etext(codigo_libro_es)
	texto=strip_headers(texto)
	
# En cada libro se separan las oraciones y se delimitan por el símbolo |||.
	oraciones_libro=split_sentences(texto)
	for oracion_libro in oraciones_libro:
		palabras=rufino.split_words(oracion_libro)
		numero_palabras_oracion=len(palabras)
		total_palabras_es+=numero_palabras_oracion
		if numero_palabras_oracion not in dic_oraciones_es:
			dic_oraciones_es[numero_palabras_oracion]=1
		else:
			dic_oraciones_es[numero_palabras_oracion]=dic_oraciones_es[numero_palabras_oracion]+1


print u'Total de oraciones en español:',len(dic_oraciones_es)
print u'Total de palabras en español:',total_palabras_es
"""
Created on Wed Aug 12 18:06:45 2015

@author: Tony
Description: Pull etext numbers from Project Gutenberg for an author

1) First pip install gutenberg 0.4.0 library for Python from the command line

"""
 
from gutenberg.query import get_etexts
from gutenberg.query import get_metadata
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers


# get the catalogue numbers of all the texts
# by Wilhelm Grimm in Project Gutenberg
bookList=get_etexts('author', 'Grimm, Wilhelm Carl')
# gives bookList = [12704, 12705, 12706, 12707, 12708, 12709, 12710, 37381, 20051, 28044, 30510, 22555, 20050, 11027, 16846, 12250, 20027, 19068, 2591]

#Once We can associate a number with a title we can pull the text
for number in bookList:
    print(number,get_metadata('title',number))
 
print('\n HHHHHHHHHHHHHHH Now for the full text HHHHHHHHHHHHHHHHHHH \n')
# Once we have the text number we can print the text
# example 11027 is the number for Grimm's Fairy Stories 
# can be tempermental truncating text at top (console limit?) may need to trick around  
etext = strip_headers(load_etext(11027)).strip()
print(etext)
예제 #56
0
25777, 25988, 26284, 26655, 27736, 29497, 29506, 29663, 29799, 29831, 30053, 30122, 30425, 
30535, 30903, 30986, 31013, 31464, 31541, 31613, 31637, 31707, 32235, 32315, 32364, 33690, 
35882, 36253, 36453, 36573, 36940, 37067, 37095, 37139, 37637, 38814, 39444, 39613, 39990, 
41746, 42727,)



guardar_cadena=0
cadena=u''
interjecciones={}
contador = 0


for texto in textos: #Repito el ciclo para cada libro
    texto= load_etext(texto) #Cargo el texto 
    texto=strip_headers(texto).lower() #Quito las cabeceras
    texto=unicode(texto)
    for caracter in texto: #recorro el texto caracter por caracter
	if caracter == u'¡':      # Si encuentro una apertura de exclamación
        	guardar_cadena=1  # Pongo una variable para empezar a guardar la cadena	
		cadena=cadena+unicode(caracter)	
	if caracter == u'!':      # Si encuentro un cierre de exclamación
		cadena = cadena+unicode(caracter)	 # 1. Guardo ese último caractér (esto es opcional)
		if cadena in interjecciones.keys(): # 2. reviso si la cadena esta en el diccionario
			interjecciones[cadena]+=1  # 3. Si esta le sumo uno a su contador
		else:                           # 4. Si no esta 
			interjecciones[cadena]=1    # La pongo y aranca desde 1
		guardar_cadena=0				# 5. Cambio el valor de la variable para que no se guarde más 
		cadena=''
        if guardar_cadena == 1:         # 6. reviso si el valor de guardar cadena esta en 1
		cadena = cadena+unicode(caracter) #Si está sumo el siguiente caracter y repito el ciclo
예제 #57
0
def downloadText(textID):
    print "Downloading", textID
    text = strip_headers(load_etext(textID)).strip()
    return text
예제 #58
0
#Hecho en python 3.5
from gutenberg.acquire import load_etext 
from gutenberg.cleanup import strip_headers

librosCodigo = {"Francés":[13735,13808],"Español":[24925,15027],"Portugés":[14904,16384],"Inglés":[10422,1013]}
dic_idiomas={}

for idioma in librosCodigo.keys():
    diccionario_largo_palabras={}

    for indeCo in librosCodigo[idioma]:
        texto= strip_headers(load_etext(indeCo))
        dic_idiomas[idioma]= diccionario_largo_palabras

        for caracter_especial in ['"',"...","¿","?","=","_","[","]","(",")",",",".",":",";","!","¡","«","»","*","~","' "," '","- "," -","--"]:
            texto=texto.replace(caracter_especial," ")
            palabras=texto.split()

        for palabra in palabras:
            largo_palabra = len(palabra)
            if largo_palabra in diccionario_largo_palabras:
                diccionario_largo_palabras[largo_palabra] = diccionario_largo_palabras[largo_palabra]+1
            else:
                diccionario_largo_palabras[largo_palabra]= 1
print (dic_idiomas)
예제 #59
0
            except Exception, e:
                logging.error("%s: %s" % (path, e))
                # raise e

    @classmethod
    def text_from_zip(cls, path, rdf_catalog_path=None):
        """Return a ProjectGutenbergText object from a zip file."""
        archive = zipfile.ZipFile(path)
        inside = archive.filelist
        filenames = [x.filename for x in inside]
        if len(inside) != 1:
            logging.warn("Supposedly plain-text %s has %d files in zip: %s" % (
                    path, len(inside), ", ".join(filenames)))
        possibilities = [x for x in filenames if x.lower().endswith(".txt")]
        data = archive.read(possibilities[0])
        return ProjectGutenbergText(data, path, rdf_catalog_path)

    @property
    def paragraphs(self):
        return self.text.split("\r\n\r\n")
Obj = ProjectGutenbergText(text, name=None, rdf_catalog_path=raw_data)
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers

text = strip_headers(load_etext(2701)).strip()
assert text.startswith('MOBY DICK; OR THE WHALE\n\nBy Herman Melville')




예제 #60
0
print total
i = 0
for (dirpath, dirnames, filenames) in walk(gutenberg_path):
    for filename in filenames:
        f =  "/".join([dirpath, filename])
        if(f.endswith(".rdf")):
            #print f
            i+=1
            bf = BeautifulSoup(open(f))
            subjects =  bf.find_all("dcterms:subject")
            if (subjects is not None and len(subjects) > 0):
                for subject in subjects:
                    val =  subject.find_all("rdf:value")[0].contents[0]
                    for i_subject in i_subjects:
                        if(i_subject in val.lower()):
                            #print f, val

                            id =  int(basename(f)[2:-4])
                            fn = str(id).zfill(10) + "_" +  i_subject + ".txt"
                            print fn
                            try:
                                text = strip_headers(load_etext(id)).strip().encode("utf-8")
                                wf = "./texts/" + fn
                                with open(wf, "w") as text_file:
                                    text_file.write(text)
                                print i, total, float(i)/total
                            except:
                                print "broken", id
            # for network in tree.findtext('dcterms subject'):
            #     print network