def __init__(self, corpus, auxiliar=None, use_proccessing=False):
        self.corpus = corpus
        self.auxiliar = auxiliar
        self.proccessing = use_proccessing

        self.allSentences = []
        self.pAllSentences = []

        for i in self.corpus.items():
            original_sentences = i[1][0]
            preprocesed_sentences = i[1][1]

            self.allSentences.extend(
                extract_sentences(original_sentences, False))
            self.pAllSentences.extend(
                extract_sentences(preprocesed_sentences, True))

        if self.auxiliar is not None:
            for i in self.auxiliar.items():
                original_sentences = i[1][0]
                preprocesed_sentences = i[1][1]

                self.allSentences.extend(
                    extract_sentences(original_sentences, False))
                self.pAllSentences.extend(
                    extract_sentences(preprocesed_sentences, True))
示例#2
0
def main():
    tags = config.TAGS

    ##### TEST CODE #####
    sentences = utils.extract_sentences("./testfile")
    my_dict = associate_tags(tags, sentences)
    for stuff in my_dict.keys():
        print "**************************************"
        print stuff, "|"
        if isinstance(my_dict[stuff], list):
            for ent in my_dict[stuff]:
                print ent
        else:
            print my_dict[stuff]
        print "**************************************"
    exit(0)
    ##### END TEST CODE #####

    my_dict = {}
    for arg in sys.argv:
        if os.path.isdir:
            merge_dicts(my_dict, create_dict([], argv))
        else:
            file_dict = associate_tags(tags, utils.extract_sentences(arg))
            merge_dicts(my_dict, file_dict)
示例#3
0
def main(inputdir: str,
         outputdir: str,
         minwords: int = 3,
         maxwords: int = 15,
         one: bool = False,
         dry: bool = False,
         abbr: bool = False,
         code: bool = False,
         plaintext: bool = False):
    try:
        nlp = spacy.load('fr_core_news_sm')
        nlp.add_pipe(set_custom_boundaries, before='parser')
    except OSError:
        raise OSError('French model not installed. Please run:\n'\
                      'python -m spacy download fr_core_news_sm')
    filenames = list_files(inputdir)
    if one:
        filenames = filenames[0:1]
    for filename in filenames:
        text = parse_epub(filename, abbr, code)
        if plaintext:
            string_final = text
        else:
            # extract sentences using utils module
            sentences = extract_sentences([text],
                                          min_words=minwords,
                                          max_words=maxwords,
                                          nlp=nlp)
            # clean and filter sentences
            sentences = [clean_sentence(x) for x in sentences]
            string_final = '\n'.join(list(sentences))
        if not dry:
            save_text(string_final, filename, inputdir, outputdir)
示例#4
0
def dump_one_play(play, nlp=None):
    print('Treating playid #{}'.format(play))
    try:
        sentences = list(
            extract_sentences(get_one_play(play),
                              args.min_words,
                              args.max_words,
                              nlp=nlp))
        nb_sents = len(sentences)

        if nb_sents < 2:
            print('Too few content: %d. Check %s' %
                  (nb_sents, WORK_TEMPLATE % {
                      'workid': play
                  }))
            return

        output_play_name = os.path.join(args.output, "{}.txt".format(play))
        print('output_play_name', output_play_name)
        if not args.dry:
            with open(output_play_name, 'wb') as output_play:
                bytes = output_play.write('\n'.join(sentences).encode('utf-8'))
                if bytes == 0:
                    print('Empty content for playid #{}'.format(play))
        else:
            print('\n'.join(sentences))
    except ValueError as e:
        print('Unable to fetch play because of', e)
    def get_matrix_glove(self):
        model = Glove.load(some_parameters['glove_model'])
        vectors = model.word_vectors
        dictionary = model.dictionary
        corpus_matrix = dict()

        for i in self.corpus.items():
            doc_name = i[0]
            original_sentences = i[1][0]
            preprocesed_sentences = i[1][1]

            ori_sents = extract_sentences(original_sentences, False)
            pp_sents = extract_sentences(preprocesed_sentences, True)

            if self.proccessing:
                matrix = get_glove_matrix(pp_sents, dictionary, vectors)
            else:
                matrix = get_glove_matrix(ori_sents, dictionary, vectors)

            corpus_matrix[doc_name] = matrix

        return corpus_matrix
示例#6
0
def extract_sentences_with_links(tag, filepath):
    """Build a dict of Entity s from a single file

    dict extract_sentences_with_links(str[] tag_list, str filepath)
    str[] tag: list of tags to search for
    str filepath: path to file to process
    return value: dictionary of Entity s encountered in processed file. Keys are
    entities as strings, values are Entity classes or lists of Entity classes in
    case of conflict.

    IMPORTS: utils"""

    sentences = utils.extract_sentences(filepath)
    return associate_tags([tag], sentences)
示例#7
0
def dump_one_book(book):
    print('Treating bookid #{}'.format(book))
    try:
        sentences = extract_sentences(parse_one_book(book), args.min_words,
                                      args.max_words)

        output_book_name = os.path.join(args.output, "{}.txt".format(book))
        print('output_book_name', output_book_name)
        if not args.dry:
            with open(output_book_name, 'wb') as output_book:
                bytes = output_book.write(
                    '.\n'.join(sentences).encode('utf-8'))
                if bytes == 0:
                    print('Empty content for bookid #{}'.format(book))
        else:
            print('.\n'.join(sentences))
    except UnknownDownloadUriException:
        print('Unable to get bookid #{}'.format(book))
示例#8
0
def main(inputdir: str,
         outputdir: str,
         minwords: int = 3,
         maxwords: int = 14,
         one: bool = False,
         dry: bool = False,
         abbr: bool = False,
         code: bool = False,
         plaintext: bool = False):
    try:
        nlp = spacy.load('fr_core_news_sm')
        # add max length for the sentence
        nlp.max_length = 5000000
        nlp.add_pipe(set_custom_boundaries, before='parser')
    except OSError:
        raise OSError('French model not installed. Please run:\n'\
                      'python -m spacy download fr_core_news_sm')

    if inputdir == '-1':
        # download file
        subprocess.call(
            ['wget', '-O', FILENAME_TMP, EPUB_LINK, '--no-check-certificate'])
        filenames = [FILENAME_TMP]
    else:
        # list file directory
        filenames = list_files(inputdir)

    for filename in filenames:
        text = parse_epub(filename, abbr, code)
        if plaintext:
            string_final = text
        else:
            # extract sentences using utils module
            sentences = extract_sentences([text],
                                          min_words=minwords,
                                          max_words=maxwords,
                                          nlp=nlp)
            # clean and filter sentences
            sentences = [clean_sentence(x) for x in sentences]
            string_final = '\n'.join(list(sentences))
        if not dry:
            save_text(string_final, filename, inputdir, outputdir)
示例#9
0
            visited.append(node)

            if node.tagName == "DateSeance":
                if seance_context is not None and 'texte' in seance_context:
                    output_seance_name = os.path.join(
                        args.output, seance_context['DateSeance'])
                    if os.path.isfile(output_seance_name + '.txt'):
                        output_seance_name += str(
                            int(
                                datetime.datetime.timestamp(
                                    datetime.datetime.utcnow())))

                    output_seance_name += '.txt'
                    print('output_seance_name', output_seance_name)
                    sentences = extract_sentences(seance_context['texte'],
                                                  args.min_words,
                                                  args.max_words)
                    if not args.dry:
                        with open(output_seance_name, 'w') as output_seance:
                            output_seance.write('.\n'.join(sentences))
                    else:
                        print('.\n'.join(sentences))

                    if args.one:
                        break

                doc.expandNode(node)
                date_seance = node.firstChild.nodeValue

                if len(date_seance) != 17:
                    print("Bogus DateSeance?", date_seance)
                            continue
                    #if we want to retrieve only page creations (faster)
                    elif args.type == "creation" and "new" in contrib.keys():
                        revid_list.append(str(contrib["revid"]))
        #Retrieving the uccontinue value to go to the next page of contributions
        if "continue" in my_json.keys():
            try:
                uccontinue = my_json["continue"]["uccontinue"]
            except:
                break
        else:
            break
    time.sleep(30)  #Gives Wikimedia servers a rest
    print("Extracting sentences")
    if args.type == "creation":
        text_list = get_article_texts(args.lang, revid_list)
    else:
        text_list = list(filter(None, text_list))
    extracted_sentences = list(
        extract_sentences(text_list, args.min_words, args.max_words, nlp=nlp))
    print(len(extracted_sentences), "sentences retrieved")
    if len(extracted_sentences) > 0:  #If we extrated at least one sentence...
        with open(
                os.path.join(args.output,
                             "_".join([str(user), str(licence)]) + ".txt"),
                "wb") as f:
            for sentence in extracted_sentences:
                f.write(str(sentence + " \n").encode("utf8"))
    print(user, "'s contributions retrieved")
print("Done.")
示例#11
0
class Vocab(object):
    def __init__(self, sents, vocab_size, freq_cutoff):
        print('initialize vocabulary ..')
        self.train = VocabEntry.from_corpus(sents, vocab_size, freq_cutoff)

    def __repr__(self):
        return 'Vocab(%d words)' % (len(self.train))


if __name__ == '__main__':
    args = parse_arguments()

    # Extracting training and validation data
    sent_no, avg_words = extract_sentences(
        'data/v2_Questions_Train_mscoco/v2_OpenEnded_mscoco_train2014_questions.json',
        'data/train')
    print('Train: ')
    print('Total sentence: {}, avg words / sentence: {}'.format(
        sent_no, avg_words))
    sent_no, avg_words = extract_sentences(
        'data/v2_Questions_Val_mscoco/v2_OpenEnded_mscoco_val2014_questions.json',
        'data/val')
    print('Val: ')
    print('Total sentence: {}, avg words / sentence: {}'.format(
        sent_no, avg_words))
    sent_no, avg_words = extract_sentences(
        'data/v2_Questions_Test_mscoco/v2_OpenEnded_mscoco_test2015_questions.json',
        'data/test')
    print('Test: ')
    print('Total sentence: {}, avg words / sentence: {}'.format(
示例#12
0
    make_sentences_vectors, make_similarity_matrix, apply_pagerank, ask_top_n_sentences_to_extract, extract_sentences

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', -1)

dataset_path = Path.cwd() / "data" / "Reviews.csv"
if __name__ == '__main__':
    dataset = pd.read_csv(dataset_path, nrows=100)
    dataset.drop_duplicates(subset=['Text'], inplace=True)
    dataset.dropna(axis=0, inplace=True)

    sentences_list = split_in_sentences(dataset['Text'])
    sentences_list = remove_html_tag(sentences_list)

    pre_processed_sentences = pre_processing(sentences_list)

    embedding_dimensionality = ask_embedding_dim()
    embeddings = get_word_embeddings(embedding_dimensionality)

    sents_vects = make_sentences_vectors(pre_processed_sentences, embeddings, int(embedding_dimensionality))

    similarity_matrix = make_similarity_matrix(sentences_list, sents_vects, int(embedding_dimensionality))

    pagerank_scores = apply_pagerank(similarity_matrix)

    number_sentences_to_extract = ask_top_n_sentences_to_extract()

    for ex_sent in extract_sentences(number_sentences_to_extract, sentences_list, pagerank_scores):
        print(ex_sent, "\n")
示例#13
0
 def apply_to_file(filepath):
     sentences = utils.extract_sentences(filepath)
     merge_dicts(corpus_dict, associate_tags(tag_list, sentences))