def __init__(self, corpus, auxiliar=None, use_proccessing=False): self.corpus = corpus self.auxiliar = auxiliar self.proccessing = use_proccessing self.allSentences = [] self.pAllSentences = [] for i in self.corpus.items(): original_sentences = i[1][0] preprocesed_sentences = i[1][1] self.allSentences.extend( extract_sentences(original_sentences, False)) self.pAllSentences.extend( extract_sentences(preprocesed_sentences, True)) if self.auxiliar is not None: for i in self.auxiliar.items(): original_sentences = i[1][0] preprocesed_sentences = i[1][1] self.allSentences.extend( extract_sentences(original_sentences, False)) self.pAllSentences.extend( extract_sentences(preprocesed_sentences, True))
def main(): tags = config.TAGS ##### TEST CODE ##### sentences = utils.extract_sentences("./testfile") my_dict = associate_tags(tags, sentences) for stuff in my_dict.keys(): print "**************************************" print stuff, "|" if isinstance(my_dict[stuff], list): for ent in my_dict[stuff]: print ent else: print my_dict[stuff] print "**************************************" exit(0) ##### END TEST CODE ##### my_dict = {} for arg in sys.argv: if os.path.isdir: merge_dicts(my_dict, create_dict([], argv)) else: file_dict = associate_tags(tags, utils.extract_sentences(arg)) merge_dicts(my_dict, file_dict)
def main(inputdir: str, outputdir: str, minwords: int = 3, maxwords: int = 15, one: bool = False, dry: bool = False, abbr: bool = False, code: bool = False, plaintext: bool = False): try: nlp = spacy.load('fr_core_news_sm') nlp.add_pipe(set_custom_boundaries, before='parser') except OSError: raise OSError('French model not installed. Please run:\n'\ 'python -m spacy download fr_core_news_sm') filenames = list_files(inputdir) if one: filenames = filenames[0:1] for filename in filenames: text = parse_epub(filename, abbr, code) if plaintext: string_final = text else: # extract sentences using utils module sentences = extract_sentences([text], min_words=minwords, max_words=maxwords, nlp=nlp) # clean and filter sentences sentences = [clean_sentence(x) for x in sentences] string_final = '\n'.join(list(sentences)) if not dry: save_text(string_final, filename, inputdir, outputdir)
def dump_one_play(play, nlp=None): print('Treating playid #{}'.format(play)) try: sentences = list( extract_sentences(get_one_play(play), args.min_words, args.max_words, nlp=nlp)) nb_sents = len(sentences) if nb_sents < 2: print('Too few content: %d. Check %s' % (nb_sents, WORK_TEMPLATE % { 'workid': play })) return output_play_name = os.path.join(args.output, "{}.txt".format(play)) print('output_play_name', output_play_name) if not args.dry: with open(output_play_name, 'wb') as output_play: bytes = output_play.write('\n'.join(sentences).encode('utf-8')) if bytes == 0: print('Empty content for playid #{}'.format(play)) else: print('\n'.join(sentences)) except ValueError as e: print('Unable to fetch play because of', e)
def get_matrix_glove(self): model = Glove.load(some_parameters['glove_model']) vectors = model.word_vectors dictionary = model.dictionary corpus_matrix = dict() for i in self.corpus.items(): doc_name = i[0] original_sentences = i[1][0] preprocesed_sentences = i[1][1] ori_sents = extract_sentences(original_sentences, False) pp_sents = extract_sentences(preprocesed_sentences, True) if self.proccessing: matrix = get_glove_matrix(pp_sents, dictionary, vectors) else: matrix = get_glove_matrix(ori_sents, dictionary, vectors) corpus_matrix[doc_name] = matrix return corpus_matrix
def extract_sentences_with_links(tag, filepath): """Build a dict of Entity s from a single file dict extract_sentences_with_links(str[] tag_list, str filepath) str[] tag: list of tags to search for str filepath: path to file to process return value: dictionary of Entity s encountered in processed file. Keys are entities as strings, values are Entity classes or lists of Entity classes in case of conflict. IMPORTS: utils""" sentences = utils.extract_sentences(filepath) return associate_tags([tag], sentences)
def dump_one_book(book): print('Treating bookid #{}'.format(book)) try: sentences = extract_sentences(parse_one_book(book), args.min_words, args.max_words) output_book_name = os.path.join(args.output, "{}.txt".format(book)) print('output_book_name', output_book_name) if not args.dry: with open(output_book_name, 'wb') as output_book: bytes = output_book.write( '.\n'.join(sentences).encode('utf-8')) if bytes == 0: print('Empty content for bookid #{}'.format(book)) else: print('.\n'.join(sentences)) except UnknownDownloadUriException: print('Unable to get bookid #{}'.format(book))
def main(inputdir: str, outputdir: str, minwords: int = 3, maxwords: int = 14, one: bool = False, dry: bool = False, abbr: bool = False, code: bool = False, plaintext: bool = False): try: nlp = spacy.load('fr_core_news_sm') # add max length for the sentence nlp.max_length = 5000000 nlp.add_pipe(set_custom_boundaries, before='parser') except OSError: raise OSError('French model not installed. Please run:\n'\ 'python -m spacy download fr_core_news_sm') if inputdir == '-1': # download file subprocess.call( ['wget', '-O', FILENAME_TMP, EPUB_LINK, '--no-check-certificate']) filenames = [FILENAME_TMP] else: # list file directory filenames = list_files(inputdir) for filename in filenames: text = parse_epub(filename, abbr, code) if plaintext: string_final = text else: # extract sentences using utils module sentences = extract_sentences([text], min_words=minwords, max_words=maxwords, nlp=nlp) # clean and filter sentences sentences = [clean_sentence(x) for x in sentences] string_final = '\n'.join(list(sentences)) if not dry: save_text(string_final, filename, inputdir, outputdir)
visited.append(node) if node.tagName == "DateSeance": if seance_context is not None and 'texte' in seance_context: output_seance_name = os.path.join( args.output, seance_context['DateSeance']) if os.path.isfile(output_seance_name + '.txt'): output_seance_name += str( int( datetime.datetime.timestamp( datetime.datetime.utcnow()))) output_seance_name += '.txt' print('output_seance_name', output_seance_name) sentences = extract_sentences(seance_context['texte'], args.min_words, args.max_words) if not args.dry: with open(output_seance_name, 'w') as output_seance: output_seance.write('.\n'.join(sentences)) else: print('.\n'.join(sentences)) if args.one: break doc.expandNode(node) date_seance = node.firstChild.nodeValue if len(date_seance) != 17: print("Bogus DateSeance?", date_seance)
continue #if we want to retrieve only page creations (faster) elif args.type == "creation" and "new" in contrib.keys(): revid_list.append(str(contrib["revid"])) #Retrieving the uccontinue value to go to the next page of contributions if "continue" in my_json.keys(): try: uccontinue = my_json["continue"]["uccontinue"] except: break else: break time.sleep(30) #Gives Wikimedia servers a rest print("Extracting sentences") if args.type == "creation": text_list = get_article_texts(args.lang, revid_list) else: text_list = list(filter(None, text_list)) extracted_sentences = list( extract_sentences(text_list, args.min_words, args.max_words, nlp=nlp)) print(len(extracted_sentences), "sentences retrieved") if len(extracted_sentences) > 0: #If we extrated at least one sentence... with open( os.path.join(args.output, "_".join([str(user), str(licence)]) + ".txt"), "wb") as f: for sentence in extracted_sentences: f.write(str(sentence + " \n").encode("utf8")) print(user, "'s contributions retrieved") print("Done.")
class Vocab(object): def __init__(self, sents, vocab_size, freq_cutoff): print('initialize vocabulary ..') self.train = VocabEntry.from_corpus(sents, vocab_size, freq_cutoff) def __repr__(self): return 'Vocab(%d words)' % (len(self.train)) if __name__ == '__main__': args = parse_arguments() # Extracting training and validation data sent_no, avg_words = extract_sentences( 'data/v2_Questions_Train_mscoco/v2_OpenEnded_mscoco_train2014_questions.json', 'data/train') print('Train: ') print('Total sentence: {}, avg words / sentence: {}'.format( sent_no, avg_words)) sent_no, avg_words = extract_sentences( 'data/v2_Questions_Val_mscoco/v2_OpenEnded_mscoco_val2014_questions.json', 'data/val') print('Val: ') print('Total sentence: {}, avg words / sentence: {}'.format( sent_no, avg_words)) sent_no, avg_words = extract_sentences( 'data/v2_Questions_Test_mscoco/v2_OpenEnded_mscoco_test2015_questions.json', 'data/test') print('Test: ') print('Total sentence: {}, avg words / sentence: {}'.format(
make_sentences_vectors, make_similarity_matrix, apply_pagerank, ask_top_n_sentences_to_extract, extract_sentences pd.set_option('display.max_columns', None) pd.set_option('display.expand_frame_repr', False) pd.set_option('max_colwidth', -1) dataset_path = Path.cwd() / "data" / "Reviews.csv" if __name__ == '__main__': dataset = pd.read_csv(dataset_path, nrows=100) dataset.drop_duplicates(subset=['Text'], inplace=True) dataset.dropna(axis=0, inplace=True) sentences_list = split_in_sentences(dataset['Text']) sentences_list = remove_html_tag(sentences_list) pre_processed_sentences = pre_processing(sentences_list) embedding_dimensionality = ask_embedding_dim() embeddings = get_word_embeddings(embedding_dimensionality) sents_vects = make_sentences_vectors(pre_processed_sentences, embeddings, int(embedding_dimensionality)) similarity_matrix = make_similarity_matrix(sentences_list, sents_vects, int(embedding_dimensionality)) pagerank_scores = apply_pagerank(similarity_matrix) number_sentences_to_extract = ask_top_n_sentences_to_extract() for ex_sent in extract_sentences(number_sentences_to_extract, sentences_list, pagerank_scores): print(ex_sent, "\n")
def apply_to_file(filepath): sentences = utils.extract_sentences(filepath) merge_dicts(corpus_dict, associate_tags(tag_list, sentences))